176 lines
7.2 KiB
Python
176 lines
7.2 KiB
Python
|
import os
|
||
|
import torch
|
||
|
|
||
|
from torch.utils.data import Sampler
|
||
|
|
||
|
from transformers import Trainer
|
||
|
from transformers.trainer import (
|
||
|
has_length,
|
||
|
)
|
||
|
from typing import List, Optional
|
||
|
|
||
|
|
||
|
def maybe_zero_3(param, ignore_status=False, name=None):
|
||
|
from deepspeed import zero
|
||
|
from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
|
||
|
if hasattr(param, "ds_id"):
|
||
|
if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
|
||
|
if not ignore_status:
|
||
|
print(name, 'no ignore status')
|
||
|
with zero.GatheredParameters([param]):
|
||
|
param = param.data.detach().cpu().clone()
|
||
|
else:
|
||
|
param = param.detach().cpu().clone()
|
||
|
return param
|
||
|
|
||
|
|
||
|
def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
|
||
|
to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
|
||
|
to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
|
||
|
return to_return
|
||
|
|
||
|
|
||
|
def split_to_even_chunks(indices, lengths, num_chunks):
|
||
|
"""
|
||
|
Split a list of indices into `chunks` chunks of roughly equal lengths.
|
||
|
"""
|
||
|
|
||
|
if len(indices) % num_chunks != 0:
|
||
|
return [indices[i::num_chunks] for i in range(num_chunks)]
|
||
|
|
||
|
num_indices_per_chunk = len(indices) // num_chunks
|
||
|
|
||
|
chunks = [[] for _ in range(num_chunks)]
|
||
|
chunks_lengths = [0 for _ in range(num_chunks)]
|
||
|
for index in indices:
|
||
|
shortest_chunk = chunks_lengths.index(min(chunks_lengths))
|
||
|
chunks[shortest_chunk].append(index)
|
||
|
chunks_lengths[shortest_chunk] += lengths[index]
|
||
|
if len(chunks[shortest_chunk]) == num_indices_per_chunk:
|
||
|
chunks_lengths[shortest_chunk] = float("inf")
|
||
|
|
||
|
return chunks
|
||
|
|
||
|
|
||
|
def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
|
||
|
# We need to use torch for the random part as a distributed sampler will set the random seed for torch.
|
||
|
assert all(l != 0 for l in lengths), "Should not have zero length."
|
||
|
mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
|
||
|
lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
|
||
|
|
||
|
assert len(mm_indices) > 0, "Should have at least one multimodal sample."
|
||
|
assert len(lang_indices) > 0, "Should have at least one language sample."
|
||
|
|
||
|
mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
|
||
|
lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
|
||
|
megabatch_size = world_size * batch_size
|
||
|
mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
|
||
|
lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]
|
||
|
|
||
|
last_mm = mm_megabatches[-1]
|
||
|
last_lang = lang_megabatches[-1]
|
||
|
additional_batch = last_mm + last_lang
|
||
|
megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
|
||
|
megabatch_indices = torch.randperm(len(megabatches), generator=generator)
|
||
|
megabatches = [megabatches[i] for i in megabatch_indices]
|
||
|
|
||
|
if len(additional_batch) >= megabatch_size:
|
||
|
megabatches = [additional_batch[:megabatch_size]] + megabatches
|
||
|
additional_batch = additional_batch[megabatch_size:]
|
||
|
|
||
|
if len(additional_batch) > 0:
|
||
|
megabatches.append(additional_batch)
|
||
|
|
||
|
return [i for megabatch in megabatches for i in megabatch]
|
||
|
|
||
|
|
||
|
def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
|
||
|
# We need to use torch for the random part as a distributed sampler will set the random seed for torch.
|
||
|
indices = torch.randperm(len(lengths), generator=generator)
|
||
|
megabatch_size = world_size * batch_size
|
||
|
megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
|
||
|
megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
|
||
|
megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]
|
||
|
|
||
|
return [i for megabatch in megabatches for batch in megabatch for i in batch]
|
||
|
|
||
|
|
||
|
class LengthGroupedSampler(Sampler):
|
||
|
r"""
|
||
|
Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
|
||
|
keeping a bit of randomness.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
batch_size: int,
|
||
|
world_size: int,
|
||
|
lengths: Optional[List[int]] = None,
|
||
|
generator=None,
|
||
|
group_by_modality: bool = False,
|
||
|
):
|
||
|
if lengths is None:
|
||
|
raise ValueError("Lengths must be provided.")
|
||
|
|
||
|
self.batch_size = batch_size
|
||
|
self.world_size = world_size
|
||
|
self.lengths = lengths
|
||
|
self.generator = generator
|
||
|
self.group_by_modality = group_by_modality
|
||
|
|
||
|
def __len__(self):
|
||
|
return len(self.lengths)
|
||
|
|
||
|
def __iter__(self):
|
||
|
if self.group_by_modality:
|
||
|
indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
|
||
|
else:
|
||
|
indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
|
||
|
return iter(indices)
|
||
|
|
||
|
|
||
|
class LLaVATrainer(Trainer):
|
||
|
|
||
|
def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
|
||
|
if self.train_dataset is None or not has_length(self.train_dataset):
|
||
|
return None
|
||
|
|
||
|
if self.args.group_by_modality_length:
|
||
|
lengths = self.train_dataset.modality_lengths
|
||
|
return LengthGroupedSampler(
|
||
|
# self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
|
||
|
self.args.train_batch_size,
|
||
|
world_size=self.args.world_size,
|
||
|
lengths=lengths,
|
||
|
group_by_modality=True,
|
||
|
)
|
||
|
else:
|
||
|
return super()._get_train_sampler()
|
||
|
|
||
|
def _save_checkpoint(self, model, trial, metrics=None):
|
||
|
if getattr(self.args, 'tune_mm_mlp_adapter', False):
|
||
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
||
|
checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
|
||
|
|
||
|
run_dir = self._get_output_dir(trial=trial)
|
||
|
output_dir = os.path.join(run_dir, checkpoint_folder)
|
||
|
|
||
|
# Only save Adapter
|
||
|
keys_to_match = ['mm_projector', 'vision_resampler']
|
||
|
if getattr(self.args, "use_im_start_end", False):
|
||
|
keys_to_match.extend(['embed_tokens', 'embed_in'])
|
||
|
|
||
|
weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
|
||
|
|
||
|
if self.args.local_rank == 0 or self.args.local_rank == -1:
|
||
|
self.model.config.save_pretrained(output_dir)
|
||
|
torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
|
||
|
else:
|
||
|
super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
|
||
|
|
||
|
def _save(self, output_dir: Optional[str] = None, state_dict=None):
|
||
|
if getattr(self.args, 'tune_mm_mlp_adapter', False):
|
||
|
pass
|
||
|
else:
|
||
|
super(LLaVATrainer, self)._save(output_dir, state_dict)
|