SUPIR/llava/train/llava_trainer.py

import os
import torch

from torch.utils.data import Sampler

from transformers import Trainer
from transformers.trainer import (
    has_length,
)
from typing import List, Optional


def maybe_zero_3(param, ignore_status=False, name=None):
    from deepspeed import zero
    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
    if hasattr(param, "ds_id"):
        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
            if not ignore_status:
                print(name, 'no ignore status')
        with zero.GatheredParameters([param]):
            param = param.data.detach().cpu().clone()
    else:
        param = param.detach().cpu().clone()
    return param


def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
    return to_return


def split_to_even_chunks(indices, lengths, num_chunks):
    """
    Split a list of indices into `chunks` chunks of roughly equal lengths.
    """

    if len(indices) % num_chunks != 0:
        return [indices[i::num_chunks] for i in range(num_chunks)]

    num_indices_per_chunk = len(indices) // num_chunks

    chunks = [[] for _ in range(num_chunks)]
    chunks_lengths = [0 for _ in range(num_chunks)]
    for index in indices:
        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
        chunks[shortest_chunk].append(index)
        chunks_lengths[shortest_chunk] += lengths[index]
        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
            chunks_lengths[shortest_chunk] = float("inf")

    return chunks


def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    assert all(l != 0 for l in lengths), "Should not have zero length."
    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])

    assert len(mm_indices) > 0, "Should have at least one multimodal sample."
    assert len(lang_indices) > 0, "Should have at least one language sample."

    mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]
    lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]
    megabatch_size = world_size * batch_size
    mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]
    lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]

    last_mm = mm_megabatches[-1]
    last_lang = lang_megabatches[-1]
    additional_batch = last_mm + last_lang
    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
    megabatches = [megabatches[i] for i in megabatch_indices]

    if len(additional_batch) >= megabatch_size:
        megabatches = [additional_batch[:megabatch_size]] + megabatches
        additional_batch = additional_batch[megabatch_size:]

    if len(additional_batch) > 0:
        megabatches.append(additional_batch)

    return [i for megabatch in megabatches for i in megabatch]


def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
    indices = torch.randperm(len(lengths), generator=generator)
    megabatch_size = world_size * batch_size
    megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]
    megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]
    megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]

    return [i for megabatch in megabatches for batch in megabatch for i in batch]


class LengthGroupedSampler(Sampler):
    r"""
    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
    keeping a bit of randomness.
    """

    def __init__(
        self,
        batch_size: int,
        world_size: int,
        lengths: Optional[List[int]] = None,
        generator=None,
        group_by_modality: bool = False,
    ):
        if lengths is None:
            raise ValueError("Lengths must be provided.")

        self.batch_size = batch_size
        self.world_size = world_size
        self.lengths = lengths
        self.generator = generator
        self.group_by_modality = group_by_modality

    def __len__(self):
        return len(self.lengths)

    def __iter__(self):
        if self.group_by_modality:
            indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
        else:
            indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)
        return iter(indices)


class LLaVATrainer(Trainer):

    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if self.train_dataset is None or not has_length(self.train_dataset):
            return None

        if self.args.group_by_modality_length:
            lengths = self.train_dataset.modality_lengths
            return LengthGroupedSampler(
                # self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps
                self.args.train_batch_size,
                world_size=self.args.world_size,
                lengths=lengths,
                group_by_modality=True,
            )
        else:
            return super()._get_train_sampler()

    def _save_checkpoint(self, model, trial, metrics=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"

            run_dir = self._get_output_dir(trial=trial)
            output_dir = os.path.join(run_dir, checkpoint_folder)

            # Only save Adapter
            keys_to_match = ['mm_projector', 'vision_resampler']
            if getattr(self.args, "use_im_start_end", False):
                keys_to_match.extend(['embed_tokens', 'embed_in'])

            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)

            if self.args.local_rank == 0 or self.args.local_rank == -1:
                self.model.config.save_pretrained(output_dir)
                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
        else:
            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)

    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        if getattr(self.args, 'tune_mm_mlp_adapter', False):
            pass
        else:
            super(LLaVATrainer, self)._save(output_dir, state_dict)
20240125 2024-01-25 15:42:59 +01:00			`import os`
			`import torch`

			`from torch.utils.data import Sampler`

			`from transformers import Trainer`
			`from transformers.trainer import (`
			`has_length,`
			`)`
			`from typing import List, Optional`


			`def maybe_zero_3(param, ignore_status=False, name=None):`
			`from deepspeed import zero`
			`from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus`
			`if hasattr(param, "ds_id"):`
			`if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:`
			`if not ignore_status:`
			`print(name, 'no ignore status')`
			`with zero.GatheredParameters([param]):`
			`param = param.data.detach().cpu().clone()`
			`else:`
			`param = param.detach().cpu().clone()`
			`return param`


			`def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):`
			`to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}`
			`to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}`
			`return to_return`


			`def split_to_even_chunks(indices, lengths, num_chunks):`
			`"""`
			Split a list of indices into `chunks` chunks of roughly equal lengths.
			`"""`

			`if len(indices) % num_chunks != 0:`
			`return [indices[i::num_chunks] for i in range(num_chunks)]`

			`num_indices_per_chunk = len(indices) // num_chunks`

			`chunks = [[] for _ in range(num_chunks)]`
			`chunks_lengths = [0 for _ in range(num_chunks)]`
			`for index in indices:`
			`shortest_chunk = chunks_lengths.index(min(chunks_lengths))`
			`chunks[shortest_chunk].append(index)`
			`chunks_lengths[shortest_chunk] += lengths[index]`
			`if len(chunks[shortest_chunk]) == num_indices_per_chunk:`
			`chunks_lengths[shortest_chunk] = float("inf")`

			`return chunks`


			`def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):`
			`# We need to use torch for the random part as a distributed sampler will set the random seed for torch.`
			`assert all(l != 0 for l in lengths), "Should not have zero length."`
			`mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])`
			`lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])`

			`assert len(mm_indices) > 0, "Should have at least one multimodal sample."`
			`assert len(lang_indices) > 0, "Should have at least one language sample."`

			`mm_shuffle = [mm_indices[i] for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)]`
			`lang_shuffle = [lang_indices[i] for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)]`
			`megabatch_size = world_size * batch_size`
			`mm_megabatches = [mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)]`
			`lang_megabatches = [lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)]`

			`last_mm = mm_megabatches[-1]`
			`last_lang = lang_megabatches[-1]`
			`additional_batch = last_mm + last_lang`
			`megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]`
			`megabatch_indices = torch.randperm(len(megabatches), generator=generator)`
			`megabatches = [megabatches[i] for i in megabatch_indices]`

			`if len(additional_batch) >= megabatch_size:`
			`megabatches = [additional_batch[:megabatch_size]] + megabatches`
			`additional_batch = additional_batch[megabatch_size:]`

			`if len(additional_batch) > 0:`
			`megabatches.append(additional_batch)`

			`return [i for megabatch in megabatches for i in megabatch]`


			`def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):`
			`# We need to use torch for the random part as a distributed sampler will set the random seed for torch.`
			`indices = torch.randperm(len(lengths), generator=generator)`
			`megabatch_size = world_size * batch_size`
			`megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)]`
			`megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches]`
			`megabatches = [split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches]`

			`return [i for megabatch in megabatches for batch in megabatch for i in batch]`


			`class LengthGroupedSampler(Sampler):`
			`r"""`
			`Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while`
			`keeping a bit of randomness.`
			`"""`

			`def __init__(`
			`self,`
			`batch_size: int,`
			`world_size: int,`
			`lengths: Optional[List[int]] = None,`
			`generator=None,`
			`group_by_modality: bool = False,`
			`):`
			`if lengths is None:`
			`raise ValueError("Lengths must be provided.")`

			`self.batch_size = batch_size`
			`self.world_size = world_size`
			`self.lengths = lengths`
			`self.generator = generator`
			`self.group_by_modality = group_by_modality`

			`def __len__(self):`
			`return len(self.lengths)`

			`def __iter__(self):`
			`if self.group_by_modality:`
			`indices = get_modality_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)`
			`else:`
			`indices = get_length_grouped_indices(self.lengths, self.batch_size, self.world_size, generator=self.generator)`
			`return iter(indices)`


			`class LLaVATrainer(Trainer):`

			`def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:`
			`if self.train_dataset is None or not has_length(self.train_dataset):`
			`return None`

			`if self.args.group_by_modality_length:`
			`lengths = self.train_dataset.modality_lengths`
			`return LengthGroupedSampler(`
			`# self.args.train_batch_size * self.args.gradient_accumulation_steps, # TODO: seems that we should not have gradient_accumulation_steps`
			`self.args.train_batch_size,`
			`world_size=self.args.world_size,`
			`lengths=lengths,`
			`group_by_modality=True,`
			`)`
			`else:`
			`return super()._get_train_sampler()`

			`def _save_checkpoint(self, model, trial, metrics=None):`
			`if getattr(self.args, 'tune_mm_mlp_adapter', False):`
			`from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR`
			`checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"`

			`run_dir = self._get_output_dir(trial=trial)`
			`output_dir = os.path.join(run_dir, checkpoint_folder)`

			`# Only save Adapter`
			`keys_to_match = ['mm_projector', 'vision_resampler']`
			`if getattr(self.args, "use_im_start_end", False):`
			`keys_to_match.extend(['embed_tokens', 'embed_in'])`

			`weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)`

			`if self.args.local_rank == 0 or self.args.local_rank == -1:`
			`self.model.config.save_pretrained(output_dir)`
			`torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))`
			`else:`
			`super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)`

			`def _save(self, output_dir: Optional[str] = None, state_dict=None):`
			`if getattr(self.args, 'tune_mm_mlp_adapter', False):`
			`pass`
			`else:`
			`super(LLaVATrainer, self)._save(output_dir, state_dict)`