NONE OF VISION MODELS ARE WORKING FOR FINE-TUNES #1505

yukiarimo · 2025-01-05T05:06:24Z

Hello. I tried fine-tuning Pixtral today! It is not possible to do so!

The previous working code was for LLaMA 3.1 8B as expected:

# Define the hyperparams
max_seq_length = 16384 # 8192
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/content/himitsu-tamer-donemodel",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 256, # 128 or 256
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head"
                      ],
    lora_alpha = 256, # 128 or 256
    lora_dropout = 0.1,
    bias = "all", # "all" or "lora_only"
    use_gradient_checkpointing = "unsloth", # True - don't use False
    random_state = 42,
    use_rslora = True,
    loftq_config = None, # And LoftQ
)

# Connect the stuff
from google.colab import drive
drive.mount('/content/drive')

# Data prep
def formatting_prompts_func(examples):
    texts = examples["text"]
    return {"text": texts}

dataset = load_dataset("json", data_files="/content/drive/MyDrive/datasets/all-prompts-combined-and-himitsu-16k.jsonl")

# This is where the magic happens
from unsloth import UnslothTrainer, UnslothTrainingArguments

trainer = UnslothTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    args=UnslothTrainingArguments(
        output_dir="outputs",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        max_steps=300,
        warmup_steps=20,
        warmup_ratio=0.1,
        learning_rate=1e-5,
        embedding_learning_rate=5e-6,  # 2-10x smaller than learning_rate 1e-6
        bf16=True,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.1,  # 0.01
        lr_scheduler_type="constant",
        seed=42,
        max_grad_norm=1.0,
        group_by_length=True,
        report_to="tensorboard",
        save_steps=10,
        gradient_checkpointing=True,
    ),
)

trainer_stats = trainer.train()

And this is the new code for Pixtral:

Do this, cause otherwise models don't load LLaVA error:

!pip install "unsloth==2024.12.11"
!pip install "unsloth-zoo==2024.12.6"

from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

model, tokenizer = FastVisionModel.from_pretrained(
    "/content/yuna-h-base",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
    max_seq_length = 16384
)

model = FastVisionModel.get_peft_model(
    model,
    # We do NOT finetune vision & attention layers since Pixtral uses more memory!
    finetune_vision_layers     = False, # False if not finetuning vision layers
    finetune_language_layers   = True,  # False if not finetuning language layers
    finetune_attention_modules = True, # False if not finetuning attention layers
    finetune_mlp_modules       = True,  # False if not finetuning MLP layers

    r = 128,           # The larger, the higher the accuracy, but might overfit
    lora_alpha = 128,  # Recommended alpha == r at least
    lora_dropout = 0.1,
    bias = "none",
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",
                      "embed_tokens", "lm_head"
                      ],
)

# Connect the stuff
from google.colab import drive
drive.mount('/content/drive')


from datasets import load_dataset
# Data prep
def formatting_prompts_func(examples):
    texts = examples["text"]
    return {"text": texts}

dataset = load_dataset("json", data_files="/content/drive/MyDrive/datasets/pack-1-all.jsonl")

from unsloth import is_bf16_supported
from transformers import default_data_collator
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

# Data prep - No need to format for "messages" anymore
def formatting_prompts_func(examples):
    return {"text": examples["text"]}

dataset = load_dataset("json", data_files="/content/drive/MyDrive/datasets/pack-1-all.jsonl")

# Tokenization with padding and truncation
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",  # Ensures all sequences are padded to max_seq_length
        truncation=True,       # Ensures sequences longer than max_seq_length are truncated
        max_length=16384,      # Match the max_seq_length in SFTConfig
    )

tokenized_dataset = dataset.map(formatting_prompts_func, batched=True).map(tokenize_function, batched=True)

# Enable model for training
FastVisionModel.for_training(model)

# Use default_data_collator instead of UnslothVisionDataCollator as there are no images
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=default_data_collator,  # Use the default data collator for text-only
    train_dataset=tokenized_dataset["train"],
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        learning_rate=5e-5,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="paged_adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="tensorboard",
        max_seq_length=16384,
    ),
)

trainer_stats = trainer.train()

Error:

Map: 100%
 246/246 [00:04<00:00, 50.09 examples/s]
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 246 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 100
 "-____-"     Number of trainable parameters = 545,917,312
🦥 Unsloth needs about 1-3 minutes to load everything - please wait!
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
[<ipython-input-23-a26efd3a589f>](https://localhost:8080/#) in <cell line: 51>()
     49 )
     50 
---> 51 trainer_stats = trainer.train()

23 frames
[/usr/local/lib/python3.10/dist-packages/transformers/models/llava/modeling_llava.py](https://localhost:8080/#) in _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels)
    301 
    302     def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
--> 303         num_images, num_image_patches, embed_dim = image_features.shape
    304         batch_size, sequence_length = input_ids.shape
    305         left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))

AttributeError: 'NoneType' object has no attribute 'shape'

Tried the other way:

def formatting_prompts_func(examples):
    return {"text": examples["text"]}

dataset = load_dataset("json", data_files="/content/drive/MyDrive/datasets/pack-1-all.jsonl")
formatted_dataset = dataset.map(formatting_prompts_func)

FastVisionModel.for_training(model) # Enable for training!

# Use default_data_collator instead of UnslothVisionDataCollator
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=default_data_collator,  # Changed to default_data_collator
    train_dataset=formatted_dataset["train"],
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        learning_rate=5e-5,
        fp16=not is_bf16_supported(),
        bf16=is_bf16_supported(),
        logging_steps=1,
        optim="paged_adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="tensorboard",
        max_seq_length=16384,
    ),
)

trainer_stats = trainer.train()

Error:

Map:   0%
 0/246 [00:02<?, ? examples/s]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
    776                 if not is_tensor(value):
--> 777                     tensor = as_tensor(value)
    778 

18 frames
ValueError: expected sequence of length 15276 at dim 1 (got 15227)

The above exception was the direct cause of the following exception:

ValueError                                Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
    791                         "Please see if a fast version of this tokenizer is available to have this feature available."
    792                     ) from e
--> 793                 raise ValueError(
    794                     "Unable to create tensor, you should probably activate truncation and/or padding with"
    795                     " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

Please help! Why it doesn't work? Yes, the dataset is text-only like this:

{"text": "Raw text here"}
{"text": "Raw text here"}
{"text": "Raw text here"}

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

NONE OF VISION MODELS ARE WORKING FOR FINE-TUNES #1505

NONE OF VISION MODELS ARE WORKING FOR FINE-TUNES #1505

yukiarimo commented Jan 5, 2025

NONE OF VISION MODELS ARE WORKING FOR FINE-TUNES #1505

NONE OF VISION MODELS ARE WORKING FOR FINE-TUNES #1505

Comments

yukiarimo commented Jan 5, 2025