-
Notifications
You must be signed in to change notification settings - Fork 461
Description
--- VLM Config ---
VLMConfig(vit_hidden_dim=768, vit_inter_dim=3072, vit_patch_size=16, vit_img_size=512, vit_n_heads=12, vit_dropout=0.0, vit_n_blocks=12, vit_ln_eps=1e-06, vit_cls_flag=False, vit_model_type='modelhub/google/siglip2-base-patch16-512', lm_hidden_dim=960, lm_inter_dim=2560, lm_rms_eps=1e-05, lm_re_base=100000, lm_max_position_embeddings=8192, lm_base_vocab_size=49152, extra_token_amount=17, lm_vocab_size=49169, lm_n_heads=15, lm_n_kv_heads=5, lm_dropout=0.0, lm_n_blocks=32, lm_attn_scaling=1.0, lm_max_length=1024, lm_use_tokens=False, lm_tie_weights=True, lm_model_type='modelhub/HuggingFaceTB/SmolLM2-360M-Instruct', lm_tokenizer='modelhub/HuggingFaceTB/cosmo2-tokenizer', lm_chat_template="{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", mp_pixel_shuffle_factor=4, mp_image_token_length=64, max_img_size=1024, vlm_extra_tokens={'image_token': '<|image|>', 'r1c1': '<row_1_col_1>', 'r1c2': '<row_1_col_2>', 'r1c3': '<row_1_col_3>', 'r1c4': '<row_1_col_4>', 'r2c1': '<row_2_col_1>', 'r2c2': '<row_2_col_2>', 'r2c3': '<row_2_col_3>', 'r2c4': '<row_2_col_4>', 'r3c1': '<row_3_col_1>', 'r3c2': '<row_3_col_2>', 'r3c3': '<row_3_col_3>', 'r3c4': '<row_3_col_4>', 'r4c1': '<row_4_col_1>', 'r4c2': '<row_4_col_2>', 'r4c3': '<row_4_col_3>', 'r4c4': '<row_4_col_4>'}, vlm_load_backbone_weights=True, vlm_checkpoint_path='checkpoints', hf_repo_name='nanoVLM')
--- Train Config ---
TrainConfig(lr_mp=0.00512, lr_backbones=5e-05, data_cutoff_idx=None, val_ratio=0.025, batch_size=8, gradient_accumulation_steps=8, max_grad_norm=1.0, eval_in_epochs=True, eval_interval=800, stats_log_interval=200, max_training_steps=5000, max_images_per_example=4, max_images_per_knapsack=18, max_sample_length=1024, compile=False, resume_from_vlm_checkpoint=False, train_dataset_path='datasets/HuggingFaceM4/the_cauldron', train_dataset_name=('all',), wandb_entity='HuggingFace', log_wandb=False, use_lmms_eval=True, lmms_eval_tasks='mmstar,mmmu,ocrbench,textvqa', lmms_eval_limit=2000, lmms_eval_batch_size=128)
1111111111111111111111111111tokenizer_init_kwargs: {'use_fast': True, 'extra_special_tokens': {'image_token': '<|image|>', 'r1c1': '<row_1_col_1>', 'r1c2': '<row_1_col_2>', 'r1c3': '<row_1_col_3>', 'r1c4': '<row_1_col_4>', 'r2c1': '<row_2_col_1>', 'r2c2': '<row_2_col_2>', 'r2c3': '<row_2_col_3>', 'r2c4': '<row_2_col_4>', 'r3c1': '<row_3_col_1>', 'r3c2': '<row_3_col_2>', 'r3c3': '<row_3_col_3>', 'r3c4': '<row_3_col_4>', 'r4c1': '<row_4_col_1>', 'r4c2': '<row_4_col_2>', 'r4c3': '<row_4_col_3>', 'r4c4': '<row_4_col_4>'}, 'chat_template': "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"}
------------------------lm_tokenizer: modelhub/HuggingFaceTB/cosmo2-tokenizer
Loading from backbone weights
Successfully loaded modelhub/google/siglip2-base-patch16-512 weights from safetensors. Model has 86,433,024 parameters.
Extending token embeddings from torch.Size([49152, 960]) to torch.Size([49169, 960])
Initialized 17 new token embeddings
Successfully loaded modelhub/HuggingFaceTB/SmolLM2-360M-Instruct weights from safetensors. Model has 361,837,440 parameters.
nanoVLM initialized with 460,066,944 parameters
Training summary: 607 samples, 75 batches/epoch, batch size 64
Validation summary: 60 samples, 7 batches/epoch, batch size 64
Using device: cuda
**********************tokenizer: GPT2TokenizerFast(name_or_path='modelhub/HuggingFaceTB/cosmo2-tokenizer', vocab_size=49152, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|endoftext|>', '<|im_start|>', '<|im_end|>', '<repo_name>', '', '<file_sep>', '', '<gh_stars>', '<issue_start>', '<issue_comment>', '<issue_closed>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<jupyter_script>', '<empty_output>']}, clean_up_tokenization_spaces=False), added_tokens_decoder={
0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
1: AddedToken("<|im_start|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
2: AddedToken("<|im_end|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
3: AddedToken("<repo_name>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
4: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
5: AddedToken("<file_sep>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
6: AddedToken("", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
7: AddedToken("<gh_stars>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
8: AddedToken("<issue_start>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
9: AddedToken("<issue_comment>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
10: AddedToken("<issue_closed>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
11: AddedToken("<jupyter_start>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
12: AddedToken("<jupyter_text>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
13: AddedToken("<jupyter_code>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
14: AddedToken("<jupyter_output>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
15: AddedToken("<jupyter_script>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
16: AddedToken("<empty_output>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}