Merge remote-tracking branch 'upstream/main' into insop/kld

pytorch · Jan 30, 2025 · b5b3f79 · b5b3f79
2 parents d5576e2 + be4ff50
commit b5b3f79
Show file tree

Hide file tree

Showing 258 changed files with 2,197 additions and 1,270 deletions.
diff --git a/.github/workflows/gpu_test.yaml b/.github/workflows/gpu_test.yaml
@@ -46,7 +46,7 @@ jobs:
         run: python -m pip install --upgrade pip
       - name: Install torch nightly
         if: ${{ matrix.torch-version == 'nightly' }}
-        run: python -m pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121
+        run: python -m pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu126
       - name: Install torch stable
         if: ${{ matrix.torch-version == 'stable' }}
         run: python -m pip install torch torchvision torchao

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ default_language_version:
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: 6306a48f7dae5861702d573c9c247e4e9498e867
+    rev: v5.0.0
     hooks:
     -   id: trailing-whitespace
     -   id: check-ast
@@ -18,7 +18,7 @@ repos:
         exclude: '^(.*\.svg)$'
 
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.5.4
+    rev: v1.5.5
     hooks:
     -   id: insert-license
         files: \.py$|\.sh$
@@ -27,7 +27,7 @@ repos:
         - docs/license_header.txt
 
 -   repo: https://github.com/pycqa/flake8
-    rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
+    rev: 7.1.1
     hooks:
     -   id: flake8
         additional_dependencies:
@@ -37,15 +37,15 @@ repos:
         args: ['--config=.flake8']
 
 -   repo: https://github.com/omnilib/ufmt
-    rev: v2.3.0
+    rev: v2.8.0
     hooks:
     -   id: ufmt
         additional_dependencies:
           - black == 22.12.0
           - usort == 1.0.5
 
 - repo: https://github.com/jsh9/pydoclint
-  rev: 94efc5f989adbea30f3534b476b2931a02c1af90
+  rev: 0.5.12
   hooks:
     - id: pydoclint
       args: [--config=pyproject.toml]
diff --git a/README.md b/README.md
@@ -170,7 +170,7 @@ pip install torchtune
 
 ```bash
 # Install PyTorch, torchvision, torchao nightlies
-pip install --pre --upgrade torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124
+pip install --pre --upgrade torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu126 # full options are cpu/cu118/cu121/cu124/cu126
 pip install --pre --upgrade torchtune --extra-index-url https://download.pytorch.org/whl/nightly/cpu
 ```
 

diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -48,10 +48,10 @@ model specific tokenizers.
     :toctree: generated/
     :nosignatures:
 
-    tokenizers.SentencePieceBaseTokenizer
-    tokenizers.TikTokenBaseTokenizer
-    tokenizers.ModelTokenizer
-    tokenizers.BaseTokenizer
+    transforms.tokenizers.SentencePieceBaseTokenizer
+    transforms.tokenizers.TikTokenBaseTokenizer
+    transforms.tokenizers.ModelTokenizer
+    transforms.tokenizers.BaseTokenizer
 
 Tokenizer Utilities
 -------------------
@@ -61,8 +61,8 @@ These are helper methods that can be used by any tokenizer.
     :toctree: generated/
     :nosignatures:
 
-    tokenizers.tokenize_messages_no_special_tokens
-    tokenizers.parse_hf_tokenizer_json
+    transforms.tokenizers.tokenize_messages_no_special_tokens
+    transforms.tokenizers.parse_hf_tokenizer_json
 
 
 PEFT Components

diff --git a/docs/source/api_ref_rlhf.rst b/docs/source/api_ref_rlhf.rst
@@ -16,4 +16,3 @@ Components and losses for RLHF algorithms like PPO and DPO.
     loss.PPOLoss
     loss.DPOLoss
     loss.RSOLoss
-    loss.SimPOLoss
diff --git a/docs/source/basics/custom_components.rst b/docs/source/basics/custom_components.rst
@@ -117,7 +117,7 @@ our models in torchtune - see :func:`~torchtune.models.llama3_2_vision.llama3_2_
     #
     from torchtune.datasets import SFTDataset, PackedDataset
     from torchtune.data import InputOutputToMessages
-    from torchtune.modules.tokenizers import ModelTokenizer
+    from torchtune.modules.transforms.tokenizers import ModelTokenizer
 
     # Example builder function for a custom code instruct dataset not in torchtune, but using
     # different dataset building blocks from torchtune

diff --git a/docs/source/basics/message_transforms.rst b/docs/source/basics/message_transforms.rst
@@ -95,6 +95,7 @@ Example message transforms
 --------------------------
 - Instruct
     - :class:`~torchtune.data.InputOutputToMessages`
+    - :class:`~torchtune.data.AlpacaToMessages`
 - Chat
     - :class:`~torchtune.data.ShareGPTToMessages`
     - :class:`~torchtune.data.OpenAIToMessages`

diff --git a/docs/source/basics/model_transforms.rst b/docs/source/basics/model_transforms.rst
@@ -101,7 +101,7 @@ The following methods are required on the model transform:
 
 .. code-block:: python
 
-    from torchtune.modules.tokenizers import ModelTokenizer
+    from torchtune.modules.transforms.tokenizers import ModelTokenizer
     from torchtune.modules.transforms import Transform
 
     class MyMultimodalTransform(ModelTokenizer, Transform):

diff --git a/docs/source/basics/tokenizers.rst b/docs/source/basics/tokenizers.rst
@@ -168,7 +168,7 @@ For example, here we change the ``"<|begin_of_text|>"`` and ``"<|end_of_text|>"`
 Base tokenizers
 ---------------
 
-:class:`~torchtune.modules.tokenizers.BaseTokenizer` are the underlying byte-pair encoding modules that perform the actual raw string to token ID conversion and back.
+:class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer` are the underlying byte-pair encoding modules that perform the actual raw string to token ID conversion and back.
 In torchtune, they are required to implement ``encode`` and ``decode`` methods, which are called by the :ref:`model_tokenizers` to convert
 between raw text and token IDs.
 
@@ -202,13 +202,13 @@ between raw text and token IDs.
             """
             pass
 
-If you load any :ref:`model_tokenizers`, you can see that it calls its underlying :class:`~torchtune.modules.tokenizers.BaseTokenizer`
+If you load any :ref:`model_tokenizers`, you can see that it calls its underlying :class:`~torchtune.modules.transforms.tokenizers.BaseTokenizer`
 to do the actual encoding and decoding.
 
 .. code-block:: python
 
     from torchtune.models.mistral import mistral_tokenizer
-    from torchtune.modules.tokenizers import SentencePieceBaseTokenizer
+    from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer
 
     m_tokenizer = mistral_tokenizer("/tmp/Mistral-7B-v0.1/tokenizer.model")
     # Mistral uses SentencePiece for its underlying BPE
@@ -227,7 +227,7 @@ to do the actual encoding and decoding.
 Model tokenizers
 ----------------
 
-:class:`~torchtune.modules.tokenizers.ModelTokenizer` are specific to a particular model. They are required to implement the ``tokenize_messages`` method,
+:class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` are specific to a particular model. They are required to implement the ``tokenize_messages`` method,
 which converts a list of Messages into a list of token IDs.
 
 .. code-block:: python
@@ -259,7 +259,7 @@ is because they add all the necessary special tokens or prompt templates require
 .. code-block:: python
 
     from torchtune.models.mistral import mistral_tokenizer
-    from torchtune.modules.tokenizers import SentencePieceBaseTokenizer
+    from torchtune.modules.transforms.tokenizers import SentencePieceBaseTokenizer
     from torchtune.data import Message
 
     m_tokenizer = mistral_tokenizer("/tmp/Mistral-7B-v0.1/tokenizer.model")

diff --git a/docs/source/install.rst b/docs/source/install.rst
@@ -19,7 +19,7 @@ nightly versions with the following commands:
     pip install torch torchvision torchao
 
     # Or nightly install for latest features
-    pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124
+    pip install --pre torch torchvision torchao --index-url https://download.pytorch.org/whl/nightly/cu126 # full options are cpu/cu118/cu121/cu124/cu126
 
 
 Install via PyPI
@@ -88,4 +88,4 @@ to the package *without* installing via ``git clone``, you can install with the
 If you already have PyTorch installed, torchtune will default to using that version. However, if you want to
 use the nightly version of PyTorch, you can append the ``--force-reinstall`` option to the above command. If you
 opt for this install method, you will likely need to change the "cpu" suffix in the index url to match your CUDA
-version. For example, if you are running CUDA 12, your index url would be "https://download.pytorch.org/whl/nightly/cu121".
+version. For example, if you are running CUDA 12, your index url would be "https://download.pytorch.org/whl/nightly/cu126".
diff --git a/docs/source/recipes/dpo.rst b/docs/source/recipes/dpo.rst
@@ -56,8 +56,6 @@ To use any of these, simply use the ``loss`` config entry or flag through the :r
     loss=torchtune.modules.loss.RSOLoss \
     gamma=0.5
 
-.. todo (@SalmanMohammadi) point to an example repo for SimPO
-
 For a deeper understanding of the different levers you can pull when using this recipe,
 see our documentation for the different PEFT training paradigms we support:
 

diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst
@@ -275,18 +275,20 @@ Let's first copy over the config to our local working directory so we can make c
 
     $ tune cp generation ./custom_generation_config.yaml
     Copied file to custom_generation_config.yaml
+    $ mkdir /tmp/torchtune/llama3_2_3B/lora_single_device/out
 
 Let's modify ``custom_generation_config.yaml`` to include the following changes. Again, you only need
  to replace two fields: ``output_dir`` and ``checkpoint_files``
 
 .. code-block:: yaml
 
-    output_dir: /tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0
+    checkpoint_dir: /tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0
+    output_dir: /tmp/torchtune/llama3_2_3B/lora_single_device/out
 
     # Tokenizer
     tokenizer:
         _component_: torchtune.models.llama3.llama3_tokenizer
-        path: ${output_dir}/original/tokenizer.model
+        path: ${checkpoint_dir}/original/tokenizer.model
         prompt_template: null
 
     model:
@@ -295,7 +297,7 @@ Let's modify ``custom_generation_config.yaml`` to include the following changes.
 
     checkpointer:
         _component_: torchtune.training.FullModelHFCheckpointer
-        checkpoint_dir: ${output_dir}
+        checkpoint_dir: ${checkpoint_dir}
         checkpoint_files: [
             ft-model-00001-of-00002.safetensors,
             ft-model-00002-of-00002.safetensors,
@@ -312,8 +314,8 @@ Let's modify ``custom_generation_config.yaml`` to include the following changes.
 
     # Generation arguments; defaults taken from gpt-fast
     prompt:
-    system: null
-    user: "Tell me a joke. "
+      system: null
+      user: "Tell me a joke. "
     max_new_tokens: 300
     temperature: 0.6 # 0.8 and 0.6 are popular values to try
     top_k: 300
@@ -330,7 +332,7 @@ these parameters.
 
 .. code-block:: text
 
-    $ tune run generate --config ./custom_generation_config.yaml prompt="tell me a joke. "
+    $ tune run generate --config ./custom_generation_config.yaml prompt.user="Tell me a joke. "
     Tell me a joke. Here's a joke for you:
 
     What do you call a fake noodle?

diff --git a/docs/source/tutorials/llama3.rst b/docs/source/tutorials/llama3.rst
@@ -230,7 +230,7 @@ Running generation with our LoRA-finetuned model, we see the following output:
 .. code-block:: bash
 
     tune run generate --config ./custom_generation_config.yaml \
-    prompt="Hello, my name is"
+    prompt.user="Hello, my name is"
 
     [generate.py:122] Hello, my name is Sarah and I am a busy working mum of two young children, living in the North East of England.
     ...

diff --git a/pyproject.toml b/pyproject.toml
@@ -87,7 +87,7 @@ target-version = ["py38"]
 [tool.pydoclint]
 style = 'google'
 check-return-types = 'False'
-exclude = 'tests/torchtune/models/(\w+)/scripts/'
+exclude = 'tests/torchtune/models/(\w+)/scripts/|recipes/|torchtune/modules/_export'
 
 [tool.pytest.ini_options]
 addopts = ["--showlocals", "--import-mode=prepend", "--without-integration", "--without-slow-integration"]

diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml
@@ -64,6 +64,7 @@ optimizer:
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml
@@ -72,6 +72,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml
@@ -71,6 +71,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/code_llama2/evaluation.yaml b/recipes/configs/code_llama2/evaluation.yaml
@@ -3,6 +3,8 @@
 # To launch, run the following command:
 #    tune run eleuther_eval --config code_llama2/evaluation
 
+output_dir: ./ # Not needed
+
 # Model arguments
 model:
   _component_: torchtune.models.code_llama2.code_llama2_7b

diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -57,6 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 

diff --git a/recipes/configs/gemma/2B_lora.yaml b/recipes/configs/gemma/2B_lora.yaml
@@ -69,6 +69,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma/2B_lora_single_device.yaml b/recipes/configs/gemma/2B_lora_single_device.yaml
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma/2B_qlora_single_device.yaml b/recipes/configs/gemma/2B_qlora_single_device.yaml
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma/7B_full.yaml b/recipes/configs/gemma/7B_full.yaml
@@ -59,6 +59,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 

diff --git a/recipes/configs/gemma/7B_lora.yaml b/recipes/configs/gemma/7B_lora.yaml
@@ -71,6 +71,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma/7B_lora_single_device.yaml b/recipes/configs/gemma/7B_lora_single_device.yaml
@@ -70,6 +70,7 @@ batch_size: 8
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma/7B_qlora_single_device.yaml b/recipes/configs/gemma/7B_qlora_single_device.yaml
@@ -70,6 +70,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma2/27B_full.yaml b/recipes/configs/gemma2/27B_full.yaml
@@ -56,6 +56,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 

diff --git a/recipes/configs/gemma2/27B_lora.yaml b/recipes/configs/gemma2/27B_lora.yaml
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma2/27B_lora_single_device.yaml b/recipes/configs/gemma2/27B_lora_single_device.yaml
@@ -67,6 +67,7 @@ batch_size: 2
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma2/27B_qlora_single_device.yaml b/recipes/configs/gemma2/27B_qlora_single_device.yaml
@@ -67,6 +67,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env

diff --git a/recipes/configs/gemma2/2B_full.yaml b/recipes/configs/gemma2/2B_full.yaml
@@ -58,6 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1