eth-easl · sjohn4 · Jan 9, 2025 · Jan 10, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/modyn/common/grpc/grpc_helpers.py b/modyn/common/grpc/grpc_helpers.py
@@ -251,6 +251,7 @@ def prepare_start_training_request(
             enable_accurate_gpu_measurements=training_config.enable_accurate_gpu_measurements,
             record_loss_every=training_config.record_loss_every,
             drop_last_batch=training_config.drop_last_batch,
+            generative=training_config.generative,
         )
 
     def start_training(

diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml
@@ -22,10 +22,11 @@ storage:
         filesystem_wrapper_type: "LocalFilesystemWrapper",
         file_wrapper_type: "SingleSampleFileWrapper",
         file_wrapper_config:
-          { file_extension: ".png", label_file_extension: ".label" },
+          { file_extension: ".png", label_file_extension: ".label",has_labels: true },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 128,
+
       },
       # ----------------------------------- CRITEO ----------------------------------- #
       {
@@ -41,10 +42,12 @@ storage:
             record_size: 160,
             label_size: 4,
             file_extension: ".bin",
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 2000000,
+
       },
       # ---------------------------------- YEARBOOK ---------------------------------- #
       {
@@ -60,10 +63,12 @@ storage:
             record_size: 12292,
             label_size: 4,
             file_extension: ".bin",
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 256,
+
       },
       {
         name: "yearbook_train",
@@ -78,6 +83,7 @@ storage:
             record_size: 12292,
             label_size: 4,
             file_extension: ".bin",
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
@@ -96,6 +102,8 @@ storage:
             record_size: 12292,
             label_size: 4,
             file_extension: ".bin",
+            has_labels: true,
+
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
@@ -110,7 +118,7 @@ storage:
         filesystem_wrapper_type: "LocalFilesystemWrapper",
         file_wrapper_type: "SingleSampleFileWrapper",
         file_wrapper_config:
-          { file_extension: ".png", label_file_extension: ".label" },
+          { file_extension: ".png", label_file_extension: ".label",has_labels: true, },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 1024,
@@ -127,6 +135,7 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since headlines contain commas and semicolons
             label_index: 1,
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
@@ -144,6 +153,7 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since headlines contain commas and semicolons
             label_index: 1,
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
@@ -160,10 +170,12 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since headlines contain commas and semicolons
             label_index: 1,
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 4096,
+
       },
       # ------------------------------------ ARXIV ----------------------------------- #
       {
@@ -177,10 +189,12 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since sentences contain commas and semicolons
             label_index: 1,
+            has_labels: true
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 4096,
+
       },
       {
         name: "arxiv_test",
@@ -193,10 +207,12 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since sentences contain commas and semicolons
             label_index: 1,
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 4096,
+
       },
       # -------------------------------- ARXIV KAGGLE -------------------------------- #
       {
@@ -210,10 +226,12 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since sentences contain commas and semicolons
             label_index: 1,
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 4096,
+
       },
       {
         name: "arxiv_kaggle_test",
@@ -226,10 +244,12 @@ storage:
             file_extension: ".csv",
             separator: "\t", #tsv best option here since sentences contain commas and semicolons
             label_index: 1,
+            has_labels: true,
           },
         ignore_last_timestamp: false,
         file_watcher_interval: 5,
         selector_batch_size: 4096,
+
       },
       # ------------------------------------ CLOC ------------------------------------ #
       {
@@ -240,10 +260,31 @@ storage:
         filesystem_wrapper_type: "LocalFilesystemWrapper",
         file_wrapper_type: "SingleSampleFileWrapper",
         file_wrapper_config:
-          { file_extension: ".jpg", label_file_extension: ".label" },
+          { file_extension: ".jpg", label_file_extension: ".label",
+          has_labels: true,
+          },
         ignore_last_timestamp: false,
         file_watcher_interval: 999999999,
         selector_batch_size: 100000,
+
+      },
+      # ------------------------------------ Wikipedia ------------------------------------ #
+      {
+        name: "Wikipedia",
+        description: "Wikipedia text dump from 2021",
+        version: "0.0.1",
+        base_path: "/datasets/readablewiki",
+        filesystem_wrapper_type: "LocalFilesystemWrapper",
+        file_wrapper_type: "CsvFileWrapper",
+        file_wrapper_config: {
+            file_extension: ".csv",
+            separator: "\t", #tsv best option here since sentences contain commas and semicolons
+            has_labels: false,
+          },
+        ignore_last_timestamp: false,
+        file_watcher_interval: 5,
+        selector_batch_size: 4096,
+
       },
     ]
   database:
@@ -278,7 +319,7 @@ selector:
   local_storage_directory: "/tmp/local_storage"
   local_storage_max_samples_in_file: 1000000
   cleanup_storage_directories_after_shutdown: true
-  ignore_existing_trigger_samples: false
+  ignore_existing_trigger_samples: true
 
 trainer_server:
   hostname: "trainer_server"

diff --git a/modyn/config/schema/pipeline/training/config.py b/modyn/config/schema/pipeline/training/config.py
@@ -7,7 +7,7 @@
 
 from modyn.config.schema.base_model import ModynBaseModel
 
-OptimizerSource = Literal["PyTorch", "APEX"]
+OptimizerSource = Literal["PyTorch", "APEX", "HuggingFace"]
 
 
 class OptimizerParamGroup(ModynBaseModel):
@@ -119,6 +119,13 @@ class TrainingConfig(ModynBaseModel):
             "we start with random weights. If initial_model is 'pretrained', cannot be False."
         )
     )
+    generative: bool = Field(
+        False,
+        description=(
+            "If True then, then the training pipeline goes into the generative branch, data is sampled without expecting labels."
+        ),
+    )
+
     seed: int | None = Field(
         None,
         description=(

diff --git a/modyn/config/schema/system/config.py b/modyn/config/schema/system/config.py
@@ -58,9 +58,10 @@ class DatasetCsvFileWrapperConfig(_DatasetBaseFileWrapperConfig):
     quoted_linebreaks: bool = Field(True, description="Whether linebreaks are quoted in CSV files.")
 
     label_index: int = Field(
+        -1,
         description=(
             "Column index of the label. For columns 'width, 'height, 'age', 'label' you should set label_index to 3."
-        )
+        ),
     )
     ignore_first_line: bool = Field(
         False, description="If the first line is the table header, you can skip it setting this parameter to True."
@@ -73,6 +74,7 @@ class DatasetCsvFileWrapperConfig(_DatasetBaseFileWrapperConfig):
             "rows are the same size and that the 'label' column exists."
         ),
     )
+    has_labels: bool = Field(True, description=("Describes wether the dataset contains a label field or not"))
 
 
 class DatasetBinaryFileWrapperConfig(_DatasetBaseFileWrapperConfig):
@@ -83,12 +85,14 @@ class DatasetBinaryFileWrapperConfig(_DatasetBaseFileWrapperConfig):
     )
     record_size: int = Field(description="The size of each full record in bytes (label + features).")
     label_size: int = Field(description="The size of the label field in bytes for a binary file wrapper.")
+    has_labels: bool = Field(True, description=("Describes wether the dataset contains a label field or not"))
 
 
 class DatasetPngFileWrapperConfig(_DatasetBaseFileWrapperConfig):
     """Represents a png dataset file used by modyn."""
 
     label_file_extension: str = Field(description="The label file extension of the dataset", pattern=r"^\..*$")
+    has_labels: bool = Field(True, description=("Describes wether the dataset contains a label field or not"))
 
 
 DatasetFileWrapperConfig = Union[  # noqa: UP007

diff --git a/modyn/models/__init__.py b/modyn/models/__init__.py
@@ -6,6 +6,7 @@
 from .dlrm.dlrm import DLRM  # noqa: F401
 from .dummy.dummy import Dummy  # noqa: F401
 from .fmownet.fmownet import FmowNet  # noqa: F401
+from .gpt2.gpt2 import Gpt2  # noqa: F401
 from .resnet18.resnet18 import ResNet18  # noqa: F401
 from .resnet50.resnet50 import ResNet50  # noqa: F401
 from .resnet152.resnet152 import ResNet152  # noqa: F401

diff --git a/modyn/models/gpt2/_init_.py b/modyn/models/gpt2/_init_.py
@@ -0,0 +1,5 @@
+import os
+
+files = os.listdir(os.path.dirname(__file__))
+files.remove("__init__.py")
+__all__ = [f[:-3] for f in files if f.endswith(".py")]
diff --git a/modyn/models/gpt2/gpt2.py b/modyn/models/gpt2/gpt2.py
@@ -0,0 +1,56 @@
+from typing import Any
+
+import torch
+from torch import nn
+from transformers import GPT2LMHeadModel
+
+from modyn.models.coreset_methods_support import CoresetSupportingModule
+
+
+class Gpt2:
+    # pylint: disable-next=unused-argument
+    def __init__(self, hparams: Any, device: str, amp: bool) -> None:
+        self.model = Gpt2Modyn(hparams)
+        self.model.to(device)
+
+
+"""
+Adapted from an example implementation of a GPT-2 model.
+This implementation uses the GPT-2 tokenizer from Hugging Face's Transformers library:
+https://huggingface.co/docs/transformers/model_doc/gpt2
+"""
+
+
+class Gpt2Modyn(CoresetSupportingModule):
+    def __init__(self, hparams: Any) -> None:
+        super().__init__()
+
+        self.model = GPT2LMHeadModel.from_pretrained("gpt2-large")  # hparams.model_name_or_path
+
+    def forward(self, data: torch.Tensor, labels: torch.Tensor = None) -> torch.Tensor:
+        """Forward method for text generation or language modeling tasks.
+
+        Args:
+        - data (torch.Tensor): Tensor of shape (batch_size, seq_len, 2), where
+          the last dimension contains token IDs and attention masks.
+        - labels (torch.Tensor, optional): Tensor of labels for language modeling tasks.
+
+        Returns:
+        - output: The output logits or loss from the GPT-2 model.
+        """
+        # Split input into token IDs and attention masks
+        input_ids = data[:, :, 0]
+        attention_mask = data[:, :, 1]
+        # Forward pass through GPT-2
+
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
+
+        return output.logits
+
+    def get_last_layer(self) -> nn.Module:
+        """Retrieve the last layer (lm_head) of the model.
+
+        Returns:
+            The final linear layer of the GPT-2 model.
+        """
+        return self.model.lm_head
diff --git a/modyn/models/tokenizers/__init__.py b/modyn/models/tokenizers/__init__.py
@@ -1,8 +1,10 @@
-"""Bert Tokenizer for NLP tasks."""
+"""Tokenizer for NLP tasks."""
 
 import os
 
 from .distill_bert_tokenizer import DistilBertTokenizerTransform  # noqa: F401
+from .gpt2_tokenizer import GPT2TokenizerTransform  # noqa: F401
+from .hf_tokenizer import HFTokenizerTransform  # noqa: F401
 
 files = os.listdir(os.path.dirname(__file__))
 files.remove("__init__.py")

diff --git a/modyn/models/tokenizers/distill_bert_tokenizer.py b/modyn/models/tokenizers/distill_bert_tokenizer.py
@@ -1,24 +1,14 @@
-import torch
 from transformers import DistilBertTokenizer
 
+from .hf_tokenizer import HFTokenizerTransform
 
-class DistilBertTokenizerTransform:
-    """
-    Adapted from WildTime's initialize_distilbert_transform
-    Here you can find the original implementation:
-    https://github.com/huaxiuyao/Wild-Time/blob/main/wildtime/data/utils.py
-    """
 
-    def __init__(self, max_token_length: int = 300) -> None:
-        self.max_token_length = max_token_length
-        self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
-
-    def __call__(self, sample: str) -> torch.Tensor:
-        # make the class Callable to use it as Torch Transform
-        tokens = self.tokenizer(
-            sample, padding="max_length", truncation=True, max_length=self.max_token_length, return_tensors="pt"
-        )
-        # create a tensor whose first dimension is the input_ids and the second is the attention_mask
-        data = torch.stack((tokens["input_ids"], tokens["attention_mask"]), dim=2)
-        data = torch.squeeze(data, dim=0)  # First shape dim is always 1, since the input is just one string
-        return data
+class DistilBertTokenizerTransform(HFTokenizerTransform):
+    def __init__(self, max_token_length: int = 300):
+        """
+        Adapted from WildTime's initialize_distilbert_transform
+        Here you can find the original implementation:
+        https://github.com/huaxiuyao/Wild-Time/blob/main/wildtime/data/utils.py
+        """
+        tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+        super().__init__(tokenizer, max_token_length)
diff --git a/modyn/models/tokenizers/gpt2_tokenizer.py b/modyn/models/tokenizers/gpt2_tokenizer.py
@@ -0,0 +1,17 @@
+from transformers import GPT2Tokenizer
+
+from .hf_tokenizer import HFTokenizerTransform
+
+
+class GPT2TokenizerTransform(HFTokenizerTransform):
+    def __init__(self, max_token_length: int = 512):
+        """Adapted from an example implementation of a GPT-2 tokenizer.
+
+        This implementation uses the GPT-2 tokenizer from Hugging Face's
+        Transformers library:
+        https://huggingface.co/docs/transformers/model_doc/gpt2
+        """
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
+        tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token to avoid padding errors
+        tokenizer.padding_side = "right"
+        super().__init__(tokenizer, max_token_length)