eth-easl · sjohn4 · Dec 11, 2024 · Jan 6, 2025 · Jan 6, 2025 · Jan 6, 2025
diff --git a/benchmark/mnist/mnist.yaml b/benchmark/mnist/mnist.yaml
@@ -12,6 +12,7 @@ model_storage:
 training:
   gpus: 1
   device: "cuda:0"
+  generative: False
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml b/benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml
@@ -17,6 +17,7 @@ training:
   initial_model: random
   batch_size: 128
   shuffle: True
+  generative: False
   optimizers:
     - name: "default"
       algorithm: "SGD"

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/arxiv_datadrift.yaml
@@ -13,6 +13,7 @@ training:
   gpus: 1
   device: "cuda:0"
   dataloader_workers: 2
+  generative: False
   use_previous_model: True
   initial_model: random
   batch_size: 96

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/huffpost_datadrift.yaml
@@ -13,6 +13,7 @@ training:
   gpus: 1
   device: "cuda:0"
   dataloader_workers: 2
+  generative: False
   use_previous_model: True
   initial_model: random
   batch_size: 64

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml b/benchmark/wildtime_benchmarks/example_pipelines/data_drift_trigger/yearbook_datadrift.yaml
@@ -14,6 +14,7 @@ training:
   gpus: 1
   device: "cuda:0"
   dataloader_workers: 2
+  generative: False
   use_previous_model: True
   initial_model: random
   batch_size: 64

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml b/benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml
@@ -13,6 +13,8 @@ training:
   gpus: 1
   device: "cuda:0"
   dataloader_workers: 2
+  generative: False
+
   use_previous_model: True
   initial_model: random
   batch_size: 64

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml b/benchmark/wildtime_benchmarks/example_pipelines/huffpost.yaml
@@ -12,6 +12,7 @@ model_storage:
 training:
   gpus: 1
   device: "cuda:0"
+  generative: False
   dataloader_workers: 2
   use_previous_model: True
   initial_model: random

diff --git a/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml b/benchmark/wildtime_benchmarks/example_pipelines/yearbook.yaml
@@ -14,6 +14,7 @@ training:
   gpus: 1
   device: "cuda:0"
   dataloader_workers: 2
+  generative: False
   use_previous_model: True
   initial_model: random
   batch_size: 64

diff --git a/environment.yml b/environment.yml
@@ -30,7 +30,7 @@ dependencies:
   - psycopg2
   - sqlalchemy>=2.0
   - pyaml
-  - pydantic
+  - pydantic==2.9.2
   - numpy==1.26.*
   - pandas
   - bitstring
@@ -43,11 +43,10 @@ dependencies:
   - nltk
   - pytorch::pytorch=2.2.1
   - pytorch::torchvision
-  - pytorch::cpuonly # comment out if commenting in lines below for CUDA
-#  - pytorch::pytorch-cuda=12.1
-#  - nvidia::cuda-libraries-dev=12.1.*
-#  - nvidia::cuda-nvcc=12.1.*
-#  - nvidia::cuda-nvtx=12.1.*
-#  - nvidia::cuda-cupti=12.1.*
-#  - nvidia::cuda-cudart-dev=12.1.*
-#  - nvidia::cuda-profiler-api=12.1.*
+  - pytorch::pytorch-cuda=12.1
+  - nvidia::cuda-libraries-dev=12.1.*
+  - nvidia::cuda-nvcc=12.1.*
+  - nvidia::cuda-nvtx=12.1.*
+  - nvidia::cuda-cupti=12.1.*
+  - nvidia::cuda-cudart-dev=12.1.*
+  - nvidia::cuda-profiler-api==12.1.*
diff --git a/integrationtests/config/dummy.yaml b/integrationtests/config/dummy.yaml
@@ -12,6 +12,7 @@ model_storage:
 training:
   gpus: 1
   device: "cpu"
+  generative: False
   dataloader_workers: 1
   use_previous_model: True
   initial_model: random

diff --git a/integrationtests/config/rho_loss.yaml b/integrationtests/config/rho_loss.yaml
@@ -13,6 +13,7 @@ training:
   gpus: 1
   device: "cpu"
   dataloader_workers: 2
+  generative: False
   use_previous_model: False
   initial_model: random
   batch_size: 4
@@ -60,6 +61,7 @@ selection_strategy:
       il_model_config:
         num_classes: 10
       device: "cpu"
+      generative: False
       dataloader_workers: 1
       use_previous_model: False
       batch_size: 2
@@ -75,4 +77,4 @@ selection_strategy:
                 lr: 0.1
                 momentum: 0.001
       optimization_criterion:
-        name: "CrossEntropyLoss"
+        name: "CrossEntropyLoss"
diff --git a/modyn/common/grpc/grpc_helpers.py b/modyn/common/grpc/grpc_helpers.py
@@ -251,6 +251,7 @@ def prepare_start_training_request(
             enable_accurate_gpu_measurements=training_config.enable_accurate_gpu_measurements,
             record_loss_every=training_config.record_loss_every,
             drop_last_batch=training_config.drop_last_batch,
+            generative=training_config.generative,
         )
 
     def start_training(

diff --git a/modyn/config/examples/modyn_config.yaml b/modyn/config/examples/modyn_config.yaml
@@ -278,7 +278,7 @@ selector:
   local_storage_directory: "/tmp/local_storage"
   local_storage_max_samples_in_file: 1000000
   cleanup_storage_directories_after_shutdown: true
-  ignore_existing_trigger_samples: false
+  ignore_existing_trigger_samples: true
 
 trainer_server:
   hostname: "trainer_server"

diff --git a/modyn/config/schema/pipeline/training/config.py b/modyn/config/schema/pipeline/training/config.py
@@ -119,6 +119,11 @@ class TrainingConfig(ModynBaseModel):
             "we start with random weights. If initial_model is 'pretrained', cannot be False."
         )
     )
+    generative: bool = Field(False,
+        description=(
+            "If True then, then the training pipeline goes into the generative branch, data is sampled without expecting labels."
+        )
+    )
     seed: int | None = Field(
         None,
         description=(

diff --git a/modyn/config/schema/system/config.py b/modyn/config/schema/system/config.py
@@ -255,7 +255,7 @@ class SelectorConfig(HostnamePortMixin):
         ),
     )
     ignore_existing_trigger_samples: bool = Field(
-        False,
+        True,
         description=(
             "Whether to ignore existing trigger samples when starting the selector. If set to false, the trigger "
             "sample directory has to be empty upon startup. May lead to unexpected behaviour if set to true and the "

diff --git a/modyn/models/GPT2/GPT2.py b/modyn/models/GPT2/GPT2.py
@@ -0,0 +1,70 @@
+import torch
+from torch import nn
+from transformers import GPT2LMHeadModel
+from typing import Any
+from modyn.models.coreset_methods_support import CoresetSupportingModule
+
+
+
+
+#from modyn.models.GPT2.RecAdam import RecAdam
+
+
+#from deepspeed.runtime.lr_schedules import WarmupDecayLR
+#import deepspeed
+  #as GPT2_Lora
+
+
+class GPT2:
+    # pylint: disable-next=unused-argument
+    def __init__(self, hparams: Any, device: str, amp: bool) -> None:
+        self.model = GPT2Modyn(hparams)
+        self.model.to(device)
+
+
+# the following class is adapted from
+# torchvision https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
+
+
+class GPT2Modyn(CoresetSupportingModule):
+
+    def __init__(self, hparams: Any) -> None:
+        super().__init__()
+
+
+        self.model = GPT2LMHeadModel.from_pretrained("gpt2-large")  # hparams.model_name_or_path
+
+
+
+    def forward(self, data: torch.Tensor, labels: torch.Tensor = None) -> torch.Tensor:
+        """
+        Forward method for text generation or language modeling tasks.
+
+        Args:
+        - data (torch.Tensor): Tensor of shape (batch_size, seq_len, 2), where
+          the last dimension contains token IDs and attention masks.
+        - labels (torch.Tensor, optional): Tensor of labels for language modeling tasks.
+
+        Returns:
+        - output: The output logits or loss from the GPT-2 model.
+        """
+        # Split input into token IDs and attention masks
+        input_ids = data[:, :, 0]
+        attention_mask = data[:, :, 1]   
+        # Forward pass through GPT-2
+        output = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            labels=labels
+        )
+        return output [0]
+
+    def get_last_layer(self) -> nn.Module:
+      """
+      Retrieve the last layer (lm_head) of the model.
+
+      Returns:
+          The final linear layer of the GPT-2 model.
+      """
+      return self.model.lm_head
+
diff --git a/modyn/models/GPT2/_init_.py b/modyn/models/GPT2/_init_.py
@@ -0,0 +1,5 @@
+import os
+
+files = os.listdir(os.path.dirname(__file__))
+files.remove("__init__.py")
+__all__ = [f[:-3] for f in files if f.endswith(".py")]
diff --git a/modyn/models/__init__.py b/modyn/models/__init__.py
@@ -6,6 +6,7 @@
 from .dlrm.dlrm import DLRM  # noqa: F401
 from .dummy.dummy import Dummy  # noqa: F401
 from .fmownet.fmownet import FmowNet  # noqa: F401
+from .GPT2.GPT2 import GPT2  #noqa: F401
 from .resnet18.resnet18 import ResNet18  # noqa: F401
 from .resnet50.resnet50 import ResNet50  # noqa: F401
 from .resnet152.resnet152 import ResNet152  # noqa: F401

diff --git a/modyn/models/tokenizers/__init__.py b/modyn/models/tokenizers/__init__.py
@@ -3,6 +3,7 @@
 import os
 
 from .distill_bert_tokenizer import DistilBertTokenizerTransform  # noqa: F401
+from .gpt2_tokenizer import GPT2TokenizerTransform  # noqa: F401
 
 files = os.listdir(os.path.dirname(__file__))
 files.remove("__init__.py")

diff --git a/modyn/models/tokenizers/gpt2_tokenizer.py b/modyn/models/tokenizers/gpt2_tokenizer.py
@@ -0,0 +1,22 @@
+import torch
+from transformers import GPT2Tokenizer
+
+
+class GPT2TokenizerTransform:
+    def __init__(self, max_token_length: int = 256):
+        # Load the GPT-2 tokenizer
+        self.max_token_length = max_token_length
+        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
+        # Set the pad token to the eos token to avoid padding errors
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.tokenizer.padding_side = "right"
+
+    def __call__(self, sample: str) -> torch.Tensor:
+        # Make the class callable to use it as Torch Transform
+        tokens = self.tokenizer(
+            sample, padding="max_length", truncation=True, max_length=self.max_token_length, return_tensors="pt"
+        )
+        # Create a tensor whose first dimension is the input_ids and the second is the attention_mask
+        data = torch.stack((tokens["input_ids"], tokens["attention_mask"]), dim=2)
+        data = torch.squeeze(data, dim=0)  # First shape dim is always 1, since the input is just one string
+        return data
diff --git a/modyn/protos/storage.proto b/modyn/protos/storage.proto
@@ -4,6 +4,7 @@ package modyn.storage;
 
 service Storage {
   rpc Get(GetRequest) returns (stream GetResponse) {}
+  rpc GetNL(GetRequest) returns (stream GetResponse) {}
   rpc GetNewDataSince(GetNewDataSinceRequest)
       returns (stream GetNewDataSinceResponse) {}
   rpc GetDataInInterval(GetDataInIntervalRequest)
@@ -25,6 +26,7 @@ service Storage {
 message GetRequest {
   string dataset_id = 1;
   repeated int64 keys = 2;
+  bool include_labels = 3;  // Added this line
 }
 
 message GetResponse {
@@ -33,6 +35,13 @@ message GetResponse {
   repeated int64 labels = 3;
 }
 
+
+message GetResponseNoLabels {
+  repeated bytes samples = 1;
+  repeated int64 keys = 2;
+
+}
+
 // https://github.com/grpc/grpc/issues/15937
 message GetCurrentTimestampRequest {}
 

diff --git a/modyn/protos/trainer_server.proto b/modyn/protos/trainer_server.proto
@@ -59,6 +59,7 @@ message StartTrainingRequest {
   bool enable_accurate_gpu_measurements = 25;
   int64 record_loss_every = 26;
   bool drop_last_batch = 27;
+  bool generative=28;
 }
 
 message StartTrainingResponse {

diff --git a/modyn/storage/include/internal/file_wrapper/binary_file_wrapper.hpp b/modyn/storage/include/internal/file_wrapper/binary_file_wrapper.hpp
@@ -37,7 +37,7 @@ class BinaryFileWrapper : public FileWrapper {
   std::vector<int64_t> get_all_labels() override;
   std::vector<unsigned char> get_sample(uint64_t index) override;
   std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) override;
-  std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) override;
+  std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices,  bool generative = false) override;
   void validate_file_extension() override;
   void delete_samples(const std::vector<uint64_t>& indices) override;
   void set_file_path(const std::string& path) override;

diff --git a/modyn/storage/include/internal/file_wrapper/csv_file_wrapper.hpp b/modyn/storage/include/internal/file_wrapper/csv_file_wrapper.hpp
@@ -66,7 +66,7 @@ class CsvFileWrapper : public FileWrapper {
   std::vector<int64_t> get_all_labels() override;
   std::vector<unsigned char> get_sample(uint64_t index) override;
   std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) override;
-  std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) override;
+  std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices,  bool generative = false) override;
   void validate_file_extension() override;
   void delete_samples(const std::vector<uint64_t>& indices) override;
   void set_file_path(const std::string& path) override;

diff --git a/modyn/storage/include/internal/file_wrapper/file_wrapper.hpp b/modyn/storage/include/internal/file_wrapper/file_wrapper.hpp
@@ -21,7 +21,7 @@ class FileWrapper {
   virtual std::vector<int64_t> get_all_labels() = 0;
   virtual std::vector<unsigned char> get_sample(uint64_t index) = 0;
   virtual std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) = 0;
-  virtual std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) = 0;
+  virtual std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices, bool generative = false) = 0;
   virtual void validate_file_extension() = 0;
   virtual void delete_samples(const std::vector<uint64_t>& indices) = 0;
   virtual void set_file_path(const std::string& path) = 0;

diff --git a/modyn/storage/include/internal/file_wrapper/single_sample_file_wrapper.hpp b/modyn/storage/include/internal/file_wrapper/single_sample_file_wrapper.hpp
@@ -18,7 +18,7 @@ class SingleSampleFileWrapper : public FileWrapper {
   std::vector<int64_t> get_all_labels() override;
   std::vector<unsigned char> get_sample(uint64_t index) override;
   std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) override;
-  std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) override;
+  std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices,  bool generative) override;
   void validate_file_extension() override;
   void delete_samples(const std::vector<uint64_t>& indices) override;
   void set_file_path(const std::string& path) override { file_path_ = path; }