Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Text generation #643

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
1 change: 1 addition & 0 deletions benchmark/mnist/mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ model_storage:
training:
gpus: 1
device: "cuda:0"
generative: False
dataloader_workers: 2
use_previous_model: True
initial_model: random
Expand Down
1 change: 1 addition & 0 deletions benchmark/wildtime_benchmarks/example_pipelines/arxiv.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ training:
initial_model: random
batch_size: 128
shuffle: True
generative: False
optimizers:
- name: "default"
algorithm: "SGD"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ training:
gpus: 1
device: "cuda:0"
dataloader_workers: 2
generative: False
use_previous_model: True
initial_model: random
batch_size: 96
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ training:
gpus: 1
device: "cuda:0"
dataloader_workers: 2
generative: False
use_previous_model: True
initial_model: random
batch_size: 64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ training:
gpus: 1
device: "cuda:0"
dataloader_workers: 2
generative: False
use_previous_model: True
initial_model: random
batch_size: 64
Expand Down
2 changes: 2 additions & 0 deletions benchmark/wildtime_benchmarks/example_pipelines/fmow.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ training:
gpus: 1
device: "cuda:0"
dataloader_workers: 2
generative: False

use_previous_model: True
initial_model: random
batch_size: 64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ model_storage:
training:
gpus: 1
device: "cuda:0"
generative: False
dataloader_workers: 2
use_previous_model: True
initial_model: random
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ training:
gpus: 1
device: "cuda:0"
dataloader_workers: 2
generative: False
use_previous_model: True
initial_model: random
batch_size: 64
Expand Down
17 changes: 8 additions & 9 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies:
- psycopg2
- sqlalchemy>=2.0
- pyaml
- pydantic
- pydantic==2.9.2
- numpy==1.26.*
- pandas
- bitstring
Expand All @@ -43,11 +43,10 @@ dependencies:
- nltk
- pytorch::pytorch=2.2.1
- pytorch::torchvision
- pytorch::cpuonly # comment out if commenting in lines below for CUDA
# - pytorch::pytorch-cuda=12.1
# - nvidia::cuda-libraries-dev=12.1.*
# - nvidia::cuda-nvcc=12.1.*
# - nvidia::cuda-nvtx=12.1.*
# - nvidia::cuda-cupti=12.1.*
# - nvidia::cuda-cudart-dev=12.1.*
# - nvidia::cuda-profiler-api=12.1.*
- pytorch::pytorch-cuda=12.1
- nvidia::cuda-libraries-dev=12.1.*
- nvidia::cuda-nvcc=12.1.*
- nvidia::cuda-nvtx=12.1.*
- nvidia::cuda-cupti=12.1.*
- nvidia::cuda-cudart-dev=12.1.*
- nvidia::cuda-profiler-api==12.1.*
1 change: 1 addition & 0 deletions integrationtests/config/dummy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ model_storage:
training:
gpus: 1
device: "cpu"
generative: False
dataloader_workers: 1
use_previous_model: True
initial_model: random
Expand Down
4 changes: 3 additions & 1 deletion integrationtests/config/rho_loss.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ training:
gpus: 1
device: "cpu"
dataloader_workers: 2
generative: False
use_previous_model: False
initial_model: random
batch_size: 4
Expand Down Expand Up @@ -60,6 +61,7 @@ selection_strategy:
il_model_config:
num_classes: 10
device: "cpu"
generative: False
dataloader_workers: 1
use_previous_model: False
batch_size: 2
Expand All @@ -75,4 +77,4 @@ selection_strategy:
lr: 0.1
momentum: 0.001
optimization_criterion:
name: "CrossEntropyLoss"
name: "CrossEntropyLoss"
1 change: 1 addition & 0 deletions modyn/common/grpc/grpc_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ def prepare_start_training_request(
enable_accurate_gpu_measurements=training_config.enable_accurate_gpu_measurements,
record_loss_every=training_config.record_loss_every,
drop_last_batch=training_config.drop_last_batch,
generative=training_config.generative,
)

def start_training(
Expand Down
2 changes: 1 addition & 1 deletion modyn/config/examples/modyn_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -278,7 +278,7 @@ selector:
local_storage_directory: "/tmp/local_storage"
local_storage_max_samples_in_file: 1000000
cleanup_storage_directories_after_shutdown: true
ignore_existing_trigger_samples: false
ignore_existing_trigger_samples: true

trainer_server:
hostname: "trainer_server"
Expand Down
5 changes: 5 additions & 0 deletions modyn/config/schema/pipeline/training/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ class TrainingConfig(ModynBaseModel):
"we start with random weights. If initial_model is 'pretrained', cannot be False."
)
)
generative: bool = Field(False,
description=(
"If True then, then the training pipeline goes into the generative branch, data is sampled without expecting labels."
)
)
seed: int | None = Field(
None,
description=(
Expand Down
2 changes: 1 addition & 1 deletion modyn/config/schema/system/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ class SelectorConfig(HostnamePortMixin):
),
)
ignore_existing_trigger_samples: bool = Field(
False,
True,
description=(
"Whether to ignore existing trigger samples when starting the selector. If set to false, the trigger "
"sample directory has to be empty upon startup. May lead to unexpected behaviour if set to true and the "
Expand Down
70 changes: 70 additions & 0 deletions modyn/models/GPT2/GPT2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import torch
from torch import nn
from transformers import GPT2LMHeadModel
from typing import Any
from modyn.models.coreset_methods_support import CoresetSupportingModule




#from modyn.models.GPT2.RecAdam import RecAdam


#from deepspeed.runtime.lr_schedules import WarmupDecayLR
#import deepspeed
#as GPT2_Lora


class GPT2:
# pylint: disable-next=unused-argument
def __init__(self, hparams: Any, device: str, amp: bool) -> None:
self.model = GPT2Modyn(hparams)
self.model.to(device)

Check warning on line 22 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L21-L22

Added lines #L21 - L22 were not covered by tests


# the following class is adapted from
# torchvision https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py


class GPT2Modyn(CoresetSupportingModule):

def __init__(self, hparams: Any) -> None:
super().__init__()

Check warning on line 32 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L32

Added line #L32 was not covered by tests


self.model = GPT2LMHeadModel.from_pretrained("gpt2-large") # hparams.model_name_or_path

Check warning on line 35 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L35

Added line #L35 was not covered by tests



def forward(self, data: torch.Tensor, labels: torch.Tensor = None) -> torch.Tensor:
"""
Forward method for text generation or language modeling tasks.

Args:
- data (torch.Tensor): Tensor of shape (batch_size, seq_len, 2), where
the last dimension contains token IDs and attention masks.
- labels (torch.Tensor, optional): Tensor of labels for language modeling tasks.

Returns:
- output: The output logits or loss from the GPT-2 model.
"""
# Split input into token IDs and attention masks
input_ids = data[:, :, 0]
attention_mask = data[:, :, 1]

Check warning on line 53 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L52-L53

Added lines #L52 - L53 were not covered by tests
# Forward pass through GPT-2
output = self.model(

Check warning on line 55 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L55

Added line #L55 was not covered by tests
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
return output [0]

Check warning on line 60 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L60

Added line #L60 was not covered by tests

def get_last_layer(self) -> nn.Module:
"""
Retrieve the last layer (lm_head) of the model.

Returns:
The final linear layer of the GPT-2 model.
"""
return self.model.lm_head

Check warning on line 69 in modyn/models/GPT2/GPT2.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/GPT2/GPT2.py#L69

Added line #L69 was not covered by tests

5 changes: 5 additions & 0 deletions modyn/models/GPT2/_init_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import os

files = os.listdir(os.path.dirname(__file__))
files.remove("__init__.py")
__all__ = [f[:-3] for f in files if f.endswith(".py")]
1 change: 1 addition & 0 deletions modyn/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .dlrm.dlrm import DLRM # noqa: F401
from .dummy.dummy import Dummy # noqa: F401
from .fmownet.fmownet import FmowNet # noqa: F401
from .GPT2.GPT2 import GPT2 #noqa: F401
from .resnet18.resnet18 import ResNet18 # noqa: F401
from .resnet50.resnet50 import ResNet50 # noqa: F401
from .resnet152.resnet152 import ResNet152 # noqa: F401
Expand Down
1 change: 1 addition & 0 deletions modyn/models/tokenizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os

from .distill_bert_tokenizer import DistilBertTokenizerTransform # noqa: F401
from .gpt2_tokenizer import GPT2TokenizerTransform # noqa: F401

files = os.listdir(os.path.dirname(__file__))
files.remove("__init__.py")
Expand Down
22 changes: 22 additions & 0 deletions modyn/models/tokenizers/gpt2_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import torch
from transformers import GPT2Tokenizer


class GPT2TokenizerTransform:
def __init__(self, max_token_length: int = 256):
# Load the GPT-2 tokenizer
self.max_token_length = max_token_length
self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")

Check warning on line 9 in modyn/models/tokenizers/gpt2_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/tokenizers/gpt2_tokenizer.py#L8-L9

Added lines #L8 - L9 were not covered by tests
# Set the pad token to the eos token to avoid padding errors
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.padding_side = "right"

Check warning on line 12 in modyn/models/tokenizers/gpt2_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/tokenizers/gpt2_tokenizer.py#L11-L12

Added lines #L11 - L12 were not covered by tests

def __call__(self, sample: str) -> torch.Tensor:
# Make the class callable to use it as Torch Transform
tokens = self.tokenizer(

Check warning on line 16 in modyn/models/tokenizers/gpt2_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/tokenizers/gpt2_tokenizer.py#L16

Added line #L16 was not covered by tests
sample, padding="max_length", truncation=True, max_length=self.max_token_length, return_tensors="pt"
)
# Create a tensor whose first dimension is the input_ids and the second is the attention_mask
data = torch.stack((tokens["input_ids"], tokens["attention_mask"]), dim=2)
data = torch.squeeze(data, dim=0) # First shape dim is always 1, since the input is just one string
return data

Check warning on line 22 in modyn/models/tokenizers/gpt2_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

modyn/models/tokenizers/gpt2_tokenizer.py#L20-L22

Added lines #L20 - L22 were not covered by tests
9 changes: 9 additions & 0 deletions modyn/protos/storage.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ package modyn.storage;

service Storage {
rpc Get(GetRequest) returns (stream GetResponse) {}
rpc GetNL(GetRequest) returns (stream GetResponse) {}
rpc GetNewDataSince(GetNewDataSinceRequest)
returns (stream GetNewDataSinceResponse) {}
rpc GetDataInInterval(GetDataInIntervalRequest)
Expand All @@ -25,6 +26,7 @@ service Storage {
message GetRequest {
string dataset_id = 1;
repeated int64 keys = 2;
bool include_labels = 3; // Added this line
}

message GetResponse {
Expand All @@ -33,6 +35,13 @@ message GetResponse {
repeated int64 labels = 3;
}


message GetResponseNoLabels {
repeated bytes samples = 1;
repeated int64 keys = 2;

}

// https://github.com/grpc/grpc/issues/15937
message GetCurrentTimestampRequest {}

Expand Down
1 change: 1 addition & 0 deletions modyn/protos/trainer_server.proto
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ message StartTrainingRequest {
bool enable_accurate_gpu_measurements = 25;
int64 record_loss_every = 26;
bool drop_last_batch = 27;
bool generative=28;
}

message StartTrainingResponse {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class BinaryFileWrapper : public FileWrapper {
std::vector<int64_t> get_all_labels() override;
std::vector<unsigned char> get_sample(uint64_t index) override;
std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) override;
std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) override;
std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices, bool generative = false) override;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's be consistent in naming here: either include_labels (preferred I think for these calls semantically) or generative.

void validate_file_extension() override;
void delete_samples(const std::vector<uint64_t>& indices) override;
void set_file_path(const std::string& path) override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class CsvFileWrapper : public FileWrapper {
std::vector<int64_t> get_all_labels() override;
std::vector<unsigned char> get_sample(uint64_t index) override;
std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) override;
std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) override;
std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices, bool generative = false) override;
void validate_file_extension() override;
void delete_samples(const std::vector<uint64_t>& indices) override;
void set_file_path(const std::string& path) override;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class FileWrapper {
virtual std::vector<int64_t> get_all_labels() = 0;
virtual std::vector<unsigned char> get_sample(uint64_t index) = 0;
virtual std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) = 0;
virtual std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) = 0;
virtual std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices, bool generative = false) = 0;
virtual void validate_file_extension() = 0;
virtual void delete_samples(const std::vector<uint64_t>& indices) = 0;
virtual void set_file_path(const std::string& path) = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class SingleSampleFileWrapper : public FileWrapper {
std::vector<int64_t> get_all_labels() override;
std::vector<unsigned char> get_sample(uint64_t index) override;
std::vector<std::vector<unsigned char>> get_samples(uint64_t start, uint64_t end) override;
std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices) override;
std::vector<std::vector<unsigned char>> get_samples_from_indices(const std::vector<uint64_t>& indices, bool generative) override;
void validate_file_extension() override;
void delete_samples(const std::vector<uint64_t>& indices) override;
void set_file_path(const std::string& path) override { file_path_ = path; }
Expand Down
Loading
Loading