Skip to content

Commit

Permalink
Merge pull request #6 from prajjwal1/meta
Browse files Browse the repository at this point in the history
added Meta Trainer, tests, example
  • Loading branch information
prajjwal1 authored Jun 29, 2020
2 parents 8fc1394 + 5a012b7 commit b5412bd
Show file tree
Hide file tree
Showing 13 changed files with 544 additions and 14 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install flake8 pytest black isort
pip install torch transformers sklearn torchvision
pip install torch sklearn torchvision pandas
pip install git+https://github.com/huggingface/transformers
python3 setup.py install --user
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ coverage.xml
*.mo
*.pot

wandb/
*.lock
cache*
# Django stuff:
*.log
local_settings.py
Expand Down
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
style:
isort --recursive --multi-line=3 --trailing-comma --force-grid-wrap=0 --use-parentheses --line-width=88 fluence tests
black fluence tests
isort --recursive --multi-line=3 --trailing-comma --force-grid-wrap=0 --use-parentheses --line-width=88 fluence tests examples
black fluence tests examples

quality:
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
black --check fluence tests
black --check fluence tests examples

11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Fluence
> Fluence is a Pytorch based deep learning library focussed on providing computationally efficient, low resource methods and algorithms. Although the main focus is to provide support with transformers, it can be extended with other architectures as well.
> Fluence is a Pytorch based deep learning library focussed on providing computationally efficient, low resource methods and algorithms. Although the main focus is to provide support with transformers for NLP tasks, it can be extended with other domains and architectures as well. Currently in pre-alpha stage.

![badge](https://github.com/prajjwal1/fluence/workflows/build/badge.svg)
Expand All @@ -14,6 +14,7 @@ pip3 install --user fluence
For development version:
```bash
git clone https://github.com/prajjwal1/fluence
cd fluence
python3 setup.py install --user
```

Expand All @@ -33,10 +34,14 @@ The library contains implementation for the following approaches (many more to c
- Clustering


# Documentation
### Documentation
Please head to this [link](https://github.com/prajjwal1/fluence/wiki) to learn how you can integrate fluence with your workflow. Since it's an early release, there might be bugs here and there. Please file an issue if you encounter one.

# Tests
### Contribution
I'd really appreciate if you can file an issue or send a PR if you encounter any bug or want some features to be added. Please checkout the [contributing guide](https://github.com/prajjwal1/fluence/blob/master/CONTRIBUTING.md) for more details.


### Tests
Tests can be run with pytest
```
pytest tests/
Expand Down
6 changes: 6 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
### Running MAML

Usage
```bash
python3 examples/run_maml_glue.py --model_name_or_path bert-base-uncased --do_train --do_eval --max_seq_length 128 --per_device_train_batch_size 1 --learning_rate 2e-5 --output_dir /home/nlp/experiments/fluence_exp/ --overwrite_output_dir --per_device_eval_batch_size 4096 --data_dir $GLUE_DIR --train_task mrpc --eval_task sst-2 --save_steps=10000 --num_train_epochs=1 --output_file_name check --eval_method every_2
```
200 changes: 200 additions & 0 deletions examples/run_maml_glue.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
import dataclasses
import logging
import os
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional, Tuple

import numpy as np
import torch
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
from tqdm import tqdm, trange
from transformers import (
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
EvalPrediction,
GlueDataset,
)
from transformers import GlueDataTrainingArguments as DataTrainingArguments
from transformers import (
HfArgumentParser,
Trainer,
TrainingArguments,
default_data_collator,
glue_compute_metrics,
glue_output_modes,
glue_tasks_num_labels,
set_seed,
)
from transformers.data.data_collator import DataCollator

from fluence.meta import MetaDataset, MetaTrainer

logger = logging.getLogger(__name__)


@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""

model_name_or_path: str = field(
metadata={
"help": (
"Path to pretrained model or model identifier from"
" huggingface.co/models"
)
}
)
config_name: Optional[str] = field(
default=None,
metadata={
"help": "Pretrained config name or path if not the same as model_name"
},
)
tokenizer_name: Optional[str] = field(
default=None,
metadata={
"help": "Pretrained tokenizer name or path if not the same as model_name"
},
)
cache_dir: Optional[str] = field(
default=None,
metadata={
"help": (
"Where do you want to store the pretrained models downloaded from s3"
)
},
)


@dataclass
class MetaArguments(TrainingArguments):
train_task: Optional[str] = field(
default=None, metadata={"help": "Support dataset"}
)
eval_task: Optional[str] = field(default=None, metadata={"help": "Query dataset"})
data_dir: Optional[str] = field(default=None)
inner_learning_rate: float = field(default=2e-5)
learning_rate: Optional[float] = field(default=2e-5) # Outer
max_len: int = field(default=80)
eval_method: Optional[str] = field(default=None)
max_seq_length: int = field(
default=128,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences"
" longer than this will be truncated, sequences shorter will be padded."
)
},
)
overwrite_cache: bool = field(
default=False,
metadata={"help": "Overwrite the cached training and evaluation sets"},
)
output_file_name: Optional[str] = field(default="results")


def main():
parser = HfArgumentParser((ModelArguments, MetaArguments))
model_args, training_args = parser.parse_args_into_dataclasses()
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not"
" empty. Use --overwrite_output_dir to overcome."
)

# Set seed
set_seed(training_args.seed)

# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s,"
" 16-bits training: %s",
training_args.local_rank,
training_args.device,
training_args.n_gpu,
bool(training_args.local_rank != -1),
training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)
set_seed(training_args.seed)

try:
num_labels = glue_tasks_num_labels[training_args.train_task]
output_mode = glue_output_modes[training_args.train_task]
except KeyError:
raise ValueError("Task not found: %s" % (training_args.train_task))

config = AutoConfig.from_pretrained(
model_args.config_name
if model_args.config_name
else model_args.model_name_or_path,
num_labels=num_labels,
finetuning_task=training_args.train_task,
cache_dir=model_args.cache_dir,
)
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name
if model_args.tokenizer_name
else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
)
model = AutoModelForSequenceClassification.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
)

def build_compute_metrics_fn(task_name: str) -> Callable[[EvalPrediction], Dict]:
def compute_metrics_fn(p: EvalPrediction) -> Dict:
if output_mode == "classification":
preds = np.argmax(p.predictions, axis=1)
elif output_mode == "regression":
preds = np.squeeze(p.predictions)
return glue_compute_metrics(training_args.task_name, preds, p.label_ids)

return compute_metrics_fn

data_dir = {
"mrpc": training_args.data_dir + "/MRPC",
"sst-2": training_args.data_dir + "/SST-2",
"cola": training_args.data_dir + "/Cola",
"sts-b": training_args.data_dir + "/STS-B",
}

training_args.task_name = training_args.train_task
training_args.data_dir = data_dir[training_args.task_name]
train_dataset = GlueDataset(training_args, tokenizer=tokenizer)
meta_dataset = MetaDataset(train_dataset)
training_args.task_name = training_args.eval_task
training_args.data_dir = data_dir[training_args.task_name]
eval_dataset = GlueDataset(training_args, tokenizer=tokenizer, mode="dev")

meta_trainer = MetaTrainer(
model=model,
args=training_args,
train_dataset=meta_dataset,
eval_dataset=eval_dataset,
train_data_collator=torch.utils.data._utils.collate.default_collate,
eval_data_collator=default_data_collator,
)

meta_trainer.train()


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion fluence/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.4"
__version__ = "0.1.5"
2 changes: 2 additions & 0 deletions fluence/meta/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from .meta_args import MetaArguments
from .meta_dataset import MetaDataset
from .meta_trainer import MetaTrainer
28 changes: 28 additions & 0 deletions fluence/meta/meta_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from dataclasses import dataclass, field
from typing import Callable, Dict, List, Optional, Tuple

from transformers import TrainingArguments


@dataclass
class MetaArguments(TrainingArguments):
train_task: List = field(default=None, metadata="Support dataset")
eval_task: List = field(default=None, metadata="Query dataset")
data_dir: str = field(default=None)
inner_learning_rate: float = field(default=1e-3)
outer_learning_rate: float = field(default=2e-5)
max_len: int = field(default=80)
eval_method: str = field(default=None)
max_seq_length: int = field(
default=128,
metadata={
"help": (
"The maximum total input sequence length after tokenization. Sequences"
" longer than this will be truncated, sequences shorter will be padded."
)
},
)
overwrite_cache: bool = field(
default=False,
metadata={"help": "Overwrite the cached training and evaluation sets"},
)
Loading

0 comments on commit b5412bd

Please sign in to comment.