From cf3f7bcaac4824c9af365fed38e616fc87408b7a Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sat, 21 Mar 2020 16:25:05 +0900 Subject: [PATCH 01/66] Feat: Add SemEval2010T1NER dataset --- .../test_datasets/test_sequential_labeling.py | 31 ++++-- torchglyph/datasets/__init__.py | 1 + torchglyph/datasets/sequential_labeling.py | 99 +++++++++++++++++-- 3 files changed, 115 insertions(+), 16 deletions(-) diff --git a/tests/test_datasets/test_sequential_labeling.py b/tests/test_datasets/test_sequential_labeling.py index d76f233..94a7e6d 100644 --- a/tests/test_datasets/test_sequential_labeling.py +++ b/tests/test_datasets/test_sequential_labeling.py @@ -1,14 +1,29 @@ -from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER +from torchglyph.datasets import CoNLL2000Chunking, CoNLL2003NER +from torchglyph.datasets import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish -def test_conll2000_chunking() -> None: +def test_conll2000_chunking(): train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None) - assert len(train) == 8936 - assert len(test) == 2012 + assert len(train.dataset) == 8936 + assert len(test.dataset) == 2012 -def test_conll2003_ner() -> None: +def test_conll2003_ner(): train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None) - assert len(train) == 14987 - assert len(dev) == 3466 - assert len(test) == 3684 + assert len(train.dataset) == 14987 + assert len(dev.dataset) == 3466 + assert len(test.dataset) == 3684 + + +def test_semeval2010_catalan(): + train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None) + assert len(train.dataset) == 8709 + assert len(dev.dataset) == 1445 + assert len(test.dataset) == 1698 + + +def test_semeval2010_spanish(): + train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None) + assert len(train.dataset) == 9022 + assert len(dev.dataset) == 1419 + assert len(test.dataset) == 1705 diff --git a/torchglyph/datasets/__init__.py b/torchglyph/datasets/__init__.py index b5b186c..0c2ab4f 100644 --- a/torchglyph/datasets/__init__.py +++ b/torchglyph/datasets/__init__.py @@ -1,2 +1,3 @@ from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER +from torchglyph.datasets.sequential_labeling import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish from torchglyph.datasets.text_classification import AgNews diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py index f91ed5c..b5f2405 100644 --- a/torchglyph/datasets/sequential_labeling.py +++ b/torchglyph/datasets/sequential_labeling.py @@ -1,14 +1,17 @@ import logging from pathlib import Path -from typing import Iterable, List, Any, Tuple, Optional, NamedTuple, TextIO +from typing import Iterable, Any +from typing import Optional, List, Tuple, NamedTuple +from typing import TextIO from tqdm import tqdm from torchglyph.dataset import Dataset, DataLoader from torchglyph.formats import conllx -from torchglyph.pipe import PackedTokSeqPipe, SeqLengthTensorPipe, RawPipe, PackedTokPtrSeqPipe +from torchglyph.pipe import PackedTokSeqPipe, SeqLengthTensorPipe, RawPipe, PackedTokPtrSeqPipe, PackedPtrSeqPipe, \ + ToSubList, UpdateCounter, Lift from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe -from torchglyph.proc import ToLower, ReplaceDigits, Identity, LoadGlove +from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend class CoNLL2000Chunking(Dataset): @@ -20,7 +23,7 @@ class CoNLL2000Chunking(Dataset): @classmethod def load(cls, path: Path) -> Iterable[List[Any]]: for sent in tqdm(conllx.load(path, sep=' '), desc=f'reading {path}'): - word, pos, chunk = list(zip(*sent)) + word, pos, chunk = map(list, zip(*sent)) yield [word, pos, chunk] def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args, **kwargs) -> None: @@ -33,7 +36,7 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args @classmethod def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]: word = PackedTokSeqPipe(device=device, unk_token='').with_( - pre=ToLower() + ReplaceDigits(repl_token='') + ..., + pre=ReplaceDigits(repl_token='') + ..., vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)), ) length = SeqLengthTensorPipe(device=device) @@ -76,7 +79,7 @@ class CoNLL2003NER(Dataset): @classmethod def load(cls, path: Path) -> Iterable[List[Any]]: for sent in tqdm(conllx.load(path, sep=' '), desc=f'reading {path}', unit=' sents'): - word, pos, chunk, ner = list(zip(*sent)) + word, pos, chunk, ner = map(list, zip(*sent)) yield [word, pos, chunk, ner] def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args, **kwargs) -> None: @@ -89,8 +92,8 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args @classmethod def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]: word = PackedTokSeqPipe(device=device, unk_token='').with_( - pre=ToLower() + ReplaceDigits(repl_token='') + ..., - vocab=... + (Identity() if word_dim is None else LoadGlove(name='6B', dim=word_dim)), + pre=ReplaceDigits(repl_token='') + ..., + vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)), ) length = SeqLengthTensorPipe(device=device) char = PackedTokBlockPipe(device=device, unk_token='') @@ -124,3 +127,83 @@ def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tupl (train, dev, test), batch_size=batch_size, shuffle=True, ) + + +class SemEval2010T1NER(Dataset): + lang: str + + @classmethod + def load(cls, path: Path, **kwargs) -> Iterable[Any]: + for sent in tqdm(conllx.load(path, sep='\t'), desc=f'reading {path}', unit=' sentences'): + _, word, _, pos, _, _, head, drel, _, _, ner = map(list, zip(*sent)) + yield [word, pos, [int(h) for h in head], drel, ner] + + def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[Any], *args, **kwargs) -> None: + ner_vocab = self.pipes['ner'].vocab.stoi + for raw_word, raw_pos, raw_ner, pred in \ + zip(batch.raw_word, batch.raw_pos, batch.raw_ner, prediction): + assert len(raw_word) == len(raw_pos) == len(raw_ner) == len(pred) + + pred_ner = [ner_vocab[p] for p in pred] + conllx.dump(zip(raw_word, raw_pos, raw_ner, pred_ner), fp, sep=' ') + + @classmethod + def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]: + word = PackedTokSeqPipe(device=device, unk_token='').with_( + pre=Prepend('', 1) + ReplaceDigits(repl_token='') + ..., + vocab=... + (Identity() if word_dim is None else LoadFastText(cls.lang, str.lower)), + ) + length = SeqLengthTensorPipe(device=device).with_(pre=Prepend('', 1) + ...) + char = PackedTokBlockPipe(device=device, unk_token='').with_( + pre=ToSubList() + Lift(Prepend('', 1)) + Lift(UpdateCounter()), + ) + word_ptr = PackedTokPtrSeqPipe(device=device, reverse=False).with_(pre=Prepend(0, 1) + ...) + pos = PackedTokSeqPipe(device=device, unk_token='').with_(pre=Prepend('', 1) + ...) + head = PackedPtrSeqPipe(device=device).with_(pre=Prepend(0, 1) + ...) + drel = PackedTokSeqPipe(device=device, unk_token='root').with_(pre=Prepend('', 1) + ...) + ner = PaddedTokSeqPipe(device=device, unk_token='O', pad_token='O') + + pipes = [ + dict(word=word, length=length, char=char, word_ptr=word_ptr, raw_word=RawPipe()), + dict(pos=pos, raw_pos=RawPipe()), + dict(head=head), + dict(drel=drel, raw_drel=RawPipe()), + dict(ner=ner, raw_ner=RawPipe()), + ] + + train, dev, test = cls.paths() + train = cls(path=train, pipes=pipes) + dev = cls(path=dev, pipes=pipes) + test = cls(path=test, pipes=pipes) + + for name, pipe in train.pipes.items(): + logging.info(f'{name} => {pipe}') + + word.build_vocab(train, dev, test, name='word') + char.build_vocab(train, dev, test, name='char') + pos.build_vocab(train, name='pos') + drel.build_vocab(train, name='drel') + ner.build_vocab(train, name='ner') + + return DataLoader.new( + (train, dev, test), + batch_size=batch_size, shuffle=True, + ) + + +class SemEval2010T1NERCatalan(SemEval2010T1NER): + urls = [ + ('https://www.dropbox.com/s/nqedh3zmk5k80n7/train.sd.conllx?dl=1', 'train.sd.conllx'), + ('https://www.dropbox.com/s/027umbuks3njwry/dev.sd.conllx?dl=1', 'dev.sd.conllx'), + ('https://www.dropbox.com/s/ldwn6z1xl5vki4y/test.sd.conllx?dl=1', 'test.sd.conllx'), + ] + lang = 'ca' + + +class SemEval2010T1NERSpanish(SemEval2010T1NER): + urls = [ + ('https://www.dropbox.com/s/lyxgvc161ai20v0/train.sd.conllx?dl=1', 'train.sd.conllx'), + ('https://www.dropbox.com/s/8tmbi7ki6ctasez/dev.sd.conllx?dl=1', 'dev.sd.conllx'), + ('https://www.dropbox.com/s/nnj94hdmlq3jjm8/test.sd.conllx?dl=1', 'test.sd.conllx'), + ] + lang = 'es' From 9bbcb26408a9492e5aa7515546ae590f917f16c0 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 25 Mar 2020 01:53:50 +0900 Subject: [PATCH 02/66] Feat: Add BertTokenizer and so on --- setup.py | 1 + torchglyph/proc/contextual.py | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 torchglyph/proc/contextual.py diff --git a/setup.py b/setup.py index 736ca61..63a49ce 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ 'tqdm', 'numpy', 'einops', + 'transformers', ], extras_require={ 'dev': [ diff --git a/torchglyph/proc/contextual.py b/torchglyph/proc/contextual.py new file mode 100644 index 0000000..21a6571 --- /dev/null +++ b/torchglyph/proc/contextual.py @@ -0,0 +1,49 @@ +from typing import Union, List + +import transformers + +from torchglyph.pipe import Proc + + +class TokenizerProc(Proc): + def __init__(self, weights: str) -> None: + super(TokenizerProc, self).__init__() + self.weights = weights + + def extra_repr(self) -> str: + return f'weights={self.weights}' + + def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]: + if not isinstance(data, str): + data = ' '.join(data) + return self.tokenizer.encode(data) + + +class BertTokenizer(TokenizerProc): + def __init__(self, weights: str = 'bert-base-uncased'): + super(BertTokenizer, self).__init__(weights=weights) + self.tokenizer = transformers.BertTokenizer.from_pretrained(weights) + + +class CTRLTokenizer(TokenizerProc): + def __init__(self, weights: str = 'ctrl'): + super(CTRLTokenizer, self).__init__(weights=weights) + self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weights) + + +class XLNetTokenizer(TokenizerProc): + def __init__(self, weights: str = 'xlnet-base-cased'): + super(XLNetTokenizer, self).__init__(weights=weights) + self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weights) + + +class XLMTokenizer(TokenizerProc): + def __init__(self, weights: str = 'xlm-mlm-enfr-1024'): + super(XLMTokenizer, self).__init__(weights=weights) + self.tokenizer = transformers.XLMTokenizer.from_pretrained(weights) + + +class BartTokenizer(TokenizerProc): + def __init__(self, weights: str = 'bart-large'): + super(BartTokenizer, self).__init__(weights=weights) + self.tokenizer = transformers.BartTokenizer.from_pretrained(weights) From 8bd94f8b7d0351c03041a048b5a863541d8f667f Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 25 Mar 2020 14:47:17 +0900 Subject: [PATCH 03/66] Feat: Support ELMo --- setup.py | 2 +- torchglyph/__init__.py | 2 +- torchglyph/io.py | 7 ++- torchglyph/nn/__init__.py | 5 +- torchglyph/nn/contextual.py | 57 +++++++++++++++++++ torchglyph/proc/__init__.py | 1 + torchglyph/proc/collecting.py | 9 +++ .../proc/{contextual.py => tokenizer.py} | 13 +++++ 8 files changed, 91 insertions(+), 5 deletions(-) create mode 100644 torchglyph/nn/contextual.py rename torchglyph/proc/{contextual.py => tokenizer.py} (74%) diff --git a/setup.py b/setup.py index 63a49ce..87d14a0 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ 'tqdm', 'numpy', 'einops', - 'transformers', + 'transformers', 'allennlp' ], extras_require={ 'dev': [ diff --git a/torchglyph/__init__.py b/torchglyph/__init__.py index f1f93f6..3c126bf 100644 --- a/torchglyph/__init__.py +++ b/torchglyph/__init__.py @@ -4,7 +4,7 @@ import torch from torch.nn.utils.rnn import PackedSequence -data_path = Path.home() / '.torchglyph' +data_path = (Path.home() / '.torchglyph').expanduser().absolute() if not data_path.exists(): data_path.mkdir(parents=True, exist_ok=True) diff --git a/torchglyph/io.py b/torchglyph/io.py index fc018d1..271ab87 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -50,7 +50,10 @@ def inner(b=1, bsize=1, tsize=None) -> None: # copied and modified from https://github.com/pytorch/text -def download_and_unzip(url: str, dest: Path) -> None: +def download_and_unzip(url: str, dest: Path) -> Path: + if dest.exists(): + return dest + if not dest.parent.exists(): dest.parent.mkdir(parents=True, exist_ok=True) @@ -73,3 +76,5 @@ def download_and_unzip(url: str, dest: Path) -> None: with gzip.open(dest, mode='rb') as fsrc: with dest.with_suffix('').open(mode='wb') as fdst: shutil.copyfileobj(fsrc, fdst) + + return dest diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py index a879623..fe4a92c 100644 --- a/torchglyph/nn/__init__.py +++ b/torchglyph/nn/__init__.py @@ -1,2 +1,3 @@ -from torchglyph.nn.embedding import * -from torchglyph.nn.rnn import * +from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding +from torchglyph.nn.rnn import ContextualLSTM +from torchglyph.nn.contextual import ELMo diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py new file mode 100644 index 0000000..9675c97 --- /dev/null +++ b/torchglyph/nn/contextual.py @@ -0,0 +1,57 @@ +import logging +from typing import Union + +from allennlp.data.dataset import Batch as AllenBatch +from allennlp.modules import Elmo as AllenELMo +from torch import Tensor +from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence + +from torchglyph import data_path +from torchglyph.io import download_and_unzip + + +class ELMo(AllenELMo): + root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/' + name = { + 'small': '2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_', + 'medium': '2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_', + 'original': '2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_', + '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_', + } + + def __init__(self, options_file, weight_file, *args, pack_output, **kwargs): + super(ELMo, self).__init__( + *args, options_file=options_file, weight_file=weight_file, **kwargs) + self.pack_output = pack_output + logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') + + @classmethod + def from_pretrained(cls, weight: str, pack_output: bool = True, + num_output_representations: int = 2, + dropout: float = 0., freeze: bool = True) -> 'ELMo': + elmo_path = data_path / cls.__name__.lower() + options_file = download_and_unzip( + url=cls.root + (cls.name[weight] + 'options.json'), + dest=elmo_path / (cls.name[weight] + 'options.json'), + ) + weight_file = download_and_unzip( + url=cls.root + (cls.name[weight] + 'weights.hdf5'), + dest=elmo_path / (cls.name[weight] + 'weights.hdf5'), + ) + return cls( + options_file=str(options_file), weight_file=str(weight_file), + num_output_representations=num_output_representations, + requires_grad=not freeze, dropout=dropout, pack_output=pack_output, + ) + + def forward(self, batch: AllenBatch) -> Union[Tensor, PackedSequence]: + outputs = super(ELMo, self).forward(batch) + elmo_representations, *_ = outputs['elmo_representations'] + if not self.pack_output: + return elmo_representations + else: + lengths = outputs['mask'].long().sum(dim=-1) + return pack_padded_sequence( + elmo_representations, lengths, + batch_first=True, enforce_sorted=False, + ) diff --git a/torchglyph/proc/__init__.py b/torchglyph/proc/__init__.py index 9416a5a..625b9dc 100644 --- a/torchglyph/proc/__init__.py +++ b/torchglyph/proc/__init__.py @@ -4,3 +4,4 @@ from torchglyph.proc.recur import * from torchglyph.proc.shape import * from torchglyph.proc.vocab import * +from torchglyph.proc.tokenizer import * diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index 1ce3fc5..f93a38b 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -1,6 +1,8 @@ from typing import Any, Union, List, Tuple import torch +from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary +from allennlp.data.dataset import Batch as AllenBatch from torch import Tensor from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence @@ -94,6 +96,13 @@ def __call__(self, data: List[Tensor], vocab: Vocab, **kwargs) -> Tensor: ) +class PadELMo(Proc): + def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor: + dataset = AllenBatch(data) + dataset.index_instances(AllenVocabulary()) + return dataset.as_tensor_dict()['elmo']['character_ids'] + + class PackSeq(Proc): def __init__(self, enforce_sorted: bool) -> None: super(PackSeq, self).__init__() diff --git a/torchglyph/proc/contextual.py b/torchglyph/proc/tokenizer.py similarity index 74% rename from torchglyph/proc/contextual.py rename to torchglyph/proc/tokenizer.py index 21a6571..310e354 100644 --- a/torchglyph/proc/contextual.py +++ b/torchglyph/proc/tokenizer.py @@ -1,6 +1,9 @@ from typing import Union, List import transformers +from allennlp.data import Token as AllenToken, Instance as AllenInstance +from allennlp.data.fields import TextField as AllenTextField +from allennlp.data.token_indexers import ELMoTokenCharactersIndexer from torchglyph.pipe import Proc @@ -19,6 +22,16 @@ def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]: return self.tokenizer.encode(data) +class ELMoTokenizer(Proc): + def __init__(self): + super(ELMoTokenizer, self).__init__() + self.tokenizer = ELMoTokenCharactersIndexer() + + def __call__(self, data: List[str], *args, **kwargs): + data = [AllenToken(token) for token in data] + return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})}) + + class BertTokenizer(TokenizerProc): def __init__(self, weights: str = 'bert-base-uncased'): super(BertTokenizer, self).__init__(weights=weights) From c6a8dd88de42c9e8f8e1b97ac4a1e263a4ffaa72 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 25 Mar 2020 15:04:28 +0900 Subject: [PATCH 04/66] Feat: Update extra_repr and __repr__ for ELMo --- setup.py | 3 ++- torchglyph/nn/contextual.py | 24 +++++++++++++++++++++--- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 87d14a0..7faba6e 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,8 @@ 'tqdm', 'numpy', 'einops', - 'transformers', 'allennlp' + 'transformers', + 'allennlp', ], extras_require={ 'dev': [ diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 9675c97..7b85eba 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -19,11 +19,15 @@ class ELMo(AllenELMo): '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_', } - def __init__(self, options_file, weight_file, *args, pack_output, **kwargs): + def __init__(self, *, options_file, weight_file, pack_output, **kwargs): + logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') + super(ELMo, self).__init__( - *args, options_file=options_file, weight_file=weight_file, **kwargs) + options_file=options_file, weight_file=weight_file, **kwargs, + ) + self.pack_output = pack_output - logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') + self.embedding_dim = self.get_output_dim() @classmethod def from_pretrained(cls, weight: str, pack_output: bool = True, @@ -44,6 +48,20 @@ def from_pretrained(cls, weight: str, pack_output: bool = True, requires_grad=not freeze, dropout=dropout, pack_output=pack_output, ) + def extra_repr(self) -> str: + args = [ + f'{self._elmo_lstm._elmo_lstm.input_size}', + f'{self._elmo_lstm._elmo_lstm.hidden_size}', + f'num_layers={self._elmo_lstm.num_layers}', + f'dropout={self._dropout.p}', + ] + if not self._elmo_lstm._requires_grad: + args.append('frozen') + return ', '.join(args) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}({self.extra_repr()})' + def forward(self, batch: AllenBatch) -> Union[Tensor, PackedSequence]: outputs = super(ELMo, self).forward(batch) elmo_representations, *_ = outputs['elmo_representations'] From c284e6f782b96832b13eb1b10beba0e0e56b1a99 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 25 Mar 2020 15:09:52 +0900 Subject: [PATCH 05/66] Fix: Import path bug --- torchglyph/proc/collecting.py | 6 +++--- torchglyph/proc/tokenizer.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index f93a38b..a564d45 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -98,9 +98,9 @@ def __call__(self, data: List[Tensor], vocab: Vocab, **kwargs) -> Tensor: class PadELMo(Proc): def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor: - dataset = AllenBatch(data) - dataset.index_instances(AllenVocabulary()) - return dataset.as_tensor_dict()['elmo']['character_ids'] + batch = AllenBatch(data) + batch.index_instances(AllenVocabulary()) + return batch.as_tensor_dict()['elmo']['character_ids'] class PackSeq(Proc): diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py index 310e354..f1904bc 100644 --- a/torchglyph/proc/tokenizer.py +++ b/torchglyph/proc/tokenizer.py @@ -5,7 +5,7 @@ from allennlp.data.fields import TextField as AllenTextField from allennlp.data.token_indexers import ELMoTokenCharactersIndexer -from torchglyph.pipe import Proc +from torchglyph.proc import Proc class TokenizerProc(Proc): From 10c385c2bfeed54b88608ac847143eaf9024065b Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 25 Mar 2020 15:14:45 +0900 Subject: [PATCH 06/66] Refactor: Unify naming style --- torchglyph/nn/__init__.py | 2 +- torchglyph/nn/contextual.py | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py index fe4a92c..f50bd31 100644 --- a/torchglyph/nn/__init__.py +++ b/torchglyph/nn/__init__.py @@ -1,3 +1,3 @@ from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding from torchglyph.nn.rnn import ContextualLSTM -from torchglyph.nn.contextual import ELMo +from torchglyph.nn.contextual import ELMoModel diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 7b85eba..75b3a76 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -1,7 +1,6 @@ import logging from typing import Union -from allennlp.data.dataset import Batch as AllenBatch from allennlp.modules import Elmo as AllenELMo from torch import Tensor from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence @@ -10,7 +9,7 @@ from torchglyph.io import download_and_unzip -class ELMo(AllenELMo): +class ELMoModel(AllenELMo): root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/' name = { 'small': '2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_', @@ -19,10 +18,10 @@ class ELMo(AllenELMo): '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_', } - def __init__(self, *, options_file, weight_file, pack_output, **kwargs): + def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs) -> None: logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') - super(ELMo, self).__init__( + super(ELMoModel, self).__init__( options_file=options_file, weight_file=weight_file, **kwargs, ) @@ -32,7 +31,7 @@ def __init__(self, *, options_file, weight_file, pack_output, **kwargs): @classmethod def from_pretrained(cls, weight: str, pack_output: bool = True, num_output_representations: int = 2, - dropout: float = 0., freeze: bool = True) -> 'ELMo': + dropout: float = 0., freeze: bool = True) -> 'ELMoModel': elmo_path = data_path / cls.__name__.lower() options_file = download_and_unzip( url=cls.root + (cls.name[weight] + 'options.json'), @@ -62,8 +61,8 @@ def extra_repr(self) -> str: def __repr__(self) -> str: return f'{self.__class__.__name__}({self.extra_repr()})' - def forward(self, batch: AllenBatch) -> Union[Tensor, PackedSequence]: - outputs = super(ELMo, self).forward(batch) + def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, PackedSequence]: + outputs = super(ELMoModel, self).forward(batch, word_inputs=word_inputs) elmo_representations, *_ = outputs['elmo_representations'] if not self.pack_output: return elmo_representations From 0a7a492c239f2f1bae96e4f684f686771427e103 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 25 Mar 2020 15:27:14 +0900 Subject: [PATCH 07/66] Feat: Add tokenizers of transformers --- torchglyph/proc/tokenizer.py | 100 ++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 32 deletions(-) diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py index f1904bc..f14ebd3 100644 --- a/torchglyph/proc/tokenizer.py +++ b/torchglyph/proc/tokenizer.py @@ -8,13 +8,23 @@ from torchglyph.proc import Proc -class TokenizerProc(Proc): - def __init__(self, weights: str) -> None: - super(TokenizerProc, self).__init__() - self.weights = weights +class ELMoTokenizer(Proc): + def __init__(self): + super(ELMoTokenizer, self).__init__() + self.tokenizer = ELMoTokenCharactersIndexer() + + def __call__(self, data: List[str], *args, **kwargs): + data = [AllenToken(token) for token in data] + return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})}) + + +class TransformerTokenizerProc(Proc): + def __init__(self, weight: str) -> None: + super(TransformerTokenizerProc, self).__init__() + self.weightt = weight def extra_repr(self) -> str: - return f'weights={self.weights}' + return f'weight={self.weight}' def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]: if not isinstance(data, str): @@ -22,41 +32,67 @@ def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]: return self.tokenizer.encode(data) -class ELMoTokenizer(Proc): - def __init__(self): - super(ELMoTokenizer, self).__init__() - self.tokenizer = ELMoTokenCharactersIndexer() +class BertTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'bert-base-uncased'): + super(BertTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.BertTokenizer.from_pretrained(weight) - def __call__(self, data: List[str], *args, **kwargs): - data = [AllenToken(token) for token in data] - return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})}) + +class OpenAIGPTTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'openai-gpt'): + super(OpenAIGPTTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(weight) + + +class GPT2Tokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'gpt2'): + super(GPT2Tokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight) + + +class CTRLTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'ctrl'): + super(CTRLTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weight) + + +class TransfoXLTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'transfo-xl-wt103'): + super(TransfoXLTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.TransfoXLTokenizer.from_pretrained(weight) + + +class XLNetTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'xlnet-base-cased'): + super(XLNetTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weight) -class BertTokenizer(TokenizerProc): - def __init__(self, weights: str = 'bert-base-uncased'): - super(BertTokenizer, self).__init__(weights=weights) - self.tokenizer = transformers.BertTokenizer.from_pretrained(weights) +class XLMTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'xlm-mlm-enfr-1024'): + super(XLMTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.XLMTokenizer.from_pretrained(weight) -class CTRLTokenizer(TokenizerProc): - def __init__(self, weights: str = 'ctrl'): - super(CTRLTokenizer, self).__init__(weights=weights) - self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weights) +class DistilBertTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'distilbert-base-cased'): + super(DistilBertTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(weight) -class XLNetTokenizer(TokenizerProc): - def __init__(self, weights: str = 'xlnet-base-cased'): - super(XLNetTokenizer, self).__init__(weights=weights) - self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weights) +class RobertaTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'roberta-base'): + super(RobertaTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.RobertaTokenizer.from_pretrained(weight) -class XLMTokenizer(TokenizerProc): - def __init__(self, weights: str = 'xlm-mlm-enfr-1024'): - super(XLMTokenizer, self).__init__(weights=weights) - self.tokenizer = transformers.XLMTokenizer.from_pretrained(weights) +class XLMRobertaTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'xlm-roberta-base'): + super(XLMRobertaTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(weight) -class BartTokenizer(TokenizerProc): - def __init__(self, weights: str = 'bart-large'): - super(BartTokenizer, self).__init__(weights=weights) - self.tokenizer = transformers.BartTokenizer.from_pretrained(weights) +class BartTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'bart-large'): + super(BartTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.BartTokenizer.from_pretrained(weight) From db1e7bde1bbd68efa5e55d0fcd7e7f900c776655 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Thu, 26 Mar 2020 17:36:12 +0900 Subject: [PATCH 08/66] Feat: Add ELMoPipe --- torchglyph/nn/__init__.py | 1 - torchglyph/pipe/ctx.py | 18 ++++++++++++++++++ torchglyph/proc/__init__.py | 1 - 3 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 torchglyph/pipe/ctx.py diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py index f50bd31..249b291 100644 --- a/torchglyph/nn/__init__.py +++ b/torchglyph/nn/__init__.py @@ -1,3 +1,2 @@ from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding from torchglyph.nn.rnn import ContextualLSTM -from torchglyph.nn.contextual import ELMoModel diff --git a/torchglyph/pipe/ctx.py b/torchglyph/pipe/ctx.py new file mode 100644 index 0000000..1adc83f --- /dev/null +++ b/torchglyph/pipe/ctx.py @@ -0,0 +1,18 @@ +from typing import Union + +import torch + +from torchglyph.pipe import Pipe +from torchglyph.proc import PadELMo +from torchglyph.proc import ToDevice +from torchglyph.proc.tokenizer import ELMoTokenizer + + +class ELMoPipe(Pipe): + def __init__(self, device: Union[int, torch.device]): + super(ELMoPipe, self).__init__( + pre=ELMoTokenizer(), + vocab=None, + post=None, + batch=PadELMo() + ToDevice(device=device), + ) diff --git a/torchglyph/proc/__init__.py b/torchglyph/proc/__init__.py index 625b9dc..9416a5a 100644 --- a/torchglyph/proc/__init__.py +++ b/torchglyph/proc/__init__.py @@ -4,4 +4,3 @@ from torchglyph.proc.recur import * from torchglyph.proc.shape import * from torchglyph.proc.vocab import * -from torchglyph.proc.tokenizer import * From 63989bba7f6b5db6864dbea7a052a28043b5da76 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 27 Mar 2020 01:04:22 +0900 Subject: [PATCH 09/66] Feat: Update ToDevice --- torchglyph/proc/collecting.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index a564d45..9b561ab 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -11,7 +11,8 @@ class ToDevice(Proc): - Batch = Union[Tensor, PackedSequence, Tuple[Union[Tensor, PackedSequence], ...]] + Item = Union[int, float, bool, Tensor, PackedSequence] + Batch = Union[Item, Tuple[Item, ...]] def __init__(self, device: Union[int, torch.device]) -> None: super(ToDevice, self).__init__() @@ -26,6 +27,8 @@ def extra_repr(self) -> str: return f'{self.device}' def __call__(self, batch: Batch, vocab: Vocab, **kwargs) -> Batch: + if isinstance(batch, (int, float, str, bool)): + return batch if isinstance(batch, (PackedSequence, Tensor)): return batch.to(self.device) return type(batch)([self(e, vocab=vocab) for e in batch]) From e04864641dfbe87a07e77905517ec7fb6a5ce137 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 31 Mar 2020 17:19:50 +0900 Subject: [PATCH 10/66] Feat: Add connection.py --- torchglyph/nn/connection.py | 81 +++++++++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 torchglyph/nn/connection.py diff --git a/torchglyph/nn/connection.py b/torchglyph/nn/connection.py new file mode 100644 index 0000000..81b2df2 --- /dev/null +++ b/torchglyph/nn/connection.py @@ -0,0 +1,81 @@ +from typing import Union + +import torch +from torch import Tensor +from torch import nn +from torch.nn.utils.rnn import PackedSequence + + +class ResNorm(nn.Module): + """ + https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf + """ + + def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None: + super(ResNorm, self).__init__() + self.input_dim = input_dim + self.output_dim = input_dim + + self.sub_layer = sub_layer + self.layer_norm = nn.LayerNorm(input_dim) + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + z = self.sub_layer(x, *args, **kwargs) + if torch.is_tensor(z): + return self.layer_norm(x + z) + elif isinstance(z, PackedSequence): + return z._replace(data=self.layer_norm(x.data + z.data)) + else: + raise NotImplementedError + + +class DenseNorm(nn.Module): + """ + http://openaccess.thecvf.com/content_cvpr_2017/papers/Huang_Densely_Connected_Convolutional_CVPR_2017_paper.pdf + """ + + def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None: + super(DenseNorm, self).__init__() + self.input_dim = input_dim + self.output_dim = input_dim * 2 + + self.sub_layer = sub_layer + self.layer_norm = nn.LayerNorm(input_dim * 2) + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + z = self.sub_layer(x, *args, **kwargs) + if torch.is_tensor(z): + return self.layer_norm(torch.cat([x, z], dim=-1)) + elif isinstance(z, PackedSequence): + return z._replace(data=self.layer_norm(torch.cat([x.data, z.data], dim=-1))) + else: + raise NotImplementedError + + +class ReZero(nn.Module): + """ + https://arxiv.org/pdf/2003.04887.pdf + """ + + def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None: + super(ReZero, self).__init__() + self.input_dim = input_dim + self.output_dim = input_dim + + self.sub_layer = sub_layer + self.scale = nn.Parameter( + torch.tensor([0.], dtype=torch.float32), + requires_grad=True, + ) + + def extra_repr(self) -> str: + return f'(scale): Parameter({self.scale.data})' + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + z = self.sub_layer(x, *args, **kwargs) + if torch.is_tensor(z): + return x + z * self.scale + elif isinstance(z, PackedSequence): + return z._replace(data=x.data + z.data * self.scale) + else: + raise NotImplementedError From de1f1d7ad3afda44923a9a8f9576a3028b614595 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 31 Mar 2020 17:23:57 +0900 Subject: [PATCH 11/66] Test: Add unit test for connections --- tests/test_nn/test_connection.py | 44 ++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 tests/test_nn/test_connection.py diff --git a/tests/test_nn/test_connection.py b/tests/test_nn/test_connection.py new file mode 100644 index 0000000..05550a3 --- /dev/null +++ b/tests/test_nn/test_connection.py @@ -0,0 +1,44 @@ +import torch +from hypothesis import given, strategies as st +from torch import nn + +from torchglyph.nn.connection import ResNorm, DenseNorm, ReZero + + +@given( + batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4), + input_dim=st.integers(1, 20), +) +def test_resnorm_shape_grad(batch_sizes, input_dim): + layer = ResNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim)) + x = torch.rand((*batch_sizes, input_dim), requires_grad=True) + y = layer(x) + + assert y.size() == (*batch_sizes, layer.output_dim) + assert y.requires_grad + + +@given( + batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4), + input_dim=st.integers(1, 20), +) +def test_densenorm_shape_grad(batch_sizes, input_dim): + layer = DenseNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim)) + x = torch.rand((*batch_sizes, input_dim), requires_grad=True) + y = layer(x) + + assert y.size() == (*batch_sizes, layer.output_dim) + assert y.requires_grad + + +@given( + batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4), + input_dim=st.integers(1, 20), +) +def test_rezero_shape_grad(batch_sizes, input_dim): + layer = ReZero(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim)) + x = torch.rand((*batch_sizes, input_dim), requires_grad=True) + y = layer(x) + + assert y.size() == (*batch_sizes, layer.output_dim) + assert y.requires_grad From 1d5a61bd8d402811129cf383f90b558c9fdb06dc Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 14:25:30 +0900 Subject: [PATCH 12/66] Feat: Add from_numpy to ToTensor --- torchglyph/proc/collecting.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index 9b561ab..e95f5c0 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -1,5 +1,6 @@ from typing import Any, Union, List, Tuple +import numpy as np import torch from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary from allennlp.data.dataset import Batch as AllenBatch @@ -44,7 +45,9 @@ def extra_repr(self) -> str: def __call__(self, data: Any, **kwargs) -> Tensor: try: - return torch.tensor(data, dtype=self.dtype, requires_grad=False) + if isinstance(data, np.ndarray): + return torch.from_numpy(data).to(dtype=self.dtype).requires_grad_(False) + return torch.tensor(data, dtype=self.dtype).requires_grad_(False) except ValueError as err: if err.args[0] == "too many dimensions 'str'": raise ValueError(f"'{data}' can not be converted to {Tensor.__name__}") From c30f2a199739d994f74cd1ad6cedaa9c9430e46e Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 15:09:25 +0900 Subject: [PATCH 13/66] Feat: Add ELMoForManyLanguage --- setup.py | 1 + torchglyph/nn/contextual.py | 97 +++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/setup.py b/setup.py index 7faba6e..16ae9e6 100644 --- a/setup.py +++ b/setup.py @@ -19,6 +19,7 @@ 'einops', 'transformers', 'allennlp', + 'elmoformanylangs', ], extras_require={ 'dev': [ diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 75b3a76..8c58a5d 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -1,13 +1,22 @@ +import json import logging +from pathlib import Path +from typing import List from typing import Union from allennlp.modules import Elmo as AllenELMo +from elmoformanylangs.elmo import read_list, create_batches, recover +from elmoformanylangs.frontend import Model +from elmoformanylangs.modules.embedding_layer import EmbeddingLayer from torch import Tensor from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence +from torch.nn.utils.rnn import pack_sequence from torchglyph import data_path from torchglyph.io import download_and_unzip +logging.getLogger('elmoformanylangs').disabled = True + class ELMoModel(AllenELMo): root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/' @@ -72,3 +81,91 @@ def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, Pa elmo_representations, lengths, batch_first=True, enforce_sorted=False, ) + + +class ELMoForManyLanguage(Model): + def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> None: + with options_file.open('r', encoding='utf-8') as fp: + config = json.load(fp) + + if config['token_embedder']['char_dim'] > 0: + char_lexicon = {} + with (weight_file / 'char.dic').open('r', encoding='utf-8') as fp: + for raw in fp: + tokens = raw.strip().split('\t') + if len(tokens) == 1: + tokens.insert(0, '\u3000') + token, index = tokens + char_lexicon[token] = int(index) + char_emb_layer = EmbeddingLayer( + config['token_embedder']['char_dim'], char_lexicon, + fix_emb=False, embs=None, + ) + else: + char_lexicon = None + char_emb_layer = None + + if config['token_embedder']['word_dim'] > 0: + word_lexicon = {} + with (weight_file / 'word.dic').open('r', encoding='utf-8') as fp: + for raw in fp: + tokens = raw.strip().split('\t') + if len(tokens) == 1: + tokens.insert(0, '\u3000') + token, index = tokens + word_lexicon[token] = int(index) + word_emb_layer = EmbeddingLayer( + config['token_embedder']['word_dim'], word_lexicon, + fix_emb=False, embs=None, + ) + else: + word_lexicon = None + word_emb_layer = None + + super(ELMoForManyLanguage, self).__init__( + config=config, word_emb_layer=word_emb_layer, + char_emb_layer=char_emb_layer, use_cuda=False, + ) + self.load_model(path=weight_file) + self.char_lexicon = char_lexicon + self.word_lexicon = word_lexicon + self.pack_output = pack_output + self.encoding_dim = self.output_dim * 2 + + @classmethod + def from_pretraiend(cls, path: Path, pack_output: bool = True): + with (path / 'config.json').open('r', encoding='utf-8') as fp: + args = json.load(fp) + return cls( + options_file=path / args['config_path'], + weight_file=path, pack_output=pack_output, + ) + + def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tensor, PackedSequence]: + if self.config['token_embedder']['name'].lower() == 'cnn': + pad, text = read_list(batch, self.config['token_embedder']['max_characters_per_token']) + else: + pad, text = read_list(batch) + + pad_w, pad_c, pad_ln, pad_mask, pad_text, recover_idx = create_batches( + pad, len(text), self.word_lexicon, self.char_lexicon, self.config, text=text) + + ans = [] + for word, char, length, mask, pads in zip(pad_w, pad_c, pad_ln, pad_mask, pad_text): + output = super(ELMoForManyLanguage, self).forward(word, char, mask) + for index, text in enumerate(pads): + if self.config['encoder']['name'].lower() == 'lstm': + data = output[index, 1:length[index] - 1, :] + elif self.config['encoder']['name'].lower() == 'elmo': + data = output[:, index, 1:length[index] - 1, :] + + if output_layer == -1: + payload = data.mean(dim=0) + else: + payload = data[output_layer] + ans.append(payload) + + ans = recover(ans, recover_idx) + if self.pack_output: + ans = pack_sequence(ans, enforce_sorted=False) + return ans From dbc38bc1709a777728f3f1617600e6c8a4c68d63 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 15:26:10 +0900 Subject: [PATCH 14/66] Feat: Handle download for ELMoForManyLanguage --- torchglyph/nn/contextual.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 8c58a5d..6465cc2 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -83,7 +83,18 @@ def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, Pa ) -class ELMoForManyLanguage(Model): +class ELMoForManyLanguages(Model): + root = 'http://vectors.nlpl.eu/repository/11/' + configs = [ + 'https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/master/configs/cnn_0_100_512_4096_sample.json', + 'https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/master/configs/cnn_50_100_512_4096_sample.json', + ] + names = { + 'ca': '138', + 'es': '145', + 'zh': '179', + } + def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> None: with options_file.open('r', encoding='utf-8') as fp: config = json.load(fp) @@ -122,7 +133,7 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> Non word_lexicon = None word_emb_layer = None - super(ELMoForManyLanguage, self).__init__( + super(ELMoForManyLanguages, self).__init__( config=config, word_emb_layer=word_emb_layer, char_emb_layer=char_emb_layer, use_cuda=False, ) @@ -133,7 +144,20 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> Non self.encoding_dim = self.output_dim * 2 @classmethod - def from_pretraiend(cls, path: Path, pack_output: bool = True): + def from_pretraiend(cls, lang: str, pack_output: bool = True) -> 'ELMoForManyLanguages': + download_and_unzip( + url=cls.configs[0], + dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name, + ) + download_and_unzip( + url=cls.configs[1], + dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[1]).name, + ) + path = download_and_unzip( + url=cls.root + f'{cls.names[lang]}.zip', + dest=data_path / cls.__name__.lower() / lang / f'{lang}.zip', + ).parent + with (path / 'config.json').open('r', encoding='utf-8') as fp: args = json.load(fp) return cls( @@ -152,7 +176,7 @@ def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tenso ans = [] for word, char, length, mask, pads in zip(pad_w, pad_c, pad_ln, pad_mask, pad_text): - output = super(ELMoForManyLanguage, self).forward(word, char, mask) + output = super(ELMoForManyLanguages, self).forward(word, char, mask) for index, text in enumerate(pads): if self.config['encoder']['name'].lower() == 'lstm': data = output[index, 1:length[index] - 1, :] From 82fd821a55346fbec93d192f2312f493633b146f Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 15:38:23 +0900 Subject: [PATCH 15/66] Feat: Update extra_repr for both ELMo and ELMoForManyLanguages --- torchglyph/nn/contextual.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 6465cc2..6896559 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -35,7 +35,7 @@ def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs ) self.pack_output = pack_output - self.embedding_dim = self.get_output_dim() + self.encoding_dim = self.get_output_dim() @classmethod def from_pretrained(cls, weight: str, pack_output: bool = True, @@ -58,8 +58,7 @@ def from_pretrained(cls, weight: str, pack_output: bool = True, def extra_repr(self) -> str: args = [ - f'{self._elmo_lstm._elmo_lstm.input_size}', - f'{self._elmo_lstm._elmo_lstm.hidden_size}', + f'encoding_dim={self.encoding_dim}', f'num_layers={self._elmo_lstm.num_layers}', f'dropout={self._dropout.p}', ] @@ -95,7 +94,7 @@ class ELMoForManyLanguages(Model): 'zh': '179', } - def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> None: + def __init__(self, *, options_file: Path, weight_file: Path, pack_output: bool, requires_grad: bool) -> None: with options_file.open('r', encoding='utf-8') as fp: config = json.load(fp) @@ -140,11 +139,14 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> Non self.load_model(path=weight_file) self.char_lexicon = char_lexicon self.word_lexicon = word_lexicon + + self.lang = weight_file.name + self.requires_grad = requires_grad self.pack_output = pack_output self.encoding_dim = self.output_dim * 2 @classmethod - def from_pretraiend(cls, lang: str, pack_output: bool = True) -> 'ELMoForManyLanguages': + def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages': download_and_unzip( url=cls.configs[0], dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name, @@ -161,10 +163,23 @@ def from_pretraiend(cls, lang: str, pack_output: bool = True) -> 'ELMoForManyLan with (path / 'config.json').open('r', encoding='utf-8') as fp: args = json.load(fp) return cls( - options_file=path / args['config_path'], + options_file=path / args['config_path'], requires_grad=not freeze, weight_file=path, pack_output=pack_output, ) + def extra_repr(self) -> str: + args = [ + f'lang={self.lang}', f'encoding_dim={self.encoding_dim}', + f'word_vocab={len(self.word_lexicon) if self.word_lexicon is not None else None}', + f'char_vocab={len(self.char_lexicon) if self.char_lexicon is not None else None}', + ] + if not self.requires_grad: + args.append('frozen') + return ', '.join(args) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}({self.extra_repr()})' + def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tensor, PackedSequence]: if self.config['token_embedder']['name'].lower() == 'cnn': pad, text = read_list(batch, self.config['token_embedder']['max_characters_per_token']) @@ -187,7 +202,7 @@ def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tenso payload = data.mean(dim=0) else: payload = data[output_layer] - ans.append(payload) + ans.append(payload if self.requires_grad else payload.detach()) ans = recover(ans, recover_idx) if self.pack_output: From 4d61b5a7c6e06c118afbc63ae2f81b0ec51c6d38 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 15:54:30 +0900 Subject: [PATCH 16/66] Feat: Toggle loggers of allennlp --- torchglyph/io.py | 9 ++++++++- torchglyph/nn/contextual.py | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/torchglyph/io.py b/torchglyph/io.py index 271ab87..99d683e 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -1,12 +1,13 @@ import gzip import logging import os +import re import shutil import tarfile import zipfile from contextlib import contextmanager from pathlib import Path -from typing import Union, TextIO +from typing import Union, TextIO, Pattern from urllib.request import urlretrieve from tqdm import tqdm @@ -78,3 +79,9 @@ def download_and_unzip(url: str, dest: Path) -> Path: shutil.copyfileobj(fsrc, fdst) return dest + + +def toggle_loggers(pattern: Union[str, Pattern], enable: bool) -> None: + for name in logging.root.manager.loggerDict: # type:str + if re.match(pattern, name) is not None: + logging.getLogger(name).disabled = not enable diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 6896559..d959001 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -13,9 +13,10 @@ from torch.nn.utils.rnn import pack_sequence from torchglyph import data_path -from torchglyph.io import download_and_unzip +from torchglyph.io import download_and_unzip, toggle_loggers -logging.getLogger('elmoformanylangs').disabled = True +toggle_loggers('allennlp', False) +toggle_loggers('elmoformanylangs', False) class ELMoModel(AllenELMo): From 85c92bfbbf0a7e41d3c521958009592558c89fbd Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 15:58:57 +0900 Subject: [PATCH 17/66] Refactor: Separate fetch --- torchglyph/nn/contextual.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index d959001..2c6aea3 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -39,9 +39,7 @@ def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs self.encoding_dim = self.get_output_dim() @classmethod - def from_pretrained(cls, weight: str, pack_output: bool = True, - num_output_representations: int = 2, - dropout: float = 0., freeze: bool = True) -> 'ELMoModel': + def fetch(cls, weight: str): elmo_path = data_path / cls.__name__.lower() options_file = download_and_unzip( url=cls.root + (cls.name[weight] + 'options.json'), @@ -51,6 +49,13 @@ def from_pretrained(cls, weight: str, pack_output: bool = True, url=cls.root + (cls.name[weight] + 'weights.hdf5'), dest=elmo_path / (cls.name[weight] + 'weights.hdf5'), ) + return options_file, weight_file + + @classmethod + def from_pretrained(cls, weight: str, pack_output: bool = True, + num_output_representations: int = 2, + dropout: float = 0., freeze: bool = True) -> 'ELMoModel': + options_file, weight_file = cls.fetch(weight=weight) return cls( options_file=str(options_file), weight_file=str(weight_file), num_output_representations=num_output_representations, @@ -147,7 +152,7 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output: bool, self.encoding_dim = self.output_dim * 2 @classmethod - def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages': + def fetch(cls, lang: str): download_and_unzip( url=cls.configs[0], dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name, @@ -156,11 +161,15 @@ def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = Tru url=cls.configs[1], dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[1]).name, ) - path = download_and_unzip( + return download_and_unzip( url=cls.root + f'{cls.names[lang]}.zip', dest=data_path / cls.__name__.lower() / lang / f'{lang}.zip', ).parent + @classmethod + def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages': + path = cls.fetch(lang=lang) + with (path / 'config.json').open('r', encoding='utf-8') as fp: args = json.load(fp) return cls( From 588d082457634ddd0a42b098752ea9bce6c8f5e5 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Wed, 1 Apr 2020 16:32:37 +0900 Subject: [PATCH 18/66] Refactor: Turn off allennlp and transformers --- torchglyph/proc/collecting.py | 3 +++ torchglyph/proc/tokenizer.py | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index e95f5c0..5f723f8 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -7,9 +7,12 @@ from torch import Tensor from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence +from torchglyph.io import toggle_loggers from torchglyph.proc import Proc, Chain, stoi from torchglyph.vocab import Vocab +toggle_loggers('allennlp', False) + class ToDevice(Proc): Item = Union[int, float, bool, Tensor, PackedSequence] diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py index f14ebd3..e2fe321 100644 --- a/torchglyph/proc/tokenizer.py +++ b/torchglyph/proc/tokenizer.py @@ -5,7 +5,10 @@ from allennlp.data.fields import TextField as AllenTextField from allennlp.data.token_indexers import ELMoTokenCharactersIndexer -from torchglyph.proc import Proc +from torchglyph.proc import Proc, toggle_loggers + +toggle_loggers('allennlp', False) +toggle_loggers('transformers', False) class ELMoTokenizer(Proc): From 497d368192128646ce538c021bf3438c44c9550e Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 3 Apr 2020 21:13:50 +0900 Subject: [PATCH 19/66] Fix: typo --- torchglyph/nn/contextual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 2c6aea3..fdf102c 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -167,7 +167,7 @@ def fetch(cls, lang: str): ).parent @classmethod - def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages': + def from_pretrained(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages': path = cls.fetch(lang=lang) with (path / 'config.json').open('r', encoding='utf-8') as fp: From cf9daf48beee0a8ad06390d5ae98b26a1bb05e6f Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 3 Apr 2020 22:08:49 +0900 Subject: [PATCH 20/66] Refactor: Change num_output_representations default value --- torchglyph/nn/contextual.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index fdf102c..3ee8d41 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -53,7 +53,7 @@ def fetch(cls, weight: str): @classmethod def from_pretrained(cls, weight: str, pack_output: bool = True, - num_output_representations: int = 2, + num_output_representations: int = 1, dropout: float = 0., freeze: bool = True) -> 'ELMoModel': options_file, weight_file = cls.fetch(weight=weight) return cls( From 25e51b00155e2fe12d0c0a5fa672ad5900cbcca7 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 3 Apr 2020 22:34:06 +0900 Subject: [PATCH 21/66] Refactor: Update ToDevice --- torchglyph/proc/collecting.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index 5f723f8..62b2d68 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -31,11 +31,11 @@ def extra_repr(self) -> str: return f'{self.device}' def __call__(self, batch: Batch, vocab: Vocab, **kwargs) -> Batch: - if isinstance(batch, (int, float, str, bool)): - return batch - if isinstance(batch, (PackedSequence, Tensor)): - return batch.to(self.device) - return type(batch)([self(e, vocab=vocab) for e in batch]) + if isinstance(batch, (Tensor, PackedSequence)): + return batch.to(device=self.device) + if isinstance(batch, (list, tuple)): + return type(batch)([self(e, vocab=vocab) for e in batch]) + return batch class ToTensor(Proc): From 107425c64e1e88f59ccfe30551d0b2afa95ca772 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 10 Apr 2020 17:15:56 +0900 Subject: [PATCH 22/66] Feat: Add remove_missing flag --- .../test_datasets/test_sequential_labeling.py | 8 ++-- .../test_datasets/test_text_classification.py | 2 +- torchglyph/datasets/sequential_labeling.py | 27 ++++++++++--- torchglyph/datasets/text_classification.py | 9 ++++- torchglyph/proc/vocab.py | 25 +++++++----- torchglyph/vocab.py | 38 +++---------------- 6 files changed, 53 insertions(+), 56 deletions(-) diff --git a/tests/test_datasets/test_sequential_labeling.py b/tests/test_datasets/test_sequential_labeling.py index 94a7e6d..fab9eb7 100644 --- a/tests/test_datasets/test_sequential_labeling.py +++ b/tests/test_datasets/test_sequential_labeling.py @@ -3,27 +3,27 @@ def test_conll2000_chunking(): - train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None) + train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None, remove_missing=True) assert len(train.dataset) == 8936 assert len(test.dataset) == 2012 def test_conll2003_ner(): - train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None) + train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None, remove_missing=True) assert len(train.dataset) == 14987 assert len(dev.dataset) == 3466 assert len(test.dataset) == 3684 def test_semeval2010_catalan(): - train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None) + train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None, remove_missing=True) assert len(train.dataset) == 8709 assert len(dev.dataset) == 1445 assert len(test.dataset) == 1698 def test_semeval2010_spanish(): - train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None) + train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None, remove_missing=True) assert len(train.dataset) == 9022 assert len(dev.dataset) == 1419 assert len(test.dataset) == 1705 diff --git a/tests/test_datasets/test_text_classification.py b/tests/test_datasets/test_text_classification.py index 587e0c1..f130e80 100644 --- a/tests/test_datasets/test_text_classification.py +++ b/tests/test_datasets/test_text_classification.py @@ -2,6 +2,6 @@ def test_agnews(): - train, test = AgNews.new(batch_size=1, word_dim=None) + train, test = AgNews.new(batch_size=1, word_dim=None, remove_missing=True) assert len(train) == 120000 assert len(test) == 7600 diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py index b5f2405..369437c 100644 --- a/torchglyph/datasets/sequential_labeling.py +++ b/torchglyph/datasets/sequential_labeling.py @@ -34,10 +34,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args conllx.dump(zip(raw_word, raw_pos, raw_chunk, pred_chunk), fp, sep=' ') @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple[DataLoader, ...]: + if word_dim is not None: + vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( pre=ReplaceDigits(repl_token='') + ..., - vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)), + vocab=... + vectors, ) length = SeqLengthTensorPipe(device=device) char = PackedTokBlockPipe(device=device, unk_token='') @@ -90,10 +95,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args conllx.dump(zip(raw_word, raw_pos, raw_chunk, raw_ner, pred_ner), fp, sep=' ') @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple[DataLoader, ...]: + if word_dim is not None: + vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( pre=ReplaceDigits(repl_token='') + ..., - vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)), + vocab=... + vectors, ) length = SeqLengthTensorPipe(device=device) char = PackedTokBlockPipe(device=device, unk_token='') @@ -148,10 +158,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[Any], *args, **kw conllx.dump(zip(raw_word, raw_pos, raw_ner, pred_ner), fp, sep=' ') @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple['DataLoader', ...]: + if word_dim is not None: + vectors = LoadFastText(str.lower, lang=cls.lang, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( pre=Prepend('', 1) + ReplaceDigits(repl_token='') + ..., - vocab=... + (Identity() if word_dim is None else LoadFastText(cls.lang, str.lower)), + vocab=... + vectors, ) length = SeqLengthTensorPipe(device=device).with_(pre=Prepend('', 1) + ...) char = PackedTokBlockPipe(device=device, unk_token='').with_( diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py index 013709f..169b78d 100644 --- a/torchglyph/datasets/text_classification.py +++ b/torchglyph/datasets/text_classification.py @@ -39,9 +39,14 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[int], *args, **kw csv.dump((' '.join(raw_title), ' '.join(raw_text), raw_target, vocab.itos[pred]), fp) @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple['DataLoader', ...]: + if word_dim is not None: + vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( - vocab=... + (Identity() if word_dim is None else LoadGlove(name='6B', dim=word_dim)), + vocab=... + vectors, ) target = TokTensorPipe(device=device, unk_token=None) diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index f42f2a3..b34bda2 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -76,21 +76,24 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: class LoadVectors(Proc): - def __init__(self, vectors: Vectors, *fallbacks) -> None: + def __init__(self, *fallback_fns, vectors: Vectors, remove_missing: bool) -> None: super(LoadVectors, self).__init__() + self.fallback_fns = fallback_fns self.vectors = vectors - self.fallbacks = fallbacks + self.remove_missing = remove_missing def extra_repr(self) -> str: return ', '.join([ f'{self.vectors.extra_repr()}', - *[f'{f.__name__}' for f in self.fallbacks], + *[f'{f.__name__}' for f in self.fallback_fns], ]) def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?" - tok, occ = vocab.load_vectors(self.vectors, *self.fallbacks) + if self.remove_missing: + vocab &= self.vectors + tok, occ = vocab.load_vectors(self.vectors, *self.fallback_fns) tok = tok / max(1, len(vocab.freq.values())) * 100 occ = occ / max(1, sum(vocab.freq.values())) * 100 logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'") @@ -98,16 +101,18 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: class LoadGlove(LoadVectors): - def __init__(self, name: str, dim: int, *fallbacks) -> None: + def __init__(self, *fallback_fns, name: str, dim: int, remove_missing: bool) -> None: super(LoadGlove, self).__init__( - Glove(name=name, dim=dim), - *fallbacks, + *fallback_fns, + vectors=Glove(name=name, dim=dim), + remove_missing=remove_missing, ) class LoadFastText(LoadVectors): - def __init__(self, lang: str, *fallbacks) -> None: + def __init__(self, *fallback_fns, lang: str, remove_missing: bool) -> None: super(LoadFastText, self).__init__( - FastTest(lang=lang), - *fallbacks, + *fallback_fns, + vectors=FastTest(lang=lang), + remove_missing=remove_missing, ) diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index ca7b0f6..cb4ae8d 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -81,45 +81,15 @@ def __and__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': if isinstance(rhs, Vocab): rhs = rhs.freq return Vocab( - counter=Counter({ - token: freq - for token, freq in self.freq.items() - if token in rhs - }), + counter=self.freq & rhs, unk_token=self.unk_token, pad_token=self.pad_token, special_tokens=self.special_tokens, max_size=self.max_size, min_freq=self.min_freq, ) - def __add__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - if isinstance(rhs, Vocab): - rhs = rhs.freq - return Vocab( - counter=Counter({ - token: self.freq[token] + rhs[token] - for token in {*self.freq.keys(), *rhs.keys()} - }), - unk_token=self.unk_token, - pad_token=self.pad_token, - special_tokens=self.special_tokens, - max_size=self.max_size, min_freq=self.min_freq, - ) - - def __sub__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - if isinstance(rhs, Vocab): - rhs = rhs.freq - return Vocab( - counter=Counter({ - token: freq - for token, freq in self.freq.items() - if token not in rhs - }), - unk_token=self.unk_token, - pad_token=self.pad_token, - special_tokens=self.special_tokens, - max_size=self.max_size, min_freq=self.min_freq, - ) + def __iand__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': + return self.__and__(rhs=rhs) @property def pad_idx(self) -> Optional[int]: @@ -168,6 +138,8 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, self.vectors = [] self.unk_init_ = unk_init_ + print(f'path => {path}') + pt_path = path.with_suffix('.pt') if not pt_path.exists(): if not path.exists(): From f48ac4ba8242ca427c12a3a6f06c37ab7786b1fc Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 10 Apr 2020 17:28:06 +0900 Subject: [PATCH 23/66] Feat: Update remove_missing flag --- torchglyph/proc/vocab.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index b34bda2..b85dfde 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -84,15 +84,16 @@ def __init__(self, *fallback_fns, vectors: Vectors, remove_missing: bool) -> Non def extra_repr(self) -> str: return ', '.join([ - f'{self.vectors.extra_repr()}', *[f'{f.__name__}' for f in self.fallback_fns], + f'{self.vectors.extra_repr()}', + f'remove_missing={self.remove_missing}', ]) def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?" if self.remove_missing: - vocab &= self.vectors + vocab = vocab & self.vectors tok, occ = vocab.load_vectors(self.vectors, *self.fallback_fns) tok = tok / max(1, len(vocab.freq.values())) * 100 occ = occ / max(1, sum(vocab.freq.values())) * 100 From 7b250ed73cff729e13a4d24b552f11b2c2f1cee4 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sat, 11 Apr 2020 21:22:28 +0900 Subject: [PATCH 24/66] Fix: Bug of remove_missing flag --- torchglyph/proc/vocab.py | 4 ++-- torchglyph/vocab.py | 38 ++++++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index b85dfde..2b72b74 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -93,8 +93,8 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?" if self.remove_missing: - vocab = vocab & self.vectors - tok, occ = vocab.load_vectors(self.vectors, *self.fallback_fns) + vocab = vocab.union(self.vectors, *self.fallback_fns) + tok, occ = vocab.load_vectors(*self.fallback_fns, vectors=self.vectors) tok = tok / max(1, len(vocab.freq.values())) * 100 occ = occ / max(1, sum(vocab.freq.values())) * 100 logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'") diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index cb4ae8d..6eeb0f4 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -2,7 +2,7 @@ from collections import Counter from collections import defaultdict from pathlib import Path -from typing import Union, Optional, Tuple, Callable, List +from typing import Optional, Tuple, Callable, List import torch from torch import Tensor @@ -77,20 +77,28 @@ def __len__(self) -> int: def __contains__(self, token: str) -> bool: return token in self.stoi - def __and__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - if isinstance(rhs, Vocab): - rhs = rhs.freq + def union(self, rhs: 'Vocab', *fallback_fns) -> 'Vocab': + counter = Counter() + + for token, freq in self.freq.items(): + if token in rhs.stoi: + counter[token] = freq + else: + for fallback_fn in fallback_fns: + new_token = fallback_fn(token) + if new_token in rhs.stoi: + counter[new_token] = freq + break + return Vocab( - counter=self.freq & rhs, + counter=counter, unk_token=self.unk_token, pad_token=self.pad_token, special_tokens=self.special_tokens, - max_size=self.max_size, min_freq=self.min_freq, + max_size=self.max_size, + min_freq=self.min_freq, ) - def __iand__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - return self.__and__(rhs=rhs) - @property def pad_idx(self) -> Optional[int]: if self.pad_token is None: @@ -103,12 +111,12 @@ def vec_dim(self) -> int: return 0 return self.vectors.size(1) - def load_vectors(self, vectors: 'Vectors', *fallbacks) -> Tuple[int, int]: + def load_vectors(self, *fallback_fns, vectors: 'Vectors') -> Tuple[int, int]: self.vectors = torch.empty((len(self), vectors.vec_dim), dtype=torch.float32) tok, occ = 0, 0 for token, index in self.stoi.items(): - if vectors.query_(token, self.vectors[index], *fallbacks): + if vectors.query_(token, self.vectors[index], *fallback_fns): tok += 1 occ += self.freq[token] @@ -138,8 +146,6 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, self.vectors = [] self.unk_init_ = unk_init_ - print(f'path => {path}') - pt_path = path.with_suffix('.pt') if not pt_path.exists(): if not path.exists(): @@ -171,12 +177,12 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, self.load(pt_path) @torch.no_grad() - def query_(self, token: str, vector: Tensor, *fallbacks) -> bool: + def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool: if token in self: vector[:] = self.vectors[self.stoi[token]] return True - for fallback in fallbacks: - new_token = fallback(token) + for fallback_fn in fallback_fns: + new_token = fallback_fn(token) if new_token in self: vector[:] = self.vectors[self.stoi[new_token]] return True From 6131949266d3e514a6af6e91448d2cd52524fe41 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 28 Apr 2020 22:30:57 +0900 Subject: [PATCH 25/66] Feat: Add NLPLVectors --- torchglyph/vocab.py | 63 +++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index 6eeb0f4..a791dca 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -136,45 +136,46 @@ def load(self, path: Path) -> None: class Vectors(Vocab): def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, - has_head_info: bool, unk_init_: Callable[[Tensor], Tensor] = init.normal_) -> None: + heading: bool, unicode_error: str = 'replace', dtype: torch.dtype = torch.float32, + unk_init_: Callable[[Tensor], Tensor] = init.normal_) -> None: super(Vectors, self).__init__( counter=Counter(), unk_token=None, pad_token=None, special_tokens=(), max_size=None, min_freq=1, ) - self.vectors = [] + vectors = [] self.unk_init_ = unk_init_ - pt_path = path.with_suffix('.pt') - if not pt_path.exists(): + dump_path = path.with_suffix('.pt') + if not dump_path.exists(): if not path.exists(): for url, dest in urls_dest: download_and_unzip(url, dest) with path.open('rb') as fp: - vec_dim = None + vector_dim = None - iteration = tqdm(fp, desc=f'reading {path}', unit=' tokens') - for raw in iteration: # type:bytes - if has_head_info: - _, vec_dim = map(int, raw.strip().split(b' ')) - has_head_info = False + iteration = tqdm(fp, desc=f'reading {path}', unit=' lines') + for raw in iteration: # type: bytes + if heading: + _, vector_dim = map(int, raw.strip().split(b' ')) + heading = False continue token, *vs = raw.rstrip().split(b' ') - if vec_dim is None: - vec_dim = len(vs) - elif vec_dim != len(vs): - raise ValueError(f'vector dimensions are not consistent, {vec_dim} != {len(vs)}') + if vector_dim is None: + vector_dim = len(vs) + elif vector_dim != len(vs): + raise ValueError(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}') - self.add_token_(str(token, encoding='utf-8')) - self.vectors.append(torch.tensor([float(v) for v in vs], dtype=torch.float32)) + self.add_token_(str(token, encoding='utf-8', errors=unicode_error)) + vectors.append(torch.tensor([float(v) for v in vs], dtype=dtype)) - self.vectors = torch.stack(self.vectors, 0) - self.save(pt_path) + self.vectors = torch.stack(vectors, 0) + self.save(dump_path) else: - self.load(pt_path) + self.load(dump_path) @torch.no_grad() def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool: @@ -192,23 +193,35 @@ def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool: class Glove(Vectors): def __init__(self, name: str, dim: int) -> None: + path = data_path / f'glove.{name}' super(Glove, self).__init__( urls_dest=[( f'http://nlp.stanford.edu/data/glove.{name}.zip', - data_path / f'glove.{name}' / f'glove.{name}.zip' + path / f'glove.{name}.zip' )], - path=data_path / f'glove.{name}' / f'glove.{name}.{dim}d.txt', - has_head_info=False, + path=path / f'glove.{name}.{dim}d.txt', heading=False, ) class FastTest(Vectors): def __init__(self, lang: str) -> None: + path = data_path / 'fasttext' super(FastTest, self).__init__( urls_dest=[( f'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{lang}.vec', - data_path / 'fasttext' / f'wiki.{lang}.vec', + path / f'wiki.{lang}.vec', )], - path=data_path / 'fasttext' / f'wiki.{lang}.vec', - has_head_info=True, + path=path / f'wiki.{lang}.vec', heading=True, + ) + + +class NLPLVectors(Vectors): + def __init__(self, index: int, name: str = 'model.txt', heading: bool = False) -> None: + path = data_path / 'nlpl' / f'{index}' + super(NLPLVectors, self).__init__( + urls_dest=[( + f'http://vectors.nlpl.eu/repository/20/{index}.zip', + path / f'{index}.zip', + )], + path=path / name, heading=heading, ) From 224a00e6c6a128ae16007c24f7dad5dfbfa13fa3 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 28 Apr 2020 23:16:55 +0900 Subject: [PATCH 26/66] Chore: Ignore dimension inconsistent token --- torchglyph/vocab.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index a791dca..a9e4095 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -156,10 +156,9 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, with path.open('rb') as fp: vector_dim = None - iteration = tqdm(fp, desc=f'reading {path}', unit=' lines') - for raw in iteration: # type: bytes + for raw in tqdm(fp, desc=f'reading {path}', unit=' lines'): # type: bytes if heading: - _, vector_dim = map(int, raw.strip().split(b' ')) + _, vector_dim = map(int, raw.rstrip().split(b' ')) heading = False continue token, *vs = raw.rstrip().split(b' ') @@ -167,7 +166,8 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, if vector_dim is None: vector_dim = len(vs) elif vector_dim != len(vs): - raise ValueError(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}') + logging.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}') + continue self.add_token_(str(token, encoding='utf-8', errors=unicode_error)) vectors.append(torch.tensor([float(v) for v in vs], dtype=dtype)) From c725cfcaa98aa8dea748552199ace8c9305998d6 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 1 May 2020 16:54:31 +0900 Subject: [PATCH 27/66] Fix: Resolve bug on unzipping .tar.gz files --- torchglyph/io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchglyph/io.py b/torchglyph/io.py index 99d683e..5eedf6d 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -69,7 +69,7 @@ def download_and_unzip(url: str, dest: Path) -> Path: logging.info(f'extracting {dest}') with zipfile.ZipFile(dest, "r") as fp: fp.extractall(path=dest.parent) - elif dest.suffixes[:-2] == ['.tar', '.gz']: + elif dest.suffixes[-2:] == ['.tar', '.gz']: logging.info(f'extracting {dest}') with tarfile.open(dest, 'r:gz') as fp: fp.extractall(path=dest.parent) From a0318e481f907636fc699a183ede5dc791eec702 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 1 May 2020 17:04:43 +0900 Subject: [PATCH 28/66] Feat: Support name property for Dataset --- torchglyph/dataset.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/torchglyph/dataset.py b/torchglyph/dataset.py index e451fbe..e7f11d3 100644 --- a/torchglyph/dataset.py +++ b/torchglyph/dataset.py @@ -2,7 +2,7 @@ import uuid from collections import namedtuple from pathlib import Path -from typing import Iterable, Any, TextIO +from typing import Iterable, Any, TextIO, Optional from typing import Union, List, Type, Tuple, NamedTuple, Dict from torch.utils import data @@ -14,6 +14,7 @@ class Dataset(data.Dataset): + name: Optional[str] urls: List[Union[Tuple[str, ...]]] def __init__(self, pipes: List[Dict[str, Pipe]], **load_kwargs) -> None: @@ -62,14 +63,16 @@ def collate_fn(self, batch: List[NamedTuple]) -> NamedTuple: @classmethod def paths(cls, root: Path = data_path) -> Tuple[Path, ...]: + root = root / getattr(cls, 'name', cls.__name__).lower() + ans = [] for url, name, *filenames in cls.urls: if len(filenames) == 0: filenames = [name] - if any(not (root / cls.__name__.lower() / n).exists() for n in filenames): - download_and_unzip(url, root / cls.__name__.lower() / name) - for n in filenames: - ans.append(root / cls.__name__.lower() / n) + if any(not (root / filename).exists() for filename in filenames): + download_and_unzip(url, root / name) + for filename in filenames: + ans.append(root / filename) return tuple(ans) From e73d7d6c897ae118c3af0029cc0fe36a00c8db86 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 1 May 2020 17:36:01 +0900 Subject: [PATCH 29/66] Chore: Separate ctx dependencies --- .github/workflows/unit-tests.yml | 2 +- setup.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ef5ea86..c4e9f3f 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -17,7 +17,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install torch - python -m pip install -e '.[dev]' + python -m pip install -e '.[dev, ctx]' - name: Test with pytest run: | python -m pytest tests diff --git a/setup.py b/setup.py index 16ae9e6..6a077a5 100644 --- a/setup.py +++ b/setup.py @@ -17,14 +17,16 @@ 'tqdm', 'numpy', 'einops', - 'transformers', - 'allennlp', - 'elmoformanylangs', ], extras_require={ 'dev': [ 'pytest', 'hypothesis', ], + 'ctx': [ + 'transformers', + 'allennlp', + 'elmoformanylangs', + ] } ) From ea37c19e4adb1fcfe477578b8fda12f3becf654e Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 1 May 2020 20:28:21 +0900 Subject: [PATCH 30/66] Refactor: Rewrite open_io function --- torchglyph/io.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/torchglyph/io.py b/torchglyph/io.py index 5eedf6d..fb9c57a 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -17,15 +17,14 @@ @contextmanager def open_io(f: IO, mode: str, encoding: str): - if isinstance(f, (str, Path)): - fp = open(f, mode=mode, encoding=encoding) - else: - fp = f try: - yield fp + if isinstance(f, (str, Path)): + with open(f, mode=mode, encoding=encoding) as fp: + yield fp + else: + yield f finally: - if isinstance(f, Path): - fp.close() + pass # copied and modified from https://github.com/pytorch/text @@ -74,9 +73,9 @@ def download_and_unzip(url: str, dest: Path) -> Path: with tarfile.open(dest, 'r:gz') as fp: fp.extractall(path=dest.parent) elif dest.suffix == '.gz': - with gzip.open(dest, mode='rb') as fsrc: - with dest.with_suffix('').open(mode='wb') as fdst: - shutil.copyfileobj(fsrc, fdst) + with gzip.open(dest, mode='rb') as fs: + with dest.with_suffix('').open(mode='wb') as fd: + shutil.copyfileobj(fs, fd) return dest From a23c75193125c41391cedc756102f3670330cacb Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 1 May 2020 20:36:05 +0900 Subject: [PATCH 31/66] Refactor: Use local logger --- torchglyph/datasets/sequential_labeling.py | 8 +++++--- torchglyph/datasets/text_classification.py | 4 +++- torchglyph/io.py | 7 +++++-- torchglyph/nn/contextual.py | 4 +++- torchglyph/proc/vocab.py | 9 +++++---- torchglyph/vocab.py | 8 +++++--- 6 files changed, 26 insertions(+), 14 deletions(-) diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py index 369437c..85cf6eb 100644 --- a/torchglyph/datasets/sequential_labeling.py +++ b/torchglyph/datasets/sequential_labeling.py @@ -13,6 +13,8 @@ from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend +logger = logging.Logger(__name__) + class CoNLL2000Chunking(Dataset): urls = [ @@ -61,7 +63,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], test = cls(path=test, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, test, name='word') char.build_vocab(train, test, name='char') @@ -125,7 +127,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], test = cls(path=test, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, dev, test, name='word') char.build_vocab(train, dev, test, name='char') @@ -192,7 +194,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], test = cls(path=test, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, dev, test, name='word') char.build_vocab(train, dev, test, name='char') diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py index 169b78d..9789580 100644 --- a/torchglyph/datasets/text_classification.py +++ b/torchglyph/datasets/text_classification.py @@ -12,6 +12,8 @@ from torchglyph.pipe import PackedTokSeqPipe, TokTensorPipe, RawPipe from torchglyph.proc import Identity, LoadGlove +logger = logging.Logger(__name__) + class AgNews(Dataset): urls = [ @@ -61,7 +63,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], test = cls(path=test, target_vocab=target_vocab, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, test, name='word') target.build_vocab(train, test, name='target') diff --git a/torchglyph/io.py b/torchglyph/io.py index fb9c57a..cf1cf15 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -12,6 +12,8 @@ from tqdm import tqdm +logger = logging.Logger(__name__) + IO = Union[str, Path, TextIO] @@ -65,14 +67,15 @@ def download_and_unzip(url: str, dest: Path) -> Path: raise err if dest.suffix == '.zip': - logging.info(f'extracting {dest}') + logger.info(f'extracting {dest}') with zipfile.ZipFile(dest, "r") as fp: fp.extractall(path=dest.parent) elif dest.suffixes[-2:] == ['.tar', '.gz']: - logging.info(f'extracting {dest}') + logger.info(f'extracting {dest}') with tarfile.open(dest, 'r:gz') as fp: fp.extractall(path=dest.parent) elif dest.suffix == '.gz': + logger.info(f'extracting {dest}') with gzip.open(dest, mode='rb') as fs: with dest.with_suffix('').open(mode='wb') as fd: shutil.copyfileobj(fs, fd) diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index 3ee8d41..af42d5b 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -18,6 +18,8 @@ toggle_loggers('allennlp', False) toggle_loggers('elmoformanylangs', False) +logger = logging.Logger(__name__) + class ELMoModel(AllenELMo): root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/' @@ -29,7 +31,7 @@ class ELMoModel(AllenELMo): } def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs) -> None: - logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') + logger.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') super(ELMoModel, self).__init__( options_file=options_file, weight_file=weight_file, **kwargs, diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index 2b72b74..1e08056 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -5,6 +5,7 @@ from torchglyph.proc import Proc from torchglyph.vocab import Vocab, Vectors, Glove, FastTest +logger = logging.Logger(__name__) class UpdateCounter(Proc): def __call__(self, data: Union[str, List[str]], counter: Counter, *args, **kwargs) -> Union[str, List[str]]: @@ -61,14 +62,14 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: occ_avg = sum(vocab.freq.values()) / max(1, tok_cnt) name = f"{vocab.__class__.__name__} '{name}'" - logging.info(f"{name} has {tok_cnt} token(s) => " + logger.info(f"{name} has {tok_cnt} token(s) => " f"{occ_avg:.1f} occurrence(s)/token [" f"{occ_max} :: '{tok_max}', " f"{occ_min} :: '{tok_min}']") if tok_cnt <= self.threshold: - logging.info(f'{name} => [{", ".join(vocab.itos)}]') + logger.info(f'{name} => [{", ".join(vocab.itos)}]') else: - logging.info(f'{name} => [' + logger.info(f'{name} => [' f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., ' f'{", ".join(vocab.itos[-self.threshold // 2:])}]') @@ -97,7 +98,7 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: tok, occ = vocab.load_vectors(*self.fallback_fns, vectors=self.vectors) tok = tok / max(1, len(vocab.freq.values())) * 100 occ = occ / max(1, sum(vocab.freq.values())) * 100 - logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'") + logger.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'") return vocab diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index a9e4095..a1959e8 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -12,6 +12,8 @@ from torchglyph import data_path from torchglyph.io import download_and_unzip +logger = logging.Logger(__name__) + class Vocab(object): def __init__(self, counter: Counter, @@ -126,11 +128,11 @@ def load_vectors(self, *fallback_fns, vectors: 'Vectors') -> Tuple[int, int]: return tok, occ def save(self, path: Path) -> None: - logging.info(f'saving {self.__class__.__name__} to {path}') + logger.info(f'saving {self.__class__.__name__} to {path}') torch.save((self.stoi, self.itos, self.vectors), path) def load(self, path: Path) -> None: - logging.info(f'loading {self.__class__.__name__} from {path}') + logger.info(f'loading {self.__class__.__name__} from {path}') self.stoi, self.itos, self.vectors = torch.load(path) @@ -166,7 +168,7 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, if vector_dim is None: vector_dim = len(vs) elif vector_dim != len(vs): - logging.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}') + logger.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}') continue self.add_token_(str(token, encoding='utf-8', errors=unicode_error)) From dff85fd8abd926623b8900604ce2fb14674ada9b Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:04:38 +0900 Subject: [PATCH 32/66] Chore: Filter packages --- setup.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 6a077a5..d8a455c 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,16 @@ from setuptools import setup, find_packages -with open('README.md', 'r', encoding='utf-8') as fp: - long_description = fp.read() +name = 'torchglyph' setup( - name='torchglyph', + name=name, version='0.1.0', - packages=find_packages(), - url='https://github.com/speedcell4/torchglyph', + packages=[package for package in find_packages() if package.startswith(name)], + url=f'https://github.com/speedcell4/{name}', license='MIT', author='speedcell4', author_email='speedcell4@gmail.com', description='Data Processor Combinators for Natural Language Processing', - long_description=long_description, install_requires=[ 'tqdm', 'numpy', From aacce26af79311ce79dbfe45933bae423477324a Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:06:55 +0900 Subject: [PATCH 33/66] Refactor: Separate PadELMo --- torchglyph/pipe/ctx.py | 2 +- torchglyph/proc/collecting.py | 12 ------------ torchglyph/proc/ctx.py | 17 +++++++++++++++++ torchglyph/proc/vocab.py | 11 ++++++----- 4 files changed, 24 insertions(+), 18 deletions(-) create mode 100644 torchglyph/proc/ctx.py diff --git a/torchglyph/pipe/ctx.py b/torchglyph/pipe/ctx.py index 1adc83f..58ae589 100644 --- a/torchglyph/pipe/ctx.py +++ b/torchglyph/pipe/ctx.py @@ -3,8 +3,8 @@ import torch from torchglyph.pipe import Pipe -from torchglyph.proc import PadELMo from torchglyph.proc import ToDevice +from torchglyph.proc.ctx import PadELMo from torchglyph.proc.tokenizer import ELMoTokenizer diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index 62b2d68..2783c6f 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -2,17 +2,12 @@ import numpy as np import torch -from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary -from allennlp.data.dataset import Batch as AllenBatch from torch import Tensor from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence -from torchglyph.io import toggle_loggers from torchglyph.proc import Proc, Chain, stoi from torchglyph.vocab import Vocab -toggle_loggers('allennlp', False) - class ToDevice(Proc): Item = Union[int, float, bool, Tensor, PackedSequence] @@ -105,13 +100,6 @@ def __call__(self, data: List[Tensor], vocab: Vocab, **kwargs) -> Tensor: ) -class PadELMo(Proc): - def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor: - batch = AllenBatch(data) - batch.index_instances(AllenVocabulary()) - return batch.as_tensor_dict()['elmo']['character_ids'] - - class PackSeq(Proc): def __init__(self, enforce_sorted: bool) -> None: super(PackSeq, self).__init__() diff --git a/torchglyph/proc/ctx.py b/torchglyph/proc/ctx.py new file mode 100644 index 0000000..0e2b786 --- /dev/null +++ b/torchglyph/proc/ctx.py @@ -0,0 +1,17 @@ +from typing import List + +from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary +from allennlp.data.dataset import Batch as AllenBatch +from torch import Tensor + +from torchglyph.io import toggle_loggers +from torchglyph.proc import Proc + +toggle_loggers('allennlp', False) + + +class PadELMo(Proc): + def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor: + batch = AllenBatch(data) + batch.index_instances(AllenVocabulary()) + return batch.as_tensor_dict()['elmo']['character_ids'] diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index 1e08056..e3cf351 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -7,6 +7,7 @@ logger = logging.Logger(__name__) + class UpdateCounter(Proc): def __call__(self, data: Union[str, List[str]], counter: Counter, *args, **kwargs) -> Union[str, List[str]]: if isinstance(data, str): @@ -63,15 +64,15 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: name = f"{vocab.__class__.__name__} '{name}'" logger.info(f"{name} has {tok_cnt} token(s) => " - f"{occ_avg:.1f} occurrence(s)/token [" - f"{occ_max} :: '{tok_max}', " - f"{occ_min} :: '{tok_min}']") + f"{occ_avg:.1f} occurrence(s)/token [" + f"{occ_max} :: '{tok_max}', " + f"{occ_min} :: '{tok_min}']") if tok_cnt <= self.threshold: logger.info(f'{name} => [{", ".join(vocab.itos)}]') else: logger.info(f'{name} => [' - f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., ' - f'{", ".join(vocab.itos[-self.threshold // 2:])}]') + f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., ' + f'{", ".join(vocab.itos[-self.threshold // 2:])}]') return vocab From 603f0674a34f6209f945ffefd27c329eeca53239 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:07:51 +0900 Subject: [PATCH 34/66] Style: PEP8 them all --- torchglyph/nn/embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py index b227825..0870190 100644 --- a/torchglyph/nn/embedding.py +++ b/torchglyph/nn/embedding.py @@ -39,7 +39,7 @@ def _padded_forward(self, sub: Tensor, tok_lengths: Tensor) -> Tensor: rearrange(tok_lengths.clamp_min(1), 'a b -> (a b)'), batch_first=self.rnn.batch_first, enforce_sorted=False, ) - + embedding = pack._replace(data=self.dropout(self.embedding(pack.data))) _, (encoding, _) = self.rnn(embedding) From 2395ad167a8ff16fef75572a7da2f8a0f300815a Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:14:37 +0900 Subject: [PATCH 35/66] Fix: Update SupportPack --- torchglyph/functional.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchglyph/functional.py b/torchglyph/functional.py index ce2ea73..484699a 100644 --- a/torchglyph/functional.py +++ b/torchglyph/functional.py @@ -1,5 +1,5 @@ import functools -from typing import Union, Tuple, Dict, Any +from typing import Any, Union, Tuple, Dict import torch from torch import Tensor @@ -19,7 +19,7 @@ def wrap(x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, Pac class SupportPack(type): def __new__(cls, name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]): - forward_fn = bases[0].forward + forward_fn = attrs.get('forward', bases[0].forward) @functools.wraps(forward_fn) def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: From 151ea7610b5ca5a79665cc4bc3f9833c4b10bcf0 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:22:55 +0900 Subject: [PATCH 36/66] Feat: Add repository flag for NLPLVectors --- torchglyph/vocab.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index a1959e8..baa9ad5 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -218,11 +218,11 @@ def __init__(self, lang: str) -> None: class NLPLVectors(Vectors): - def __init__(self, index: int, name: str = 'model.txt', heading: bool = False) -> None: + def __init__(self, index: int, repository: str = '20', name: str = 'model.txt', heading: bool = False) -> None: path = data_path / 'nlpl' / f'{index}' super(NLPLVectors, self).__init__( urls_dest=[( - f'http://vectors.nlpl.eu/repository/20/{index}.zip', + f'http://vectors.nlpl.eu/repository/{repository}/{index}.zip', path / f'{index}.zip', )], path=path / name, heading=heading, From 98b71344ba1b68276e81ab268f5d9b6cf7b9f233 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:25:22 +0900 Subject: [PATCH 37/66] Fix: Resolve toggle_loggers import path --- torchglyph/proc/tokenizer.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py index e2fe321..8d244dc 100644 --- a/torchglyph/proc/tokenizer.py +++ b/torchglyph/proc/tokenizer.py @@ -5,14 +5,15 @@ from allennlp.data.fields import TextField as AllenTextField from allennlp.data.token_indexers import ELMoTokenCharactersIndexer -from torchglyph.proc import Proc, toggle_loggers +from torchglyph.io import toggle_loggers +from torchglyph.proc import Proc toggle_loggers('allennlp', False) toggle_loggers('transformers', False) class ELMoTokenizer(Proc): - def __init__(self): + def __init__(self) -> None: super(ELMoTokenizer, self).__init__() self.tokenizer = ELMoTokenCharactersIndexer() @@ -36,66 +37,66 @@ def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]: class BertTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'bert-base-uncased'): + def __init__(self, weight: str = 'bert-base-uncased') -> None: super(BertTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.BertTokenizer.from_pretrained(weight) class OpenAIGPTTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'openai-gpt'): + def __init__(self, weight: str = 'openai-gpt') -> None: super(OpenAIGPTTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(weight) class GPT2Tokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'gpt2'): + def __init__(self, weight: str = 'gpt2') -> None: super(GPT2Tokenizer, self).__init__(weight=weight) self.tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight) class CTRLTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'ctrl'): + def __init__(self, weight: str = 'ctrl') -> None: super(CTRLTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weight) class TransfoXLTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'transfo-xl-wt103'): + def __init__(self, weight: str = 'transfo-xl-wt103') -> None: super(TransfoXLTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.TransfoXLTokenizer.from_pretrained(weight) class XLNetTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'xlnet-base-cased'): + def __init__(self, weight: str = 'xlnet-base-cased') -> None: super(XLNetTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weight) class XLMTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'xlm-mlm-enfr-1024'): + def __init__(self, weight: str = 'xlm-mlm-enfr-1024') -> None: super(XLMTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.XLMTokenizer.from_pretrained(weight) class DistilBertTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'distilbert-base-cased'): + def __init__(self, weight: str = 'distilbert-base-cased') -> None: super(DistilBertTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(weight) class RobertaTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'roberta-base'): + def __init__(self, weight: str = 'roberta-base') -> None: super(RobertaTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.RobertaTokenizer.from_pretrained(weight) class XLMRobertaTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'xlm-roberta-base'): + def __init__(self, weight: str = 'xlm-roberta-base') -> None: super(XLMRobertaTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(weight) class BartTokenizer(TransformerTokenizerProc): - def __init__(self, weight: str = 'bart-large'): + def __init__(self, weight: str = 'bart-large') -> None: super(BartTokenizer, self).__init__(weight=weight) self.tokenizer = transformers.BartTokenizer.from_pretrained(weight) From f15e9b5628079bb80072c429dda1702495f62d15 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:34:19 +0900 Subject: [PATCH 38/66] Chore: Run github action on both Python 3.7 and 3.8 --- .github/workflows/unit-tests.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index c4e9f3f..5920a61 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -7,12 +7,16 @@ jobs: runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7", "3.8"] + steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.7 + - uses: actions/checkout@v2 + - name: Set up Python "${{ matrix.python-version }}" uses: actions/setup-python@v1 with: - python-version: 3.7 + python-version: "${{ matrix.python-version }}" - name: Install dependencies run: | python -m pip install --upgrade pip From 6369729850fa23575b83312afe35058b3a852270 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Sun, 3 May 2020 18:38:23 +0900 Subject: [PATCH 39/66] Style: Update unit-tests.yml style --- .github/workflows/unit-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 5920a61..92535fd 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -13,7 +13,7 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python "${{ matrix.python-version }}" + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: "${{ matrix.python-version }}" From 08a27dca8feeac490f9c2435fe7f4fdbad238968 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:25:03 +0900 Subject: [PATCH 40/66] Docs: Init document --- .gitignore | 1 + docs/docs/index.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ docs/mkdocs.yml | 4 ++++ setup.py | 4 ++++ 4 files changed, 55 insertions(+) create mode 100644 docs/docs/index.md create mode 100644 docs/mkdocs.yml diff --git a/.gitignore b/.gitignore index a4b0cc6..d7d336d 100644 --- a/.gitignore +++ b/.gitignore @@ -640,3 +640,4 @@ GitHub.sublime-settings *.ptx *.cubin *.fatbin +!/docs/site/ diff --git a/docs/docs/index.md b/docs/docs/index.md new file mode 100644 index 0000000..7a995d3 --- /dev/null +++ b/docs/docs/index.md @@ -0,0 +1,46 @@ +# Welcome to TorchGlyph + +[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions) + +Data Processor Combinators for Natural Language Processing + +## Installation + +Simply run this command in your terminal: + +```bash +pip install torchglyph +``` + +## Quickstart + +The minimal data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce a more complex processor by composing two simple `Proc`s. + +```python +ToLower() + ReplaceDigits(repl_token='') +``` + +Composed `Proc`s act like data `Pipe`lines, in which raw input textual data is processed incrementally. According to the stages, they are roughly categories into four-groups: + ++ `pre` for processing *before* building vocabulary; ++ `vocab` for building and updating *vocabulary*; ++ `post` for precessing *after* building vocabulary; ++ `batch` for collating examples to build *batches*. + +Defining the `Pipe`s of your dataset you can build it from scratch, or you can simply manipulate existing `Pipe`s by calling `.with_` method. + +```python +class PackedTokSeqPipe(PackedIdxSeqPipe): + def __init__(self, device, unk_token, special_tokens = (), + threshold = THRESHOLD, dtype = torch.long) -> None: + super(PackedTokSeqPipe, self).__init__(device=device, dtype=dtype) + self.with_( + pre=UpdateCounter(), + vocab=[ + BuildVocab(unk_token=unk_token, pad_token=None, + special_tokens=special_tokens), + StatsVocab(threshold=threshold), + ], + post=Numbering() + ..., + ) +``` \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 0000000..07f06a4 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,4 @@ +site_name: TorchGlyph +nav: + - Home: index.md +theme: alabaster \ No newline at end of file diff --git a/setup.py b/setup.py index d8a455c..886efde 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,10 @@ 'transformers', 'allennlp', 'elmoformanylangs', + ], + 'docs': [ + 'mkdocs', + 'mkdocs-alabaster', ] } ) From 223a41d8137fc865a0d08f0dd3e1e266bd33b435 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:38:46 +0900 Subject: [PATCH 41/66] Docs: Move to top-level directory --- .gitignore | 2 +- docs/{docs => }/index.md | 0 docs/mkdocs.yml => mkdocs.yml | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename docs/{docs => }/index.md (100%) rename docs/mkdocs.yml => mkdocs.yml (100%) diff --git a/.gitignore b/.gitignore index d7d336d..5149fdb 100644 --- a/.gitignore +++ b/.gitignore @@ -640,4 +640,4 @@ GitHub.sublime-settings *.ptx *.cubin *.fatbin -!/docs/site/ +!/site/ diff --git a/docs/docs/index.md b/docs/index.md similarity index 100% rename from docs/docs/index.md rename to docs/index.md diff --git a/docs/mkdocs.yml b/mkdocs.yml similarity index 100% rename from docs/mkdocs.yml rename to mkdocs.yml From b752cf0c0f7361e7e5cbdfee1d302a52e0301990 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:53:18 +0900 Subject: [PATCH 42/66] Chore: Add mkdocs.yml --- .github/workflows/mkdocs.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/mkdocs.yml diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml new file mode 100644 index 0000000..c88bdea --- /dev/null +++ b/.github/workflows/mkdocs.yml @@ -0,0 +1,14 @@ +name: mkdocs +on: + push: + branches: + - develop + +jobs: + build: + name: Deploy docs + runs-on: ubuntu-latest + steps: + - name: Deploy MkDocs + uses: mhausenblas/mkdocs-deploy-gh-pages@1.11 + From 2265e5e1ba26f66aab626c284a7d1fa702e60b2c Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:54:43 +0900 Subject: [PATCH 43/66] Fix: Update mkdocs.yml --- .github/workflows/mkdocs.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index c88bdea..74633f8 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -9,6 +9,11 @@ jobs: name: Deploy docs runs-on: ubuntu-latest steps: - - name: Deploy MkDocs - uses: mhausenblas/mkdocs-deploy-gh-pages@1.11 + - name: Checkout develop + uses: actions/checkout@v1 + - name: Deploy docs + uses: mhausenblas/mkdocs-deploy-gh-pages@master + env: + PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }} + CUSTOM_DOMAIN: optionaldomain.com \ No newline at end of file From c5895e69914e5d48201d6d866d8b0595ee81709e Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:57:41 +0900 Subject: [PATCH 44/66] Fix: Install Python and dependencies --- .github/workflows/mkdocs.yml | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index 74633f8..c6b9fcb 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -11,9 +11,13 @@ jobs: steps: - name: Checkout develop uses: actions/checkout@v1 - + - name: Set up Python 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7 + - name: Install dependencies + run: python -m pip install -e '.[docs]' - name: Deploy docs uses: mhausenblas/mkdocs-deploy-gh-pages@master env: - PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }} - CUSTOM_DOMAIN: optionaldomain.com \ No newline at end of file + PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }} \ No newline at end of file From 7af526e625587684c17f81a612ad76ec96b3b8ee Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:58:29 +0900 Subject: [PATCH 45/66] Fix: Resolve indent --- .github/workflows/mkdocs.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index c6b9fcb..2fcdaa6 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -12,9 +12,9 @@ jobs: - name: Checkout develop uses: actions/checkout@v1 - name: Set up Python 3.7 - uses: actions/setup-python@v1 - with: - python-version: 3.7 + uses: actions/setup-python@v1 + with: + python-version: 3.7 - name: Install dependencies run: python -m pip install -e '.[docs]' - name: Deploy docs From 02c7e3b54819e478e42f47105517fafbbb25d9d2 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 02:58:57 +0900 Subject: [PATCH 46/66] Fix: Resolve indent again --- .github/workflows/mkdocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index 2fcdaa6..770eeff 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -16,7 +16,7 @@ jobs: with: python-version: 3.7 - name: Install dependencies - run: python -m pip install -e '.[docs]' + run: python -m pip install -e '.[docs]' - name: Deploy docs uses: mhausenblas/mkdocs-deploy-gh-pages@master env: From b1b65fc5bbf9c9635b10c1b18c62b961147b3b50 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:03:04 +0900 Subject: [PATCH 47/66] Fix: Update Deploy --- .github/workflows/mkdocs.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index 770eeff..d1d089f 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -17,7 +17,8 @@ jobs: python-version: 3.7 - name: Install dependencies run: python -m pip install -e '.[docs]' - - name: Deploy docs - uses: mhausenblas/mkdocs-deploy-gh-pages@master - env: - PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }} \ No newline at end of file + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.PERSONAL_TOKEN }} + publish_dir: ./site \ No newline at end of file From dc6159e4c5f24a06de1bb2039a310dcdc451a3bf Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:05:16 +0900 Subject: [PATCH 48/66] Fix: Update Build --- .github/workflows/mkdocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index d1d089f..f3634bc 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -17,6 +17,8 @@ jobs: python-version: 3.7 - name: Install dependencies run: python -m pip install -e '.[docs]' + - name: Build + run: mkdocs build - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: From 2f44bda3431e41bb8993dc970f780ea37e90d154 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:06:52 +0900 Subject: [PATCH 49/66] Docs: Update comma --- docs/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 7a995d3..4e8b91e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ Data Processor Combinators for Natural Language Processing ## Installation -Simply run this command in your terminal: +Simply run this command in your terminal, ```bash pip install torchglyph From 274d7c69c2c696fb7c7f9027a2d33009f6f21bbb Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:09:49 +0900 Subject: [PATCH 50/66] Docs: Add PackedIdxSeqPipe --- docs/index.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/index.md b/docs/index.md index 4e8b91e..e90c554 100644 --- a/docs/index.md +++ b/docs/index.md @@ -27,7 +27,20 @@ Composed `Proc`s act like data `Pipe`lines, in which raw input textual data is p + `post` for precessing *after* building vocabulary; + `batch` for collating examples to build *batches*. -Defining the `Pipe`s of your dataset you can build it from scratch, or you can simply manipulate existing `Pipe`s by calling `.with_` method. +Defining the `Pipe`s of your dataset you can build it from scratch, + +```python +class PackedIdxSeqPipe(Pipe): + def __init__(self, device, dtype = torch.long) -> None: + super(PackedIdxSeqPipe, self).__init__( + pre=None, + vocab=None, + post=ToTensor(dtype=dtype), + batch=PackSeq(enforce_sorted=False) + ToDevice(device=device), + ) +``` + +or you can simply manipulate existing `Pipe`s by calling `.with_` method. ```python class PackedTokSeqPipe(PackedIdxSeqPipe): From 7847157f939f7ac63c764ec9e46cf4fe6a1775b7 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:14:56 +0900 Subject: [PATCH 51/66] Docs: Fix some typos --- docs/index.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/index.md b/docs/index.md index e90c554..9357b0e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,9 +1,9 @@ # Welcome to TorchGlyph -[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions) - Data Processor Combinators for Natural Language Processing +[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions) + ## Installation Simply run this command in your terminal, @@ -14,20 +14,20 @@ pip install torchglyph ## Quickstart -The minimal data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce a more complex processor by composing two simple `Proc`s. +The atomic data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce complex `Proc` by composing two simple `Proc`s. ```python ToLower() + ReplaceDigits(repl_token='') ``` -Composed `Proc`s act like data `Pipe`lines, in which raw input textual data is processed incrementally. According to the stages, they are roughly categories into four-groups: +Composed `Proc`s act like data `Pipe`lines, where raw textual data is processed incrementally. According to the stages, they are roughly categorized into four-groups: + `pre` for processing *before* building vocabulary; + `vocab` for building and updating *vocabulary*; + `post` for precessing *after* building vocabulary; + `batch` for collating examples to build *batches*. -Defining the `Pipe`s of your dataset you can build it from scratch, +Defining the `Pipe`s of your dataset is the first step to build a dataset, you can build it from scratch, ```python class PackedIdxSeqPipe(Pipe): From acc0c9c87b6da652429471ae38ffc2d89131095f Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:22:19 +0900 Subject: [PATCH 52/66] Docs: PEP8 them all --- docs/index.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/index.md b/docs/index.md index 9357b0e..f1a0311 100644 --- a/docs/index.md +++ b/docs/index.md @@ -31,7 +31,7 @@ Defining the `Pipe`s of your dataset is the first step to build a dataset, you c ```python class PackedIdxSeqPipe(Pipe): - def __init__(self, device, dtype = torch.long) -> None: + def __init__(self, device, dtype=torch.long) -> None: super(PackedIdxSeqPipe, self).__init__( pre=None, vocab=None, @@ -44,13 +44,13 @@ or you can simply manipulate existing `Pipe`s by calling `.with_` method. ```python class PackedTokSeqPipe(PackedIdxSeqPipe): - def __init__(self, device, unk_token, special_tokens = (), - threshold = THRESHOLD, dtype = torch.long) -> None: + def __init__(self, device, unk_token, special_tokens=(), + threshold=THRESHOLD, dtype=torch.long) -> None: super(PackedTokSeqPipe, self).__init__(device=device, dtype=dtype) self.with_( pre=UpdateCounter(), vocab=[ - BuildVocab(unk_token=unk_token, pad_token=None, + BuildVocab(unk_token=unk_token, pad_token=None, special_tokens=special_tokens), StatsVocab(threshold=threshold), ], From dcb28fe11f4bfe0ea8cfd8c6d723e5c856953a12 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:50:09 +0900 Subject: [PATCH 53/66] Chore: Remove Python 3.8 unit test --- .github/workflows/unit-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 92535fd..0915eec 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -9,7 +9,7 @@ jobs: strategy: matrix: - python-version: ["3.7", "3.8"] + python-version: ["3.7"] steps: - uses: actions/checkout@v2 From 050f32e77e5094777dcfb6397b920c26918d64a9 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 03:51:28 +0900 Subject: [PATCH 54/66] Chore: Update homepage --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 886efde..414abe1 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,7 @@ name=name, version='0.1.0', packages=[package for package in find_packages() if package.startswith(name)], - url=f'https://github.com/speedcell4/{name}', + url=f'https://speedcell4.github.io/torchglyph', license='MIT', author='speedcell4', author_email='speedcell4@gmail.com', From e9bf982fa78993c574d39f9566f78a05699dd9ad Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 4 May 2020 18:08:02 +0900 Subject: [PATCH 55/66] Feat: Add Itos --- torchglyph/proc/infer.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 torchglyph/proc/infer.py diff --git a/torchglyph/proc/infer.py b/torchglyph/proc/infer.py new file mode 100644 index 0000000..d83faf1 --- /dev/null +++ b/torchglyph/proc/infer.py @@ -0,0 +1,9 @@ +from typing import List + +from torchglyph.proc import Proc +from torchglyph.vocab import Vocab + + +class RevVocab(Proc): + def __call__(self, xs: List[int], vocab: Vocab, **kwargs) -> List[str]: + return [vocab.itos[x] for x in xs] From d021730b8e26d75713440d7be623bb9b301753d6 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Fri, 8 May 2020 15:24:04 +0900 Subject: [PATCH 56/66] Fix: Resolve vocabulary issue of PackedSeqPtrSeqPipe --- torchglyph/pipe/seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchglyph/pipe/seq.py b/torchglyph/pipe/seq.py index a5b9ff2..f0f7c6e 100644 --- a/torchglyph/pipe/seq.py +++ b/torchglyph/pipe/seq.py @@ -90,7 +90,7 @@ class PackedSeqPtrSeqPipe(PackedIdxSeqPipe): def __init__(self, device: Union[int, torch.device], dtype: torch.dtype = torch.long) -> None: super(PackedSeqPtrSeqPipe, self).__init__(device=device, dtype=dtype) self.with_( - pre=GetMask(token=0), + post=GetMask(token=0) + ..., batch=Scan(fn=cum_seq, init=0) + ..., ) From 8d658b4d409658fa06a01cf46373a5c478bf33fc Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 12 May 2020 02:17:31 +0900 Subject: [PATCH 57/66] Feat: Add PackedContiguousSubPipe and PackedContiguousSubPtrPipe --- torchglyph/pipe/contiguous.py | 33 ++++++++++++++++++++ torchglyph/proc/contiguous.py | 59 +++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 torchglyph/pipe/contiguous.py create mode 100644 torchglyph/proc/contiguous.py diff --git a/torchglyph/pipe/contiguous.py b/torchglyph/pipe/contiguous.py new file mode 100644 index 0000000..25e0ae8 --- /dev/null +++ b/torchglyph/pipe/contiguous.py @@ -0,0 +1,33 @@ +from typing import Union, Optional, Tuple + +import torch + +from torchglyph.pipe import PackedTokSeqPipe +from torchglyph.pipe import Pipe +from torchglyph.pipe import THRESHOLD +from torchglyph.proc import GetLength, Lift, ToTensor +from torchglyph.proc.collecting import ToDevice +from torchglyph.proc.contiguous import BuildContiguousSub, BuildContiguousSubPtr, PackContiguousSubPtr + + +class PackedContiguousSubPipe(PackedTokSeqPipe): + def __init__(self, device: Union[int, torch.device], unk_token: Optional[str], + seq_token: str, special_tokens: Tuple[Optional[str], ...] = (), + threshold: int = THRESHOLD, dtype: torch.dtype = torch.long) -> None: + super(PackedContiguousSubPipe, self).__init__( + device=device, unk_token=unk_token, special_tokens=special_tokens, + threshold=threshold, dtype=dtype, + ) + self.with_( + pre=BuildContiguousSub(seq_token=seq_token) + ..., + ) + + +class PackedContiguousSubPtrPipe(Pipe): + def __init__(self, device: Union[int, torch.device], dtype: torch.dtype = torch.long) -> None: + super(PackedContiguousSubPtrPipe, self).__init__( + pre=Lift(GetLength()) + BuildContiguousSubPtr() + Lift(ToTensor(dtype=dtype)), + vocab=None, + post=None, + batch=PackContiguousSubPtr(enforce_sorted=False) + ToDevice(device=device), + ) diff --git a/torchglyph/proc/contiguous.py b/torchglyph/proc/contiguous.py new file mode 100644 index 0000000..6b73c66 --- /dev/null +++ b/torchglyph/proc/contiguous.py @@ -0,0 +1,59 @@ +from typing import List, Tuple + +import torch +from torch import Tensor +from torch.nn.utils.rnn import PackedSequence +from torch.nn.utils.rnn import pack_sequence +from torch.nn.utils.rnn import pad_packed_sequence + +from torchglyph.proc.abc import Proc + + +class BuildContiguousSub(Proc): + def __init__(self, seq_token: str) -> None: + super(BuildContiguousSub, self).__init__() + self.seq_token = seq_token + + def extra_repr(self) -> str: + return repr(self.seq_token) + + def __call__(self, tokens: List[str], **kwargs) -> List[str]: + zs = [] + for token in tokens: + zs.extend(list(token)) + zs.append(self.seq_token) + return zs[:-1] + + +class BuildContiguousSubPtr(Proc): + def __call__(self, lengths: List[int], **kwargs) -> Tuple[List[int], List[int]]: + indices = [0] + for length in lengths: + indices.append(indices[-1] + length + 1) + return [index - 2 for index in indices[1:]], indices[:-1] + + +class PackContiguousSubPtr(Proc): + def __init__(self, enforce_sorted: bool) -> None: + super(PackContiguousSubPtr, self).__init__() + self.enforce_sorted = enforce_sorted + + def extra_repr(self) -> str: + return f'enforce_sorted={self.enforce_sorted}' + + def __call__(self, data: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]: + fs, bs = zip(*data) + + pack = pack_sequence([ + torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fs + ], enforce_sorted=self.enforce_sorted) + indices = pack._replace(data=torch.arange(pack.data.size(0), device=pack.data.device)) + indices, _ = pad_packed_sequence(indices, batch_first=True) + + fs = pack_sequence([ + indices[i, f] for i, f in enumerate(fs) + ], enforce_sorted=self.enforce_sorted) + bs = pack_sequence([ + indices[i, b] for i, b in enumerate(bs) + ], enforce_sorted=self.enforce_sorted) + return fs, bs From de8a5fcb6dff2c6ad84de513d96864c4ace2bc55 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 12 May 2020 03:03:50 +0900 Subject: [PATCH 58/66] Feat: Add ContiguousSubLstmEmbedding --- torchglyph/nn/embedding.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py index 0870190..de93ce7 100644 --- a/torchglyph/nn/embedding.py +++ b/torchglyph/nn/embedding.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Tuple import torch from einops import rearrange @@ -57,3 +57,33 @@ def forward(self, sub: Union[Tensor, PackedSequence], *args) -> Union[Tensor, Pa return self._padded_forward(sub, *args) else: return self._packed_forward(sub, *args) + + +class ContiguousSubLstmEmbedding(nn.Module): + def __init__(self, num_embeddings: int, embedding_dim: int, + hidden_dim: int, dropout: float, num_layers: int = 1, + bias: bool = True, batch_first: bool = True, + bidirectional: bool = True, padding_idx: int = None) -> None: + super(ContiguousSubLstmEmbedding, self).__init__() + + self.embedding = nn.Embedding( + num_embeddings=num_embeddings, + embedding_dim=embedding_dim, + padding_idx=padding_idx, + ) + self.dropout = nn.Dropout(dropout) + self.rnn = nn.LSTM( + input_size=self.embedding.embedding_dim, + hidden_size=hidden_dim, num_layers=num_layers, bias=bias, + batch_first=batch_first, bidirectional=bidirectional, + ) + + self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1) + + def forward(self, sub: PackedSequence, indices: Tuple[PackedSequence, PackedSequence]) -> PackedSequence: + embedding = sub._replace(data=self.dropout(self.embedding(sub.data))) + encoding, _ = self.rnn(embedding) # type: (PackedSequence, _) + + fidx, bidx = indices + fenc, benc = encoding.data.chunk(2, dim=-1) + return fidx._replace(data=torch.cat([fenc[fidx.data], benc[bidx.data]], dim=-1)) From ba5029df6af43b8801abf227d510fe3be1e5bee6 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 12 May 2020 15:21:35 +0900 Subject: [PATCH 59/66] Refactor: Rename PackContiguousSubPtr --- torchglyph/proc/contiguous.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/torchglyph/proc/contiguous.py b/torchglyph/proc/contiguous.py index 6b73c66..8cd5b11 100644 --- a/torchglyph/proc/contiguous.py +++ b/torchglyph/proc/contiguous.py @@ -41,19 +41,19 @@ def __init__(self, enforce_sorted: bool) -> None: def extra_repr(self) -> str: return f'enforce_sorted={self.enforce_sorted}' - def __call__(self, data: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]: - fs, bs = zip(*data) + def __call__(self, indices: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]: + fidx, bidx = zip(*indices) pack = pack_sequence([ - torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fs + torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fidx ], enforce_sorted=self.enforce_sorted) indices = pack._replace(data=torch.arange(pack.data.size(0), device=pack.data.device)) indices, _ = pad_packed_sequence(indices, batch_first=True) - fs = pack_sequence([ - indices[i, f] for i, f in enumerate(fs) + fidx = pack_sequence([ + indices[i, f] for i, f in enumerate(fidx) ], enforce_sorted=self.enforce_sorted) - bs = pack_sequence([ - indices[i, b] for i, b in enumerate(bs) + bidx = pack_sequence([ + indices[i, b] for i, b in enumerate(bidx) ], enforce_sorted=self.enforce_sorted) - return fs, bs + return fidx, bidx From 11a347d0ff9de4a455e40d63810042c40007cff7 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 12 May 2020 15:50:30 +0900 Subject: [PATCH 60/66] Feat: Add head_pack and prepend_pack --- torchglyph/functional.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/torchglyph/functional.py b/torchglyph/functional.py index 484699a..e963971 100644 --- a/torchglyph/functional.py +++ b/torchglyph/functional.py @@ -1,5 +1,6 @@ import functools -from typing import Any, Union, Tuple, Dict +from typing import Any +from typing import Union, Tuple, Dict import torch from torch import Tensor @@ -29,3 +30,16 @@ def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Te return x._replace(data=forward_fn(self, x.data, *args, **kwargs)) return type(name, bases, {**attrs, 'forward': forward}) + + +def head_pack(pack: PackedSequence) -> Tensor: + return pack.data[:pack.batch_sizes[0].item()] + + +def prepend_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -> PackedSequence: + if not torch.is_tensor(value): + value = torch.full_like(head_pack(pack), fill_value=value) + return pack._replace( + data=torch.cat([value, pack.data], dim=0), + batch_sizes=torch.cat([pack.batch_sizes[:1], pack.batch_sizes], dim=0), + ) From ba8670dba5f60231fc5cd79098ea4ab1ae3d3d9b Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Tue, 12 May 2020 15:56:24 +0900 Subject: [PATCH 61/66] Feat: Add SupportPack and rename SupportPackMeta --- torchglyph/functional.py | 15 ++++++++++++++- torchglyph/nn/embedding.py | 4 ++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/torchglyph/functional.py b/torchglyph/functional.py index e963971..e1ed678 100644 --- a/torchglyph/functional.py +++ b/torchglyph/functional.py @@ -4,6 +4,7 @@ import torch from torch import Tensor +from torch import nn from torch.nn.utils.rnn import PackedSequence @@ -18,7 +19,19 @@ def wrap(x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, Pac return wrap -class SupportPack(type): +class SupportPack(nn.Module): + def __init__(self, module: nn.Module) -> None: + super(SupportPack, self).__init__() + self.module = module + + def __repr__(self) -> str: + return f'Packed{self.module.__repr__()}' + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + return support_pack(self.module)(x) + + +class SupportPackMeta(type): def __new__(cls, name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]): forward_fn = attrs.get('forward', bases[0].forward) diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py index de93ce7..f97096c 100644 --- a/torchglyph/nn/embedding.py +++ b/torchglyph/nn/embedding.py @@ -5,10 +5,10 @@ from torch import nn, Tensor from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence -from torchglyph.functional import SupportPack +from torchglyph.functional import SupportPackMeta -class TokEmbedding(nn.Embedding, metaclass=SupportPack): +class TokEmbedding(nn.Embedding, metaclass=SupportPackMeta): pass From 90a3b142864b706ed527615f770fcb1911c87a04 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 18 May 2020 12:57:03 +0900 Subject: [PATCH 62/66] Feat: Add tail_pack and append_pack --- torchglyph/functional.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/torchglyph/functional.py b/torchglyph/functional.py index e1ed678..5462117 100644 --- a/torchglyph/functional.py +++ b/torchglyph/functional.py @@ -5,7 +5,8 @@ import torch from torch import Tensor from torch import nn -from torch.nn.utils.rnn import PackedSequence +from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence +from torch.nn.utils.rnn import pack_padded_sequence def support_pack(fn): @@ -56,3 +57,20 @@ def prepend_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) - data=torch.cat([value, pack.data], dim=0), batch_sizes=torch.cat([pack.batch_sizes[:1], pack.batch_sizes], dim=0), ) + + +def tail_pack(pack: PackedSequence) -> Tensor: + data, lengths = pad_packed_sequence(pack, batch_first=True) # type: (Tensor, Tensor) + indices = torch.arange(lengths.size(0), dtype=torch.long, device=data.device) + return data[indices, lengths - 1] + + +def append_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -> PackedSequence: + if not torch.is_tensor(value): + value = torch.full_like(head_pack(pack), fill_value=value) + data, lengths = pad_packed_sequence(pack, batch_first=True) # type: (Tensor, Tensor) + indices = torch.arange(lengths.size(0), dtype=torch.long, device=data.device) + return pack_padded_sequence( + torch.cat([data, value[:, None]], dim=1).index_put((indices, lengths), value), + lengths + 1, batch_first=True, enforce_sorted=False, + ) From 0570d4d4b5b6fec9852530dc6daf5f10a1a896e1 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 25 May 2020 17:15:29 +0900 Subject: [PATCH 63/66] Feat: Add unk_idx for TokEmbedding --- torchglyph/nn/embedding.py | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py index f97096c..0d99723 100644 --- a/torchglyph/nn/embedding.py +++ b/torchglyph/nn/embedding.py @@ -2,21 +2,34 @@ import torch from einops import rearrange -from torch import nn, Tensor -from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence +from torch import Tensor +from torch import nn +from torch.nn.utils.rnn import PackedSequence, pack_sequence, pack_padded_sequence from torchglyph.functional import SupportPackMeta class TokEmbedding(nn.Embedding, metaclass=SupportPackMeta): - pass + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = None, unk_idx: int = None, + max_norm: float = None, norm_type: float = 2., scale_grad_by_freq: bool = False, + sparse: bool = False, _weight: Tensor = None): + super(TokEmbedding, self).__init__( + num_embeddings=num_embeddings, embedding_dim=embedding_dim, + padding_idx=padding_idx, max_norm=max_norm, norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, _weight=_weight, + ) + self._unk_idx = unk_idx + + @property + def unk(self) -> Tensor: + return self.weight[self._unk_idx] class SubLstmEmbedding(nn.Module): def __init__(self, num_embeddings: int, embedding_dim: int, hidden_dim: int, dropout: float, num_layers: int = 1, bias: bool = True, batch_first: bool = True, - bidirectional: bool = True, padding_idx: int = None) -> None: + bidirectional: bool = True, padding_idx: int = None, unk_idx: int = None) -> None: super(SubLstmEmbedding, self).__init__() self.embedding = nn.Embedding( @@ -32,6 +45,13 @@ def __init__(self, num_embeddings: int, embedding_dim: int, ) self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1) + self._unk_idx = unk_idx + + @property + def unk(self) -> Tensor: + embedding = self.embedding.weight[None, self._unk_idx] + _, (encoding, _) = self.rnn(pack_sequence([embedding], enforce_sorted=True)) + return rearrange(encoding, '(l d) a h -> l a (d h)', l=self.rnn.num_layers)[0, 0, :] def _padded_forward(self, sub: Tensor, tok_lengths: Tensor) -> Tensor: pack = pack_padded_sequence( From 3ffe20539ffa273accd6109336896da199a5d245 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 25 May 2020 17:26:18 +0900 Subject: [PATCH 64/66] Feat: Add unk_idx for SubLstmEmbedding --- torchglyph/nn/embedding.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py index 0d99723..87fd302 100644 --- a/torchglyph/nn/embedding.py +++ b/torchglyph/nn/embedding.py @@ -18,11 +18,11 @@ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = N padding_idx=padding_idx, max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, _weight=_weight, ) - self._unk_idx = unk_idx + self.unk_idx = unk_idx @property def unk(self) -> Tensor: - return self.weight[self._unk_idx] + return self.weight[self.unk_idx] class SubLstmEmbedding(nn.Module): @@ -45,11 +45,11 @@ def __init__(self, num_embeddings: int, embedding_dim: int, ) self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1) - self._unk_idx = unk_idx + self.unk_idx = unk_idx @property def unk(self) -> Tensor: - embedding = self.embedding.weight[None, self._unk_idx] + embedding = self.embedding.weight[None, self.unk_idx] _, (encoding, _) = self.rnn(pack_sequence([embedding], enforce_sorted=True)) return rearrange(encoding, '(l d) a h -> l a (d h)', l=self.rnn.num_layers)[0, 0, :] From 5a855b4e060c1e3d739b6b25d255e453e2fca8b6 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Thu, 4 Jun 2020 23:00:26 +0900 Subject: [PATCH 65/66] Fix: Use getLogger --- torchglyph/datasets/sequential_labeling.py | 2 +- torchglyph/datasets/text_classification.py | 2 +- torchglyph/io.py | 2 +- torchglyph/nn/contextual.py | 2 +- torchglyph/proc/vocab.py | 2 +- torchglyph/vocab.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py index 85cf6eb..edaa85a 100644 --- a/torchglyph/datasets/sequential_labeling.py +++ b/torchglyph/datasets/sequential_labeling.py @@ -13,7 +13,7 @@ from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend -logger = logging.Logger(__name__) +logger = logging.getLogger(__name__) class CoNLL2000Chunking(Dataset): diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py index 9789580..a228479 100644 --- a/torchglyph/datasets/text_classification.py +++ b/torchglyph/datasets/text_classification.py @@ -12,7 +12,7 @@ from torchglyph.pipe import PackedTokSeqPipe, TokTensorPipe, RawPipe from torchglyph.proc import Identity, LoadGlove -logger = logging.Logger(__name__) +logger = logging.getLogger(__name__) class AgNews(Dataset): diff --git a/torchglyph/io.py b/torchglyph/io.py index cf1cf15..e2b7025 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -12,7 +12,7 @@ from tqdm import tqdm -logger = logging.Logger(__name__) +logger = logging.getLogger(__name__) IO = Union[str, Path, TextIO] diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py index af42d5b..9ec87c4 100644 --- a/torchglyph/nn/contextual.py +++ b/torchglyph/nn/contextual.py @@ -18,7 +18,7 @@ toggle_loggers('allennlp', False) toggle_loggers('elmoformanylangs', False) -logger = logging.Logger(__name__) +logger = logging.getLogger(__name__) class ELMoModel(AllenELMo): diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index e3cf351..ad47e9c 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -5,7 +5,7 @@ from torchglyph.proc import Proc from torchglyph.vocab import Vocab, Vectors, Glove, FastTest -logger = logging.Logger(__name__) +logger = logging.getLogger(__name__) class UpdateCounter(Proc): diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index baa9ad5..428dea8 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -12,7 +12,7 @@ from torchglyph import data_path from torchglyph.io import download_and_unzip -logger = logging.Logger(__name__) +logger = logging.getLogger(__name__) class Vocab(object): From 843837a2e03e1f21015b2238c93becce85f09413 Mon Sep 17 00:00:00 2001 From: speedcell4 Date: Mon, 27 Jul 2020 22:15:16 +0900 Subject: [PATCH 66/66] Feat: Update Github Action settings --- .github/workflows/mkdocs.yml | 8 +++---- .github/workflows/python-publish.yml | 31 ++++++++++++++++++++++++++++ .github/workflows/unit-tests.yml | 4 ++-- README.md | 3 ++- setup.py | 2 +- 5 files changed, 40 insertions(+), 8 deletions(-) create mode 100644 .github/workflows/python-publish.yml diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml index f3634bc..5e041c6 100644 --- a/.github/workflows/mkdocs.yml +++ b/.github/workflows/mkdocs.yml @@ -1,4 +1,4 @@ -name: mkdocs +name: Build Document by Mkdocs on: push: branches: @@ -10,9 +10,9 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout develop - uses: actions/checkout@v1 + uses: actions/checkout@v2 - name: Set up Python 3.7 - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: 3.7 - name: Install dependencies @@ -22,5 +22,5 @@ jobs: - name: Deploy uses: peaceiris/actions-gh-pages@v3 with: - github_token: ${{ secrets.PERSONAL_TOKEN }} + github_token: ${{ secrets.GITHUB_TOKEN }} publish_dir: ./site \ No newline at end of file diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..7ba9c7b --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 0915eec..9f4cd89 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,4 +1,4 @@ -name: unit-tests +name: Unit Tests on: [push] @@ -14,7 +14,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: "${{ matrix.python-version }}" - name: Install dependencies diff --git a/README.md b/README.md index 5365e39..eb43acc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # TorchGlyph -[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions) +![Unit Tests](https://github.com/speedcell4/torchglyph/workflows/Unit%20Tests/badge.svg) +![Upload Python Package](https://github.com/speedcell4/torchglyph/workflows/Upload%20Python%20Package/badge.svg) ## Requirements diff --git a/setup.py b/setup.py index 414abe1..eddd429 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name=name, - version='0.1.0', + version='0.1.1', packages=[package for package in find_packages() if package.startswith(name)], url=f'https://speedcell4.github.io/torchglyph', license='MIT',