From cf3f7bcaac4824c9af365fed38e616fc87408b7a Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sat, 21 Mar 2020 16:25:05 +0900
Subject: [PATCH 01/66] Feat: Add SemEval2010T1NER dataset

---
 .../test_datasets/test_sequential_labeling.py | 31 ++++--
 torchglyph/datasets/__init__.py               |  1 +
 torchglyph/datasets/sequential_labeling.py    | 99 +++++++++++++++++--
 3 files changed, 115 insertions(+), 16 deletions(-)

diff --git a/tests/test_datasets/test_sequential_labeling.py b/tests/test_datasets/test_sequential_labeling.py
index d76f233..94a7e6d 100644
--- a/tests/test_datasets/test_sequential_labeling.py
+++ b/tests/test_datasets/test_sequential_labeling.py
@@ -1,14 +1,29 @@
-from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER
+from torchglyph.datasets import CoNLL2000Chunking, CoNLL2003NER
+from torchglyph.datasets import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish
 
 
-def test_conll2000_chunking() -> None:
+def test_conll2000_chunking():
     train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None)
-    assert len(train) == 8936
-    assert len(test) == 2012
+    assert len(train.dataset) == 8936
+    assert len(test.dataset) == 2012
 
 
-def test_conll2003_ner() -> None:
+def test_conll2003_ner():
     train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None)
-    assert len(train) == 14987
-    assert len(dev) == 3466
-    assert len(test) == 3684
+    assert len(train.dataset) == 14987
+    assert len(dev.dataset) == 3466
+    assert len(test.dataset) == 3684
+
+
+def test_semeval2010_catalan():
+    train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None)
+    assert len(train.dataset) == 8709
+    assert len(dev.dataset) == 1445
+    assert len(test.dataset) == 1698
+
+
+def test_semeval2010_spanish():
+    train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None)
+    assert len(train.dataset) == 9022
+    assert len(dev.dataset) == 1419
+    assert len(test.dataset) == 1705
diff --git a/torchglyph/datasets/__init__.py b/torchglyph/datasets/__init__.py
index b5b186c..0c2ab4f 100644
--- a/torchglyph/datasets/__init__.py
+++ b/torchglyph/datasets/__init__.py
@@ -1,2 +1,3 @@
 from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER
+from torchglyph.datasets.sequential_labeling import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish
 from torchglyph.datasets.text_classification import AgNews
diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py
index f91ed5c..b5f2405 100644
--- a/torchglyph/datasets/sequential_labeling.py
+++ b/torchglyph/datasets/sequential_labeling.py
@@ -1,14 +1,17 @@
 import logging
 from pathlib import Path
-from typing import Iterable, List, Any, Tuple, Optional, NamedTuple, TextIO
+from typing import Iterable, Any
+from typing import Optional, List, Tuple, NamedTuple
+from typing import TextIO
 
 from tqdm import tqdm
 
 from torchglyph.dataset import Dataset, DataLoader
 from torchglyph.formats import conllx
-from torchglyph.pipe import PackedTokSeqPipe, SeqLengthTensorPipe, RawPipe, PackedTokPtrSeqPipe
+from torchglyph.pipe import PackedTokSeqPipe, SeqLengthTensorPipe, RawPipe, PackedTokPtrSeqPipe, PackedPtrSeqPipe, \
+    ToSubList, UpdateCounter, Lift
 from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe
-from torchglyph.proc import ToLower, ReplaceDigits, Identity, LoadGlove
+from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend
 
 
 class CoNLL2000Chunking(Dataset):
@@ -20,7 +23,7 @@ class CoNLL2000Chunking(Dataset):
     @classmethod
     def load(cls, path: Path) -> Iterable[List[Any]]:
         for sent in tqdm(conllx.load(path, sep=' '), desc=f'reading {path}'):
-            word, pos, chunk = list(zip(*sent))
+            word, pos, chunk = map(list, zip(*sent))
             yield [word, pos, chunk]
 
     def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args, **kwargs) -> None:
@@ -33,7 +36,7 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args
     @classmethod
     def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]:
         word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
-            pre=ToLower() + ReplaceDigits(repl_token='<digits>') + ...,
+            pre=ReplaceDigits(repl_token='<digits>') + ...,
             vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)),
         )
         length = SeqLengthTensorPipe(device=device)
@@ -76,7 +79,7 @@ class CoNLL2003NER(Dataset):
     @classmethod
     def load(cls, path: Path) -> Iterable[List[Any]]:
         for sent in tqdm(conllx.load(path, sep=' '), desc=f'reading {path}', unit=' sents'):
-            word, pos, chunk, ner = list(zip(*sent))
+            word, pos, chunk, ner = map(list, zip(*sent))
             yield [word, pos, chunk, ner]
 
     def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args, **kwargs) -> None:
@@ -89,8 +92,8 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args
     @classmethod
     def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]:
         word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
-            pre=ToLower() + ReplaceDigits(repl_token='<digits>') + ...,
-            vocab=... + (Identity() if word_dim is None else LoadGlove(name='6B', dim=word_dim)),
+            pre=ReplaceDigits(repl_token='<digits>') + ...,
+            vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)),
         )
         length = SeqLengthTensorPipe(device=device)
         char = PackedTokBlockPipe(device=device, unk_token='<unk>')
@@ -124,3 +127,83 @@ def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tupl
             (train, dev, test),
             batch_size=batch_size, shuffle=True,
         )
+
+
+class SemEval2010T1NER(Dataset):
+    lang: str
+
+    @classmethod
+    def load(cls, path: Path, **kwargs) -> Iterable[Any]:
+        for sent in tqdm(conllx.load(path, sep='\t'), desc=f'reading {path}', unit=' sentences'):
+            _, word, _, pos, _, _, head, drel, _, _, ner = map(list, zip(*sent))
+            yield [word, pos, [int(h) for h in head], drel, ner]
+
+    def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[Any], *args, **kwargs) -> None:
+        ner_vocab = self.pipes['ner'].vocab.stoi
+        for raw_word, raw_pos, raw_ner, pred in \
+                zip(batch.raw_word, batch.raw_pos, batch.raw_ner, prediction):
+            assert len(raw_word) == len(raw_pos) == len(raw_ner) == len(pred)
+
+            pred_ner = [ner_vocab[p] for p in pred]
+            conllx.dump(zip(raw_word, raw_pos, raw_ner, pred_ner), fp, sep=' ')
+
+    @classmethod
+    def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]:
+        word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
+            pre=Prepend('<root>', 1) + ReplaceDigits(repl_token='<digits>') + ...,
+            vocab=... + (Identity() if word_dim is None else LoadFastText(cls.lang, str.lower)),
+        )
+        length = SeqLengthTensorPipe(device=device).with_(pre=Prepend('<root>', 1) + ...)
+        char = PackedTokBlockPipe(device=device, unk_token='<unk>').with_(
+            pre=ToSubList() + Lift(Prepend('<root>', 1)) + Lift(UpdateCounter()),
+        )
+        word_ptr = PackedTokPtrSeqPipe(device=device, reverse=False).with_(pre=Prepend(0, 1) + ...)
+        pos = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(pre=Prepend('<root>', 1) + ...)
+        head = PackedPtrSeqPipe(device=device).with_(pre=Prepend(0, 1) + ...)
+        drel = PackedTokSeqPipe(device=device, unk_token='root').with_(pre=Prepend('<root>', 1) + ...)
+        ner = PaddedTokSeqPipe(device=device, unk_token='O', pad_token='O')
+
+        pipes = [
+            dict(word=word, length=length, char=char, word_ptr=word_ptr, raw_word=RawPipe()),
+            dict(pos=pos, raw_pos=RawPipe()),
+            dict(head=head),
+            dict(drel=drel, raw_drel=RawPipe()),
+            dict(ner=ner, raw_ner=RawPipe()),
+        ]
+
+        train, dev, test = cls.paths()
+        train = cls(path=train, pipes=pipes)
+        dev = cls(path=dev, pipes=pipes)
+        test = cls(path=test, pipes=pipes)
+
+        for name, pipe in train.pipes.items():
+            logging.info(f'{name} => {pipe}')
+
+        word.build_vocab(train, dev, test, name='word')
+        char.build_vocab(train, dev, test, name='char')
+        pos.build_vocab(train, name='pos')
+        drel.build_vocab(train, name='drel')
+        ner.build_vocab(train, name='ner')
+
+        return DataLoader.new(
+            (train, dev, test),
+            batch_size=batch_size, shuffle=True,
+        )
+
+
+class SemEval2010T1NERCatalan(SemEval2010T1NER):
+    urls = [
+        ('https://www.dropbox.com/s/nqedh3zmk5k80n7/train.sd.conllx?dl=1', 'train.sd.conllx'),
+        ('https://www.dropbox.com/s/027umbuks3njwry/dev.sd.conllx?dl=1', 'dev.sd.conllx'),
+        ('https://www.dropbox.com/s/ldwn6z1xl5vki4y/test.sd.conllx?dl=1', 'test.sd.conllx'),
+    ]
+    lang = 'ca'
+
+
+class SemEval2010T1NERSpanish(SemEval2010T1NER):
+    urls = [
+        ('https://www.dropbox.com/s/lyxgvc161ai20v0/train.sd.conllx?dl=1', 'train.sd.conllx'),
+        ('https://www.dropbox.com/s/8tmbi7ki6ctasez/dev.sd.conllx?dl=1', 'dev.sd.conllx'),
+        ('https://www.dropbox.com/s/nnj94hdmlq3jjm8/test.sd.conllx?dl=1', 'test.sd.conllx'),
+    ]
+    lang = 'es'

From 9bbcb26408a9492e5aa7515546ae590f917f16c0 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 25 Mar 2020 01:53:50 +0900
Subject: [PATCH 02/66] Feat: Add BertTokenizer and so on

---
 setup.py                      |  1 +
 torchglyph/proc/contextual.py | 49 +++++++++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 torchglyph/proc/contextual.py

diff --git a/setup.py b/setup.py
index 736ca61..63a49ce 100644
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,7 @@
         'tqdm',
         'numpy',
         'einops',
+        'transformers',
     ],
     extras_require={
         'dev': [
diff --git a/torchglyph/proc/contextual.py b/torchglyph/proc/contextual.py
new file mode 100644
index 0000000..21a6571
--- /dev/null
+++ b/torchglyph/proc/contextual.py
@@ -0,0 +1,49 @@
+from typing import Union, List
+
+import transformers
+
+from torchglyph.pipe import Proc
+
+
+class TokenizerProc(Proc):
+    def __init__(self, weights: str) -> None:
+        super(TokenizerProc, self).__init__()
+        self.weights = weights
+
+    def extra_repr(self) -> str:
+        return f'weights={self.weights}'
+
+    def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]:
+        if not isinstance(data, str):
+            data = ' '.join(data)
+        return self.tokenizer.encode(data)
+
+
+class BertTokenizer(TokenizerProc):
+    def __init__(self, weights: str = 'bert-base-uncased'):
+        super(BertTokenizer, self).__init__(weights=weights)
+        self.tokenizer = transformers.BertTokenizer.from_pretrained(weights)
+
+
+class CTRLTokenizer(TokenizerProc):
+    def __init__(self, weights: str = 'ctrl'):
+        super(CTRLTokenizer, self).__init__(weights=weights)
+        self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weights)
+
+
+class XLNetTokenizer(TokenizerProc):
+    def __init__(self, weights: str = 'xlnet-base-cased'):
+        super(XLNetTokenizer, self).__init__(weights=weights)
+        self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weights)
+
+
+class XLMTokenizer(TokenizerProc):
+    def __init__(self, weights: str = 'xlm-mlm-enfr-1024'):
+        super(XLMTokenizer, self).__init__(weights=weights)
+        self.tokenizer = transformers.XLMTokenizer.from_pretrained(weights)
+
+
+class BartTokenizer(TokenizerProc):
+    def __init__(self, weights: str = 'bart-large'):
+        super(BartTokenizer, self).__init__(weights=weights)
+        self.tokenizer = transformers.BartTokenizer.from_pretrained(weights)

From 8bd94f8b7d0351c03041a048b5a863541d8f667f Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 25 Mar 2020 14:47:17 +0900
Subject: [PATCH 03/66] Feat: Support ELMo

---
 setup.py                                      |  2 +-
 torchglyph/__init__.py                        |  2 +-
 torchglyph/io.py                              |  7 ++-
 torchglyph/nn/__init__.py                     |  5 +-
 torchglyph/nn/contextual.py                   | 57 +++++++++++++++++++
 torchglyph/proc/__init__.py                   |  1 +
 torchglyph/proc/collecting.py                 |  9 +++
 .../proc/{contextual.py => tokenizer.py}      | 13 +++++
 8 files changed, 91 insertions(+), 5 deletions(-)
 create mode 100644 torchglyph/nn/contextual.py
 rename torchglyph/proc/{contextual.py => tokenizer.py} (74%)

diff --git a/setup.py b/setup.py
index 63a49ce..87d14a0 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
         'tqdm',
         'numpy',
         'einops',
-        'transformers',
+        'transformers', 'allennlp'
     ],
     extras_require={
         'dev': [
diff --git a/torchglyph/__init__.py b/torchglyph/__init__.py
index f1f93f6..3c126bf 100644
--- a/torchglyph/__init__.py
+++ b/torchglyph/__init__.py
@@ -4,7 +4,7 @@
 import torch
 from torch.nn.utils.rnn import PackedSequence
 
-data_path = Path.home() / '.torchglyph'
+data_path = (Path.home() / '.torchglyph').expanduser().absolute()
 if not data_path.exists():
     data_path.mkdir(parents=True, exist_ok=True)
 
diff --git a/torchglyph/io.py b/torchglyph/io.py
index fc018d1..271ab87 100644
--- a/torchglyph/io.py
+++ b/torchglyph/io.py
@@ -50,7 +50,10 @@ def inner(b=1, bsize=1, tsize=None) -> None:
 
 
 # copied and modified from https://github.com/pytorch/text
-def download_and_unzip(url: str, dest: Path) -> None:
+def download_and_unzip(url: str, dest: Path) -> Path:
+    if dest.exists():
+        return dest
+
     if not dest.parent.exists():
         dest.parent.mkdir(parents=True, exist_ok=True)
 
@@ -73,3 +76,5 @@ def download_and_unzip(url: str, dest: Path) -> None:
         with gzip.open(dest, mode='rb') as fsrc:
             with dest.with_suffix('').open(mode='wb') as fdst:
                 shutil.copyfileobj(fsrc, fdst)
+
+    return dest
diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py
index a879623..fe4a92c 100644
--- a/torchglyph/nn/__init__.py
+++ b/torchglyph/nn/__init__.py
@@ -1,2 +1,3 @@
-from torchglyph.nn.embedding import *
-from torchglyph.nn.rnn import *
+from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding
+from torchglyph.nn.rnn import ContextualLSTM
+from torchglyph.nn.contextual import ELMo
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
new file mode 100644
index 0000000..9675c97
--- /dev/null
+++ b/torchglyph/nn/contextual.py
@@ -0,0 +1,57 @@
+import logging
+from typing import Union
+
+from allennlp.data.dataset import Batch as AllenBatch
+from allennlp.modules import Elmo as AllenELMo
+from torch import Tensor
+from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence
+
+from torchglyph import data_path
+from torchglyph.io import download_and_unzip
+
+
+class ELMo(AllenELMo):
+    root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/'
+    name = {
+        'small': '2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_',
+        'medium': '2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_',
+        'original': '2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_',
+        '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_',
+    }
+
+    def __init__(self, options_file, weight_file, *args, pack_output, **kwargs):
+        super(ELMo, self).__init__(
+            *args, options_file=options_file, weight_file=weight_file, **kwargs)
+        self.pack_output = pack_output
+        logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}')
+
+    @classmethod
+    def from_pretrained(cls, weight: str, pack_output: bool = True,
+                        num_output_representations: int = 2,
+                        dropout: float = 0., freeze: bool = True) -> 'ELMo':
+        elmo_path = data_path / cls.__name__.lower()
+        options_file = download_and_unzip(
+            url=cls.root + (cls.name[weight] + 'options.json'),
+            dest=elmo_path / (cls.name[weight] + 'options.json'),
+        )
+        weight_file = download_and_unzip(
+            url=cls.root + (cls.name[weight] + 'weights.hdf5'),
+            dest=elmo_path / (cls.name[weight] + 'weights.hdf5'),
+        )
+        return cls(
+            options_file=str(options_file), weight_file=str(weight_file),
+            num_output_representations=num_output_representations,
+            requires_grad=not freeze, dropout=dropout, pack_output=pack_output,
+        )
+
+    def forward(self, batch: AllenBatch) -> Union[Tensor, PackedSequence]:
+        outputs = super(ELMo, self).forward(batch)
+        elmo_representations, *_ = outputs['elmo_representations']
+        if not self.pack_output:
+            return elmo_representations
+        else:
+            lengths = outputs['mask'].long().sum(dim=-1)
+            return pack_padded_sequence(
+                elmo_representations, lengths,
+                batch_first=True, enforce_sorted=False,
+            )
diff --git a/torchglyph/proc/__init__.py b/torchglyph/proc/__init__.py
index 9416a5a..625b9dc 100644
--- a/torchglyph/proc/__init__.py
+++ b/torchglyph/proc/__init__.py
@@ -4,3 +4,4 @@
 from torchglyph.proc.recur import *
 from torchglyph.proc.shape import *
 from torchglyph.proc.vocab import *
+from torchglyph.proc.tokenizer import *
diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index 1ce3fc5..f93a38b 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -1,6 +1,8 @@
 from typing import Any, Union, List, Tuple
 
 import torch
+from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary
+from allennlp.data.dataset import Batch as AllenBatch
 from torch import Tensor
 from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence
 
@@ -94,6 +96,13 @@ def __call__(self, data: List[Tensor], vocab: Vocab, **kwargs) -> Tensor:
         )
 
 
+class PadELMo(Proc):
+    def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor:
+        dataset = AllenBatch(data)
+        dataset.index_instances(AllenVocabulary())
+        return dataset.as_tensor_dict()['elmo']['character_ids']
+
+
 class PackSeq(Proc):
     def __init__(self, enforce_sorted: bool) -> None:
         super(PackSeq, self).__init__()
diff --git a/torchglyph/proc/contextual.py b/torchglyph/proc/tokenizer.py
similarity index 74%
rename from torchglyph/proc/contextual.py
rename to torchglyph/proc/tokenizer.py
index 21a6571..310e354 100644
--- a/torchglyph/proc/contextual.py
+++ b/torchglyph/proc/tokenizer.py
@@ -1,6 +1,9 @@
 from typing import Union, List
 
 import transformers
+from allennlp.data import Token as AllenToken, Instance as AllenInstance
+from allennlp.data.fields import TextField as AllenTextField
+from allennlp.data.token_indexers import ELMoTokenCharactersIndexer
 
 from torchglyph.pipe import Proc
 
@@ -19,6 +22,16 @@ def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]:
         return self.tokenizer.encode(data)
 
 
+class ELMoTokenizer(Proc):
+    def __init__(self):
+        super(ELMoTokenizer, self).__init__()
+        self.tokenizer = ELMoTokenCharactersIndexer()
+
+    def __call__(self, data: List[str], *args, **kwargs):
+        data = [AllenToken(token) for token in data]
+        return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})})
+
+
 class BertTokenizer(TokenizerProc):
     def __init__(self, weights: str = 'bert-base-uncased'):
         super(BertTokenizer, self).__init__(weights=weights)

From c6a8dd88de42c9e8f8e1b97ac4a1e263a4ffaa72 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 25 Mar 2020 15:04:28 +0900
Subject: [PATCH 04/66] Feat: Update extra_repr and __repr__ for ELMo

---
 setup.py                    |  3 ++-
 torchglyph/nn/contextual.py | 24 +++++++++++++++++++++---
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 87d14a0..7faba6e 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,8 @@
         'tqdm',
         'numpy',
         'einops',
-        'transformers', 'allennlp'
+        'transformers',
+        'allennlp',
     ],
     extras_require={
         'dev': [
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 9675c97..7b85eba 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -19,11 +19,15 @@ class ELMo(AllenELMo):
         '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_',
     }
 
-    def __init__(self, options_file, weight_file, *args, pack_output, **kwargs):
+    def __init__(self, *, options_file, weight_file, pack_output, **kwargs):
+        logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}')
+
         super(ELMo, self).__init__(
-            *args, options_file=options_file, weight_file=weight_file, **kwargs)
+            options_file=options_file, weight_file=weight_file, **kwargs,
+        )
+
         self.pack_output = pack_output
-        logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}')
+        self.embedding_dim = self.get_output_dim()
 
     @classmethod
     def from_pretrained(cls, weight: str, pack_output: bool = True,
@@ -44,6 +48,20 @@ def from_pretrained(cls, weight: str, pack_output: bool = True,
             requires_grad=not freeze, dropout=dropout, pack_output=pack_output,
         )
 
+    def extra_repr(self) -> str:
+        args = [
+            f'{self._elmo_lstm._elmo_lstm.input_size}',
+            f'{self._elmo_lstm._elmo_lstm.hidden_size}',
+            f'num_layers={self._elmo_lstm.num_layers}',
+            f'dropout={self._dropout.p}',
+        ]
+        if not self._elmo_lstm._requires_grad:
+            args.append('frozen')
+        return ', '.join(args)
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}({self.extra_repr()})'
+
     def forward(self, batch: AllenBatch) -> Union[Tensor, PackedSequence]:
         outputs = super(ELMo, self).forward(batch)
         elmo_representations, *_ = outputs['elmo_representations']

From c284e6f782b96832b13eb1b10beba0e0e56b1a99 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 25 Mar 2020 15:09:52 +0900
Subject: [PATCH 05/66] Fix: Import path bug

---
 torchglyph/proc/collecting.py | 6 +++---
 torchglyph/proc/tokenizer.py  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index f93a38b..a564d45 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -98,9 +98,9 @@ def __call__(self, data: List[Tensor], vocab: Vocab, **kwargs) -> Tensor:
 
 class PadELMo(Proc):
     def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor:
-        dataset = AllenBatch(data)
-        dataset.index_instances(AllenVocabulary())
-        return dataset.as_tensor_dict()['elmo']['character_ids']
+        batch = AllenBatch(data)
+        batch.index_instances(AllenVocabulary())
+        return batch.as_tensor_dict()['elmo']['character_ids']
 
 
 class PackSeq(Proc):
diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py
index 310e354..f1904bc 100644
--- a/torchglyph/proc/tokenizer.py
+++ b/torchglyph/proc/tokenizer.py
@@ -5,7 +5,7 @@
 from allennlp.data.fields import TextField as AllenTextField
 from allennlp.data.token_indexers import ELMoTokenCharactersIndexer
 
-from torchglyph.pipe import Proc
+from torchglyph.proc import Proc
 
 
 class TokenizerProc(Proc):

From 10c385c2bfeed54b88608ac847143eaf9024065b Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 25 Mar 2020 15:14:45 +0900
Subject: [PATCH 06/66] Refactor: Unify naming style

---
 torchglyph/nn/__init__.py   |  2 +-
 torchglyph/nn/contextual.py | 13 ++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py
index fe4a92c..f50bd31 100644
--- a/torchglyph/nn/__init__.py
+++ b/torchglyph/nn/__init__.py
@@ -1,3 +1,3 @@
 from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding
 from torchglyph.nn.rnn import ContextualLSTM
-from torchglyph.nn.contextual import ELMo
+from torchglyph.nn.contextual import ELMoModel
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 7b85eba..75b3a76 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -1,7 +1,6 @@
 import logging
 from typing import Union
 
-from allennlp.data.dataset import Batch as AllenBatch
 from allennlp.modules import Elmo as AllenELMo
 from torch import Tensor
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence
@@ -10,7 +9,7 @@
 from torchglyph.io import download_and_unzip
 
 
-class ELMo(AllenELMo):
+class ELMoModel(AllenELMo):
     root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/'
     name = {
         'small': '2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_',
@@ -19,10 +18,10 @@ class ELMo(AllenELMo):
         '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_',
     }
 
-    def __init__(self, *, options_file, weight_file, pack_output, **kwargs):
+    def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs) -> None:
         logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}')
 
-        super(ELMo, self).__init__(
+        super(ELMoModel, self).__init__(
             options_file=options_file, weight_file=weight_file, **kwargs,
         )
 
@@ -32,7 +31,7 @@ def __init__(self, *, options_file, weight_file, pack_output, **kwargs):
     @classmethod
     def from_pretrained(cls, weight: str, pack_output: bool = True,
                         num_output_representations: int = 2,
-                        dropout: float = 0., freeze: bool = True) -> 'ELMo':
+                        dropout: float = 0., freeze: bool = True) -> 'ELMoModel':
         elmo_path = data_path / cls.__name__.lower()
         options_file = download_and_unzip(
             url=cls.root + (cls.name[weight] + 'options.json'),
@@ -62,8 +61,8 @@ def extra_repr(self) -> str:
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}({self.extra_repr()})'
 
-    def forward(self, batch: AllenBatch) -> Union[Tensor, PackedSequence]:
-        outputs = super(ELMo, self).forward(batch)
+    def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, PackedSequence]:
+        outputs = super(ELMoModel, self).forward(batch, word_inputs=word_inputs)
         elmo_representations, *_ = outputs['elmo_representations']
         if not self.pack_output:
             return elmo_representations

From 0a7a492c239f2f1bae96e4f684f686771427e103 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 25 Mar 2020 15:27:14 +0900
Subject: [PATCH 07/66] Feat: Add tokenizers of transformers

---
 torchglyph/proc/tokenizer.py | 100 ++++++++++++++++++++++++-----------
 1 file changed, 68 insertions(+), 32 deletions(-)

diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py
index f1904bc..f14ebd3 100644
--- a/torchglyph/proc/tokenizer.py
+++ b/torchglyph/proc/tokenizer.py
@@ -8,13 +8,23 @@
 from torchglyph.proc import Proc
 
 
-class TokenizerProc(Proc):
-    def __init__(self, weights: str) -> None:
-        super(TokenizerProc, self).__init__()
-        self.weights = weights
+class ELMoTokenizer(Proc):
+    def __init__(self):
+        super(ELMoTokenizer, self).__init__()
+        self.tokenizer = ELMoTokenCharactersIndexer()
+
+    def __call__(self, data: List[str], *args, **kwargs):
+        data = [AllenToken(token) for token in data]
+        return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})})
+
+
+class TransformerTokenizerProc(Proc):
+    def __init__(self, weight: str) -> None:
+        super(TransformerTokenizerProc, self).__init__()
+        self.weightt = weight
 
     def extra_repr(self) -> str:
-        return f'weights={self.weights}'
+        return f'weight={self.weight}'
 
     def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]:
         if not isinstance(data, str):
@@ -22,41 +32,67 @@ def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]:
         return self.tokenizer.encode(data)
 
 
-class ELMoTokenizer(Proc):
-    def __init__(self):
-        super(ELMoTokenizer, self).__init__()
-        self.tokenizer = ELMoTokenCharactersIndexer()
+class BertTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'bert-base-uncased'):
+        super(BertTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.BertTokenizer.from_pretrained(weight)
 
-    def __call__(self, data: List[str], *args, **kwargs):
-        data = [AllenToken(token) for token in data]
-        return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})})
+
+class OpenAIGPTTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'openai-gpt'):
+        super(OpenAIGPTTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(weight)
+
+
+class GPT2Tokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'gpt2'):
+        super(GPT2Tokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight)
+
+
+class CTRLTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'ctrl'):
+        super(CTRLTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weight)
+
+
+class TransfoXLTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'transfo-xl-wt103'):
+        super(TransfoXLTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.TransfoXLTokenizer.from_pretrained(weight)
+
+
+class XLNetTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'xlnet-base-cased'):
+        super(XLNetTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weight)
 
 
-class BertTokenizer(TokenizerProc):
-    def __init__(self, weights: str = 'bert-base-uncased'):
-        super(BertTokenizer, self).__init__(weights=weights)
-        self.tokenizer = transformers.BertTokenizer.from_pretrained(weights)
+class XLMTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'xlm-mlm-enfr-1024'):
+        super(XLMTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.XLMTokenizer.from_pretrained(weight)
 
 
-class CTRLTokenizer(TokenizerProc):
-    def __init__(self, weights: str = 'ctrl'):
-        super(CTRLTokenizer, self).__init__(weights=weights)
-        self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weights)
+class DistilBertTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'distilbert-base-cased'):
+        super(DistilBertTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(weight)
 
 
-class XLNetTokenizer(TokenizerProc):
-    def __init__(self, weights: str = 'xlnet-base-cased'):
-        super(XLNetTokenizer, self).__init__(weights=weights)
-        self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weights)
+class RobertaTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'roberta-base'):
+        super(RobertaTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.RobertaTokenizer.from_pretrained(weight)
 
 
-class XLMTokenizer(TokenizerProc):
-    def __init__(self, weights: str = 'xlm-mlm-enfr-1024'):
-        super(XLMTokenizer, self).__init__(weights=weights)
-        self.tokenizer = transformers.XLMTokenizer.from_pretrained(weights)
+class XLMRobertaTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'xlm-roberta-base'):
+        super(XLMRobertaTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(weight)
 
 
-class BartTokenizer(TokenizerProc):
-    def __init__(self, weights: str = 'bart-large'):
-        super(BartTokenizer, self).__init__(weights=weights)
-        self.tokenizer = transformers.BartTokenizer.from_pretrained(weights)
+class BartTokenizer(TransformerTokenizerProc):
+    def __init__(self, weight: str = 'bart-large'):
+        super(BartTokenizer, self).__init__(weight=weight)
+        self.tokenizer = transformers.BartTokenizer.from_pretrained(weight)

From db1e7bde1bbd68efa5e55d0fcd7e7f900c776655 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Thu, 26 Mar 2020 17:36:12 +0900
Subject: [PATCH 08/66] Feat: Add ELMoPipe

---
 torchglyph/nn/__init__.py   |  1 -
 torchglyph/pipe/ctx.py      | 18 ++++++++++++++++++
 torchglyph/proc/__init__.py |  1 -
 3 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 torchglyph/pipe/ctx.py

diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py
index f50bd31..249b291 100644
--- a/torchglyph/nn/__init__.py
+++ b/torchglyph/nn/__init__.py
@@ -1,3 +1,2 @@
 from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding
 from torchglyph.nn.rnn import ContextualLSTM
-from torchglyph.nn.contextual import ELMoModel
diff --git a/torchglyph/pipe/ctx.py b/torchglyph/pipe/ctx.py
new file mode 100644
index 0000000..1adc83f
--- /dev/null
+++ b/torchglyph/pipe/ctx.py
@@ -0,0 +1,18 @@
+from typing import Union
+
+import torch
+
+from torchglyph.pipe import Pipe
+from torchglyph.proc import PadELMo
+from torchglyph.proc import ToDevice
+from torchglyph.proc.tokenizer import ELMoTokenizer
+
+
+class ELMoPipe(Pipe):
+    def __init__(self, device: Union[int, torch.device]):
+        super(ELMoPipe, self).__init__(
+            pre=ELMoTokenizer(),
+            vocab=None,
+            post=None,
+            batch=PadELMo() + ToDevice(device=device),
+        )
diff --git a/torchglyph/proc/__init__.py b/torchglyph/proc/__init__.py
index 625b9dc..9416a5a 100644
--- a/torchglyph/proc/__init__.py
+++ b/torchglyph/proc/__init__.py
@@ -4,4 +4,3 @@
 from torchglyph.proc.recur import *
 from torchglyph.proc.shape import *
 from torchglyph.proc.vocab import *
-from torchglyph.proc.tokenizer import *

From 63989bba7f6b5db6864dbea7a052a28043b5da76 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 27 Mar 2020 01:04:22 +0900
Subject: [PATCH 09/66] Feat: Update ToDevice

---
 torchglyph/proc/collecting.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index a564d45..9b561ab 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -11,7 +11,8 @@
 
 
 class ToDevice(Proc):
-    Batch = Union[Tensor, PackedSequence, Tuple[Union[Tensor, PackedSequence], ...]]
+    Item = Union[int, float, bool, Tensor, PackedSequence]
+    Batch = Union[Item, Tuple[Item, ...]]
 
     def __init__(self, device: Union[int, torch.device]) -> None:
         super(ToDevice, self).__init__()
@@ -26,6 +27,8 @@ def extra_repr(self) -> str:
         return f'{self.device}'
 
     def __call__(self, batch: Batch, vocab: Vocab, **kwargs) -> Batch:
+        if isinstance(batch, (int, float, str, bool)):
+            return batch
         if isinstance(batch, (PackedSequence, Tensor)):
             return batch.to(self.device)
         return type(batch)([self(e, vocab=vocab) for e in batch])

From e04864641dfbe87a07e77905517ec7fb6a5ce137 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 31 Mar 2020 17:19:50 +0900
Subject: [PATCH 10/66] Feat: Add connection.py

---
 torchglyph/nn/connection.py | 81 +++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 torchglyph/nn/connection.py

diff --git a/torchglyph/nn/connection.py b/torchglyph/nn/connection.py
new file mode 100644
index 0000000..81b2df2
--- /dev/null
+++ b/torchglyph/nn/connection.py
@@ -0,0 +1,81 @@
+from typing import Union
+
+import torch
+from torch import Tensor
+from torch import nn
+from torch.nn.utils.rnn import PackedSequence
+
+
+class ResNorm(nn.Module):
+    """
+    https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf
+    """
+
+    def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None:
+        super(ResNorm, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = input_dim
+
+        self.sub_layer = sub_layer
+        self.layer_norm = nn.LayerNorm(input_dim)
+
+    def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]:
+        z = self.sub_layer(x, *args, **kwargs)
+        if torch.is_tensor(z):
+            return self.layer_norm(x + z)
+        elif isinstance(z, PackedSequence):
+            return z._replace(data=self.layer_norm(x.data + z.data))
+        else:
+            raise NotImplementedError
+
+
+class DenseNorm(nn.Module):
+    """
+    http://openaccess.thecvf.com/content_cvpr_2017/papers/Huang_Densely_Connected_Convolutional_CVPR_2017_paper.pdf
+    """
+
+    def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None:
+        super(DenseNorm, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = input_dim * 2
+
+        self.sub_layer = sub_layer
+        self.layer_norm = nn.LayerNorm(input_dim * 2)
+
+    def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]:
+        z = self.sub_layer(x, *args, **kwargs)
+        if torch.is_tensor(z):
+            return self.layer_norm(torch.cat([x, z], dim=-1))
+        elif isinstance(z, PackedSequence):
+            return z._replace(data=self.layer_norm(torch.cat([x.data, z.data], dim=-1)))
+        else:
+            raise NotImplementedError
+
+
+class ReZero(nn.Module):
+    """
+    https://arxiv.org/pdf/2003.04887.pdf
+    """
+
+    def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None:
+        super(ReZero, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = input_dim
+
+        self.sub_layer = sub_layer
+        self.scale = nn.Parameter(
+            torch.tensor([0.], dtype=torch.float32),
+            requires_grad=True,
+        )
+
+    def extra_repr(self) -> str:
+        return f'(scale): Parameter({self.scale.data})'
+
+    def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]:
+        z = self.sub_layer(x, *args, **kwargs)
+        if torch.is_tensor(z):
+            return x + z * self.scale
+        elif isinstance(z, PackedSequence):
+            return z._replace(data=x.data + z.data * self.scale)
+        else:
+            raise NotImplementedError

From de1f1d7ad3afda44923a9a8f9576a3028b614595 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 31 Mar 2020 17:23:57 +0900
Subject: [PATCH 11/66] Test: Add unit test for connections

---
 tests/test_nn/test_connection.py | 44 ++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100644 tests/test_nn/test_connection.py

diff --git a/tests/test_nn/test_connection.py b/tests/test_nn/test_connection.py
new file mode 100644
index 0000000..05550a3
--- /dev/null
+++ b/tests/test_nn/test_connection.py
@@ -0,0 +1,44 @@
+import torch
+from hypothesis import given, strategies as st
+from torch import nn
+
+from torchglyph.nn.connection import ResNorm, DenseNorm, ReZero
+
+
+@given(
+    batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4),
+    input_dim=st.integers(1, 20),
+)
+def test_resnorm_shape_grad(batch_sizes, input_dim):
+    layer = ResNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim))
+    x = torch.rand((*batch_sizes, input_dim), requires_grad=True)
+    y = layer(x)
+
+    assert y.size() == (*batch_sizes, layer.output_dim)
+    assert y.requires_grad
+
+
+@given(
+    batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4),
+    input_dim=st.integers(1, 20),
+)
+def test_densenorm_shape_grad(batch_sizes, input_dim):
+    layer = DenseNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim))
+    x = torch.rand((*batch_sizes, input_dim), requires_grad=True)
+    y = layer(x)
+
+    assert y.size() == (*batch_sizes, layer.output_dim)
+    assert y.requires_grad
+
+
+@given(
+    batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4),
+    input_dim=st.integers(1, 20),
+)
+def test_rezero_shape_grad(batch_sizes, input_dim):
+    layer = ReZero(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim))
+    x = torch.rand((*batch_sizes, input_dim), requires_grad=True)
+    y = layer(x)
+
+    assert y.size() == (*batch_sizes, layer.output_dim)
+    assert y.requires_grad

From 1d5a61bd8d402811129cf383f90b558c9fdb06dc Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 14:25:30 +0900
Subject: [PATCH 12/66] Feat: Add from_numpy to ToTensor

---
 torchglyph/proc/collecting.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index 9b561ab..e95f5c0 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -1,5 +1,6 @@
 from typing import Any, Union, List, Tuple
 
+import numpy as np
 import torch
 from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary
 from allennlp.data.dataset import Batch as AllenBatch
@@ -44,7 +45,9 @@ def extra_repr(self) -> str:
 
     def __call__(self, data: Any, **kwargs) -> Tensor:
         try:
-            return torch.tensor(data, dtype=self.dtype, requires_grad=False)
+            if isinstance(data, np.ndarray):
+                return torch.from_numpy(data).to(dtype=self.dtype).requires_grad_(False)
+            return torch.tensor(data, dtype=self.dtype).requires_grad_(False)
         except ValueError as err:
             if err.args[0] == "too many dimensions 'str'":
                 raise ValueError(f"'{data}' can not be converted to {Tensor.__name__}")

From c30f2a199739d994f74cd1ad6cedaa9c9430e46e Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 15:09:25 +0900
Subject: [PATCH 13/66] Feat: Add ELMoForManyLanguage

---
 setup.py                    |  1 +
 torchglyph/nn/contextual.py | 97 +++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/setup.py b/setup.py
index 7faba6e..16ae9e6 100644
--- a/setup.py
+++ b/setup.py
@@ -19,6 +19,7 @@
         'einops',
         'transformers',
         'allennlp',
+        'elmoformanylangs',
     ],
     extras_require={
         'dev': [
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 75b3a76..8c58a5d 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -1,13 +1,22 @@
+import json
 import logging
+from pathlib import Path
+from typing import List
 from typing import Union
 
 from allennlp.modules import Elmo as AllenELMo
+from elmoformanylangs.elmo import read_list, create_batches, recover
+from elmoformanylangs.frontend import Model
+from elmoformanylangs.modules.embedding_layer import EmbeddingLayer
 from torch import Tensor
 from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence
+from torch.nn.utils.rnn import pack_sequence
 
 from torchglyph import data_path
 from torchglyph.io import download_and_unzip
 
+logging.getLogger('elmoformanylangs').disabled = True
+
 
 class ELMoModel(AllenELMo):
     root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/'
@@ -72,3 +81,91 @@ def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, Pa
                 elmo_representations, lengths,
                 batch_first=True, enforce_sorted=False,
             )
+
+
+class ELMoForManyLanguage(Model):
+    def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> None:
+        with options_file.open('r', encoding='utf-8') as fp:
+            config = json.load(fp)
+
+        if config['token_embedder']['char_dim'] > 0:
+            char_lexicon = {}
+            with (weight_file / 'char.dic').open('r', encoding='utf-8') as fp:
+                for raw in fp:
+                    tokens = raw.strip().split('\t')
+                    if len(tokens) == 1:
+                        tokens.insert(0, '\u3000')
+                    token, index = tokens
+                    char_lexicon[token] = int(index)
+            char_emb_layer = EmbeddingLayer(
+                config['token_embedder']['char_dim'], char_lexicon,
+                fix_emb=False, embs=None,
+            )
+        else:
+            char_lexicon = None
+            char_emb_layer = None
+
+        if config['token_embedder']['word_dim'] > 0:
+            word_lexicon = {}
+            with (weight_file / 'word.dic').open('r', encoding='utf-8') as fp:
+                for raw in fp:
+                    tokens = raw.strip().split('\t')
+                    if len(tokens) == 1:
+                        tokens.insert(0, '\u3000')
+                    token, index = tokens
+                    word_lexicon[token] = int(index)
+            word_emb_layer = EmbeddingLayer(
+                config['token_embedder']['word_dim'], word_lexicon,
+                fix_emb=False, embs=None,
+            )
+        else:
+            word_lexicon = None
+            word_emb_layer = None
+
+        super(ELMoForManyLanguage, self).__init__(
+            config=config, word_emb_layer=word_emb_layer,
+            char_emb_layer=char_emb_layer, use_cuda=False,
+        )
+        self.load_model(path=weight_file)
+        self.char_lexicon = char_lexicon
+        self.word_lexicon = word_lexicon
+        self.pack_output = pack_output
+        self.encoding_dim = self.output_dim * 2
+
+    @classmethod
+    def from_pretraiend(cls, path: Path, pack_output: bool = True):
+        with (path / 'config.json').open('r', encoding='utf-8') as fp:
+            args = json.load(fp)
+        return cls(
+            options_file=path / args['config_path'],
+            weight_file=path, pack_output=pack_output,
+        )
+
+    def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tensor, PackedSequence]:
+        if self.config['token_embedder']['name'].lower() == 'cnn':
+            pad, text = read_list(batch, self.config['token_embedder']['max_characters_per_token'])
+        else:
+            pad, text = read_list(batch)
+
+        pad_w, pad_c, pad_ln, pad_mask, pad_text, recover_idx = create_batches(
+            pad, len(text), self.word_lexicon, self.char_lexicon, self.config, text=text)
+
+        ans = []
+        for word, char, length, mask, pads in zip(pad_w, pad_c, pad_ln, pad_mask, pad_text):
+            output = super(ELMoForManyLanguage, self).forward(word, char, mask)
+            for index, text in enumerate(pads):
+                if self.config['encoder']['name'].lower() == 'lstm':
+                    data = output[index, 1:length[index] - 1, :]
+                elif self.config['encoder']['name'].lower() == 'elmo':
+                    data = output[:, index, 1:length[index] - 1, :]
+
+                if output_layer == -1:
+                    payload = data.mean(dim=0)
+                else:
+                    payload = data[output_layer]
+                ans.append(payload)
+
+        ans = recover(ans, recover_idx)
+        if self.pack_output:
+            ans = pack_sequence(ans, enforce_sorted=False)
+        return ans

From dbc38bc1709a777728f3f1617600e6c8a4c68d63 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 15:26:10 +0900
Subject: [PATCH 14/66] Feat: Handle download for ELMoForManyLanguage

---
 torchglyph/nn/contextual.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 8c58a5d..6465cc2 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -83,7 +83,18 @@ def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, Pa
             )
 
 
-class ELMoForManyLanguage(Model):
+class ELMoForManyLanguages(Model):
+    root = 'http://vectors.nlpl.eu/repository/11/'
+    configs = [
+        'https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/master/configs/cnn_0_100_512_4096_sample.json',
+        'https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/master/configs/cnn_50_100_512_4096_sample.json',
+    ]
+    names = {
+        'ca': '138',
+        'es': '145',
+        'zh': '179',
+    }
+
     def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> None:
         with options_file.open('r', encoding='utf-8') as fp:
             config = json.load(fp)
@@ -122,7 +133,7 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> Non
             word_lexicon = None
             word_emb_layer = None
 
-        super(ELMoForManyLanguage, self).__init__(
+        super(ELMoForManyLanguages, self).__init__(
             config=config, word_emb_layer=word_emb_layer,
             char_emb_layer=char_emb_layer, use_cuda=False,
         )
@@ -133,7 +144,20 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> Non
         self.encoding_dim = self.output_dim * 2
 
     @classmethod
-    def from_pretraiend(cls, path: Path, pack_output: bool = True):
+    def from_pretraiend(cls, lang: str, pack_output: bool = True) -> 'ELMoForManyLanguages':
+        download_and_unzip(
+            url=cls.configs[0],
+            dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name,
+        )
+        download_and_unzip(
+            url=cls.configs[1],
+            dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[1]).name,
+        )
+        path = download_and_unzip(
+            url=cls.root + f'{cls.names[lang]}.zip',
+            dest=data_path / cls.__name__.lower() / lang / f'{lang}.zip',
+        ).parent
+
         with (path / 'config.json').open('r', encoding='utf-8') as fp:
             args = json.load(fp)
         return cls(
@@ -152,7 +176,7 @@ def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tenso
 
         ans = []
         for word, char, length, mask, pads in zip(pad_w, pad_c, pad_ln, pad_mask, pad_text):
-            output = super(ELMoForManyLanguage, self).forward(word, char, mask)
+            output = super(ELMoForManyLanguages, self).forward(word, char, mask)
             for index, text in enumerate(pads):
                 if self.config['encoder']['name'].lower() == 'lstm':
                     data = output[index, 1:length[index] - 1, :]

From 82fd821a55346fbec93d192f2312f493633b146f Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 15:38:23 +0900
Subject: [PATCH 15/66] Feat: Update extra_repr for both ELMo and
 ELMoForManyLanguages

---
 torchglyph/nn/contextual.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 6465cc2..6896559 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -35,7 +35,7 @@ def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs
         )
 
         self.pack_output = pack_output
-        self.embedding_dim = self.get_output_dim()
+        self.encoding_dim = self.get_output_dim()
 
     @classmethod
     def from_pretrained(cls, weight: str, pack_output: bool = True,
@@ -58,8 +58,7 @@ def from_pretrained(cls, weight: str, pack_output: bool = True,
 
     def extra_repr(self) -> str:
         args = [
-            f'{self._elmo_lstm._elmo_lstm.input_size}',
-            f'{self._elmo_lstm._elmo_lstm.hidden_size}',
+            f'encoding_dim={self.encoding_dim}',
             f'num_layers={self._elmo_lstm.num_layers}',
             f'dropout={self._dropout.p}',
         ]
@@ -95,7 +94,7 @@ class ELMoForManyLanguages(Model):
         'zh': '179',
     }
 
-    def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> None:
+    def __init__(self, *, options_file: Path, weight_file: Path, pack_output: bool, requires_grad: bool) -> None:
         with options_file.open('r', encoding='utf-8') as fp:
             config = json.load(fp)
 
@@ -140,11 +139,14 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output) -> Non
         self.load_model(path=weight_file)
         self.char_lexicon = char_lexicon
         self.word_lexicon = word_lexicon
+
+        self.lang = weight_file.name
+        self.requires_grad = requires_grad
         self.pack_output = pack_output
         self.encoding_dim = self.output_dim * 2
 
     @classmethod
-    def from_pretraiend(cls, lang: str, pack_output: bool = True) -> 'ELMoForManyLanguages':
+    def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages':
         download_and_unzip(
             url=cls.configs[0],
             dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name,
@@ -161,10 +163,23 @@ def from_pretraiend(cls, lang: str, pack_output: bool = True) -> 'ELMoForManyLan
         with (path / 'config.json').open('r', encoding='utf-8') as fp:
             args = json.load(fp)
         return cls(
-            options_file=path / args['config_path'],
+            options_file=path / args['config_path'], requires_grad=not freeze,
             weight_file=path, pack_output=pack_output,
         )
 
+    def extra_repr(self) -> str:
+        args = [
+            f'lang={self.lang}', f'encoding_dim={self.encoding_dim}',
+            f'word_vocab={len(self.word_lexicon) if self.word_lexicon is not None else None}',
+            f'char_vocab={len(self.char_lexicon) if self.char_lexicon is not None else None}',
+        ]
+        if not self.requires_grad:
+            args.append('frozen')
+        return ', '.join(args)
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}({self.extra_repr()})'
+
     def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tensor, PackedSequence]:
         if self.config['token_embedder']['name'].lower() == 'cnn':
             pad, text = read_list(batch, self.config['token_embedder']['max_characters_per_token'])
@@ -187,7 +202,7 @@ def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tenso
                     payload = data.mean(dim=0)
                 else:
                     payload = data[output_layer]
-                ans.append(payload)
+                ans.append(payload if self.requires_grad else payload.detach())
 
         ans = recover(ans, recover_idx)
         if self.pack_output:

From 4d61b5a7c6e06c118afbc63ae2f81b0ec51c6d38 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 15:54:30 +0900
Subject: [PATCH 16/66] Feat: Toggle loggers of allennlp

---
 torchglyph/io.py            | 9 ++++++++-
 torchglyph/nn/contextual.py | 5 +++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/torchglyph/io.py b/torchglyph/io.py
index 271ab87..99d683e 100644
--- a/torchglyph/io.py
+++ b/torchglyph/io.py
@@ -1,12 +1,13 @@
 import gzip
 import logging
 import os
+import re
 import shutil
 import tarfile
 import zipfile
 from contextlib import contextmanager
 from pathlib import Path
-from typing import Union, TextIO
+from typing import Union, TextIO, Pattern
 from urllib.request import urlretrieve
 
 from tqdm import tqdm
@@ -78,3 +79,9 @@ def download_and_unzip(url: str, dest: Path) -> Path:
                 shutil.copyfileobj(fsrc, fdst)
 
     return dest
+
+
+def toggle_loggers(pattern: Union[str, Pattern], enable: bool) -> None:
+    for name in logging.root.manager.loggerDict:  # type:str
+        if re.match(pattern, name) is not None:
+            logging.getLogger(name).disabled = not enable
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 6896559..d959001 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -13,9 +13,10 @@
 from torch.nn.utils.rnn import pack_sequence
 
 from torchglyph import data_path
-from torchglyph.io import download_and_unzip
+from torchglyph.io import download_and_unzip, toggle_loggers
 
-logging.getLogger('elmoformanylangs').disabled = True
+toggle_loggers('allennlp', False)
+toggle_loggers('elmoformanylangs', False)
 
 
 class ELMoModel(AllenELMo):

From 85c92bfbbf0a7e41d3c521958009592558c89fbd Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 15:58:57 +0900
Subject: [PATCH 17/66] Refactor: Separate fetch

---
 torchglyph/nn/contextual.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index d959001..2c6aea3 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -39,9 +39,7 @@ def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs
         self.encoding_dim = self.get_output_dim()
 
     @classmethod
-    def from_pretrained(cls, weight: str, pack_output: bool = True,
-                        num_output_representations: int = 2,
-                        dropout: float = 0., freeze: bool = True) -> 'ELMoModel':
+    def fetch(cls, weight: str):
         elmo_path = data_path / cls.__name__.lower()
         options_file = download_and_unzip(
             url=cls.root + (cls.name[weight] + 'options.json'),
@@ -51,6 +49,13 @@ def from_pretrained(cls, weight: str, pack_output: bool = True,
             url=cls.root + (cls.name[weight] + 'weights.hdf5'),
             dest=elmo_path / (cls.name[weight] + 'weights.hdf5'),
         )
+        return options_file, weight_file
+
+    @classmethod
+    def from_pretrained(cls, weight: str, pack_output: bool = True,
+                        num_output_representations: int = 2,
+                        dropout: float = 0., freeze: bool = True) -> 'ELMoModel':
+        options_file, weight_file = cls.fetch(weight=weight)
         return cls(
             options_file=str(options_file), weight_file=str(weight_file),
             num_output_representations=num_output_representations,
@@ -147,7 +152,7 @@ def __init__(self, *, options_file: Path, weight_file: Path, pack_output: bool,
         self.encoding_dim = self.output_dim * 2
 
     @classmethod
-    def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages':
+    def fetch(cls, lang: str):
         download_and_unzip(
             url=cls.configs[0],
             dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name,
@@ -156,11 +161,15 @@ def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = Tru
             url=cls.configs[1],
             dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[1]).name,
         )
-        path = download_and_unzip(
+        return download_and_unzip(
             url=cls.root + f'{cls.names[lang]}.zip',
             dest=data_path / cls.__name__.lower() / lang / f'{lang}.zip',
         ).parent
 
+    @classmethod
+    def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages':
+        path = cls.fetch(lang=lang)
+
         with (path / 'config.json').open('r', encoding='utf-8') as fp:
             args = json.load(fp)
         return cls(

From 588d082457634ddd0a42b098752ea9bce6c8f5e5 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Wed, 1 Apr 2020 16:32:37 +0900
Subject: [PATCH 18/66] Refactor: Turn off allennlp and transformers

---
 torchglyph/proc/collecting.py | 3 +++
 torchglyph/proc/tokenizer.py  | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index e95f5c0..5f723f8 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -7,9 +7,12 @@
 from torch import Tensor
 from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence
 
+from torchglyph.io import toggle_loggers
 from torchglyph.proc import Proc, Chain, stoi
 from torchglyph.vocab import Vocab
 
+toggle_loggers('allennlp', False)
+
 
 class ToDevice(Proc):
     Item = Union[int, float, bool, Tensor, PackedSequence]
diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py
index f14ebd3..e2fe321 100644
--- a/torchglyph/proc/tokenizer.py
+++ b/torchglyph/proc/tokenizer.py
@@ -5,7 +5,10 @@
 from allennlp.data.fields import TextField as AllenTextField
 from allennlp.data.token_indexers import ELMoTokenCharactersIndexer
 
-from torchglyph.proc import Proc
+from torchglyph.proc import Proc, toggle_loggers
+
+toggle_loggers('allennlp', False)
+toggle_loggers('transformers', False)
 
 
 class ELMoTokenizer(Proc):

From 497d368192128646ce538c021bf3438c44c9550e Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 3 Apr 2020 21:13:50 +0900
Subject: [PATCH 19/66] Fix: typo

---
 torchglyph/nn/contextual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 2c6aea3..fdf102c 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -167,7 +167,7 @@ def fetch(cls, lang: str):
         ).parent
 
     @classmethod
-    def from_pretraiend(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages':
+    def from_pretrained(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages':
         path = cls.fetch(lang=lang)
 
         with (path / 'config.json').open('r', encoding='utf-8') as fp:

From cf9daf48beee0a8ad06390d5ae98b26a1bb05e6f Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 3 Apr 2020 22:08:49 +0900
Subject: [PATCH 20/66] Refactor: Change num_output_representations default
 value

---
 torchglyph/nn/contextual.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index fdf102c..3ee8d41 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -53,7 +53,7 @@ def fetch(cls, weight: str):
 
     @classmethod
     def from_pretrained(cls, weight: str, pack_output: bool = True,
-                        num_output_representations: int = 2,
+                        num_output_representations: int = 1,
                         dropout: float = 0., freeze: bool = True) -> 'ELMoModel':
         options_file, weight_file = cls.fetch(weight=weight)
         return cls(

From 25e51b00155e2fe12d0c0a5fa672ad5900cbcca7 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 3 Apr 2020 22:34:06 +0900
Subject: [PATCH 21/66] Refactor: Update ToDevice

---
 torchglyph/proc/collecting.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index 5f723f8..62b2d68 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -31,11 +31,11 @@ def extra_repr(self) -> str:
         return f'{self.device}'
 
     def __call__(self, batch: Batch, vocab: Vocab, **kwargs) -> Batch:
-        if isinstance(batch, (int, float, str, bool)):
-            return batch
-        if isinstance(batch, (PackedSequence, Tensor)):
-            return batch.to(self.device)
-        return type(batch)([self(e, vocab=vocab) for e in batch])
+        if isinstance(batch, (Tensor, PackedSequence)):
+            return batch.to(device=self.device)
+        if isinstance(batch, (list, tuple)):
+            return type(batch)([self(e, vocab=vocab) for e in batch])
+        return batch
 
 
 class ToTensor(Proc):

From 107425c64e1e88f59ccfe30551d0b2afa95ca772 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 10 Apr 2020 17:15:56 +0900
Subject: [PATCH 22/66] Feat: Add remove_missing flag

---
 .../test_datasets/test_sequential_labeling.py |  8 ++--
 .../test_datasets/test_text_classification.py |  2 +-
 torchglyph/datasets/sequential_labeling.py    | 27 ++++++++++---
 torchglyph/datasets/text_classification.py    |  9 ++++-
 torchglyph/proc/vocab.py                      | 25 +++++++-----
 torchglyph/vocab.py                           | 38 +++----------------
 6 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/tests/test_datasets/test_sequential_labeling.py b/tests/test_datasets/test_sequential_labeling.py
index 94a7e6d..fab9eb7 100644
--- a/tests/test_datasets/test_sequential_labeling.py
+++ b/tests/test_datasets/test_sequential_labeling.py
@@ -3,27 +3,27 @@
 
 
 def test_conll2000_chunking():
-    train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None)
+    train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None, remove_missing=True)
     assert len(train.dataset) == 8936
     assert len(test.dataset) == 2012
 
 
 def test_conll2003_ner():
-    train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None)
+    train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None, remove_missing=True)
     assert len(train.dataset) == 14987
     assert len(dev.dataset) == 3466
     assert len(test.dataset) == 3684
 
 
 def test_semeval2010_catalan():
-    train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None)
+    train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None, remove_missing=True)
     assert len(train.dataset) == 8709
     assert len(dev.dataset) == 1445
     assert len(test.dataset) == 1698
 
 
 def test_semeval2010_spanish():
-    train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None)
+    train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None, remove_missing=True)
     assert len(train.dataset) == 9022
     assert len(dev.dataset) == 1419
     assert len(test.dataset) == 1705
diff --git a/tests/test_datasets/test_text_classification.py b/tests/test_datasets/test_text_classification.py
index 587e0c1..f130e80 100644
--- a/tests/test_datasets/test_text_classification.py
+++ b/tests/test_datasets/test_text_classification.py
@@ -2,6 +2,6 @@
 
 
 def test_agnews():
-    train, test = AgNews.new(batch_size=1, word_dim=None)
+    train, test = AgNews.new(batch_size=1, word_dim=None, remove_missing=True)
     assert len(train) == 120000
     assert len(test) == 7600
diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py
index b5f2405..369437c 100644
--- a/torchglyph/datasets/sequential_labeling.py
+++ b/torchglyph/datasets/sequential_labeling.py
@@ -34,10 +34,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args
             conllx.dump(zip(raw_word, raw_pos, raw_chunk, pred_chunk), fp, sep=' ')
 
     @classmethod
-    def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]:
+    def new(cls, batch_size: int, word_dim: Optional[int],
+            remove_missing: bool, device: int = -1) -> Tuple[DataLoader, ...]:
+        if word_dim is not None:
+            vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing)
+        else:
+            vectors = Identity()
         word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
             pre=ReplaceDigits(repl_token='<digits>') + ...,
-            vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)),
+            vocab=... + vectors,
         )
         length = SeqLengthTensorPipe(device=device)
         char = PackedTokBlockPipe(device=device, unk_token='<unk>')
@@ -90,10 +95,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args
             conllx.dump(zip(raw_word, raw_pos, raw_chunk, raw_ner, pred_ner), fp, sep=' ')
 
     @classmethod
-    def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]:
+    def new(cls, batch_size: int, word_dim: Optional[int],
+            remove_missing: bool, device: int = -1) -> Tuple[DataLoader, ...]:
+        if word_dim is not None:
+            vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing)
+        else:
+            vectors = Identity()
         word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
             pre=ReplaceDigits(repl_token='<digits>') + ...,
-            vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)),
+            vocab=... + vectors,
         )
         length = SeqLengthTensorPipe(device=device)
         char = PackedTokBlockPipe(device=device, unk_token='<unk>')
@@ -148,10 +158,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[Any], *args, **kw
             conllx.dump(zip(raw_word, raw_pos, raw_ner, pred_ner), fp, sep=' ')
 
     @classmethod
-    def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]:
+    def new(cls, batch_size: int, word_dim: Optional[int],
+            remove_missing: bool, device: int = -1) -> Tuple['DataLoader', ...]:
+        if word_dim is not None:
+            vectors = LoadFastText(str.lower, lang=cls.lang, remove_missing=remove_missing)
+        else:
+            vectors = Identity()
         word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
             pre=Prepend('<root>', 1) + ReplaceDigits(repl_token='<digits>') + ...,
-            vocab=... + (Identity() if word_dim is None else LoadFastText(cls.lang, str.lower)),
+            vocab=... + vectors,
         )
         length = SeqLengthTensorPipe(device=device).with_(pre=Prepend('<root>', 1) + ...)
         char = PackedTokBlockPipe(device=device, unk_token='<unk>').with_(
diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py
index 013709f..169b78d 100644
--- a/torchglyph/datasets/text_classification.py
+++ b/torchglyph/datasets/text_classification.py
@@ -39,9 +39,14 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[int], *args, **kw
             csv.dump((' '.join(raw_title), ' '.join(raw_text), raw_target, vocab.itos[pred]), fp)
 
     @classmethod
-    def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]:
+    def new(cls, batch_size: int, word_dim: Optional[int],
+            remove_missing: bool, device: int = -1) -> Tuple['DataLoader', ...]:
+        if word_dim is not None:
+            vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing)
+        else:
+            vectors = Identity()
         word = PackedTokSeqPipe(device=device, unk_token='<unk>').with_(
-            vocab=... + (Identity() if word_dim is None else LoadGlove(name='6B', dim=word_dim)),
+            vocab=... + vectors,
         )
         target = TokTensorPipe(device=device, unk_token=None)
 
diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py
index f42f2a3..b34bda2 100644
--- a/torchglyph/proc/vocab.py
+++ b/torchglyph/proc/vocab.py
@@ -76,21 +76,24 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
 
 
 class LoadVectors(Proc):
-    def __init__(self, vectors: Vectors, *fallbacks) -> None:
+    def __init__(self, *fallback_fns, vectors: Vectors, remove_missing: bool) -> None:
         super(LoadVectors, self).__init__()
+        self.fallback_fns = fallback_fns
         self.vectors = vectors
-        self.fallbacks = fallbacks
+        self.remove_missing = remove_missing
 
     def extra_repr(self) -> str:
         return ', '.join([
             f'{self.vectors.extra_repr()}',
-            *[f'{f.__name__}' for f in self.fallbacks],
+            *[f'{f.__name__}' for f in self.fallback_fns],
         ])
 
     def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
         assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?"
 
-        tok, occ = vocab.load_vectors(self.vectors, *self.fallbacks)
+        if self.remove_missing:
+            vocab &= self.vectors
+        tok, occ = vocab.load_vectors(self.vectors, *self.fallback_fns)
         tok = tok / max(1, len(vocab.freq.values())) * 100
         occ = occ / max(1, sum(vocab.freq.values())) * 100
         logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'")
@@ -98,16 +101,18 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
 
 
 class LoadGlove(LoadVectors):
-    def __init__(self, name: str, dim: int, *fallbacks) -> None:
+    def __init__(self, *fallback_fns, name: str, dim: int, remove_missing: bool) -> None:
         super(LoadGlove, self).__init__(
-            Glove(name=name, dim=dim),
-            *fallbacks,
+            *fallback_fns,
+            vectors=Glove(name=name, dim=dim),
+            remove_missing=remove_missing,
         )
 
 
 class LoadFastText(LoadVectors):
-    def __init__(self, lang: str, *fallbacks) -> None:
+    def __init__(self, *fallback_fns, lang: str, remove_missing: bool) -> None:
         super(LoadFastText, self).__init__(
-            FastTest(lang=lang),
-            *fallbacks,
+            *fallback_fns,
+            vectors=FastTest(lang=lang),
+            remove_missing=remove_missing,
         )
diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index ca7b0f6..cb4ae8d 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -81,45 +81,15 @@ def __and__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab':
         if isinstance(rhs, Vocab):
             rhs = rhs.freq
         return Vocab(
-            counter=Counter({
-                token: freq
-                for token, freq in self.freq.items()
-                if token in rhs
-            }),
+            counter=self.freq & rhs,
             unk_token=self.unk_token,
             pad_token=self.pad_token,
             special_tokens=self.special_tokens,
             max_size=self.max_size, min_freq=self.min_freq,
         )
 
-    def __add__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab':
-        if isinstance(rhs, Vocab):
-            rhs = rhs.freq
-        return Vocab(
-            counter=Counter({
-                token: self.freq[token] + rhs[token]
-                for token in {*self.freq.keys(), *rhs.keys()}
-            }),
-            unk_token=self.unk_token,
-            pad_token=self.pad_token,
-            special_tokens=self.special_tokens,
-            max_size=self.max_size, min_freq=self.min_freq,
-        )
-
-    def __sub__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab':
-        if isinstance(rhs, Vocab):
-            rhs = rhs.freq
-        return Vocab(
-            counter=Counter({
-                token: freq
-                for token, freq in self.freq.items()
-                if token not in rhs
-            }),
-            unk_token=self.unk_token,
-            pad_token=self.pad_token,
-            special_tokens=self.special_tokens,
-            max_size=self.max_size, min_freq=self.min_freq,
-        )
+    def __iand__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab':
+        return self.__and__(rhs=rhs)
 
     @property
     def pad_idx(self) -> Optional[int]:
@@ -168,6 +138,8 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
         self.vectors = []
         self.unk_init_ = unk_init_
 
+        print(f'path => {path}')
+
         pt_path = path.with_suffix('.pt')
         if not pt_path.exists():
             if not path.exists():

From f48ac4ba8242ca427c12a3a6f06c37ab7786b1fc Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 10 Apr 2020 17:28:06 +0900
Subject: [PATCH 23/66] Feat: Update remove_missing flag

---
 torchglyph/proc/vocab.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py
index b34bda2..b85dfde 100644
--- a/torchglyph/proc/vocab.py
+++ b/torchglyph/proc/vocab.py
@@ -84,15 +84,16 @@ def __init__(self, *fallback_fns, vectors: Vectors, remove_missing: bool) -> Non
 
     def extra_repr(self) -> str:
         return ', '.join([
-            f'{self.vectors.extra_repr()}',
             *[f'{f.__name__}' for f in self.fallback_fns],
+            f'{self.vectors.extra_repr()}',
+            f'remove_missing={self.remove_missing}',
         ])
 
     def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
         assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?"
 
         if self.remove_missing:
-            vocab &= self.vectors
+            vocab = vocab & self.vectors
         tok, occ = vocab.load_vectors(self.vectors, *self.fallback_fns)
         tok = tok / max(1, len(vocab.freq.values())) * 100
         occ = occ / max(1, sum(vocab.freq.values())) * 100

From 7b250ed73cff729e13a4d24b552f11b2c2f1cee4 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sat, 11 Apr 2020 21:22:28 +0900
Subject: [PATCH 24/66] Fix: Bug of remove_missing flag

---
 torchglyph/proc/vocab.py |  4 ++--
 torchglyph/vocab.py      | 38 ++++++++++++++++++++++----------------
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py
index b85dfde..2b72b74 100644
--- a/torchglyph/proc/vocab.py
+++ b/torchglyph/proc/vocab.py
@@ -93,8 +93,8 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
         assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?"
 
         if self.remove_missing:
-            vocab = vocab & self.vectors
-        tok, occ = vocab.load_vectors(self.vectors, *self.fallback_fns)
+            vocab = vocab.union(self.vectors, *self.fallback_fns)
+        tok, occ = vocab.load_vectors(*self.fallback_fns, vectors=self.vectors)
         tok = tok / max(1, len(vocab.freq.values())) * 100
         occ = occ / max(1, sum(vocab.freq.values())) * 100
         logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'")
diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index cb4ae8d..6eeb0f4 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -2,7 +2,7 @@
 from collections import Counter
 from collections import defaultdict
 from pathlib import Path
-from typing import Union, Optional, Tuple, Callable, List
+from typing import Optional, Tuple, Callable, List
 
 import torch
 from torch import Tensor
@@ -77,20 +77,28 @@ def __len__(self) -> int:
     def __contains__(self, token: str) -> bool:
         return token in self.stoi
 
-    def __and__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab':
-        if isinstance(rhs, Vocab):
-            rhs = rhs.freq
+    def union(self, rhs: 'Vocab', *fallback_fns) -> 'Vocab':
+        counter = Counter()
+
+        for token, freq in self.freq.items():
+            if token in rhs.stoi:
+                counter[token] = freq
+            else:
+                for fallback_fn in fallback_fns:
+                    new_token = fallback_fn(token)
+                    if new_token in rhs.stoi:
+                        counter[new_token] = freq
+                        break
+
         return Vocab(
-            counter=self.freq & rhs,
+            counter=counter,
             unk_token=self.unk_token,
             pad_token=self.pad_token,
             special_tokens=self.special_tokens,
-            max_size=self.max_size, min_freq=self.min_freq,
+            max_size=self.max_size,
+            min_freq=self.min_freq,
         )
 
-    def __iand__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab':
-        return self.__and__(rhs=rhs)
-
     @property
     def pad_idx(self) -> Optional[int]:
         if self.pad_token is None:
@@ -103,12 +111,12 @@ def vec_dim(self) -> int:
             return 0
         return self.vectors.size(1)
 
-    def load_vectors(self, vectors: 'Vectors', *fallbacks) -> Tuple[int, int]:
+    def load_vectors(self, *fallback_fns, vectors: 'Vectors') -> Tuple[int, int]:
         self.vectors = torch.empty((len(self), vectors.vec_dim), dtype=torch.float32)
 
         tok, occ = 0, 0
         for token, index in self.stoi.items():
-            if vectors.query_(token, self.vectors[index], *fallbacks):
+            if vectors.query_(token, self.vectors[index], *fallback_fns):
                 tok += 1
                 occ += self.freq[token]
 
@@ -138,8 +146,6 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
         self.vectors = []
         self.unk_init_ = unk_init_
 
-        print(f'path => {path}')
-
         pt_path = path.with_suffix('.pt')
         if not pt_path.exists():
             if not path.exists():
@@ -171,12 +177,12 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
             self.load(pt_path)
 
     @torch.no_grad()
-    def query_(self, token: str, vector: Tensor, *fallbacks) -> bool:
+    def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool:
         if token in self:
             vector[:] = self.vectors[self.stoi[token]]
             return True
-        for fallback in fallbacks:
-            new_token = fallback(token)
+        for fallback_fn in fallback_fns:
+            new_token = fallback_fn(token)
             if new_token in self:
                 vector[:] = self.vectors[self.stoi[new_token]]
                 return True

From 6131949266d3e514a6af6e91448d2cd52524fe41 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 28 Apr 2020 22:30:57 +0900
Subject: [PATCH 25/66] Feat: Add NLPLVectors

---
 torchglyph/vocab.py | 63 +++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index 6eeb0f4..a791dca 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -136,45 +136,46 @@ def load(self, path: Path) -> None:
 
 class Vectors(Vocab):
     def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
-                 has_head_info: bool, unk_init_: Callable[[Tensor], Tensor] = init.normal_) -> None:
+                 heading: bool, unicode_error: str = 'replace', dtype: torch.dtype = torch.float32,
+                 unk_init_: Callable[[Tensor], Tensor] = init.normal_) -> None:
         super(Vectors, self).__init__(
             counter=Counter(),
             unk_token=None, pad_token=None,
             special_tokens=(), max_size=None, min_freq=1,
         )
 
-        self.vectors = []
+        vectors = []
         self.unk_init_ = unk_init_
 
-        pt_path = path.with_suffix('.pt')
-        if not pt_path.exists():
+        dump_path = path.with_suffix('.pt')
+        if not dump_path.exists():
             if not path.exists():
                 for url, dest in urls_dest:
                     download_and_unzip(url, dest)
 
             with path.open('rb') as fp:
-                vec_dim = None
+                vector_dim = None
 
-                iteration = tqdm(fp, desc=f'reading {path}', unit=' tokens')
-                for raw in iteration:  # type:bytes
-                    if has_head_info:
-                        _, vec_dim = map(int, raw.strip().split(b' '))
-                        has_head_info = False
+                iteration = tqdm(fp, desc=f'reading {path}', unit=' lines')
+                for raw in iteration:  # type: bytes
+                    if heading:
+                        _, vector_dim = map(int, raw.strip().split(b' '))
+                        heading = False
                         continue
                     token, *vs = raw.rstrip().split(b' ')
 
-                    if vec_dim is None:
-                        vec_dim = len(vs)
-                    elif vec_dim != len(vs):
-                        raise ValueError(f'vector dimensions are not consistent, {vec_dim} != {len(vs)}')
+                    if vector_dim is None:
+                        vector_dim = len(vs)
+                    elif vector_dim != len(vs):
+                        raise ValueError(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}')
 
-                    self.add_token_(str(token, encoding='utf-8'))
-                    self.vectors.append(torch.tensor([float(v) for v in vs], dtype=torch.float32))
+                    self.add_token_(str(token, encoding='utf-8', errors=unicode_error))
+                    vectors.append(torch.tensor([float(v) for v in vs], dtype=dtype))
 
-            self.vectors = torch.stack(self.vectors, 0)
-            self.save(pt_path)
+            self.vectors = torch.stack(vectors, 0)
+            self.save(dump_path)
         else:
-            self.load(pt_path)
+            self.load(dump_path)
 
     @torch.no_grad()
     def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool:
@@ -192,23 +193,35 @@ def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool:
 
 class Glove(Vectors):
     def __init__(self, name: str, dim: int) -> None:
+        path = data_path / f'glove.{name}'
         super(Glove, self).__init__(
             urls_dest=[(
                 f'http://nlp.stanford.edu/data/glove.{name}.zip',
-                data_path / f'glove.{name}' / f'glove.{name}.zip'
+                path / f'glove.{name}.zip'
             )],
-            path=data_path / f'glove.{name}' / f'glove.{name}.{dim}d.txt',
-            has_head_info=False,
+            path=path / f'glove.{name}.{dim}d.txt', heading=False,
         )
 
 
 class FastTest(Vectors):
     def __init__(self, lang: str) -> None:
+        path = data_path / 'fasttext'
         super(FastTest, self).__init__(
             urls_dest=[(
                 f'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{lang}.vec',
-                data_path / 'fasttext' / f'wiki.{lang}.vec',
+                path / f'wiki.{lang}.vec',
             )],
-            path=data_path / 'fasttext' / f'wiki.{lang}.vec',
-            has_head_info=True,
+            path=path / f'wiki.{lang}.vec', heading=True,
+        )
+
+
+class NLPLVectors(Vectors):
+    def __init__(self, index: int, name: str = 'model.txt', heading: bool = False) -> None:
+        path = data_path / 'nlpl' / f'{index}'
+        super(NLPLVectors, self).__init__(
+            urls_dest=[(
+                f'http://vectors.nlpl.eu/repository/20/{index}.zip',
+                path / f'{index}.zip',
+            )],
+            path=path / name, heading=heading,
         )

From 224a00e6c6a128ae16007c24f7dad5dfbfa13fa3 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 28 Apr 2020 23:16:55 +0900
Subject: [PATCH 26/66] Chore: Ignore dimension inconsistent token

---
 torchglyph/vocab.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index a791dca..a9e4095 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -156,10 +156,9 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
             with path.open('rb') as fp:
                 vector_dim = None
 
-                iteration = tqdm(fp, desc=f'reading {path}', unit=' lines')
-                for raw in iteration:  # type: bytes
+                for raw in tqdm(fp, desc=f'reading {path}', unit=' lines'):  # type: bytes
                     if heading:
-                        _, vector_dim = map(int, raw.strip().split(b' '))
+                        _, vector_dim = map(int, raw.rstrip().split(b' '))
                         heading = False
                         continue
                     token, *vs = raw.rstrip().split(b' ')
@@ -167,7 +166,8 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
                     if vector_dim is None:
                         vector_dim = len(vs)
                     elif vector_dim != len(vs):
-                        raise ValueError(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}')
+                        logging.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}')
+                        continue
 
                     self.add_token_(str(token, encoding='utf-8', errors=unicode_error))
                     vectors.append(torch.tensor([float(v) for v in vs], dtype=dtype))

From c725cfcaa98aa8dea748552199ace8c9305998d6 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 1 May 2020 16:54:31 +0900
Subject: [PATCH 27/66] Fix: Resolve bug on unzipping .tar.gz files

---
 torchglyph/io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchglyph/io.py b/torchglyph/io.py
index 99d683e..5eedf6d 100644
--- a/torchglyph/io.py
+++ b/torchglyph/io.py
@@ -69,7 +69,7 @@ def download_and_unzip(url: str, dest: Path) -> Path:
         logging.info(f'extracting {dest}')
         with zipfile.ZipFile(dest, "r") as fp:
             fp.extractall(path=dest.parent)
-    elif dest.suffixes[:-2] == ['.tar', '.gz']:
+    elif dest.suffixes[-2:] == ['.tar', '.gz']:
         logging.info(f'extracting {dest}')
         with tarfile.open(dest, 'r:gz') as fp:
             fp.extractall(path=dest.parent)

From a0318e481f907636fc699a183ede5dc791eec702 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 1 May 2020 17:04:43 +0900
Subject: [PATCH 28/66] Feat: Support name property for Dataset

---
 torchglyph/dataset.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/torchglyph/dataset.py b/torchglyph/dataset.py
index e451fbe..e7f11d3 100644
--- a/torchglyph/dataset.py
+++ b/torchglyph/dataset.py
@@ -2,7 +2,7 @@
 import uuid
 from collections import namedtuple
 from pathlib import Path
-from typing import Iterable, Any, TextIO
+from typing import Iterable, Any, TextIO, Optional
 from typing import Union, List, Type, Tuple, NamedTuple, Dict
 
 from torch.utils import data
@@ -14,6 +14,7 @@
 
 
 class Dataset(data.Dataset):
+    name: Optional[str]
     urls: List[Union[Tuple[str, ...]]]
 
     def __init__(self, pipes: List[Dict[str, Pipe]], **load_kwargs) -> None:
@@ -62,14 +63,16 @@ def collate_fn(self, batch: List[NamedTuple]) -> NamedTuple:
 
     @classmethod
     def paths(cls, root: Path = data_path) -> Tuple[Path, ...]:
+        root = root / getattr(cls, 'name', cls.__name__).lower()
+
         ans = []
         for url, name, *filenames in cls.urls:
             if len(filenames) == 0:
                 filenames = [name]
-            if any(not (root / cls.__name__.lower() / n).exists() for n in filenames):
-                download_and_unzip(url, root / cls.__name__.lower() / name)
-            for n in filenames:
-                ans.append(root / cls.__name__.lower() / n)
+            if any(not (root / filename).exists() for filename in filenames):
+                download_and_unzip(url, root / name)
+            for filename in filenames:
+                ans.append(root / filename)
 
         return tuple(ans)
 

From e73d7d6c897ae118c3af0029cc0fe36a00c8db86 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 1 May 2020 17:36:01 +0900
Subject: [PATCH 29/66] Chore: Separate ctx dependencies

---
 .github/workflows/unit-tests.yml | 2 +-
 setup.py                         | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index ef5ea86..c4e9f3f 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -17,7 +17,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install torch
-          python -m pip install -e '.[dev]'
+          python -m pip install -e '.[dev, ctx]'
       - name: Test with pytest
         run: |
           python -m pytest tests
diff --git a/setup.py b/setup.py
index 16ae9e6..6a077a5 100644
--- a/setup.py
+++ b/setup.py
@@ -17,14 +17,16 @@
         'tqdm',
         'numpy',
         'einops',
-        'transformers',
-        'allennlp',
-        'elmoformanylangs',
     ],
     extras_require={
         'dev': [
             'pytest',
             'hypothesis',
         ],
+        'ctx': [
+            'transformers',
+            'allennlp',
+            'elmoformanylangs',
+        ]
     }
 )

From ea37c19e4adb1fcfe477578b8fda12f3becf654e Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 1 May 2020 20:28:21 +0900
Subject: [PATCH 30/66] Refactor: Rewrite open_io function

---
 torchglyph/io.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/torchglyph/io.py b/torchglyph/io.py
index 5eedf6d..fb9c57a 100644
--- a/torchglyph/io.py
+++ b/torchglyph/io.py
@@ -17,15 +17,14 @@
 
 @contextmanager
 def open_io(f: IO, mode: str, encoding: str):
-    if isinstance(f, (str, Path)):
-        fp = open(f, mode=mode, encoding=encoding)
-    else:
-        fp = f
     try:
-        yield fp
+        if isinstance(f, (str, Path)):
+            with open(f, mode=mode, encoding=encoding) as fp:
+                yield fp
+        else:
+            yield f
     finally:
-        if isinstance(f, Path):
-            fp.close()
+        pass
 
 
 # copied and modified from https://github.com/pytorch/text
@@ -74,9 +73,9 @@ def download_and_unzip(url: str, dest: Path) -> Path:
         with tarfile.open(dest, 'r:gz') as fp:
             fp.extractall(path=dest.parent)
     elif dest.suffix == '.gz':
-        with gzip.open(dest, mode='rb') as fsrc:
-            with dest.with_suffix('').open(mode='wb') as fdst:
-                shutil.copyfileobj(fsrc, fdst)
+        with gzip.open(dest, mode='rb') as fs:
+            with dest.with_suffix('').open(mode='wb') as fd:
+                shutil.copyfileobj(fs, fd)
 
     return dest
 

From a23c75193125c41391cedc756102f3670330cacb Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 1 May 2020 20:36:05 +0900
Subject: [PATCH 31/66] Refactor: Use local logger

---
 torchglyph/datasets/sequential_labeling.py | 8 +++++---
 torchglyph/datasets/text_classification.py | 4 +++-
 torchglyph/io.py                           | 7 +++++--
 torchglyph/nn/contextual.py                | 4 +++-
 torchglyph/proc/vocab.py                   | 9 +++++----
 torchglyph/vocab.py                        | 8 +++++---
 6 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py
index 369437c..85cf6eb 100644
--- a/torchglyph/datasets/sequential_labeling.py
+++ b/torchglyph/datasets/sequential_labeling.py
@@ -13,6 +13,8 @@
 from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe
 from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend
 
+logger = logging.Logger(__name__)
+
 
 class CoNLL2000Chunking(Dataset):
     urls = [
@@ -61,7 +63,7 @@ def new(cls, batch_size: int, word_dim: Optional[int],
         test = cls(path=test, pipes=pipes)
 
         for name, pipe in train.pipes.items():
-            logging.info(f'{name} => {pipe}')
+            logger.info(f'{name} => {pipe}')
 
         word.build_vocab(train, test, name='word')
         char.build_vocab(train, test, name='char')
@@ -125,7 +127,7 @@ def new(cls, batch_size: int, word_dim: Optional[int],
         test = cls(path=test, pipes=pipes)
 
         for name, pipe in train.pipes.items():
-            logging.info(f'{name} => {pipe}')
+            logger.info(f'{name} => {pipe}')
 
         word.build_vocab(train, dev, test, name='word')
         char.build_vocab(train, dev, test, name='char')
@@ -192,7 +194,7 @@ def new(cls, batch_size: int, word_dim: Optional[int],
         test = cls(path=test, pipes=pipes)
 
         for name, pipe in train.pipes.items():
-            logging.info(f'{name} => {pipe}')
+            logger.info(f'{name} => {pipe}')
 
         word.build_vocab(train, dev, test, name='word')
         char.build_vocab(train, dev, test, name='char')
diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py
index 169b78d..9789580 100644
--- a/torchglyph/datasets/text_classification.py
+++ b/torchglyph/datasets/text_classification.py
@@ -12,6 +12,8 @@
 from torchglyph.pipe import PackedTokSeqPipe, TokTensorPipe, RawPipe
 from torchglyph.proc import Identity, LoadGlove
 
+logger = logging.Logger(__name__)
+
 
 class AgNews(Dataset):
     urls = [
@@ -61,7 +63,7 @@ def new(cls, batch_size: int, word_dim: Optional[int],
         test = cls(path=test, target_vocab=target_vocab, pipes=pipes)
 
         for name, pipe in train.pipes.items():
-            logging.info(f'{name} => {pipe}')
+            logger.info(f'{name} => {pipe}')
 
         word.build_vocab(train, test, name='word')
         target.build_vocab(train, test, name='target')
diff --git a/torchglyph/io.py b/torchglyph/io.py
index fb9c57a..cf1cf15 100644
--- a/torchglyph/io.py
+++ b/torchglyph/io.py
@@ -12,6 +12,8 @@
 
 from tqdm import tqdm
 
+logger = logging.Logger(__name__)
+
 IO = Union[str, Path, TextIO]
 
 
@@ -65,14 +67,15 @@ def download_and_unzip(url: str, dest: Path) -> Path:
             raise err
 
     if dest.suffix == '.zip':
-        logging.info(f'extracting {dest}')
+        logger.info(f'extracting {dest}')
         with zipfile.ZipFile(dest, "r") as fp:
             fp.extractall(path=dest.parent)
     elif dest.suffixes[-2:] == ['.tar', '.gz']:
-        logging.info(f'extracting {dest}')
+        logger.info(f'extracting {dest}')
         with tarfile.open(dest, 'r:gz') as fp:
             fp.extractall(path=dest.parent)
     elif dest.suffix == '.gz':
+        logger.info(f'extracting {dest}')
         with gzip.open(dest, mode='rb') as fs:
             with dest.with_suffix('').open(mode='wb') as fd:
                 shutil.copyfileobj(fs, fd)
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index 3ee8d41..af42d5b 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -18,6 +18,8 @@
 toggle_loggers('allennlp', False)
 toggle_loggers('elmoformanylangs', False)
 
+logger = logging.Logger(__name__)
+
 
 class ELMoModel(AllenELMo):
     root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/'
@@ -29,7 +31,7 @@ class ELMoModel(AllenELMo):
     }
 
     def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs) -> None:
-        logging.info(f'loading pretrained {self.__class__.__name__} from {weight_file}')
+        logger.info(f'loading pretrained {self.__class__.__name__} from {weight_file}')
 
         super(ELMoModel, self).__init__(
             options_file=options_file, weight_file=weight_file, **kwargs,
diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py
index 2b72b74..1e08056 100644
--- a/torchglyph/proc/vocab.py
+++ b/torchglyph/proc/vocab.py
@@ -5,6 +5,7 @@
 from torchglyph.proc import Proc
 from torchglyph.vocab import Vocab, Vectors, Glove, FastTest
 
+logger = logging.Logger(__name__)
 
 class UpdateCounter(Proc):
     def __call__(self, data: Union[str, List[str]], counter: Counter, *args, **kwargs) -> Union[str, List[str]]:
@@ -61,14 +62,14 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
         occ_avg = sum(vocab.freq.values()) / max(1, tok_cnt)
 
         name = f"{vocab.__class__.__name__} '{name}'"
-        logging.info(f"{name} has {tok_cnt} token(s) => "
+        logger.info(f"{name} has {tok_cnt} token(s) => "
                      f"{occ_avg:.1f} occurrence(s)/token ["
                      f"{occ_max} :: '{tok_max}', "
                      f"{occ_min} :: '{tok_min}']")
         if tok_cnt <= self.threshold:
-            logging.info(f'{name} => [{", ".join(vocab.itos)}]')
+            logger.info(f'{name} => [{", ".join(vocab.itos)}]')
         else:
-            logging.info(f'{name} => ['
+            logger.info(f'{name} => ['
                          f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., '
                          f'{", ".join(vocab.itos[-self.threshold // 2:])}]')
 
@@ -97,7 +98,7 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
         tok, occ = vocab.load_vectors(*self.fallback_fns, vectors=self.vectors)
         tok = tok / max(1, len(vocab.freq.values())) * 100
         occ = occ / max(1, sum(vocab.freq.values())) * 100
-        logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'")
+        logger.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'")
         return vocab
 
 
diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index a9e4095..a1959e8 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -12,6 +12,8 @@
 from torchglyph import data_path
 from torchglyph.io import download_and_unzip
 
+logger = logging.Logger(__name__)
+
 
 class Vocab(object):
     def __init__(self, counter: Counter,
@@ -126,11 +128,11 @@ def load_vectors(self, *fallback_fns, vectors: 'Vectors') -> Tuple[int, int]:
         return tok, occ
 
     def save(self, path: Path) -> None:
-        logging.info(f'saving {self.__class__.__name__} to {path}')
+        logger.info(f'saving {self.__class__.__name__} to {path}')
         torch.save((self.stoi, self.itos, self.vectors), path)
 
     def load(self, path: Path) -> None:
-        logging.info(f'loading {self.__class__.__name__} from {path}')
+        logger.info(f'loading {self.__class__.__name__} from {path}')
         self.stoi, self.itos, self.vectors = torch.load(path)
 
 
@@ -166,7 +168,7 @@ def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path,
                     if vector_dim is None:
                         vector_dim = len(vs)
                     elif vector_dim != len(vs):
-                        logging.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}')
+                        logger.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}')
                         continue
 
                     self.add_token_(str(token, encoding='utf-8', errors=unicode_error))

From dff85fd8abd926623b8900604ce2fb14674ada9b Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:04:38 +0900
Subject: [PATCH 32/66] Chore: Filter packages

---
 setup.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/setup.py b/setup.py
index 6a077a5..d8a455c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,18 +1,16 @@
 from setuptools import setup, find_packages
 
-with open('README.md', 'r', encoding='utf-8') as fp:
-    long_description = fp.read()
+name = 'torchglyph'
 
 setup(
-    name='torchglyph',
+    name=name,
     version='0.1.0',
-    packages=find_packages(),
-    url='https://github.com/speedcell4/torchglyph',
+    packages=[package for package in find_packages() if package.startswith(name)],
+    url=f'https://github.com/speedcell4/{name}',
     license='MIT',
     author='speedcell4',
     author_email='speedcell4@gmail.com',
     description='Data Processor Combinators for Natural Language Processing',
-    long_description=long_description,
     install_requires=[
         'tqdm',
         'numpy',

From aacce26af79311ce79dbfe45933bae423477324a Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:06:55 +0900
Subject: [PATCH 33/66] Refactor: Separate PadELMo

---
 torchglyph/pipe/ctx.py        |  2 +-
 torchglyph/proc/collecting.py | 12 ------------
 torchglyph/proc/ctx.py        | 17 +++++++++++++++++
 torchglyph/proc/vocab.py      | 11 ++++++-----
 4 files changed, 24 insertions(+), 18 deletions(-)
 create mode 100644 torchglyph/proc/ctx.py

diff --git a/torchglyph/pipe/ctx.py b/torchglyph/pipe/ctx.py
index 1adc83f..58ae589 100644
--- a/torchglyph/pipe/ctx.py
+++ b/torchglyph/pipe/ctx.py
@@ -3,8 +3,8 @@
 import torch
 
 from torchglyph.pipe import Pipe
-from torchglyph.proc import PadELMo
 from torchglyph.proc import ToDevice
+from torchglyph.proc.ctx import PadELMo
 from torchglyph.proc.tokenizer import ELMoTokenizer
 
 
diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py
index 62b2d68..2783c6f 100644
--- a/torchglyph/proc/collecting.py
+++ b/torchglyph/proc/collecting.py
@@ -2,17 +2,12 @@
 
 import numpy as np
 import torch
-from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary
-from allennlp.data.dataset import Batch as AllenBatch
 from torch import Tensor
 from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence
 
-from torchglyph.io import toggle_loggers
 from torchglyph.proc import Proc, Chain, stoi
 from torchglyph.vocab import Vocab
 
-toggle_loggers('allennlp', False)
-
 
 class ToDevice(Proc):
     Item = Union[int, float, bool, Tensor, PackedSequence]
@@ -105,13 +100,6 @@ def __call__(self, data: List[Tensor], vocab: Vocab, **kwargs) -> Tensor:
         )
 
 
-class PadELMo(Proc):
-    def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor:
-        batch = AllenBatch(data)
-        batch.index_instances(AllenVocabulary())
-        return batch.as_tensor_dict()['elmo']['character_ids']
-
-
 class PackSeq(Proc):
     def __init__(self, enforce_sorted: bool) -> None:
         super(PackSeq, self).__init__()
diff --git a/torchglyph/proc/ctx.py b/torchglyph/proc/ctx.py
new file mode 100644
index 0000000..0e2b786
--- /dev/null
+++ b/torchglyph/proc/ctx.py
@@ -0,0 +1,17 @@
+from typing import List
+
+from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary
+from allennlp.data.dataset import Batch as AllenBatch
+from torch import Tensor
+
+from torchglyph.io import toggle_loggers
+from torchglyph.proc import Proc
+
+toggle_loggers('allennlp', False)
+
+
+class PadELMo(Proc):
+    def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor:
+        batch = AllenBatch(data)
+        batch.index_instances(AllenVocabulary())
+        return batch.as_tensor_dict()['elmo']['character_ids']
diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py
index 1e08056..e3cf351 100644
--- a/torchglyph/proc/vocab.py
+++ b/torchglyph/proc/vocab.py
@@ -7,6 +7,7 @@
 
 logger = logging.Logger(__name__)
 
+
 class UpdateCounter(Proc):
     def __call__(self, data: Union[str, List[str]], counter: Counter, *args, **kwargs) -> Union[str, List[str]]:
         if isinstance(data, str):
@@ -63,15 +64,15 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab:
 
         name = f"{vocab.__class__.__name__} '{name}'"
         logger.info(f"{name} has {tok_cnt} token(s) => "
-                     f"{occ_avg:.1f} occurrence(s)/token ["
-                     f"{occ_max} :: '{tok_max}', "
-                     f"{occ_min} :: '{tok_min}']")
+                    f"{occ_avg:.1f} occurrence(s)/token ["
+                    f"{occ_max} :: '{tok_max}', "
+                    f"{occ_min} :: '{tok_min}']")
         if tok_cnt <= self.threshold:
             logger.info(f'{name} => [{", ".join(vocab.itos)}]')
         else:
             logger.info(f'{name} => ['
-                         f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., '
-                         f'{", ".join(vocab.itos[-self.threshold // 2:])}]')
+                        f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., '
+                        f'{", ".join(vocab.itos[-self.threshold // 2:])}]')
 
         return vocab
 

From 603f0674a34f6209f945ffefd27c329eeca53239 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:07:51 +0900
Subject: [PATCH 34/66] Style: PEP8 them all

---
 torchglyph/nn/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py
index b227825..0870190 100644
--- a/torchglyph/nn/embedding.py
+++ b/torchglyph/nn/embedding.py
@@ -39,7 +39,7 @@ def _padded_forward(self, sub: Tensor, tok_lengths: Tensor) -> Tensor:
             rearrange(tok_lengths.clamp_min(1), 'a b -> (a b)'),
             batch_first=self.rnn.batch_first, enforce_sorted=False,
         )
-        
+
         embedding = pack._replace(data=self.dropout(self.embedding(pack.data)))
         _, (encoding, _) = self.rnn(embedding)
 

From 2395ad167a8ff16fef75572a7da2f8a0f300815a Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:14:37 +0900
Subject: [PATCH 35/66] Fix: Update SupportPack

---
 torchglyph/functional.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchglyph/functional.py b/torchglyph/functional.py
index ce2ea73..484699a 100644
--- a/torchglyph/functional.py
+++ b/torchglyph/functional.py
@@ -1,5 +1,5 @@
 import functools
-from typing import Union, Tuple, Dict, Any
+from typing import Any, Union, Tuple, Dict
 
 import torch
 from torch import Tensor
@@ -19,7 +19,7 @@ def wrap(x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, Pac
 
 class SupportPack(type):
     def __new__(cls, name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]):
-        forward_fn = bases[0].forward
+        forward_fn = attrs.get('forward', bases[0].forward)
 
         @functools.wraps(forward_fn)
         def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]:

From 151ea7610b5ca5a79665cc4bc3f9833c4b10bcf0 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:22:55 +0900
Subject: [PATCH 36/66] Feat: Add repository flag for NLPLVectors

---
 torchglyph/vocab.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index a1959e8..baa9ad5 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -218,11 +218,11 @@ def __init__(self, lang: str) -> None:
 
 
 class NLPLVectors(Vectors):
-    def __init__(self, index: int, name: str = 'model.txt', heading: bool = False) -> None:
+    def __init__(self, index: int, repository: str = '20', name: str = 'model.txt', heading: bool = False) -> None:
         path = data_path / 'nlpl' / f'{index}'
         super(NLPLVectors, self).__init__(
             urls_dest=[(
-                f'http://vectors.nlpl.eu/repository/20/{index}.zip',
+                f'http://vectors.nlpl.eu/repository/{repository}/{index}.zip',
                 path / f'{index}.zip',
             )],
             path=path / name, heading=heading,

From 98b71344ba1b68276e81ab268f5d9b6cf7b9f233 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:25:22 +0900
Subject: [PATCH 37/66] Fix: Resolve toggle_loggers import path

---
 torchglyph/proc/tokenizer.py | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py
index e2fe321..8d244dc 100644
--- a/torchglyph/proc/tokenizer.py
+++ b/torchglyph/proc/tokenizer.py
@@ -5,14 +5,15 @@
 from allennlp.data.fields import TextField as AllenTextField
 from allennlp.data.token_indexers import ELMoTokenCharactersIndexer
 
-from torchglyph.proc import Proc, toggle_loggers
+from torchglyph.io import toggle_loggers
+from torchglyph.proc import Proc
 
 toggle_loggers('allennlp', False)
 toggle_loggers('transformers', False)
 
 
 class ELMoTokenizer(Proc):
-    def __init__(self):
+    def __init__(self) -> None:
         super(ELMoTokenizer, self).__init__()
         self.tokenizer = ELMoTokenCharactersIndexer()
 
@@ -36,66 +37,66 @@ def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]:
 
 
 class BertTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'bert-base-uncased'):
+    def __init__(self, weight: str = 'bert-base-uncased') -> None:
         super(BertTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.BertTokenizer.from_pretrained(weight)
 
 
 class OpenAIGPTTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'openai-gpt'):
+    def __init__(self, weight: str = 'openai-gpt') -> None:
         super(OpenAIGPTTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(weight)
 
 
 class GPT2Tokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'gpt2'):
+    def __init__(self, weight: str = 'gpt2') -> None:
         super(GPT2Tokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight)
 
 
 class CTRLTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'ctrl'):
+    def __init__(self, weight: str = 'ctrl') -> None:
         super(CTRLTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weight)
 
 
 class TransfoXLTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'transfo-xl-wt103'):
+    def __init__(self, weight: str = 'transfo-xl-wt103') -> None:
         super(TransfoXLTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.TransfoXLTokenizer.from_pretrained(weight)
 
 
 class XLNetTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'xlnet-base-cased'):
+    def __init__(self, weight: str = 'xlnet-base-cased') -> None:
         super(XLNetTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weight)
 
 
 class XLMTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'xlm-mlm-enfr-1024'):
+    def __init__(self, weight: str = 'xlm-mlm-enfr-1024') -> None:
         super(XLMTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.XLMTokenizer.from_pretrained(weight)
 
 
 class DistilBertTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'distilbert-base-cased'):
+    def __init__(self, weight: str = 'distilbert-base-cased') -> None:
         super(DistilBertTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(weight)
 
 
 class RobertaTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'roberta-base'):
+    def __init__(self, weight: str = 'roberta-base') -> None:
         super(RobertaTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.RobertaTokenizer.from_pretrained(weight)
 
 
 class XLMRobertaTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'xlm-roberta-base'):
+    def __init__(self, weight: str = 'xlm-roberta-base') -> None:
         super(XLMRobertaTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(weight)
 
 
 class BartTokenizer(TransformerTokenizerProc):
-    def __init__(self, weight: str = 'bart-large'):
+    def __init__(self, weight: str = 'bart-large') -> None:
         super(BartTokenizer, self).__init__(weight=weight)
         self.tokenizer = transformers.BartTokenizer.from_pretrained(weight)

From f15e9b5628079bb80072c429dda1702495f62d15 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:34:19 +0900
Subject: [PATCH 38/66] Chore: Run github action on both Python 3.7 and 3.8

---
 .github/workflows/unit-tests.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index c4e9f3f..5920a61 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -7,12 +7,16 @@ jobs:
 
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        python-version: ["3.7", "3.8"]
+
     steps:
-      - uses: actions/checkout@v1
-      - name: Set up Python 3.7
+      - uses: actions/checkout@v2
+      - name: Set up Python "${{ matrix.python-version }}"
         uses: actions/setup-python@v1
         with:
-          python-version: 3.7
+          python-version: "${{ matrix.python-version }}"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip

From 6369729850fa23575b83312afe35058b3a852270 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Sun, 3 May 2020 18:38:23 +0900
Subject: [PATCH 39/66] Style: Update unit-tests.yml style

---
 .github/workflows/unit-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 5920a61..92535fd 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -13,7 +13,7 @@ jobs:
 
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python "${{ matrix.python-version }}"
+      - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v1
         with:
           python-version: "${{ matrix.python-version }}"

From 08a27dca8feeac490f9c2435fe7f4fdbad238968 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:25:03 +0900
Subject: [PATCH 40/66] Docs: Init document

---
 .gitignore         |  1 +
 docs/docs/index.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 docs/mkdocs.yml    |  4 ++++
 setup.py           |  4 ++++
 4 files changed, 55 insertions(+)
 create mode 100644 docs/docs/index.md
 create mode 100644 docs/mkdocs.yml

diff --git a/.gitignore b/.gitignore
index a4b0cc6..d7d336d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -640,3 +640,4 @@ GitHub.sublime-settings
 *.ptx
 *.cubin
 *.fatbin
+!/docs/site/
diff --git a/docs/docs/index.md b/docs/docs/index.md
new file mode 100644
index 0000000..7a995d3
--- /dev/null
+++ b/docs/docs/index.md
@@ -0,0 +1,46 @@
+# Welcome to TorchGlyph
+
+[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions)
+
+Data Processor Combinators for Natural Language Processing
+
+## Installation
+
+Simply run this command in your terminal:
+
+```bash
+pip install torchglyph
+```
+
+## Quickstart
+
+The minimal data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce a more complex processor by composing two simple `Proc`s. 
+
+```python
+ToLower() + ReplaceDigits(repl_token='<digits>')
+```
+
+Composed `Proc`s act like data `Pipe`lines, in which raw input textual data is processed incrementally. According to the stages, they are roughly categories into four-groups:
+
++ `pre` for processing *before* building vocabulary;
++ `vocab` for building and updating *vocabulary*;
++ `post` for precessing *after* building vocabulary;
++ `batch` for collating examples to build *batches*.
+
+Defining the `Pipe`s of your dataset you can build it from scratch, or you can simply manipulate existing `Pipe`s by calling `.with_` method.
+
+```python
+class PackedTokSeqPipe(PackedIdxSeqPipe):
+    def __init__(self, device, unk_token, special_tokens = (), 
+                 threshold = THRESHOLD, dtype = torch.long) -> None:
+        super(PackedTokSeqPipe, self).__init__(device=device, dtype=dtype)
+        self.with_(
+            pre=UpdateCounter(),
+            vocab=[
+                BuildVocab(unk_token=unk_token, pad_token=None, 
+                           special_tokens=special_tokens),
+                StatsVocab(threshold=threshold),
+            ],
+            post=Numbering() + ...,
+        )
+```
\ No newline at end of file
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
new file mode 100644
index 0000000..07f06a4
--- /dev/null
+++ b/docs/mkdocs.yml
@@ -0,0 +1,4 @@
+site_name: TorchGlyph
+nav:
+  - Home: index.md
+theme: alabaster
\ No newline at end of file
diff --git a/setup.py b/setup.py
index d8a455c..886efde 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,10 @@
             'transformers',
             'allennlp',
             'elmoformanylangs',
+        ],
+        'docs': [
+            'mkdocs',
+            'mkdocs-alabaster',
         ]
     }
 )

From 223a41d8137fc865a0d08f0dd3e1e266bd33b435 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:38:46 +0900
Subject: [PATCH 41/66] Docs: Move to top-level directory

---
 .gitignore                    | 2 +-
 docs/{docs => }/index.md      | 0
 docs/mkdocs.yml => mkdocs.yml | 0
 3 files changed, 1 insertion(+), 1 deletion(-)
 rename docs/{docs => }/index.md (100%)
 rename docs/mkdocs.yml => mkdocs.yml (100%)

diff --git a/.gitignore b/.gitignore
index d7d336d..5149fdb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -640,4 +640,4 @@ GitHub.sublime-settings
 *.ptx
 *.cubin
 *.fatbin
-!/docs/site/
+!/site/
diff --git a/docs/docs/index.md b/docs/index.md
similarity index 100%
rename from docs/docs/index.md
rename to docs/index.md
diff --git a/docs/mkdocs.yml b/mkdocs.yml
similarity index 100%
rename from docs/mkdocs.yml
rename to mkdocs.yml

From b752cf0c0f7361e7e5cbdfee1d302a52e0301990 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:53:18 +0900
Subject: [PATCH 42/66] Chore: Add mkdocs.yml

---
 .github/workflows/mkdocs.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 .github/workflows/mkdocs.yml

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
new file mode 100644
index 0000000..c88bdea
--- /dev/null
+++ b/.github/workflows/mkdocs.yml
@@ -0,0 +1,14 @@
+name: mkdocs
+on:
+  push:
+    branches:
+      - develop
+
+jobs:
+  build:
+    name: Deploy docs
+    runs-on: ubuntu-latest
+    steps:
+      - name: Deploy MkDocs
+        uses: mhausenblas/mkdocs-deploy-gh-pages@1.11
+

From 2265e5e1ba26f66aab626c284a7d1fa702e60b2c Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:54:43 +0900
Subject: [PATCH 43/66] Fix: Update mkdocs.yml

---
 .github/workflows/mkdocs.yml | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index c88bdea..74633f8 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -9,6 +9,11 @@ jobs:
     name: Deploy docs
     runs-on: ubuntu-latest
     steps:
-      - name: Deploy MkDocs
-        uses: mhausenblas/mkdocs-deploy-gh-pages@1.11
+      - name: Checkout develop
+        uses: actions/checkout@v1
 
+      - name: Deploy docs
+        uses: mhausenblas/mkdocs-deploy-gh-pages@master
+        env:
+          PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }}
+          CUSTOM_DOMAIN: optionaldomain.com
\ No newline at end of file

From c5895e69914e5d48201d6d866d8b0595ee81709e Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:57:41 +0900
Subject: [PATCH 44/66] Fix: Install Python and dependencies

---
 .github/workflows/mkdocs.yml | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index 74633f8..c6b9fcb 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -11,9 +11,13 @@ jobs:
     steps:
       - name: Checkout develop
         uses: actions/checkout@v1
-
+      - name: Set up Python 3.7
+          uses: actions/setup-python@v1
+          with:
+            python-version: 3.7
+      - name: Install dependencies
+          run: python -m pip install -e '.[docs]'
       - name: Deploy docs
         uses: mhausenblas/mkdocs-deploy-gh-pages@master
         env:
-          PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }}
-          CUSTOM_DOMAIN: optionaldomain.com
\ No newline at end of file
+          PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }}
\ No newline at end of file

From 7af526e625587684c17f81a612ad76ec96b3b8ee Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:58:29 +0900
Subject: [PATCH 45/66] Fix: Resolve indent

---
 .github/workflows/mkdocs.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index c6b9fcb..2fcdaa6 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -12,9 +12,9 @@ jobs:
       - name: Checkout develop
         uses: actions/checkout@v1
       - name: Set up Python 3.7
-          uses: actions/setup-python@v1
-          with:
-            python-version: 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
       - name: Install dependencies
           run: python -m pip install -e '.[docs]'
       - name: Deploy docs

From 02c7e3b54819e478e42f47105517fafbbb25d9d2 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 02:58:57 +0900
Subject: [PATCH 46/66] Fix: Resolve indent again

---
 .github/workflows/mkdocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index 2fcdaa6..770eeff 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -16,7 +16,7 @@ jobs:
         with:
           python-version: 3.7
       - name: Install dependencies
-          run: python -m pip install -e '.[docs]'
+        run: python -m pip install -e '.[docs]'
       - name: Deploy docs
         uses: mhausenblas/mkdocs-deploy-gh-pages@master
         env:

From b1b65fc5bbf9c9635b10c1b18c62b961147b3b50 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:03:04 +0900
Subject: [PATCH 47/66] Fix: Update Deploy

---
 .github/workflows/mkdocs.yml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index 770eeff..d1d089f 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -17,7 +17,8 @@ jobs:
           python-version: 3.7
       - name: Install dependencies
         run: python -m pip install -e '.[docs]'
-      - name: Deploy docs
-        uses: mhausenblas/mkdocs-deploy-gh-pages@master
-        env:
-          PERSONAL_TOKEN: ${{ secrets.PERSONAL_TOKEN }}
\ No newline at end of file
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.PERSONAL_TOKEN }}
+          publish_dir: ./site
\ No newline at end of file

From dc6159e4c5f24a06de1bb2039a310dcdc451a3bf Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:05:16 +0900
Subject: [PATCH 48/66] Fix: Update Build

---
 .github/workflows/mkdocs.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index d1d089f..f3634bc 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -17,6 +17,8 @@ jobs:
           python-version: 3.7
       - name: Install dependencies
         run: python -m pip install -e '.[docs]'
+      - name: Build
+        run: mkdocs build
       - name: Deploy
         uses: peaceiris/actions-gh-pages@v3
         with:

From 2f44bda3431e41bb8993dc970f780ea37e90d154 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:06:52 +0900
Subject: [PATCH 49/66] Docs: Update comma

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 7a995d3..4e8b91e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -6,7 +6,7 @@ Data Processor Combinators for Natural Language Processing
 
 ## Installation
 
-Simply run this command in your terminal:
+Simply run this command in your terminal,
 
 ```bash
 pip install torchglyph

From 274d7c69c2c696fb7c7f9027a2d33009f6f21bbb Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:09:49 +0900
Subject: [PATCH 50/66] Docs: Add PackedIdxSeqPipe

---
 docs/index.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 4e8b91e..e90c554 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -27,7 +27,20 @@ Composed `Proc`s act like data `Pipe`lines, in which raw input textual data is p
 + `post` for precessing *after* building vocabulary;
 + `batch` for collating examples to build *batches*.
 
-Defining the `Pipe`s of your dataset you can build it from scratch, or you can simply manipulate existing `Pipe`s by calling `.with_` method.
+Defining the `Pipe`s of your dataset you can build it from scratch, 
+
+```python
+class PackedIdxSeqPipe(Pipe):
+    def __init__(self, device, dtype = torch.long) -> None:
+        super(PackedIdxSeqPipe, self).__init__(
+            pre=None,
+            vocab=None,
+            post=ToTensor(dtype=dtype),
+            batch=PackSeq(enforce_sorted=False) + ToDevice(device=device),
+        )
+```
+
+or you can simply manipulate existing `Pipe`s by calling `.with_` method.
 
 ```python
 class PackedTokSeqPipe(PackedIdxSeqPipe):

From 7847157f939f7ac63c764ec9e46cf4fe6a1775b7 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:14:56 +0900
Subject: [PATCH 51/66] Docs: Fix some typos

---
 docs/index.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index e90c554..9357b0e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,9 +1,9 @@
 # Welcome to TorchGlyph
 
-[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions)
-
 Data Processor Combinators for Natural Language Processing
 
+[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions)
+
 ## Installation
 
 Simply run this command in your terminal,
@@ -14,20 +14,20 @@ pip install torchglyph
 
 ## Quickstart
 
-The minimal data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce a more complex processor by composing two simple `Proc`s. 
+The atomic data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce complex `Proc` by composing two simple `Proc`s. 
 
 ```python
 ToLower() + ReplaceDigits(repl_token='<digits>')
 ```
 
-Composed `Proc`s act like data `Pipe`lines, in which raw input textual data is processed incrementally. According to the stages, they are roughly categories into four-groups:
+Composed `Proc`s act like data `Pipe`lines, where raw textual data is processed incrementally. According to the stages, they are roughly categorized into four-groups:
 
 + `pre` for processing *before* building vocabulary;
 + `vocab` for building and updating *vocabulary*;
 + `post` for precessing *after* building vocabulary;
 + `batch` for collating examples to build *batches*.
 
-Defining the `Pipe`s of your dataset you can build it from scratch, 
+Defining the `Pipe`s of your dataset is the first step to build a dataset, you can build it from scratch, 
 
 ```python
 class PackedIdxSeqPipe(Pipe):

From acc0c9c87b6da652429471ae38ffc2d89131095f Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:22:19 +0900
Subject: [PATCH 52/66] Docs: PEP8 them all

---
 docs/index.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 9357b0e..f1a0311 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -31,7 +31,7 @@ Defining the `Pipe`s of your dataset is the first step to build a dataset, you c
 
 ```python
 class PackedIdxSeqPipe(Pipe):
-    def __init__(self, device, dtype = torch.long) -> None:
+    def __init__(self, device, dtype=torch.long) -> None:
         super(PackedIdxSeqPipe, self).__init__(
             pre=None,
             vocab=None,
@@ -44,13 +44,13 @@ or you can simply manipulate existing `Pipe`s by calling `.with_` method.
 
 ```python
 class PackedTokSeqPipe(PackedIdxSeqPipe):
-    def __init__(self, device, unk_token, special_tokens = (), 
-                 threshold = THRESHOLD, dtype = torch.long) -> None:
+    def __init__(self, device, unk_token, special_tokens=(),
+                 threshold=THRESHOLD, dtype=torch.long) -> None:
         super(PackedTokSeqPipe, self).__init__(device=device, dtype=dtype)
         self.with_(
             pre=UpdateCounter(),
             vocab=[
-                BuildVocab(unk_token=unk_token, pad_token=None, 
+                BuildVocab(unk_token=unk_token, pad_token=None,
                            special_tokens=special_tokens),
                 StatsVocab(threshold=threshold),
             ],

From dcb28fe11f4bfe0ea8cfd8c6d723e5c856953a12 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:50:09 +0900
Subject: [PATCH 53/66] Chore: Remove Python 3.8 unit test

---
 .github/workflows/unit-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 92535fd..0915eec 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -9,7 +9,7 @@ jobs:
 
     strategy:
       matrix:
-        python-version: ["3.7", "3.8"]
+        python-version: ["3.7"]
 
     steps:
       - uses: actions/checkout@v2

From 050f32e77e5094777dcfb6397b920c26918d64a9 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 03:51:28 +0900
Subject: [PATCH 54/66] Chore: Update homepage

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 886efde..414abe1 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@
     name=name,
     version='0.1.0',
     packages=[package for package in find_packages() if package.startswith(name)],
-    url=f'https://github.com/speedcell4/{name}',
+    url=f'https://speedcell4.github.io/torchglyph',
     license='MIT',
     author='speedcell4',
     author_email='speedcell4@gmail.com',

From e9bf982fa78993c574d39f9566f78a05699dd9ad Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 4 May 2020 18:08:02 +0900
Subject: [PATCH 55/66] Feat: Add Itos

---
 torchglyph/proc/infer.py | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 torchglyph/proc/infer.py

diff --git a/torchglyph/proc/infer.py b/torchglyph/proc/infer.py
new file mode 100644
index 0000000..d83faf1
--- /dev/null
+++ b/torchglyph/proc/infer.py
@@ -0,0 +1,9 @@
+from typing import List
+
+from torchglyph.proc import Proc
+from torchglyph.vocab import Vocab
+
+
+class RevVocab(Proc):
+    def __call__(self, xs: List[int], vocab: Vocab, **kwargs) -> List[str]:
+        return [vocab.itos[x] for x in xs]

From d021730b8e26d75713440d7be623bb9b301753d6 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Fri, 8 May 2020 15:24:04 +0900
Subject: [PATCH 56/66] Fix: Resolve vocabulary issue of PackedSeqPtrSeqPipe

---
 torchglyph/pipe/seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchglyph/pipe/seq.py b/torchglyph/pipe/seq.py
index a5b9ff2..f0f7c6e 100644
--- a/torchglyph/pipe/seq.py
+++ b/torchglyph/pipe/seq.py
@@ -90,7 +90,7 @@ class PackedSeqPtrSeqPipe(PackedIdxSeqPipe):
     def __init__(self, device: Union[int, torch.device], dtype: torch.dtype = torch.long) -> None:
         super(PackedSeqPtrSeqPipe, self).__init__(device=device, dtype=dtype)
         self.with_(
-            pre=GetMask(token=0),
+            post=GetMask(token=0) + ...,
             batch=Scan(fn=cum_seq, init=0) + ...,
         )
 

From 8d658b4d409658fa06a01cf46373a5c478bf33fc Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 12 May 2020 02:17:31 +0900
Subject: [PATCH 57/66] Feat: Add PackedContiguousSubPipe and
 PackedContiguousSubPtrPipe

---
 torchglyph/pipe/contiguous.py | 33 ++++++++++++++++++++
 torchglyph/proc/contiguous.py | 59 +++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 torchglyph/pipe/contiguous.py
 create mode 100644 torchglyph/proc/contiguous.py

diff --git a/torchglyph/pipe/contiguous.py b/torchglyph/pipe/contiguous.py
new file mode 100644
index 0000000..25e0ae8
--- /dev/null
+++ b/torchglyph/pipe/contiguous.py
@@ -0,0 +1,33 @@
+from typing import Union, Optional, Tuple
+
+import torch
+
+from torchglyph.pipe import PackedTokSeqPipe
+from torchglyph.pipe import Pipe
+from torchglyph.pipe import THRESHOLD
+from torchglyph.proc import GetLength, Lift, ToTensor
+from torchglyph.proc.collecting import ToDevice
+from torchglyph.proc.contiguous import BuildContiguousSub, BuildContiguousSubPtr, PackContiguousSubPtr
+
+
+class PackedContiguousSubPipe(PackedTokSeqPipe):
+    def __init__(self, device: Union[int, torch.device], unk_token: Optional[str],
+                 seq_token: str, special_tokens: Tuple[Optional[str], ...] = (),
+                 threshold: int = THRESHOLD, dtype: torch.dtype = torch.long) -> None:
+        super(PackedContiguousSubPipe, self).__init__(
+            device=device, unk_token=unk_token, special_tokens=special_tokens,
+            threshold=threshold, dtype=dtype,
+        )
+        self.with_(
+            pre=BuildContiguousSub(seq_token=seq_token) + ...,
+        )
+
+
+class PackedContiguousSubPtrPipe(Pipe):
+    def __init__(self, device: Union[int, torch.device], dtype: torch.dtype = torch.long) -> None:
+        super(PackedContiguousSubPtrPipe, self).__init__(
+            pre=Lift(GetLength()) + BuildContiguousSubPtr() + Lift(ToTensor(dtype=dtype)),
+            vocab=None,
+            post=None,
+            batch=PackContiguousSubPtr(enforce_sorted=False) + ToDevice(device=device),
+        )
diff --git a/torchglyph/proc/contiguous.py b/torchglyph/proc/contiguous.py
new file mode 100644
index 0000000..6b73c66
--- /dev/null
+++ b/torchglyph/proc/contiguous.py
@@ -0,0 +1,59 @@
+from typing import List, Tuple
+
+import torch
+from torch import Tensor
+from torch.nn.utils.rnn import PackedSequence
+from torch.nn.utils.rnn import pack_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+
+from torchglyph.proc.abc import Proc
+
+
+class BuildContiguousSub(Proc):
+    def __init__(self, seq_token: str) -> None:
+        super(BuildContiguousSub, self).__init__()
+        self.seq_token = seq_token
+
+    def extra_repr(self) -> str:
+        return repr(self.seq_token)
+
+    def __call__(self, tokens: List[str], **kwargs) -> List[str]:
+        zs = []
+        for token in tokens:
+            zs.extend(list(token))
+            zs.append(self.seq_token)
+        return zs[:-1]
+
+
+class BuildContiguousSubPtr(Proc):
+    def __call__(self, lengths: List[int], **kwargs) -> Tuple[List[int], List[int]]:
+        indices = [0]
+        for length in lengths:
+            indices.append(indices[-1] + length + 1)
+        return [index - 2 for index in indices[1:]], indices[:-1]
+
+
+class PackContiguousSubPtr(Proc):
+    def __init__(self, enforce_sorted: bool) -> None:
+        super(PackContiguousSubPtr, self).__init__()
+        self.enforce_sorted = enforce_sorted
+
+    def extra_repr(self) -> str:
+        return f'enforce_sorted={self.enforce_sorted}'
+
+    def __call__(self, data: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]:
+        fs, bs = zip(*data)
+
+        pack = pack_sequence([
+            torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fs
+        ], enforce_sorted=self.enforce_sorted)
+        indices = pack._replace(data=torch.arange(pack.data.size(0), device=pack.data.device))
+        indices, _ = pad_packed_sequence(indices, batch_first=True)
+
+        fs = pack_sequence([
+            indices[i, f] for i, f in enumerate(fs)
+        ], enforce_sorted=self.enforce_sorted)
+        bs = pack_sequence([
+            indices[i, b] for i, b in enumerate(bs)
+        ], enforce_sorted=self.enforce_sorted)
+        return fs, bs

From de8a5fcb6dff2c6ad84de513d96864c4ace2bc55 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 12 May 2020 03:03:50 +0900
Subject: [PATCH 58/66] Feat: Add ContiguousSubLstmEmbedding

---
 torchglyph/nn/embedding.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py
index 0870190..de93ce7 100644
--- a/torchglyph/nn/embedding.py
+++ b/torchglyph/nn/embedding.py
@@ -1,4 +1,4 @@
-from typing import Union
+from typing import Union, Tuple
 
 import torch
 from einops import rearrange
@@ -57,3 +57,33 @@ def forward(self, sub: Union[Tensor, PackedSequence], *args) -> Union[Tensor, Pa
             return self._padded_forward(sub, *args)
         else:
             return self._packed_forward(sub, *args)
+
+
+class ContiguousSubLstmEmbedding(nn.Module):
+    def __init__(self, num_embeddings: int, embedding_dim: int,
+                 hidden_dim: int, dropout: float, num_layers: int = 1,
+                 bias: bool = True, batch_first: bool = True,
+                 bidirectional: bool = True, padding_idx: int = None) -> None:
+        super(ContiguousSubLstmEmbedding, self).__init__()
+
+        self.embedding = nn.Embedding(
+            num_embeddings=num_embeddings,
+            embedding_dim=embedding_dim,
+            padding_idx=padding_idx,
+        )
+        self.dropout = nn.Dropout(dropout)
+        self.rnn = nn.LSTM(
+            input_size=self.embedding.embedding_dim,
+            hidden_size=hidden_dim, num_layers=num_layers, bias=bias,
+            batch_first=batch_first, bidirectional=bidirectional,
+        )
+
+        self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1)
+
+    def forward(self, sub: PackedSequence, indices: Tuple[PackedSequence, PackedSequence]) -> PackedSequence:
+        embedding = sub._replace(data=self.dropout(self.embedding(sub.data)))
+        encoding, _ = self.rnn(embedding)  # type: (PackedSequence, _)
+
+        fidx, bidx = indices
+        fenc, benc = encoding.data.chunk(2, dim=-1)
+        return fidx._replace(data=torch.cat([fenc[fidx.data], benc[bidx.data]], dim=-1))

From ba5029df6af43b8801abf227d510fe3be1e5bee6 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 12 May 2020 15:21:35 +0900
Subject: [PATCH 59/66] Refactor: Rename PackContiguousSubPtr

---
 torchglyph/proc/contiguous.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/torchglyph/proc/contiguous.py b/torchglyph/proc/contiguous.py
index 6b73c66..8cd5b11 100644
--- a/torchglyph/proc/contiguous.py
+++ b/torchglyph/proc/contiguous.py
@@ -41,19 +41,19 @@ def __init__(self, enforce_sorted: bool) -> None:
     def extra_repr(self) -> str:
         return f'enforce_sorted={self.enforce_sorted}'
 
-    def __call__(self, data: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]:
-        fs, bs = zip(*data)
+    def __call__(self, indices: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]:
+        fidx, bidx = zip(*indices)
 
         pack = pack_sequence([
-            torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fs
+            torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fidx
         ], enforce_sorted=self.enforce_sorted)
         indices = pack._replace(data=torch.arange(pack.data.size(0), device=pack.data.device))
         indices, _ = pad_packed_sequence(indices, batch_first=True)
 
-        fs = pack_sequence([
-            indices[i, f] for i, f in enumerate(fs)
+        fidx = pack_sequence([
+            indices[i, f] for i, f in enumerate(fidx)
         ], enforce_sorted=self.enforce_sorted)
-        bs = pack_sequence([
-            indices[i, b] for i, b in enumerate(bs)
+        bidx = pack_sequence([
+            indices[i, b] for i, b in enumerate(bidx)
         ], enforce_sorted=self.enforce_sorted)
-        return fs, bs
+        return fidx, bidx

From 11a347d0ff9de4a455e40d63810042c40007cff7 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 12 May 2020 15:50:30 +0900
Subject: [PATCH 60/66] Feat: Add head_pack and prepend_pack

---
 torchglyph/functional.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/torchglyph/functional.py b/torchglyph/functional.py
index 484699a..e963971 100644
--- a/torchglyph/functional.py
+++ b/torchglyph/functional.py
@@ -1,5 +1,6 @@
 import functools
-from typing import Any, Union, Tuple, Dict
+from typing import Any
+from typing import Union, Tuple, Dict
 
 import torch
 from torch import Tensor
@@ -29,3 +30,16 @@ def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Te
                 return x._replace(data=forward_fn(self, x.data, *args, **kwargs))
 
         return type(name, bases, {**attrs, 'forward': forward})
+
+
+def head_pack(pack: PackedSequence) -> Tensor:
+    return pack.data[:pack.batch_sizes[0].item()]
+
+
+def prepend_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -> PackedSequence:
+    if not torch.is_tensor(value):
+        value = torch.full_like(head_pack(pack), fill_value=value)
+    return pack._replace(
+        data=torch.cat([value, pack.data], dim=0),
+        batch_sizes=torch.cat([pack.batch_sizes[:1], pack.batch_sizes], dim=0),
+    )

From ba8670dba5f60231fc5cd79098ea4ab1ae3d3d9b Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Tue, 12 May 2020 15:56:24 +0900
Subject: [PATCH 61/66] Feat: Add SupportPack and rename SupportPackMeta

---
 torchglyph/functional.py   | 15 ++++++++++++++-
 torchglyph/nn/embedding.py |  4 ++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/torchglyph/functional.py b/torchglyph/functional.py
index e963971..e1ed678 100644
--- a/torchglyph/functional.py
+++ b/torchglyph/functional.py
@@ -4,6 +4,7 @@
 
 import torch
 from torch import Tensor
+from torch import nn
 from torch.nn.utils.rnn import PackedSequence
 
 
@@ -18,7 +19,19 @@ def wrap(x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, Pac
     return wrap
 
 
-class SupportPack(type):
+class SupportPack(nn.Module):
+    def __init__(self, module: nn.Module) -> None:
+        super(SupportPack, self).__init__()
+        self.module = module
+
+    def __repr__(self) -> str:
+        return f'Packed{self.module.__repr__()}'
+
+    def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]:
+        return support_pack(self.module)(x)
+
+
+class SupportPackMeta(type):
     def __new__(cls, name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]):
         forward_fn = attrs.get('forward', bases[0].forward)
 
diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py
index de93ce7..f97096c 100644
--- a/torchglyph/nn/embedding.py
+++ b/torchglyph/nn/embedding.py
@@ -5,10 +5,10 @@
 from torch import nn, Tensor
 from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence
 
-from torchglyph.functional import SupportPack
+from torchglyph.functional import SupportPackMeta
 
 
-class TokEmbedding(nn.Embedding, metaclass=SupportPack):
+class TokEmbedding(nn.Embedding, metaclass=SupportPackMeta):
     pass
 
 

From 90a3b142864b706ed527615f770fcb1911c87a04 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 18 May 2020 12:57:03 +0900
Subject: [PATCH 62/66] Feat: Add tail_pack and append_pack

---
 torchglyph/functional.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/torchglyph/functional.py b/torchglyph/functional.py
index e1ed678..5462117 100644
--- a/torchglyph/functional.py
+++ b/torchglyph/functional.py
@@ -5,7 +5,8 @@
 import torch
 from torch import Tensor
 from torch import nn
-from torch.nn.utils.rnn import PackedSequence
+from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+from torch.nn.utils.rnn import pack_padded_sequence
 
 
 def support_pack(fn):
@@ -56,3 +57,20 @@ def prepend_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -
         data=torch.cat([value, pack.data], dim=0),
         batch_sizes=torch.cat([pack.batch_sizes[:1], pack.batch_sizes], dim=0),
     )
+
+
+def tail_pack(pack: PackedSequence) -> Tensor:
+    data, lengths = pad_packed_sequence(pack, batch_first=True)  # type: (Tensor, Tensor)
+    indices = torch.arange(lengths.size(0), dtype=torch.long, device=data.device)
+    return data[indices, lengths - 1]
+
+
+def append_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -> PackedSequence:
+    if not torch.is_tensor(value):
+        value = torch.full_like(head_pack(pack), fill_value=value)
+    data, lengths = pad_packed_sequence(pack, batch_first=True)  # type: (Tensor, Tensor)
+    indices = torch.arange(lengths.size(0), dtype=torch.long, device=data.device)
+    return pack_padded_sequence(
+        torch.cat([data, value[:, None]], dim=1).index_put((indices, lengths), value),
+        lengths + 1, batch_first=True, enforce_sorted=False,
+    )

From 0570d4d4b5b6fec9852530dc6daf5f10a1a896e1 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 25 May 2020 17:15:29 +0900
Subject: [PATCH 63/66] Feat: Add unk_idx for TokEmbedding

---
 torchglyph/nn/embedding.py | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py
index f97096c..0d99723 100644
--- a/torchglyph/nn/embedding.py
+++ b/torchglyph/nn/embedding.py
@@ -2,21 +2,34 @@
 
 import torch
 from einops import rearrange
-from torch import nn, Tensor
-from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence
+from torch import Tensor
+from torch import nn
+from torch.nn.utils.rnn import PackedSequence, pack_sequence, pack_padded_sequence
 
 from torchglyph.functional import SupportPackMeta
 
 
 class TokEmbedding(nn.Embedding, metaclass=SupportPackMeta):
-    pass
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = None, unk_idx: int = None,
+                 max_norm: float = None, norm_type: float = 2., scale_grad_by_freq: bool = False,
+                 sparse: bool = False, _weight: Tensor = None):
+        super(TokEmbedding, self).__init__(
+            num_embeddings=num_embeddings, embedding_dim=embedding_dim,
+            padding_idx=padding_idx, max_norm=max_norm, norm_type=norm_type,
+            scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, _weight=_weight,
+        )
+        self._unk_idx = unk_idx
+
+    @property
+    def unk(self) -> Tensor:
+        return self.weight[self._unk_idx]
 
 
 class SubLstmEmbedding(nn.Module):
     def __init__(self, num_embeddings: int, embedding_dim: int,
                  hidden_dim: int, dropout: float, num_layers: int = 1,
                  bias: bool = True, batch_first: bool = True,
-                 bidirectional: bool = True, padding_idx: int = None) -> None:
+                 bidirectional: bool = True, padding_idx: int = None, unk_idx: int = None) -> None:
         super(SubLstmEmbedding, self).__init__()
 
         self.embedding = nn.Embedding(
@@ -32,6 +45,13 @@ def __init__(self, num_embeddings: int, embedding_dim: int,
         )
 
         self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1)
+        self._unk_idx = unk_idx
+
+    @property
+    def unk(self) -> Tensor:
+        embedding = self.embedding.weight[None, self._unk_idx]
+        _, (encoding, _) = self.rnn(pack_sequence([embedding], enforce_sorted=True))
+        return rearrange(encoding, '(l d) a h -> l a (d h)', l=self.rnn.num_layers)[0, 0, :]
 
     def _padded_forward(self, sub: Tensor, tok_lengths: Tensor) -> Tensor:
         pack = pack_padded_sequence(

From 3ffe20539ffa273accd6109336896da199a5d245 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 25 May 2020 17:26:18 +0900
Subject: [PATCH 64/66] Feat: Add unk_idx for SubLstmEmbedding

---
 torchglyph/nn/embedding.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py
index 0d99723..87fd302 100644
--- a/torchglyph/nn/embedding.py
+++ b/torchglyph/nn/embedding.py
@@ -18,11 +18,11 @@ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = N
             padding_idx=padding_idx, max_norm=max_norm, norm_type=norm_type,
             scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, _weight=_weight,
         )
-        self._unk_idx = unk_idx
+        self.unk_idx = unk_idx
 
     @property
     def unk(self) -> Tensor:
-        return self.weight[self._unk_idx]
+        return self.weight[self.unk_idx]
 
 
 class SubLstmEmbedding(nn.Module):
@@ -45,11 +45,11 @@ def __init__(self, num_embeddings: int, embedding_dim: int,
         )
 
         self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1)
-        self._unk_idx = unk_idx
+        self.unk_idx = unk_idx
 
     @property
     def unk(self) -> Tensor:
-        embedding = self.embedding.weight[None, self._unk_idx]
+        embedding = self.embedding.weight[None, self.unk_idx]
         _, (encoding, _) = self.rnn(pack_sequence([embedding], enforce_sorted=True))
         return rearrange(encoding, '(l d) a h -> l a (d h)', l=self.rnn.num_layers)[0, 0, :]
 

From 5a855b4e060c1e3d739b6b25d255e453e2fca8b6 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Thu, 4 Jun 2020 23:00:26 +0900
Subject: [PATCH 65/66] Fix: Use getLogger

---
 torchglyph/datasets/sequential_labeling.py | 2 +-
 torchglyph/datasets/text_classification.py | 2 +-
 torchglyph/io.py                           | 2 +-
 torchglyph/nn/contextual.py                | 2 +-
 torchglyph/proc/vocab.py                   | 2 +-
 torchglyph/vocab.py                        | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py
index 85cf6eb..edaa85a 100644
--- a/torchglyph/datasets/sequential_labeling.py
+++ b/torchglyph/datasets/sequential_labeling.py
@@ -13,7 +13,7 @@
 from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe
 from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend
 
-logger = logging.Logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class CoNLL2000Chunking(Dataset):
diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py
index 9789580..a228479 100644
--- a/torchglyph/datasets/text_classification.py
+++ b/torchglyph/datasets/text_classification.py
@@ -12,7 +12,7 @@
 from torchglyph.pipe import PackedTokSeqPipe, TokTensorPipe, RawPipe
 from torchglyph.proc import Identity, LoadGlove
 
-logger = logging.Logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class AgNews(Dataset):
diff --git a/torchglyph/io.py b/torchglyph/io.py
index cf1cf15..e2b7025 100644
--- a/torchglyph/io.py
+++ b/torchglyph/io.py
@@ -12,7 +12,7 @@
 
 from tqdm import tqdm
 
-logger = logging.Logger(__name__)
+logger = logging.getLogger(__name__)
 
 IO = Union[str, Path, TextIO]
 
diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py
index af42d5b..9ec87c4 100644
--- a/torchglyph/nn/contextual.py
+++ b/torchglyph/nn/contextual.py
@@ -18,7 +18,7 @@
 toggle_loggers('allennlp', False)
 toggle_loggers('elmoformanylangs', False)
 
-logger = logging.Logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class ELMoModel(AllenELMo):
diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py
index e3cf351..ad47e9c 100644
--- a/torchglyph/proc/vocab.py
+++ b/torchglyph/proc/vocab.py
@@ -5,7 +5,7 @@
 from torchglyph.proc import Proc
 from torchglyph.vocab import Vocab, Vectors, Glove, FastTest
 
-logger = logging.Logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class UpdateCounter(Proc):
diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py
index baa9ad5..428dea8 100644
--- a/torchglyph/vocab.py
+++ b/torchglyph/vocab.py
@@ -12,7 +12,7 @@
 from torchglyph import data_path
 from torchglyph.io import download_and_unzip
 
-logger = logging.Logger(__name__)
+logger = logging.getLogger(__name__)
 
 
 class Vocab(object):

From 843837a2e03e1f21015b2238c93becce85f09413 Mon Sep 17 00:00:00 2001
From: speedcell4 <speedcell4@gmail.com>
Date: Mon, 27 Jul 2020 22:15:16 +0900
Subject: [PATCH 66/66] Feat: Update Github Action settings

---
 .github/workflows/mkdocs.yml         |  8 +++----
 .github/workflows/python-publish.yml | 31 ++++++++++++++++++++++++++++
 .github/workflows/unit-tests.yml     |  4 ++--
 README.md                            |  3 ++-
 setup.py                             |  2 +-
 5 files changed, 40 insertions(+), 8 deletions(-)
 create mode 100644 .github/workflows/python-publish.yml

diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
index f3634bc..5e041c6 100644
--- a/.github/workflows/mkdocs.yml
+++ b/.github/workflows/mkdocs.yml
@@ -1,4 +1,4 @@
-name: mkdocs
+name: Build Document by Mkdocs
 on:
   push:
     branches:
@@ -10,9 +10,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout develop
-        uses: actions/checkout@v1
+        uses: actions/checkout@v2
       - name: Set up Python 3.7
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
         with:
           python-version: 3.7
       - name: Install dependencies
@@ -22,5 +22,5 @@ jobs:
       - name: Deploy
         uses: peaceiris/actions-gh-pages@v3
         with:
-          github_token: ${{ secrets.PERSONAL_TOKEN }}
+          github_token: ${{ secrets.GITHUB_TOKEN }}
           publish_dir: ./site
\ No newline at end of file
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
new file mode 100644
index 0000000..7ba9c7b
--- /dev/null
+++ b/.github/workflows/python-publish.yml
@@ -0,0 +1,31 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.7'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install setuptools wheel twine
+      - name: Build and publish
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: |
+          python setup.py sdist bdist_wheel
+          twine upload dist/*
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index 0915eec..9f4cd89 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -1,4 +1,4 @@
-name: unit-tests
+name: Unit Tests
 
 on: [push]
 
@@ -14,7 +14,7 @@ jobs:
     steps:
       - uses: actions/checkout@v2
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v1
+        uses: actions/setup-python@v2
         with:
           python-version: "${{ matrix.python-version }}"
       - name: Install dependencies
diff --git a/README.md b/README.md
index 5365e39..eb43acc 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # TorchGlyph
 
-[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions)
+![Unit Tests](https://github.com/speedcell4/torchglyph/workflows/Unit%20Tests/badge.svg)
+![Upload Python Package](https://github.com/speedcell4/torchglyph/workflows/Upload%20Python%20Package/badge.svg)
 
 ## Requirements
 
diff --git a/setup.py b/setup.py
index 414abe1..eddd429 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name=name,
-    version='0.1.0',
+    version='0.1.1',
     packages=[package for package in find_packages() if package.startswith(name)],
     url=f'https://speedcell4.github.io/torchglyph',
     license='MIT',