diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml new file mode 100644 index 0000000..5e041c6 --- /dev/null +++ b/.github/workflows/mkdocs.yml @@ -0,0 +1,26 @@ +name: Build Document by Mkdocs +on: + push: + branches: + - develop + +jobs: + build: + name: Deploy docs + runs-on: ubuntu-latest + steps: + - name: Checkout develop + uses: actions/checkout@v2 + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + - name: Install dependencies + run: python -m pip install -e '.[docs]' + - name: Build + run: mkdocs build + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./site \ No newline at end of file diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..7ba9c7b --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,31 @@ +# This workflows will upload a Python Package using Twine when a release is created +# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries + +name: Upload Python Package + +on: + release: + types: [created] + +jobs: + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.7' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install setuptools wheel twine + - name: Build and publish + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index ef5ea86..9f4cd89 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,4 +1,4 @@ -name: unit-tests +name: Unit Tests on: [push] @@ -7,17 +7,21 @@ jobs: runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.7"] + steps: - - uses: actions/checkout@v1 - - name: Set up Python 3.7 - uses: actions/setup-python@v1 + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: "${{ matrix.python-version }}" - name: Install dependencies run: | python -m pip install --upgrade pip python -m pip install torch - python -m pip install -e '.[dev]' + python -m pip install -e '.[dev, ctx]' - name: Test with pytest run: | python -m pytest tests diff --git a/.gitignore b/.gitignore index a4b0cc6..5149fdb 100644 --- a/.gitignore +++ b/.gitignore @@ -640,3 +640,4 @@ GitHub.sublime-settings *.ptx *.cubin *.fatbin +!/site/ diff --git a/README.md b/README.md index 5365e39..eb43acc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # TorchGlyph -[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions) +![Unit Tests](https://github.com/speedcell4/torchglyph/workflows/Unit%20Tests/badge.svg) +![Upload Python Package](https://github.com/speedcell4/torchglyph/workflows/Upload%20Python%20Package/badge.svg) ## Requirements diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..f1a0311 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,59 @@ +# Welcome to TorchGlyph + +Data Processor Combinators for Natural Language Processing + +[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions) + +## Installation + +Simply run this command in your terminal, + +```bash +pip install torchglyph +``` + +## Quickstart + +The atomic data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce complex `Proc` by composing two simple `Proc`s. + +```python +ToLower() + ReplaceDigits(repl_token='') +``` + +Composed `Proc`s act like data `Pipe`lines, where raw textual data is processed incrementally. According to the stages, they are roughly categorized into four-groups: + ++ `pre` for processing *before* building vocabulary; ++ `vocab` for building and updating *vocabulary*; ++ `post` for precessing *after* building vocabulary; ++ `batch` for collating examples to build *batches*. + +Defining the `Pipe`s of your dataset is the first step to build a dataset, you can build it from scratch, + +```python +class PackedIdxSeqPipe(Pipe): + def __init__(self, device, dtype=torch.long) -> None: + super(PackedIdxSeqPipe, self).__init__( + pre=None, + vocab=None, + post=ToTensor(dtype=dtype), + batch=PackSeq(enforce_sorted=False) + ToDevice(device=device), + ) +``` + +or you can simply manipulate existing `Pipe`s by calling `.with_` method. + +```python +class PackedTokSeqPipe(PackedIdxSeqPipe): + def __init__(self, device, unk_token, special_tokens=(), + threshold=THRESHOLD, dtype=torch.long) -> None: + super(PackedTokSeqPipe, self).__init__(device=device, dtype=dtype) + self.with_( + pre=UpdateCounter(), + vocab=[ + BuildVocab(unk_token=unk_token, pad_token=None, + special_tokens=special_tokens), + StatsVocab(threshold=threshold), + ], + post=Numbering() + ..., + ) +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..07f06a4 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,4 @@ +site_name: TorchGlyph +nav: + - Home: index.md +theme: alabaster \ No newline at end of file diff --git a/setup.py b/setup.py index 736ca61..eddd429 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,16 @@ from setuptools import setup, find_packages -with open('README.md', 'r', encoding='utf-8') as fp: - long_description = fp.read() +name = 'torchglyph' setup( - name='torchglyph', - version='0.1.0', - packages=find_packages(), - url='https://github.com/speedcell4/torchglyph', + name=name, + version='0.1.1', + packages=[package for package in find_packages() if package.startswith(name)], + url=f'https://speedcell4.github.io/torchglyph', license='MIT', author='speedcell4', author_email='speedcell4@gmail.com', description='Data Processor Combinators for Natural Language Processing', - long_description=long_description, install_requires=[ 'tqdm', 'numpy', @@ -23,5 +21,14 @@ 'pytest', 'hypothesis', ], + 'ctx': [ + 'transformers', + 'allennlp', + 'elmoformanylangs', + ], + 'docs': [ + 'mkdocs', + 'mkdocs-alabaster', + ] } ) diff --git a/tests/test_datasets/test_sequential_labeling.py b/tests/test_datasets/test_sequential_labeling.py index d76f233..fab9eb7 100644 --- a/tests/test_datasets/test_sequential_labeling.py +++ b/tests/test_datasets/test_sequential_labeling.py @@ -1,14 +1,29 @@ -from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER +from torchglyph.datasets import CoNLL2000Chunking, CoNLL2003NER +from torchglyph.datasets import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish -def test_conll2000_chunking() -> None: - train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None) - assert len(train) == 8936 - assert len(test) == 2012 +def test_conll2000_chunking(): + train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None, remove_missing=True) + assert len(train.dataset) == 8936 + assert len(test.dataset) == 2012 -def test_conll2003_ner() -> None: - train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None) - assert len(train) == 14987 - assert len(dev) == 3466 - assert len(test) == 3684 +def test_conll2003_ner(): + train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None, remove_missing=True) + assert len(train.dataset) == 14987 + assert len(dev.dataset) == 3466 + assert len(test.dataset) == 3684 + + +def test_semeval2010_catalan(): + train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None, remove_missing=True) + assert len(train.dataset) == 8709 + assert len(dev.dataset) == 1445 + assert len(test.dataset) == 1698 + + +def test_semeval2010_spanish(): + train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None, remove_missing=True) + assert len(train.dataset) == 9022 + assert len(dev.dataset) == 1419 + assert len(test.dataset) == 1705 diff --git a/tests/test_datasets/test_text_classification.py b/tests/test_datasets/test_text_classification.py index 587e0c1..f130e80 100644 --- a/tests/test_datasets/test_text_classification.py +++ b/tests/test_datasets/test_text_classification.py @@ -2,6 +2,6 @@ def test_agnews(): - train, test = AgNews.new(batch_size=1, word_dim=None) + train, test = AgNews.new(batch_size=1, word_dim=None, remove_missing=True) assert len(train) == 120000 assert len(test) == 7600 diff --git a/tests/test_nn/test_connection.py b/tests/test_nn/test_connection.py new file mode 100644 index 0000000..05550a3 --- /dev/null +++ b/tests/test_nn/test_connection.py @@ -0,0 +1,44 @@ +import torch +from hypothesis import given, strategies as st +from torch import nn + +from torchglyph.nn.connection import ResNorm, DenseNorm, ReZero + + +@given( + batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4), + input_dim=st.integers(1, 20), +) +def test_resnorm_shape_grad(batch_sizes, input_dim): + layer = ResNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim)) + x = torch.rand((*batch_sizes, input_dim), requires_grad=True) + y = layer(x) + + assert y.size() == (*batch_sizes, layer.output_dim) + assert y.requires_grad + + +@given( + batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4), + input_dim=st.integers(1, 20), +) +def test_densenorm_shape_grad(batch_sizes, input_dim): + layer = DenseNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim)) + x = torch.rand((*batch_sizes, input_dim), requires_grad=True) + y = layer(x) + + assert y.size() == (*batch_sizes, layer.output_dim) + assert y.requires_grad + + +@given( + batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4), + input_dim=st.integers(1, 20), +) +def test_rezero_shape_grad(batch_sizes, input_dim): + layer = ReZero(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim)) + x = torch.rand((*batch_sizes, input_dim), requires_grad=True) + y = layer(x) + + assert y.size() == (*batch_sizes, layer.output_dim) + assert y.requires_grad diff --git a/torchglyph/__init__.py b/torchglyph/__init__.py index f1f93f6..3c126bf 100644 --- a/torchglyph/__init__.py +++ b/torchglyph/__init__.py @@ -4,7 +4,7 @@ import torch from torch.nn.utils.rnn import PackedSequence -data_path = Path.home() / '.torchglyph' +data_path = (Path.home() / '.torchglyph').expanduser().absolute() if not data_path.exists(): data_path.mkdir(parents=True, exist_ok=True) diff --git a/torchglyph/dataset.py b/torchglyph/dataset.py index e451fbe..e7f11d3 100644 --- a/torchglyph/dataset.py +++ b/torchglyph/dataset.py @@ -2,7 +2,7 @@ import uuid from collections import namedtuple from pathlib import Path -from typing import Iterable, Any, TextIO +from typing import Iterable, Any, TextIO, Optional from typing import Union, List, Type, Tuple, NamedTuple, Dict from torch.utils import data @@ -14,6 +14,7 @@ class Dataset(data.Dataset): + name: Optional[str] urls: List[Union[Tuple[str, ...]]] def __init__(self, pipes: List[Dict[str, Pipe]], **load_kwargs) -> None: @@ -62,14 +63,16 @@ def collate_fn(self, batch: List[NamedTuple]) -> NamedTuple: @classmethod def paths(cls, root: Path = data_path) -> Tuple[Path, ...]: + root = root / getattr(cls, 'name', cls.__name__).lower() + ans = [] for url, name, *filenames in cls.urls: if len(filenames) == 0: filenames = [name] - if any(not (root / cls.__name__.lower() / n).exists() for n in filenames): - download_and_unzip(url, root / cls.__name__.lower() / name) - for n in filenames: - ans.append(root / cls.__name__.lower() / n) + if any(not (root / filename).exists() for filename in filenames): + download_and_unzip(url, root / name) + for filename in filenames: + ans.append(root / filename) return tuple(ans) diff --git a/torchglyph/datasets/__init__.py b/torchglyph/datasets/__init__.py index b5b186c..0c2ab4f 100644 --- a/torchglyph/datasets/__init__.py +++ b/torchglyph/datasets/__init__.py @@ -1,2 +1,3 @@ from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER +from torchglyph.datasets.sequential_labeling import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish from torchglyph.datasets.text_classification import AgNews diff --git a/torchglyph/datasets/sequential_labeling.py b/torchglyph/datasets/sequential_labeling.py index f91ed5c..edaa85a 100644 --- a/torchglyph/datasets/sequential_labeling.py +++ b/torchglyph/datasets/sequential_labeling.py @@ -1,14 +1,19 @@ import logging from pathlib import Path -from typing import Iterable, List, Any, Tuple, Optional, NamedTuple, TextIO +from typing import Iterable, Any +from typing import Optional, List, Tuple, NamedTuple +from typing import TextIO from tqdm import tqdm from torchglyph.dataset import Dataset, DataLoader from torchglyph.formats import conllx -from torchglyph.pipe import PackedTokSeqPipe, SeqLengthTensorPipe, RawPipe, PackedTokPtrSeqPipe +from torchglyph.pipe import PackedTokSeqPipe, SeqLengthTensorPipe, RawPipe, PackedTokPtrSeqPipe, PackedPtrSeqPipe, \ + ToSubList, UpdateCounter, Lift from torchglyph.pipe import PaddedTokSeqPipe, PackedTokBlockPipe -from torchglyph.proc import ToLower, ReplaceDigits, Identity, LoadGlove +from torchglyph.proc import ReplaceDigits, Identity, LoadGlove, LoadFastText, Prepend + +logger = logging.getLogger(__name__) class CoNLL2000Chunking(Dataset): @@ -20,7 +25,7 @@ class CoNLL2000Chunking(Dataset): @classmethod def load(cls, path: Path) -> Iterable[List[Any]]: for sent in tqdm(conllx.load(path, sep=' '), desc=f'reading {path}'): - word, pos, chunk = list(zip(*sent)) + word, pos, chunk = map(list, zip(*sent)) yield [word, pos, chunk] def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args, **kwargs) -> None: @@ -31,10 +36,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args conllx.dump(zip(raw_word, raw_pos, raw_chunk, pred_chunk), fp, sep=' ') @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple[DataLoader, ...]: + if word_dim is not None: + vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( - pre=ToLower() + ReplaceDigits(repl_token='') + ..., - vocab=... + (Identity() if word_dim is None else LoadGlove('6B', word_dim, str.lower)), + pre=ReplaceDigits(repl_token='') + ..., + vocab=... + vectors, ) length = SeqLengthTensorPipe(device=device) char = PackedTokBlockPipe(device=device, unk_token='') @@ -53,7 +63,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tupl test = cls(path=test, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, test, name='word') char.build_vocab(train, test, name='char') @@ -76,7 +86,7 @@ class CoNLL2003NER(Dataset): @classmethod def load(cls, path: Path) -> Iterable[List[Any]]: for sent in tqdm(conllx.load(path, sep=' '), desc=f'reading {path}', unit=' sents'): - word, pos, chunk, ner = list(zip(*sent)) + word, pos, chunk, ner = map(list, zip(*sent)) yield [word, pos, chunk, ner] def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args, **kwargs) -> None: @@ -87,10 +97,15 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[List[int]], *args conllx.dump(zip(raw_word, raw_pos, raw_chunk, raw_ner, pred_ner), fp, sep=' ') @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple[DataLoader, ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple[DataLoader, ...]: + if word_dim is not None: + vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( - pre=ToLower() + ReplaceDigits(repl_token='') + ..., - vocab=... + (Identity() if word_dim is None else LoadGlove(name='6B', dim=word_dim)), + pre=ReplaceDigits(repl_token='') + ..., + vocab=... + vectors, ) length = SeqLengthTensorPipe(device=device) char = PackedTokBlockPipe(device=device, unk_token='') @@ -112,7 +127,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tupl test = cls(path=test, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, dev, test, name='word') char.build_vocab(train, dev, test, name='char') @@ -124,3 +139,88 @@ def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tupl (train, dev, test), batch_size=batch_size, shuffle=True, ) + + +class SemEval2010T1NER(Dataset): + lang: str + + @classmethod + def load(cls, path: Path, **kwargs) -> Iterable[Any]: + for sent in tqdm(conllx.load(path, sep='\t'), desc=f'reading {path}', unit=' sentences'): + _, word, _, pos, _, _, head, drel, _, _, ner = map(list, zip(*sent)) + yield [word, pos, [int(h) for h in head], drel, ner] + + def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[Any], *args, **kwargs) -> None: + ner_vocab = self.pipes['ner'].vocab.stoi + for raw_word, raw_pos, raw_ner, pred in \ + zip(batch.raw_word, batch.raw_pos, batch.raw_ner, prediction): + assert len(raw_word) == len(raw_pos) == len(raw_ner) == len(pred) + + pred_ner = [ner_vocab[p] for p in pred] + conllx.dump(zip(raw_word, raw_pos, raw_ner, pred_ner), fp, sep=' ') + + @classmethod + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple['DataLoader', ...]: + if word_dim is not None: + vectors = LoadFastText(str.lower, lang=cls.lang, remove_missing=remove_missing) + else: + vectors = Identity() + word = PackedTokSeqPipe(device=device, unk_token='').with_( + pre=Prepend('', 1) + ReplaceDigits(repl_token='') + ..., + vocab=... + vectors, + ) + length = SeqLengthTensorPipe(device=device).with_(pre=Prepend('', 1) + ...) + char = PackedTokBlockPipe(device=device, unk_token='').with_( + pre=ToSubList() + Lift(Prepend('', 1)) + Lift(UpdateCounter()), + ) + word_ptr = PackedTokPtrSeqPipe(device=device, reverse=False).with_(pre=Prepend(0, 1) + ...) + pos = PackedTokSeqPipe(device=device, unk_token='').with_(pre=Prepend('', 1) + ...) + head = PackedPtrSeqPipe(device=device).with_(pre=Prepend(0, 1) + ...) + drel = PackedTokSeqPipe(device=device, unk_token='root').with_(pre=Prepend('', 1) + ...) + ner = PaddedTokSeqPipe(device=device, unk_token='O', pad_token='O') + + pipes = [ + dict(word=word, length=length, char=char, word_ptr=word_ptr, raw_word=RawPipe()), + dict(pos=pos, raw_pos=RawPipe()), + dict(head=head), + dict(drel=drel, raw_drel=RawPipe()), + dict(ner=ner, raw_ner=RawPipe()), + ] + + train, dev, test = cls.paths() + train = cls(path=train, pipes=pipes) + dev = cls(path=dev, pipes=pipes) + test = cls(path=test, pipes=pipes) + + for name, pipe in train.pipes.items(): + logger.info(f'{name} => {pipe}') + + word.build_vocab(train, dev, test, name='word') + char.build_vocab(train, dev, test, name='char') + pos.build_vocab(train, name='pos') + drel.build_vocab(train, name='drel') + ner.build_vocab(train, name='ner') + + return DataLoader.new( + (train, dev, test), + batch_size=batch_size, shuffle=True, + ) + + +class SemEval2010T1NERCatalan(SemEval2010T1NER): + urls = [ + ('https://www.dropbox.com/s/nqedh3zmk5k80n7/train.sd.conllx?dl=1', 'train.sd.conllx'), + ('https://www.dropbox.com/s/027umbuks3njwry/dev.sd.conllx?dl=1', 'dev.sd.conllx'), + ('https://www.dropbox.com/s/ldwn6z1xl5vki4y/test.sd.conllx?dl=1', 'test.sd.conllx'), + ] + lang = 'ca' + + +class SemEval2010T1NERSpanish(SemEval2010T1NER): + urls = [ + ('https://www.dropbox.com/s/lyxgvc161ai20v0/train.sd.conllx?dl=1', 'train.sd.conllx'), + ('https://www.dropbox.com/s/8tmbi7ki6ctasez/dev.sd.conllx?dl=1', 'dev.sd.conllx'), + ('https://www.dropbox.com/s/nnj94hdmlq3jjm8/test.sd.conllx?dl=1', 'test.sd.conllx'), + ] + lang = 'es' diff --git a/torchglyph/datasets/text_classification.py b/torchglyph/datasets/text_classification.py index 013709f..a228479 100644 --- a/torchglyph/datasets/text_classification.py +++ b/torchglyph/datasets/text_classification.py @@ -12,6 +12,8 @@ from torchglyph.pipe import PackedTokSeqPipe, TokTensorPipe, RawPipe from torchglyph.proc import Identity, LoadGlove +logger = logging.getLogger(__name__) + class AgNews(Dataset): urls = [ @@ -39,9 +41,14 @@ def dump(self, fp: TextIO, batch: NamedTuple, prediction: List[int], *args, **kw csv.dump((' '.join(raw_title), ' '.join(raw_text), raw_target, vocab.itos[pred]), fp) @classmethod - def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tuple['DataLoader', ...]: + def new(cls, batch_size: int, word_dim: Optional[int], + remove_missing: bool, device: int = -1) -> Tuple['DataLoader', ...]: + if word_dim is not None: + vectors = LoadGlove(name='6B', dim=word_dim, remove_missing=remove_missing) + else: + vectors = Identity() word = PackedTokSeqPipe(device=device, unk_token='').with_( - vocab=... + (Identity() if word_dim is None else LoadGlove(name='6B', dim=word_dim)), + vocab=... + vectors, ) target = TokTensorPipe(device=device, unk_token=None) @@ -56,7 +63,7 @@ def new(cls, batch_size: int, word_dim: Optional[int], device: int = -1) -> Tupl test = cls(path=test, target_vocab=target_vocab, pipes=pipes) for name, pipe in train.pipes.items(): - logging.info(f'{name} => {pipe}') + logger.info(f'{name} => {pipe}') word.build_vocab(train, test, name='word') target.build_vocab(train, test, name='target') diff --git a/torchglyph/functional.py b/torchglyph/functional.py index ce2ea73..5462117 100644 --- a/torchglyph/functional.py +++ b/torchglyph/functional.py @@ -1,9 +1,12 @@ import functools -from typing import Union, Tuple, Dict, Any +from typing import Any +from typing import Union, Tuple, Dict import torch from torch import Tensor -from torch.nn.utils.rnn import PackedSequence +from torch import nn +from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence +from torch.nn.utils.rnn import pack_padded_sequence def support_pack(fn): @@ -17,9 +20,21 @@ def wrap(x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, Pac return wrap -class SupportPack(type): +class SupportPack(nn.Module): + def __init__(self, module: nn.Module) -> None: + super(SupportPack, self).__init__() + self.module = module + + def __repr__(self) -> str: + return f'Packed{self.module.__repr__()}' + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + return support_pack(self.module)(x) + + +class SupportPackMeta(type): def __new__(cls, name: str, bases: Tuple[type, ...], attrs: Dict[str, Any]): - forward_fn = bases[0].forward + forward_fn = attrs.get('forward', bases[0].forward) @functools.wraps(forward_fn) def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: @@ -29,3 +44,33 @@ def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Te return x._replace(data=forward_fn(self, x.data, *args, **kwargs)) return type(name, bases, {**attrs, 'forward': forward}) + + +def head_pack(pack: PackedSequence) -> Tensor: + return pack.data[:pack.batch_sizes[0].item()] + + +def prepend_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -> PackedSequence: + if not torch.is_tensor(value): + value = torch.full_like(head_pack(pack), fill_value=value) + return pack._replace( + data=torch.cat([value, pack.data], dim=0), + batch_sizes=torch.cat([pack.batch_sizes[:1], pack.batch_sizes], dim=0), + ) + + +def tail_pack(pack: PackedSequence) -> Tensor: + data, lengths = pad_packed_sequence(pack, batch_first=True) # type: (Tensor, Tensor) + indices = torch.arange(lengths.size(0), dtype=torch.long, device=data.device) + return data[indices, lengths - 1] + + +def append_pack(pack: PackedSequence, value: Union[int, bool, float, Tensor]) -> PackedSequence: + if not torch.is_tensor(value): + value = torch.full_like(head_pack(pack), fill_value=value) + data, lengths = pad_packed_sequence(pack, batch_first=True) # type: (Tensor, Tensor) + indices = torch.arange(lengths.size(0), dtype=torch.long, device=data.device) + return pack_padded_sequence( + torch.cat([data, value[:, None]], dim=1).index_put((indices, lengths), value), + lengths + 1, batch_first=True, enforce_sorted=False, + ) diff --git a/torchglyph/io.py b/torchglyph/io.py index fc018d1..e2b7025 100644 --- a/torchglyph/io.py +++ b/torchglyph/io.py @@ -1,30 +1,32 @@ import gzip import logging import os +import re import shutil import tarfile import zipfile from contextlib import contextmanager from pathlib import Path -from typing import Union, TextIO +from typing import Union, TextIO, Pattern from urllib.request import urlretrieve from tqdm import tqdm +logger = logging.getLogger(__name__) + IO = Union[str, Path, TextIO] @contextmanager def open_io(f: IO, mode: str, encoding: str): - if isinstance(f, (str, Path)): - fp = open(f, mode=mode, encoding=encoding) - else: - fp = f try: - yield fp + if isinstance(f, (str, Path)): + with open(f, mode=mode, encoding=encoding) as fp: + yield fp + else: + yield f finally: - if isinstance(f, Path): - fp.close() + pass # copied and modified from https://github.com/pytorch/text @@ -50,7 +52,10 @@ def inner(b=1, bsize=1, tsize=None) -> None: # copied and modified from https://github.com/pytorch/text -def download_and_unzip(url: str, dest: Path) -> None: +def download_and_unzip(url: str, dest: Path) -> Path: + if dest.exists(): + return dest + if not dest.parent.exists(): dest.parent.mkdir(parents=True, exist_ok=True) @@ -62,14 +67,23 @@ def download_and_unzip(url: str, dest: Path) -> None: raise err if dest.suffix == '.zip': - logging.info(f'extracting {dest}') + logger.info(f'extracting {dest}') with zipfile.ZipFile(dest, "r") as fp: fp.extractall(path=dest.parent) - elif dest.suffixes[:-2] == ['.tar', '.gz']: - logging.info(f'extracting {dest}') + elif dest.suffixes[-2:] == ['.tar', '.gz']: + logger.info(f'extracting {dest}') with tarfile.open(dest, 'r:gz') as fp: fp.extractall(path=dest.parent) elif dest.suffix == '.gz': - with gzip.open(dest, mode='rb') as fsrc: - with dest.with_suffix('').open(mode='wb') as fdst: - shutil.copyfileobj(fsrc, fdst) + logger.info(f'extracting {dest}') + with gzip.open(dest, mode='rb') as fs: + with dest.with_suffix('').open(mode='wb') as fd: + shutil.copyfileobj(fs, fd) + + return dest + + +def toggle_loggers(pattern: Union[str, Pattern], enable: bool) -> None: + for name in logging.root.manager.loggerDict: # type:str + if re.match(pattern, name) is not None: + logging.getLogger(name).disabled = not enable diff --git a/torchglyph/nn/__init__.py b/torchglyph/nn/__init__.py index a879623..249b291 100644 --- a/torchglyph/nn/__init__.py +++ b/torchglyph/nn/__init__.py @@ -1,2 +1,2 @@ -from torchglyph.nn.embedding import * -from torchglyph.nn.rnn import * +from torchglyph.nn.embedding import TokEmbedding, SubLstmEmbedding +from torchglyph.nn.rnn import ContextualLSTM diff --git a/torchglyph/nn/connection.py b/torchglyph/nn/connection.py new file mode 100644 index 0000000..81b2df2 --- /dev/null +++ b/torchglyph/nn/connection.py @@ -0,0 +1,81 @@ +from typing import Union + +import torch +from torch import Tensor +from torch import nn +from torch.nn.utils.rnn import PackedSequence + + +class ResNorm(nn.Module): + """ + https://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/He_Deep_Residual_Learning_CVPR_2016_paper.pdf + """ + + def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None: + super(ResNorm, self).__init__() + self.input_dim = input_dim + self.output_dim = input_dim + + self.sub_layer = sub_layer + self.layer_norm = nn.LayerNorm(input_dim) + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + z = self.sub_layer(x, *args, **kwargs) + if torch.is_tensor(z): + return self.layer_norm(x + z) + elif isinstance(z, PackedSequence): + return z._replace(data=self.layer_norm(x.data + z.data)) + else: + raise NotImplementedError + + +class DenseNorm(nn.Module): + """ + http://openaccess.thecvf.com/content_cvpr_2017/papers/Huang_Densely_Connected_Convolutional_CVPR_2017_paper.pdf + """ + + def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None: + super(DenseNorm, self).__init__() + self.input_dim = input_dim + self.output_dim = input_dim * 2 + + self.sub_layer = sub_layer + self.layer_norm = nn.LayerNorm(input_dim * 2) + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + z = self.sub_layer(x, *args, **kwargs) + if torch.is_tensor(z): + return self.layer_norm(torch.cat([x, z], dim=-1)) + elif isinstance(z, PackedSequence): + return z._replace(data=self.layer_norm(torch.cat([x.data, z.data], dim=-1))) + else: + raise NotImplementedError + + +class ReZero(nn.Module): + """ + https://arxiv.org/pdf/2003.04887.pdf + """ + + def __init__(self, input_dim: int, *, sub_layer: nn.Module) -> None: + super(ReZero, self).__init__() + self.input_dim = input_dim + self.output_dim = input_dim + + self.sub_layer = sub_layer + self.scale = nn.Parameter( + torch.tensor([0.], dtype=torch.float32), + requires_grad=True, + ) + + def extra_repr(self) -> str: + return f'(scale): Parameter({self.scale.data})' + + def forward(self, x: Union[Tensor, PackedSequence], *args, **kwargs) -> Union[Tensor, PackedSequence]: + z = self.sub_layer(x, *args, **kwargs) + if torch.is_tensor(z): + return x + z * self.scale + elif isinstance(z, PackedSequence): + return z._replace(data=x.data + z.data * self.scale) + else: + raise NotImplementedError diff --git a/torchglyph/nn/contextual.py b/torchglyph/nn/contextual.py new file mode 100644 index 0000000..9ec87c4 --- /dev/null +++ b/torchglyph/nn/contextual.py @@ -0,0 +1,222 @@ +import json +import logging +from pathlib import Path +from typing import List +from typing import Union + +from allennlp.modules import Elmo as AllenELMo +from elmoformanylangs.elmo import read_list, create_batches, recover +from elmoformanylangs.frontend import Model +from elmoformanylangs.modules.embedding_layer import EmbeddingLayer +from torch import Tensor +from torch.nn.utils.rnn import PackedSequence, pack_padded_sequence +from torch.nn.utils.rnn import pack_sequence + +from torchglyph import data_path +from torchglyph.io import download_and_unzip, toggle_loggers + +toggle_loggers('allennlp', False) +toggle_loggers('elmoformanylangs', False) + +logger = logging.getLogger(__name__) + + +class ELMoModel(AllenELMo): + root = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/' + name = { + 'small': '2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_', + 'medium': '2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_', + 'original': '2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_', + '5.5B': '2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_', + } + + def __init__(self, *, options_file: str, weight_file: str, pack_output, **kwargs) -> None: + logger.info(f'loading pretrained {self.__class__.__name__} from {weight_file}') + + super(ELMoModel, self).__init__( + options_file=options_file, weight_file=weight_file, **kwargs, + ) + + self.pack_output = pack_output + self.encoding_dim = self.get_output_dim() + + @classmethod + def fetch(cls, weight: str): + elmo_path = data_path / cls.__name__.lower() + options_file = download_and_unzip( + url=cls.root + (cls.name[weight] + 'options.json'), + dest=elmo_path / (cls.name[weight] + 'options.json'), + ) + weight_file = download_and_unzip( + url=cls.root + (cls.name[weight] + 'weights.hdf5'), + dest=elmo_path / (cls.name[weight] + 'weights.hdf5'), + ) + return options_file, weight_file + + @classmethod + def from_pretrained(cls, weight: str, pack_output: bool = True, + num_output_representations: int = 1, + dropout: float = 0., freeze: bool = True) -> 'ELMoModel': + options_file, weight_file = cls.fetch(weight=weight) + return cls( + options_file=str(options_file), weight_file=str(weight_file), + num_output_representations=num_output_representations, + requires_grad=not freeze, dropout=dropout, pack_output=pack_output, + ) + + def extra_repr(self) -> str: + args = [ + f'encoding_dim={self.encoding_dim}', + f'num_layers={self._elmo_lstm.num_layers}', + f'dropout={self._dropout.p}', + ] + if not self._elmo_lstm._requires_grad: + args.append('frozen') + return ', '.join(args) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}({self.extra_repr()})' + + def forward(self, batch: Tensor, word_inputs: Tensor = None) -> Union[Tensor, PackedSequence]: + outputs = super(ELMoModel, self).forward(batch, word_inputs=word_inputs) + elmo_representations, *_ = outputs['elmo_representations'] + if not self.pack_output: + return elmo_representations + else: + lengths = outputs['mask'].long().sum(dim=-1) + return pack_padded_sequence( + elmo_representations, lengths, + batch_first=True, enforce_sorted=False, + ) + + +class ELMoForManyLanguages(Model): + root = 'http://vectors.nlpl.eu/repository/11/' + configs = [ + 'https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/master/configs/cnn_0_100_512_4096_sample.json', + 'https://raw.githubusercontent.com/HIT-SCIR/ELMoForManyLangs/master/configs/cnn_50_100_512_4096_sample.json', + ] + names = { + 'ca': '138', + 'es': '145', + 'zh': '179', + } + + def __init__(self, *, options_file: Path, weight_file: Path, pack_output: bool, requires_grad: bool) -> None: + with options_file.open('r', encoding='utf-8') as fp: + config = json.load(fp) + + if config['token_embedder']['char_dim'] > 0: + char_lexicon = {} + with (weight_file / 'char.dic').open('r', encoding='utf-8') as fp: + for raw in fp: + tokens = raw.strip().split('\t') + if len(tokens) == 1: + tokens.insert(0, '\u3000') + token, index = tokens + char_lexicon[token] = int(index) + char_emb_layer = EmbeddingLayer( + config['token_embedder']['char_dim'], char_lexicon, + fix_emb=False, embs=None, + ) + else: + char_lexicon = None + char_emb_layer = None + + if config['token_embedder']['word_dim'] > 0: + word_lexicon = {} + with (weight_file / 'word.dic').open('r', encoding='utf-8') as fp: + for raw in fp: + tokens = raw.strip().split('\t') + if len(tokens) == 1: + tokens.insert(0, '\u3000') + token, index = tokens + word_lexicon[token] = int(index) + word_emb_layer = EmbeddingLayer( + config['token_embedder']['word_dim'], word_lexicon, + fix_emb=False, embs=None, + ) + else: + word_lexicon = None + word_emb_layer = None + + super(ELMoForManyLanguages, self).__init__( + config=config, word_emb_layer=word_emb_layer, + char_emb_layer=char_emb_layer, use_cuda=False, + ) + self.load_model(path=weight_file) + self.char_lexicon = char_lexicon + self.word_lexicon = word_lexicon + + self.lang = weight_file.name + self.requires_grad = requires_grad + self.pack_output = pack_output + self.encoding_dim = self.output_dim * 2 + + @classmethod + def fetch(cls, lang: str): + download_and_unzip( + url=cls.configs[0], + dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[0]).name, + ) + download_and_unzip( + url=cls.configs[1], + dest=data_path / cls.__name__.lower() / 'configs' / Path(cls.configs[1]).name, + ) + return download_and_unzip( + url=cls.root + f'{cls.names[lang]}.zip', + dest=data_path / cls.__name__.lower() / lang / f'{lang}.zip', + ).parent + + @classmethod + def from_pretrained(cls, lang: str, pack_output: bool = True, freeze: bool = True) -> 'ELMoForManyLanguages': + path = cls.fetch(lang=lang) + + with (path / 'config.json').open('r', encoding='utf-8') as fp: + args = json.load(fp) + return cls( + options_file=path / args['config_path'], requires_grad=not freeze, + weight_file=path, pack_output=pack_output, + ) + + def extra_repr(self) -> str: + args = [ + f'lang={self.lang}', f'encoding_dim={self.encoding_dim}', + f'word_vocab={len(self.word_lexicon) if self.word_lexicon is not None else None}', + f'char_vocab={len(self.char_lexicon) if self.char_lexicon is not None else None}', + ] + if not self.requires_grad: + args.append('frozen') + return ', '.join(args) + + def __repr__(self) -> str: + return f'{self.__class__.__name__}({self.extra_repr()})' + + def forward(self, batch: List[List[str]], output_layer: int = -1) -> Union[Tensor, PackedSequence]: + if self.config['token_embedder']['name'].lower() == 'cnn': + pad, text = read_list(batch, self.config['token_embedder']['max_characters_per_token']) + else: + pad, text = read_list(batch) + + pad_w, pad_c, pad_ln, pad_mask, pad_text, recover_idx = create_batches( + pad, len(text), self.word_lexicon, self.char_lexicon, self.config, text=text) + + ans = [] + for word, char, length, mask, pads in zip(pad_w, pad_c, pad_ln, pad_mask, pad_text): + output = super(ELMoForManyLanguages, self).forward(word, char, mask) + for index, text in enumerate(pads): + if self.config['encoder']['name'].lower() == 'lstm': + data = output[index, 1:length[index] - 1, :] + elif self.config['encoder']['name'].lower() == 'elmo': + data = output[:, index, 1:length[index] - 1, :] + + if output_layer == -1: + payload = data.mean(dim=0) + else: + payload = data[output_layer] + ans.append(payload if self.requires_grad else payload.detach()) + + ans = recover(ans, recover_idx) + if self.pack_output: + ans = pack_sequence(ans, enforce_sorted=False) + return ans diff --git a/torchglyph/nn/embedding.py b/torchglyph/nn/embedding.py index b227825..87fd302 100644 --- a/torchglyph/nn/embedding.py +++ b/torchglyph/nn/embedding.py @@ -1,22 +1,35 @@ -from typing import Union +from typing import Union, Tuple import torch from einops import rearrange -from torch import nn, Tensor -from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence +from torch import Tensor +from torch import nn +from torch.nn.utils.rnn import PackedSequence, pack_sequence, pack_padded_sequence -from torchglyph.functional import SupportPack +from torchglyph.functional import SupportPackMeta -class TokEmbedding(nn.Embedding, metaclass=SupportPack): - pass +class TokEmbedding(nn.Embedding, metaclass=SupportPackMeta): + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int = None, unk_idx: int = None, + max_norm: float = None, norm_type: float = 2., scale_grad_by_freq: bool = False, + sparse: bool = False, _weight: Tensor = None): + super(TokEmbedding, self).__init__( + num_embeddings=num_embeddings, embedding_dim=embedding_dim, + padding_idx=padding_idx, max_norm=max_norm, norm_type=norm_type, + scale_grad_by_freq=scale_grad_by_freq, sparse=sparse, _weight=_weight, + ) + self.unk_idx = unk_idx + + @property + def unk(self) -> Tensor: + return self.weight[self.unk_idx] class SubLstmEmbedding(nn.Module): def __init__(self, num_embeddings: int, embedding_dim: int, hidden_dim: int, dropout: float, num_layers: int = 1, bias: bool = True, batch_first: bool = True, - bidirectional: bool = True, padding_idx: int = None) -> None: + bidirectional: bool = True, padding_idx: int = None, unk_idx: int = None) -> None: super(SubLstmEmbedding, self).__init__() self.embedding = nn.Embedding( @@ -32,6 +45,13 @@ def __init__(self, num_embeddings: int, embedding_dim: int, ) self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1) + self.unk_idx = unk_idx + + @property + def unk(self) -> Tensor: + embedding = self.embedding.weight[None, self.unk_idx] + _, (encoding, _) = self.rnn(pack_sequence([embedding], enforce_sorted=True)) + return rearrange(encoding, '(l d) a h -> l a (d h)', l=self.rnn.num_layers)[0, 0, :] def _padded_forward(self, sub: Tensor, tok_lengths: Tensor) -> Tensor: pack = pack_padded_sequence( @@ -39,7 +59,7 @@ def _padded_forward(self, sub: Tensor, tok_lengths: Tensor) -> Tensor: rearrange(tok_lengths.clamp_min(1), 'a b -> (a b)'), batch_first=self.rnn.batch_first, enforce_sorted=False, ) - + embedding = pack._replace(data=self.dropout(self.embedding(pack.data))) _, (encoding, _) = self.rnn(embedding) @@ -57,3 +77,33 @@ def forward(self, sub: Union[Tensor, PackedSequence], *args) -> Union[Tensor, Pa return self._padded_forward(sub, *args) else: return self._packed_forward(sub, *args) + + +class ContiguousSubLstmEmbedding(nn.Module): + def __init__(self, num_embeddings: int, embedding_dim: int, + hidden_dim: int, dropout: float, num_layers: int = 1, + bias: bool = True, batch_first: bool = True, + bidirectional: bool = True, padding_idx: int = None) -> None: + super(ContiguousSubLstmEmbedding, self).__init__() + + self.embedding = nn.Embedding( + num_embeddings=num_embeddings, + embedding_dim=embedding_dim, + padding_idx=padding_idx, + ) + self.dropout = nn.Dropout(dropout) + self.rnn = nn.LSTM( + input_size=self.embedding.embedding_dim, + hidden_size=hidden_dim, num_layers=num_layers, bias=bias, + batch_first=batch_first, bidirectional=bidirectional, + ) + + self.embedding_dim = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1) + + def forward(self, sub: PackedSequence, indices: Tuple[PackedSequence, PackedSequence]) -> PackedSequence: + embedding = sub._replace(data=self.dropout(self.embedding(sub.data))) + encoding, _ = self.rnn(embedding) # type: (PackedSequence, _) + + fidx, bidx = indices + fenc, benc = encoding.data.chunk(2, dim=-1) + return fidx._replace(data=torch.cat([fenc[fidx.data], benc[bidx.data]], dim=-1)) diff --git a/torchglyph/pipe/contiguous.py b/torchglyph/pipe/contiguous.py new file mode 100644 index 0000000..25e0ae8 --- /dev/null +++ b/torchglyph/pipe/contiguous.py @@ -0,0 +1,33 @@ +from typing import Union, Optional, Tuple + +import torch + +from torchglyph.pipe import PackedTokSeqPipe +from torchglyph.pipe import Pipe +from torchglyph.pipe import THRESHOLD +from torchglyph.proc import GetLength, Lift, ToTensor +from torchglyph.proc.collecting import ToDevice +from torchglyph.proc.contiguous import BuildContiguousSub, BuildContiguousSubPtr, PackContiguousSubPtr + + +class PackedContiguousSubPipe(PackedTokSeqPipe): + def __init__(self, device: Union[int, torch.device], unk_token: Optional[str], + seq_token: str, special_tokens: Tuple[Optional[str], ...] = (), + threshold: int = THRESHOLD, dtype: torch.dtype = torch.long) -> None: + super(PackedContiguousSubPipe, self).__init__( + device=device, unk_token=unk_token, special_tokens=special_tokens, + threshold=threshold, dtype=dtype, + ) + self.with_( + pre=BuildContiguousSub(seq_token=seq_token) + ..., + ) + + +class PackedContiguousSubPtrPipe(Pipe): + def __init__(self, device: Union[int, torch.device], dtype: torch.dtype = torch.long) -> None: + super(PackedContiguousSubPtrPipe, self).__init__( + pre=Lift(GetLength()) + BuildContiguousSubPtr() + Lift(ToTensor(dtype=dtype)), + vocab=None, + post=None, + batch=PackContiguousSubPtr(enforce_sorted=False) + ToDevice(device=device), + ) diff --git a/torchglyph/pipe/ctx.py b/torchglyph/pipe/ctx.py new file mode 100644 index 0000000..58ae589 --- /dev/null +++ b/torchglyph/pipe/ctx.py @@ -0,0 +1,18 @@ +from typing import Union + +import torch + +from torchglyph.pipe import Pipe +from torchglyph.proc import ToDevice +from torchglyph.proc.ctx import PadELMo +from torchglyph.proc.tokenizer import ELMoTokenizer + + +class ELMoPipe(Pipe): + def __init__(self, device: Union[int, torch.device]): + super(ELMoPipe, self).__init__( + pre=ELMoTokenizer(), + vocab=None, + post=None, + batch=PadELMo() + ToDevice(device=device), + ) diff --git a/torchglyph/pipe/seq.py b/torchglyph/pipe/seq.py index a5b9ff2..f0f7c6e 100644 --- a/torchglyph/pipe/seq.py +++ b/torchglyph/pipe/seq.py @@ -90,7 +90,7 @@ class PackedSeqPtrSeqPipe(PackedIdxSeqPipe): def __init__(self, device: Union[int, torch.device], dtype: torch.dtype = torch.long) -> None: super(PackedSeqPtrSeqPipe, self).__init__(device=device, dtype=dtype) self.with_( - pre=GetMask(token=0), + post=GetMask(token=0) + ..., batch=Scan(fn=cum_seq, init=0) + ..., ) diff --git a/torchglyph/proc/collecting.py b/torchglyph/proc/collecting.py index 1ce3fc5..2783c6f 100644 --- a/torchglyph/proc/collecting.py +++ b/torchglyph/proc/collecting.py @@ -1,5 +1,6 @@ from typing import Any, Union, List, Tuple +import numpy as np import torch from torch import Tensor from torch.nn.utils.rnn import pad_sequence, PackedSequence, pack_sequence, pad_packed_sequence @@ -9,7 +10,8 @@ class ToDevice(Proc): - Batch = Union[Tensor, PackedSequence, Tuple[Union[Tensor, PackedSequence], ...]] + Item = Union[int, float, bool, Tensor, PackedSequence] + Batch = Union[Item, Tuple[Item, ...]] def __init__(self, device: Union[int, torch.device]) -> None: super(ToDevice, self).__init__() @@ -24,9 +26,11 @@ def extra_repr(self) -> str: return f'{self.device}' def __call__(self, batch: Batch, vocab: Vocab, **kwargs) -> Batch: - if isinstance(batch, (PackedSequence, Tensor)): - return batch.to(self.device) - return type(batch)([self(e, vocab=vocab) for e in batch]) + if isinstance(batch, (Tensor, PackedSequence)): + return batch.to(device=self.device) + if isinstance(batch, (list, tuple)): + return type(batch)([self(e, vocab=vocab) for e in batch]) + return batch class ToTensor(Proc): @@ -39,7 +43,9 @@ def extra_repr(self) -> str: def __call__(self, data: Any, **kwargs) -> Tensor: try: - return torch.tensor(data, dtype=self.dtype, requires_grad=False) + if isinstance(data, np.ndarray): + return torch.from_numpy(data).to(dtype=self.dtype).requires_grad_(False) + return torch.tensor(data, dtype=self.dtype).requires_grad_(False) except ValueError as err: if err.args[0] == "too many dimensions 'str'": raise ValueError(f"'{data}' can not be converted to {Tensor.__name__}") diff --git a/torchglyph/proc/contiguous.py b/torchglyph/proc/contiguous.py new file mode 100644 index 0000000..8cd5b11 --- /dev/null +++ b/torchglyph/proc/contiguous.py @@ -0,0 +1,59 @@ +from typing import List, Tuple + +import torch +from torch import Tensor +from torch.nn.utils.rnn import PackedSequence +from torch.nn.utils.rnn import pack_sequence +from torch.nn.utils.rnn import pad_packed_sequence + +from torchglyph.proc.abc import Proc + + +class BuildContiguousSub(Proc): + def __init__(self, seq_token: str) -> None: + super(BuildContiguousSub, self).__init__() + self.seq_token = seq_token + + def extra_repr(self) -> str: + return repr(self.seq_token) + + def __call__(self, tokens: List[str], **kwargs) -> List[str]: + zs = [] + for token in tokens: + zs.extend(list(token)) + zs.append(self.seq_token) + return zs[:-1] + + +class BuildContiguousSubPtr(Proc): + def __call__(self, lengths: List[int], **kwargs) -> Tuple[List[int], List[int]]: + indices = [0] + for length in lengths: + indices.append(indices[-1] + length + 1) + return [index - 2 for index in indices[1:]], indices[:-1] + + +class PackContiguousSubPtr(Proc): + def __init__(self, enforce_sorted: bool) -> None: + super(PackContiguousSubPtr, self).__init__() + self.enforce_sorted = enforce_sorted + + def extra_repr(self) -> str: + return f'enforce_sorted={self.enforce_sorted}' + + def __call__(self, indices: List[Tuple[Tensor, Tensor]], **kwargs) -> Tuple[PackedSequence, PackedSequence]: + fidx, bidx = zip(*indices) + + pack = pack_sequence([ + torch.empty((f.max().item() + 1,), dtype=torch.long) for f in fidx + ], enforce_sorted=self.enforce_sorted) + indices = pack._replace(data=torch.arange(pack.data.size(0), device=pack.data.device)) + indices, _ = pad_packed_sequence(indices, batch_first=True) + + fidx = pack_sequence([ + indices[i, f] for i, f in enumerate(fidx) + ], enforce_sorted=self.enforce_sorted) + bidx = pack_sequence([ + indices[i, b] for i, b in enumerate(bidx) + ], enforce_sorted=self.enforce_sorted) + return fidx, bidx diff --git a/torchglyph/proc/ctx.py b/torchglyph/proc/ctx.py new file mode 100644 index 0000000..0e2b786 --- /dev/null +++ b/torchglyph/proc/ctx.py @@ -0,0 +1,17 @@ +from typing import List + +from allennlp.data import Instance as AllenInstance, Vocabulary as AllenVocabulary +from allennlp.data.dataset import Batch as AllenBatch +from torch import Tensor + +from torchglyph.io import toggle_loggers +from torchglyph.proc import Proc + +toggle_loggers('allennlp', False) + + +class PadELMo(Proc): + def __call__(self, data: List[AllenInstance], *args, **kwargs) -> Tensor: + batch = AllenBatch(data) + batch.index_instances(AllenVocabulary()) + return batch.as_tensor_dict()['elmo']['character_ids'] diff --git a/torchglyph/proc/infer.py b/torchglyph/proc/infer.py new file mode 100644 index 0000000..d83faf1 --- /dev/null +++ b/torchglyph/proc/infer.py @@ -0,0 +1,9 @@ +from typing import List + +from torchglyph.proc import Proc +from torchglyph.vocab import Vocab + + +class RevVocab(Proc): + def __call__(self, xs: List[int], vocab: Vocab, **kwargs) -> List[str]: + return [vocab.itos[x] for x in xs] diff --git a/torchglyph/proc/tokenizer.py b/torchglyph/proc/tokenizer.py new file mode 100644 index 0000000..8d244dc --- /dev/null +++ b/torchglyph/proc/tokenizer.py @@ -0,0 +1,102 @@ +from typing import Union, List + +import transformers +from allennlp.data import Token as AllenToken, Instance as AllenInstance +from allennlp.data.fields import TextField as AllenTextField +from allennlp.data.token_indexers import ELMoTokenCharactersIndexer + +from torchglyph.io import toggle_loggers +from torchglyph.proc import Proc + +toggle_loggers('allennlp', False) +toggle_loggers('transformers', False) + + +class ELMoTokenizer(Proc): + def __init__(self) -> None: + super(ELMoTokenizer, self).__init__() + self.tokenizer = ELMoTokenCharactersIndexer() + + def __call__(self, data: List[str], *args, **kwargs): + data = [AllenToken(token) for token in data] + return AllenInstance({"elmo": AllenTextField(data, {'character_ids': self.tokenizer})}) + + +class TransformerTokenizerProc(Proc): + def __init__(self, weight: str) -> None: + super(TransformerTokenizerProc, self).__init__() + self.weightt = weight + + def extra_repr(self) -> str: + return f'weight={self.weight}' + + def __call__(self, data: Union[str, List[str]], **kwargs) -> List[int]: + if not isinstance(data, str): + data = ' '.join(data) + return self.tokenizer.encode(data) + + +class BertTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'bert-base-uncased') -> None: + super(BertTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.BertTokenizer.from_pretrained(weight) + + +class OpenAIGPTTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'openai-gpt') -> None: + super(OpenAIGPTTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.OpenAIGPTTokenizer.from_pretrained(weight) + + +class GPT2Tokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'gpt2') -> None: + super(GPT2Tokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight) + + +class CTRLTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'ctrl') -> None: + super(CTRLTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.CTRLTokenizer.from_pretrained(weight) + + +class TransfoXLTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'transfo-xl-wt103') -> None: + super(TransfoXLTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.TransfoXLTokenizer.from_pretrained(weight) + + +class XLNetTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'xlnet-base-cased') -> None: + super(XLNetTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.XLNetTokenizer.from_pretrained(weight) + + +class XLMTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'xlm-mlm-enfr-1024') -> None: + super(XLMTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.XLMTokenizer.from_pretrained(weight) + + +class DistilBertTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'distilbert-base-cased') -> None: + super(DistilBertTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.DistilBertTokenizer.from_pretrained(weight) + + +class RobertaTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'roberta-base') -> None: + super(RobertaTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.RobertaTokenizer.from_pretrained(weight) + + +class XLMRobertaTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'xlm-roberta-base') -> None: + super(XLMRobertaTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.XLMRobertaTokenizer.from_pretrained(weight) + + +class BartTokenizer(TransformerTokenizerProc): + def __init__(self, weight: str = 'bart-large') -> None: + super(BartTokenizer, self).__init__(weight=weight) + self.tokenizer = transformers.BartTokenizer.from_pretrained(weight) diff --git a/torchglyph/proc/vocab.py b/torchglyph/proc/vocab.py index f42f2a3..ad47e9c 100644 --- a/torchglyph/proc/vocab.py +++ b/torchglyph/proc/vocab.py @@ -5,6 +5,8 @@ from torchglyph.proc import Proc from torchglyph.vocab import Vocab, Vectors, Glove, FastTest +logger = logging.getLogger(__name__) + class UpdateCounter(Proc): def __call__(self, data: Union[str, List[str]], counter: Counter, *args, **kwargs) -> Union[str, List[str]]: @@ -61,53 +63,59 @@ def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: occ_avg = sum(vocab.freq.values()) / max(1, tok_cnt) name = f"{vocab.__class__.__name__} '{name}'" - logging.info(f"{name} has {tok_cnt} token(s) => " - f"{occ_avg:.1f} occurrence(s)/token [" - f"{occ_max} :: '{tok_max}', " - f"{occ_min} :: '{tok_min}']") + logger.info(f"{name} has {tok_cnt} token(s) => " + f"{occ_avg:.1f} occurrence(s)/token [" + f"{occ_max} :: '{tok_max}', " + f"{occ_min} :: '{tok_min}']") if tok_cnt <= self.threshold: - logging.info(f'{name} => [{", ".join(vocab.itos)}]') + logger.info(f'{name} => [{", ".join(vocab.itos)}]') else: - logging.info(f'{name} => [' - f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., ' - f'{", ".join(vocab.itos[-self.threshold // 2:])}]') + logger.info(f'{name} => [' + f'{", ".join(vocab.itos[:self.threshold // 2])}, ..., ' + f'{", ".join(vocab.itos[-self.threshold // 2:])}]') return vocab class LoadVectors(Proc): - def __init__(self, vectors: Vectors, *fallbacks) -> None: + def __init__(self, *fallback_fns, vectors: Vectors, remove_missing: bool) -> None: super(LoadVectors, self).__init__() + self.fallback_fns = fallback_fns self.vectors = vectors - self.fallbacks = fallbacks + self.remove_missing = remove_missing def extra_repr(self) -> str: return ', '.join([ + *[f'{f.__name__}' for f in self.fallback_fns], f'{self.vectors.extra_repr()}', - *[f'{f.__name__}' for f in self.fallbacks], + f'remove_missing={self.remove_missing}', ]) def __call__(self, vocab: Vocab, name: str, *args, **kwargs) -> Vocab: assert vocab is not None, f"did you forget '{BuildVocab.__name__}' before '{LoadVectors.__name__}'?" - tok, occ = vocab.load_vectors(self.vectors, *self.fallbacks) + if self.remove_missing: + vocab = vocab.union(self.vectors, *self.fallback_fns) + tok, occ = vocab.load_vectors(*self.fallback_fns, vectors=self.vectors) tok = tok / max(1, len(vocab.freq.values())) * 100 occ = occ / max(1, sum(vocab.freq.values())) * 100 - logging.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'") + logger.info(f"{self.vectors} hits {tok:.1f}% tokens and {occ:.1f}% occurrences of {Vocab.__name__} '{name}'") return vocab class LoadGlove(LoadVectors): - def __init__(self, name: str, dim: int, *fallbacks) -> None: + def __init__(self, *fallback_fns, name: str, dim: int, remove_missing: bool) -> None: super(LoadGlove, self).__init__( - Glove(name=name, dim=dim), - *fallbacks, + *fallback_fns, + vectors=Glove(name=name, dim=dim), + remove_missing=remove_missing, ) class LoadFastText(LoadVectors): - def __init__(self, lang: str, *fallbacks) -> None: + def __init__(self, *fallback_fns, lang: str, remove_missing: bool) -> None: super(LoadFastText, self).__init__( - FastTest(lang=lang), - *fallbacks, + *fallback_fns, + vectors=FastTest(lang=lang), + remove_missing=remove_missing, ) diff --git a/torchglyph/vocab.py b/torchglyph/vocab.py index ca7b0f6..428dea8 100644 --- a/torchglyph/vocab.py +++ b/torchglyph/vocab.py @@ -2,7 +2,7 @@ from collections import Counter from collections import defaultdict from pathlib import Path -from typing import Union, Optional, Tuple, Callable, List +from typing import Optional, Tuple, Callable, List import torch from torch import Tensor @@ -12,6 +12,8 @@ from torchglyph import data_path from torchglyph.io import download_and_unzip +logger = logging.getLogger(__name__) + class Vocab(object): def __init__(self, counter: Counter, @@ -77,48 +79,26 @@ def __len__(self) -> int: def __contains__(self, token: str) -> bool: return token in self.stoi - def __and__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - if isinstance(rhs, Vocab): - rhs = rhs.freq - return Vocab( - counter=Counter({ - token: freq - for token, freq in self.freq.items() - if token in rhs - }), - unk_token=self.unk_token, - pad_token=self.pad_token, - special_tokens=self.special_tokens, - max_size=self.max_size, min_freq=self.min_freq, - ) + def union(self, rhs: 'Vocab', *fallback_fns) -> 'Vocab': + counter = Counter() - def __add__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - if isinstance(rhs, Vocab): - rhs = rhs.freq - return Vocab( - counter=Counter({ - token: self.freq[token] + rhs[token] - for token in {*self.freq.keys(), *rhs.keys()} - }), - unk_token=self.unk_token, - pad_token=self.pad_token, - special_tokens=self.special_tokens, - max_size=self.max_size, min_freq=self.min_freq, - ) + for token, freq in self.freq.items(): + if token in rhs.stoi: + counter[token] = freq + else: + for fallback_fn in fallback_fns: + new_token = fallback_fn(token) + if new_token in rhs.stoi: + counter[new_token] = freq + break - def __sub__(self, rhs: Union['Counter', 'Vocab']) -> 'Vocab': - if isinstance(rhs, Vocab): - rhs = rhs.freq return Vocab( - counter=Counter({ - token: freq - for token, freq in self.freq.items() - if token not in rhs - }), + counter=counter, unk_token=self.unk_token, pad_token=self.pad_token, special_tokens=self.special_tokens, - max_size=self.max_size, min_freq=self.min_freq, + max_size=self.max_size, + min_freq=self.min_freq, ) @property @@ -133,12 +113,12 @@ def vec_dim(self) -> int: return 0 return self.vectors.size(1) - def load_vectors(self, vectors: 'Vectors', *fallbacks) -> Tuple[int, int]: + def load_vectors(self, *fallback_fns, vectors: 'Vectors') -> Tuple[int, int]: self.vectors = torch.empty((len(self), vectors.vec_dim), dtype=torch.float32) tok, occ = 0, 0 for token, index in self.stoi.items(): - if vectors.query_(token, self.vectors[index], *fallbacks): + if vectors.query_(token, self.vectors[index], *fallback_fns): tok += 1 occ += self.freq[token] @@ -148,63 +128,64 @@ def load_vectors(self, vectors: 'Vectors', *fallbacks) -> Tuple[int, int]: return tok, occ def save(self, path: Path) -> None: - logging.info(f'saving {self.__class__.__name__} to {path}') + logger.info(f'saving {self.__class__.__name__} to {path}') torch.save((self.stoi, self.itos, self.vectors), path) def load(self, path: Path) -> None: - logging.info(f'loading {self.__class__.__name__} from {path}') + logger.info(f'loading {self.__class__.__name__} from {path}') self.stoi, self.itos, self.vectors = torch.load(path) class Vectors(Vocab): def __init__(self, urls_dest: List[Tuple[str, Path]], path: Path, - has_head_info: bool, unk_init_: Callable[[Tensor], Tensor] = init.normal_) -> None: + heading: bool, unicode_error: str = 'replace', dtype: torch.dtype = torch.float32, + unk_init_: Callable[[Tensor], Tensor] = init.normal_) -> None: super(Vectors, self).__init__( counter=Counter(), unk_token=None, pad_token=None, special_tokens=(), max_size=None, min_freq=1, ) - self.vectors = [] + vectors = [] self.unk_init_ = unk_init_ - pt_path = path.with_suffix('.pt') - if not pt_path.exists(): + dump_path = path.with_suffix('.pt') + if not dump_path.exists(): if not path.exists(): for url, dest in urls_dest: download_and_unzip(url, dest) with path.open('rb') as fp: - vec_dim = None + vector_dim = None - iteration = tqdm(fp, desc=f'reading {path}', unit=' tokens') - for raw in iteration: # type:bytes - if has_head_info: - _, vec_dim = map(int, raw.strip().split(b' ')) - has_head_info = False + for raw in tqdm(fp, desc=f'reading {path}', unit=' lines'): # type: bytes + if heading: + _, vector_dim = map(int, raw.rstrip().split(b' ')) + heading = False continue token, *vs = raw.rstrip().split(b' ') - if vec_dim is None: - vec_dim = len(vs) - elif vec_dim != len(vs): - raise ValueError(f'vector dimensions are not consistent, {vec_dim} != {len(vs)}') + if vector_dim is None: + vector_dim = len(vs) + elif vector_dim != len(vs): + logger.error(f'vector dimensions are not consistent, {vector_dim} != {len(vs)} :: {token}') + continue - self.add_token_(str(token, encoding='utf-8')) - self.vectors.append(torch.tensor([float(v) for v in vs], dtype=torch.float32)) + self.add_token_(str(token, encoding='utf-8', errors=unicode_error)) + vectors.append(torch.tensor([float(v) for v in vs], dtype=dtype)) - self.vectors = torch.stack(self.vectors, 0) - self.save(pt_path) + self.vectors = torch.stack(vectors, 0) + self.save(dump_path) else: - self.load(pt_path) + self.load(dump_path) @torch.no_grad() - def query_(self, token: str, vector: Tensor, *fallbacks) -> bool: + def query_(self, token: str, vector: Tensor, *fallback_fns) -> bool: if token in self: vector[:] = self.vectors[self.stoi[token]] return True - for fallback in fallbacks: - new_token = fallback(token) + for fallback_fn in fallback_fns: + new_token = fallback_fn(token) if new_token in self: vector[:] = self.vectors[self.stoi[new_token]] return True @@ -214,23 +195,35 @@ def query_(self, token: str, vector: Tensor, *fallbacks) -> bool: class Glove(Vectors): def __init__(self, name: str, dim: int) -> None: + path = data_path / f'glove.{name}' super(Glove, self).__init__( urls_dest=[( f'http://nlp.stanford.edu/data/glove.{name}.zip', - data_path / f'glove.{name}' / f'glove.{name}.zip' + path / f'glove.{name}.zip' )], - path=data_path / f'glove.{name}' / f'glove.{name}.{dim}d.txt', - has_head_info=False, + path=path / f'glove.{name}.{dim}d.txt', heading=False, ) class FastTest(Vectors): def __init__(self, lang: str) -> None: + path = data_path / 'fasttext' super(FastTest, self).__init__( urls_dest=[( f'https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.{lang}.vec', - data_path / 'fasttext' / f'wiki.{lang}.vec', + path / f'wiki.{lang}.vec', + )], + path=path / f'wiki.{lang}.vec', heading=True, + ) + + +class NLPLVectors(Vectors): + def __init__(self, index: int, repository: str = '20', name: str = 'model.txt', heading: bool = False) -> None: + path = data_path / 'nlpl' / f'{index}' + super(NLPLVectors, self).__init__( + urls_dest=[( + f'http://vectors.nlpl.eu/repository/{repository}/{index}.zip', + path / f'{index}.zip', )], - path=data_path / 'fasttext' / f'wiki.{lang}.vec', - has_head_info=True, + path=path / name, heading=heading, )