Merge branch 'release/v0.1.1'

speedcell4 · Jul 27, 2020 · b9830e1 · b9830e1
2 parents 40df6c2 + 843837a
commit b9830e1
Show file tree

Hide file tree

Showing 32 changed files with 1,130 additions and 170 deletions.
diff --git a/.github/workflows/mkdocs.yml b/.github/workflows/mkdocs.yml
@@ -0,0 +1,26 @@
+name: Build Document by Mkdocs
+on:
+  push:
+    branches:
+      - develop
+
+jobs:
+  build:
+    name: Deploy docs
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout develop
+        uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install dependencies
+        run: python -m pip install -e '.[docs]'
+      - name: Build
+        run: mkdocs build
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./site
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,31 @@
+# This workflows will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.7'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install setuptools wheel twine
+      - name: Build and publish
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: |
+          python setup.py sdist bdist_wheel
+          twine upload dist/*
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -1,4 +1,4 @@
-name: unit-tests
+name: Unit Tests
 
 on: [push]
 
@@ -7,17 +7,21 @@ jobs:
 
     runs-on: ubuntu-latest
 
+    strategy:
+      matrix:
+        python-version: ["3.7"]
+
     steps:
-      - uses: actions/checkout@v1
-      - name: Set up Python 3.7
-        uses: actions/setup-python@v1
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: "${{ matrix.python-version }}"
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
           python -m pip install torch
-          python -m pip install -e '.[dev]'
+          python -m pip install -e '.[dev, ctx]'
       - name: Test with pytest
         run: |
           python -m pytest tests
diff --git a/.gitignore b/.gitignore
@@ -640,3 +640,4 @@ GitHub.sublime-settings
 *.ptx
 *.cubin
 *.fatbin
+!/site/
diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # TorchGlyph
 
-[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions)
+![Unit Tests](https://github.com/speedcell4/torchglyph/workflows/Unit%20Tests/badge.svg)
+![Upload Python Package](https://github.com/speedcell4/torchglyph/workflows/Upload%20Python%20Package/badge.svg)
 
 ## Requirements
 

diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,59 @@
+# Welcome to TorchGlyph
+
+Data Processor Combinators for Natural Language Processing
+
+[![Actions Status](https://github.com/speedcell4/torchglyph/workflows/unit-tests/badge.svg)](https://github.com/speedcell4/torchglyph/actions)
+
+## Installation
+
+Simply run this command in your terminal,
+
+```bash
+pip install torchglyph
+```
+
+## Quickstart
+
+The atomic data processor of TorchGlyph is called `Proc`. Compose operator `+` is provided to produce complex `Proc` by composing two simple `Proc`s. 
+
+```python
+ToLower() + ReplaceDigits(repl_token='<digits>')
+```
+
+Composed `Proc`s act like data `Pipe`lines, where raw textual data is processed incrementally. According to the stages, they are roughly categorized into four-groups:
+
++ `pre` for processing *before* building vocabulary;
++ `vocab` for building and updating *vocabulary*;
++ `post` for precessing *after* building vocabulary;
++ `batch` for collating examples to build *batches*.
+
+Defining the `Pipe`s of your dataset is the first step to build a dataset, you can build it from scratch, 
+
+```python
+class PackedIdxSeqPipe(Pipe):
+    def __init__(self, device, dtype=torch.long) -> None:
+        super(PackedIdxSeqPipe, self).__init__(
+            pre=None,
+            vocab=None,
+            post=ToTensor(dtype=dtype),
+            batch=PackSeq(enforce_sorted=False) + ToDevice(device=device),
+        )
+```
+
+or you can simply manipulate existing `Pipe`s by calling `.with_` method.
+
+```python
+class PackedTokSeqPipe(PackedIdxSeqPipe):
+    def __init__(self, device, unk_token, special_tokens=(),
+                 threshold=THRESHOLD, dtype=torch.long) -> None:
+        super(PackedTokSeqPipe, self).__init__(device=device, dtype=dtype)
+        self.with_(
+            pre=UpdateCounter(),
+            vocab=[
+                BuildVocab(unk_token=unk_token, pad_token=None,
+                           special_tokens=special_tokens),
+                StatsVocab(threshold=threshold),
+            ],
+            post=Numbering() + ...,
+        )
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -0,0 +1,4 @@
+site_name: TorchGlyph
+nav:
+  - Home: index.md
+theme: alabaster
diff --git a/setup.py b/setup.py
@@ -1,18 +1,16 @@
 from setuptools import setup, find_packages
 
-with open('README.md', 'r', encoding='utf-8') as fp:
-    long_description = fp.read()
+name = 'torchglyph'
 
 setup(
-    name='torchglyph',
-    version='0.1.0',
-    packages=find_packages(),
-    url='https://github.com/speedcell4/torchglyph',
+    name=name,
+    version='0.1.1',
+    packages=[package for package in find_packages() if package.startswith(name)],
+    url=f'https://speedcell4.github.io/torchglyph',
     license='MIT',
     author='speedcell4',
     author_email='[email protected]',
     description='Data Processor Combinators for Natural Language Processing',
-    long_description=long_description,
     install_requires=[
         'tqdm',
         'numpy',
@@ -23,5 +21,14 @@
             'pytest',
             'hypothesis',
         ],
+        'ctx': [
+            'transformers',
+            'allennlp',
+            'elmoformanylangs',
+        ],
+        'docs': [
+            'mkdocs',
+            'mkdocs-alabaster',
+        ]
     }
 )
diff --git a/tests/test_datasets/test_sequential_labeling.py b/tests/test_datasets/test_sequential_labeling.py
@@ -1,14 +1,29 @@
-from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER
+from torchglyph.datasets import CoNLL2000Chunking, CoNLL2003NER
+from torchglyph.datasets import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish
 
 
-def test_conll2000_chunking() -> None:
-    train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None)
-    assert len(train) == 8936
-    assert len(test) == 2012
+def test_conll2000_chunking():
+    train, test = CoNLL2000Chunking.new(batch_size=1, word_dim=None, remove_missing=True)
+    assert len(train.dataset) == 8936
+    assert len(test.dataset) == 2012
 
 
-def test_conll2003_ner() -> None:
-    train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None)
-    assert len(train) == 14987
-    assert len(dev) == 3466
-    assert len(test) == 3684
+def test_conll2003_ner():
+    train, dev, test = CoNLL2003NER.new(batch_size=1, word_dim=None, remove_missing=True)
+    assert len(train.dataset) == 14987
+    assert len(dev.dataset) == 3466
+    assert len(test.dataset) == 3684
+
+
+def test_semeval2010_catalan():
+    train, dev, test = SemEval2010T1NERCatalan.new(batch_size=1, word_dim=None, remove_missing=True)
+    assert len(train.dataset) == 8709
+    assert len(dev.dataset) == 1445
+    assert len(test.dataset) == 1698
+
+
+def test_semeval2010_spanish():
+    train, dev, test = SemEval2010T1NERSpanish.new(batch_size=1, word_dim=None, remove_missing=True)
+    assert len(train.dataset) == 9022
+    assert len(dev.dataset) == 1419
+    assert len(test.dataset) == 1705
diff --git a/tests/test_datasets/test_text_classification.py b/tests/test_datasets/test_text_classification.py
@@ -2,6 +2,6 @@
 
 
 def test_agnews():
-    train, test = AgNews.new(batch_size=1, word_dim=None)
+    train, test = AgNews.new(batch_size=1, word_dim=None, remove_missing=True)
     assert len(train) == 120000
     assert len(test) == 7600
diff --git a/tests/test_nn/test_connection.py b/tests/test_nn/test_connection.py
@@ -0,0 +1,44 @@
+import torch
+from hypothesis import given, strategies as st
+from torch import nn
+
+from torchglyph.nn.connection import ResNorm, DenseNorm, ReZero
+
+
+@given(
+    batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4),
+    input_dim=st.integers(1, 20),
+)
+def test_resnorm_shape_grad(batch_sizes, input_dim):
+    layer = ResNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim))
+    x = torch.rand((*batch_sizes, input_dim), requires_grad=True)
+    y = layer(x)
+
+    assert y.size() == (*batch_sizes, layer.output_dim)
+    assert y.requires_grad
+
+
+@given(
+    batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4),
+    input_dim=st.integers(1, 20),
+)
+def test_densenorm_shape_grad(batch_sizes, input_dim):
+    layer = DenseNorm(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim))
+    x = torch.rand((*batch_sizes, input_dim), requires_grad=True)
+    y = layer(x)
+
+    assert y.size() == (*batch_sizes, layer.output_dim)
+    assert y.requires_grad
+
+
+@given(
+    batch_sizes=st.lists(st.integers(1, 10), min_size=0, max_size=4),
+    input_dim=st.integers(1, 20),
+)
+def test_rezero_shape_grad(batch_sizes, input_dim):
+    layer = ReZero(input_dim=input_dim, sub_layer=nn.Linear(input_dim, input_dim))
+    x = torch.rand((*batch_sizes, input_dim), requires_grad=True)
+    y = layer(x)
+
+    assert y.size() == (*batch_sizes, layer.output_dim)
+    assert y.requires_grad
diff --git a/torchglyph/__init__.py b/torchglyph/__init__.py
@@ -4,7 +4,7 @@
 import torch
 from torch.nn.utils.rnn import PackedSequence
 
-data_path = Path.home() / '.torchglyph'
+data_path = (Path.home() / '.torchglyph').expanduser().absolute()
 if not data_path.exists():
     data_path.mkdir(parents=True, exist_ok=True)
 

diff --git a/torchglyph/dataset.py b/torchglyph/dataset.py
@@ -2,7 +2,7 @@
 import uuid
 from collections import namedtuple
 from pathlib import Path
-from typing import Iterable, Any, TextIO
+from typing import Iterable, Any, TextIO, Optional
 from typing import Union, List, Type, Tuple, NamedTuple, Dict
 
 from torch.utils import data
@@ -14,6 +14,7 @@
 
 
 class Dataset(data.Dataset):
+    name: Optional[str]
     urls: List[Union[Tuple[str, ...]]]
 
     def __init__(self, pipes: List[Dict[str, Pipe]], **load_kwargs) -> None:
@@ -62,14 +63,16 @@ def collate_fn(self, batch: List[NamedTuple]) -> NamedTuple:
 
     @classmethod
     def paths(cls, root: Path = data_path) -> Tuple[Path, ...]:
+        root = root / getattr(cls, 'name', cls.__name__).lower()
+
         ans = []
         for url, name, *filenames in cls.urls:
             if len(filenames) == 0:
                 filenames = [name]
-            if any(not (root / cls.__name__.lower() / n).exists() for n in filenames):
-                download_and_unzip(url, root / cls.__name__.lower() / name)
-            for n in filenames:
-                ans.append(root / cls.__name__.lower() / n)
+            if any(not (root / filename).exists() for filename in filenames):
+                download_and_unzip(url, root / name)
+            for filename in filenames:
+                ans.append(root / filename)
 
         return tuple(ans)
 

diff --git a/torchglyph/datasets/__init__.py b/torchglyph/datasets/__init__.py
@@ -1,2 +1,3 @@
 from torchglyph.datasets.sequential_labeling import CoNLL2000Chunking, CoNLL2003NER
+from torchglyph.datasets.sequential_labeling import SemEval2010T1NERCatalan, SemEval2010T1NERSpanish
 from torchglyph.datasets.text_classification import AgNews
-Original file line number
+Diff line change
@@ Expand Up / @@ -640,3 +640,4 @@ GitHub.sublime-settings @@
     *.ptx
     *.cubin
     *.fatbin
+    !/site/