cstsunfu
diff --git a/‎tests/data/tokenizer/bpe_token.json‎
Lines changed: 1448 additions & 0 deletions b/‎tests/data/tokenizer/bpe_token.json‎
Lines changed: 1448 additions & 0 deletions
diff --git a/‎tests/data/tokenizer/uni_token.json‎
Lines changed: 1184 additions & 0 deletions b/‎tests/data/tokenizer/uni_token.json‎
Lines changed: 1184 additions & 0 deletions
diff --git a/‎tests/data/tokenizer/wp_token.json‎
Lines changed: 810 additions & 0 deletions b/‎tests/data/tokenizer/wp_token.json‎
Lines changed: 810 additions & 0 deletions
diff --git a/‎tests/processors/test_subprocessor_tokenizer.py‎
Lines changed: 161 additions & 0 deletions b/‎tests/processors/test_subprocessor_tokenizer.py‎
Lines changed: 161 additions & 0 deletions
diff --git a/‎tools/convert_tokenizer/convert.hjson‎
Lines changed: 3 additions & 3 deletions b/‎tools/convert_tokenizer/convert.hjson‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tools/convert_tokenizer/convert.py‎
Lines changed: 1 addition & 1 deletion b/‎tools/convert_tokenizer/convert.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tools/convert_tokenizer/vocab2tokenizer.hjson‎
Lines changed: 3 additions & 3 deletions b/‎tools/convert_tokenizer/vocab2tokenizer.hjson‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tools/convert_tokenizer/vocab2tokenizer.py‎
Lines changed: 3 additions & 2 deletions b/‎tools/convert_tokenizer/vocab2tokenizer.py‎
Lines changed: 3 additions & 2 deletions
@@ -0,0 +1,161 @@
+import pandas as pd
+import pytest
+import os
+import copy
+from dlk.data.subprocessors.fast_tokenizer import FastTokenizer, FastTokenizerConfig
+from dlk.utils.get_root import get_root
+import json
+path = os.path.dirname((os.path.realpath('__file__')))
+path_o = (os.path.realpath('__file__'))
+
+@pytest.fixture
+def default_single_config(request):
+    return {
+        "_name": "fast_tokenizer",
+        "config": {
+            "train": {
+                "data_set": {
+                    "train": ["train", "valid", 'test', 'predict'],
+                    "predict": ["predict"],
+                    "online": ["online"]
+                },
+                "config_path": "",
+                "truncation": {
+                    "max_length": 10,
+                    "strategy": "longest_first"
+                },
+                "normalizer": 'default',
+                "pre_tokenizer": "default",
+                "post_processor": "default",
+                "output_map": {
+                    "tokens": "tokens",
+                    "ids": "input_ids",
+                    "attention_mask": "attention_mask",
+                    "type_ids": "type_ids",
+                    "special_tokens_mask": "special_tokens_mask",
+                    "offsets": "offsets",
+                    "word_ids": "word_ids",
+                    "overflowing": "overflowing",
+                    "sequence_ids": "sequence_ids",
+                },
+                "input_map": {
+                    "sentence": "sentence",
+                },
+                "deliver": "tokenizer",
+                "process_data": { "is_pretokenized": False},
+                "data_type": "single",
+            },
+            "predict": ["train", {"deliver": None}],
+            "online": ["train", {"deliver": None}],
+        }
+    }
+
+
+class TestFastTokenizer(object):
+    def test_default_tokenizer(self, default_single_config):
+        default_single_config = copy.deepcopy(default_single_config)
+        tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
+        default_single_config['config']['train']['config_path'] = tokenizer_path
+        tokenizer_config = FastTokenizerConfig(stage='train', config=default_single_config)
+        tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
+        data = {
+            "data": {
+                "train": pd.DataFrame(data={'sentence': ["I have an apple."]})
+            }
+        }
+        result = tokenizer.process(data)
+        result = result['data']['train'].iloc[0]
+        assert result['sentence'] == "I have an apple."
+        assert result['tokens'] == ['I', 'have', 'an', 'app', '##le', '.']
+        assert result['input_ids'] == [8, 9, 10, 6, 7, 11]
+        assert result['attention_mask'] == [1, 1, 1, 1, 1, 1]
+        assert result['type_ids'] == [0, 0, 0, 0, 0, 0]
+        assert result['special_tokens_mask'] == [0, 0, 0, 0, 0, 0]
+        assert result['offsets'] == [(0, 1), (2, 6), (7, 9), (10, 13), (13, 15), (15, 16)]
+        assert result['word_ids'] == [0, 1, 2, 3, 3, 4]
+        assert result['sequence_ids'] == [0, 0, 0, 0, 0, 0]
+
+    def test_post_bert_prosess_tokenizer(self, default_single_config):
+        default_single_config = copy.deepcopy(default_single_config)
+        tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
+        default_single_config['config']['train']['config_path'] = tokenizer_path
+        # default_single_config['config']['train']['pre_tokenizer'] = ['bert']
+        default_single_config['config']['train']['post_processor'] = 'bert'
+        tokenizer_config = FastTokenizerConfig(stage='train', config=default_single_config)
+        tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
+        data = {
+            "data": {
+                "train": pd.DataFrame(data={'sentence': ["I have an apple."]})
+            }
+        }
+        result = tokenizer.process(data)
+        result = result['data']['train'].iloc[0]
+        assert result['sentence'] == "I have an apple."
+        assert result['tokens'] == ['[CLS]', 'I', 'have', 'an', 'app', '##le', '.', '[SEP]']
+        assert result['input_ids'] == [0, 8, 9, 10, 6, 7, 11, 1]
+        assert result['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1]
+        assert result['type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0]
+        assert result['special_tokens_mask'] == [1, 0, 0, 0, 0, 0, 0, 1]
+        assert result['offsets'] == [(0, 0), (0, 1), (2, 6), (7, 9), (10, 13), (13, 15), (15, 16), (0, 0)]
+        assert result['word_ids'] == [None, 0, 1, 2, 3, 3, 4, None]
+        assert result['sequence_ids'] == [None, 0, 0, 0, 0, 0, 0, None]
+
+    def test_pre_tokenized_tokenizer(self, default_single_config):
+        default_single_config = copy.deepcopy(default_single_config)
+        tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
+        default_single_config['config']['train']['config_path'] = tokenizer_path
+        default_single_config['config']['train']['process_data']['is_pretokenized'] = True
+        # default_single_config['config']['train']['post_processor'] = 'bert'
+        tokenizer_config = FastTokenizerConfig(stage='train', config=default_single_config)
+        tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
+        data = {
+            "data": {
+                "train": pd.DataFrame(data={'sentence': [["我", "来自", '山东', '济宁', '.']]})
+            }
+        }
+        result = tokenizer.process(data)
+        result = result['data']['train'].iloc[0]
+        assert result['sentence'] == ["我", "来自", '山东', '济宁', '.']
+        assert result['tokens'] == ['我', '[UNK]', '山', '##东', '济宁', '.']
+        assert result['input_ids'] == [12, 4, 15, 17, 18, 11]
+        assert result['attention_mask'] == [1, 1, 1, 1, 1, 1]
+        assert result['type_ids'] == [0, 0, 0, 0, 0, 0]
+        assert result['special_tokens_mask'] == [0, 0, 0, 0, 0, 0]
+        assert result['offsets'] == [(0, 1), (0, 2), (0, 1), (1, 2), (0, 2), (0, 1)]
+        assert result['word_ids'] == [0, 1, 2, 2, 3, 4]
+        assert result['sequence_ids'] == [0, 0, 0, 0, 0, 0]
+
+    @pytest.mark.cur
+    def test_pair_tokenizer(self, default_single_config):
+        default_pair_config = copy.deepcopy(default_single_config)
+        tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
+        default_pair_config['config']['train']['config_path'] = tokenizer_path
+        default_pair_config['config']['train']['data_type'] = 'pair'
+        default_pair_config['config']['train']['input_map'] = {
+            "sentence_a": "sentence_a",
+            "sentence_b": "sentence_b",
+        }
+        default_pair_config['config']['train']['truncation'] = {
+            "max_length": 20,
+            "strategy": "longest_first"
+        }
+
+        tokenizer_config = FastTokenizerConfig(stage='train', config=default_pair_config)
+        tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
+        data = {
+            "data": {
+                "train": pd.DataFrame(data={'sentence_a': ["I have an apple."], 'sentence_b': ['an apple.']})
+            }
+        }
+        result = tokenizer.process(data)
+        result = result['data']['train']
+        print(result)
+        # assert result['sentence'] == "I have an apple."
+        # assert result['tokens'] == ['I', 'have', 'an', 'app', '##le', '.']
+        # assert result['input_ids'] == [8, 9, 10, 6, 7, 11]
+        # assert result['attention_mask'] == [1, 1, 1, 1, 1, 1]
+        # assert result['type_ids'] == [0, 0, 0, 0, 0, 0]
+        # assert result['special_tokens_mask'] == [0, 0, 0, 0, 0, 0]
+        # assert result['offsets'] == [(0, 1), (2, 6), (7, 9), (10, 13), (13, 15), (15, 16)]
+        # assert result['word_ids'] == [0, 1, 2, 3, 3, 4]
+        # assert result['sequence_ids'] == [0, 0, 0, 0, 0, 0]
@@ -1,5 +1,5 @@
 {
-    "type": "distil_bert", //roberta, bert, vocab, embedding
-    "config_dir": "test", //transformers vocab files
-    "output": "./tokenizer.json"
+    "type": "bert", //roberta, bert, vocab, embedding
+    "config_dir": "data", //transformers vocab files
+    "output": "./data/convert_tokenizer.json"
 }
@@ -30,7 +30,7 @@
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description="Convert the original vocab.txt and other format tokenizer config for Transformers to tokenizer.json for `tokenizer`")
     parser.add_argument(
         "--config",
         type=str,
 
@@ -1,9 +1,9 @@
 {
     "unk": "[UNK]",
-    "vocab": "./test/embedding_vocab.txt",
+    "vocab": "./data/vocab.txt",
     "do_norm": false,
     "do_lowercase": false,
     "do_bert_postprocess": false,
-    "pre_tokenizer": 'whitespacesplit',
-    "output": './vocab_tokenizer.json',
+    "pre_tokenizer": 'whitespace',
+    "output": './data/vocab_tokenizer.json',
 }
@@ -18,7 +18,8 @@
 
 pre_tokenizers = {
     "bert": {"type": "BertPreTokenizer"},
-    "whitespacesplit": {"type": "WhitespaceSplit"}
+    "whitespacesplit": {"type": "WhitespaceSplit"},
+    "whitespace": {"type": "Whitespace"}
 }
 
 template = {
@@ -137,7 +138,7 @@
 }
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(description="Convert the file in --vocab to tokenizer.json for `tokenizer`")
     parser.add_argument(
         "--config",
         type=str,
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "type": "distil_bert", //roberta, bert, vocab, embedding`
`3`		`- "config_dir": "test", //transformers vocab files`
`4`		`- "output": "./tokenizer.json"`
	`2`	`+ "type": "bert", //roberta, bert, vocab, embedding`
	`3`	`+ "config_dir": "data", //transformers vocab files`
	`4`	`+ "output": "./data/convert_tokenizer.json"`
`5`	`5`	`}`
Original file line number	Diff line number	Diff line change
`@@ -1,9 +1,9 @@`
`1`	`1`	`{`
`2`	`2`	`"unk": "[UNK]",`
`3`		`- "vocab": "./test/embedding_vocab.txt",`
	`3`	`+ "vocab": "./data/vocab.txt",`
`4`	`4`	`"do_norm": false,`
`5`	`5`	`"do_lowercase": false,`
`6`	`6`	`"do_bert_postprocess": false,`
`7`		`- "pre_tokenizer": 'whitespacesplit',`
`8`		`- "output": './vocab_tokenizer.json',`
	`7`	`+ "pre_tokenizer": 'whitespace',`
	`8`	`+ "output": './data/vocab_tokenizer.json',`
`9`	`9`	`}`