Skip to content

Commit bd5bede

Browse files
committed
update tools
1 parent b14d76c commit bd5bede

File tree

8 files changed

+3613
-9
lines changed

8 files changed

+3613
-9
lines changed

tests/data/tokenizer/bpe_token.json

Lines changed: 1448 additions & 0 deletions
Large diffs are not rendered by default.

tests/data/tokenizer/uni_token.json

Lines changed: 1184 additions & 0 deletions
Large diffs are not rendered by default.

tests/data/tokenizer/wp_token.json

Lines changed: 810 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import pandas as pd
2+
import pytest
3+
import os
4+
import copy
5+
from dlk.data.subprocessors.fast_tokenizer import FastTokenizer, FastTokenizerConfig
6+
from dlk.utils.get_root import get_root
7+
import json
8+
path = os.path.dirname((os.path.realpath('__file__')))
9+
path_o = (os.path.realpath('__file__'))
10+
11+
@pytest.fixture
12+
def default_single_config(request):
13+
return {
14+
"_name": "fast_tokenizer",
15+
"config": {
16+
"train": {
17+
"data_set": {
18+
"train": ["train", "valid", 'test', 'predict'],
19+
"predict": ["predict"],
20+
"online": ["online"]
21+
},
22+
"config_path": "",
23+
"truncation": {
24+
"max_length": 10,
25+
"strategy": "longest_first"
26+
},
27+
"normalizer": 'default',
28+
"pre_tokenizer": "default",
29+
"post_processor": "default",
30+
"output_map": {
31+
"tokens": "tokens",
32+
"ids": "input_ids",
33+
"attention_mask": "attention_mask",
34+
"type_ids": "type_ids",
35+
"special_tokens_mask": "special_tokens_mask",
36+
"offsets": "offsets",
37+
"word_ids": "word_ids",
38+
"overflowing": "overflowing",
39+
"sequence_ids": "sequence_ids",
40+
},
41+
"input_map": {
42+
"sentence": "sentence",
43+
},
44+
"deliver": "tokenizer",
45+
"process_data": { "is_pretokenized": False},
46+
"data_type": "single",
47+
},
48+
"predict": ["train", {"deliver": None}],
49+
"online": ["train", {"deliver": None}],
50+
}
51+
}
52+
53+
54+
class TestFastTokenizer(object):
55+
def test_default_tokenizer(self, default_single_config):
56+
default_single_config = copy.deepcopy(default_single_config)
57+
tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
58+
default_single_config['config']['train']['config_path'] = tokenizer_path
59+
tokenizer_config = FastTokenizerConfig(stage='train', config=default_single_config)
60+
tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
61+
data = {
62+
"data": {
63+
"train": pd.DataFrame(data={'sentence': ["I have an apple."]})
64+
}
65+
}
66+
result = tokenizer.process(data)
67+
result = result['data']['train'].iloc[0]
68+
assert result['sentence'] == "I have an apple."
69+
assert result['tokens'] == ['I', 'have', 'an', 'app', '##le', '.']
70+
assert result['input_ids'] == [8, 9, 10, 6, 7, 11]
71+
assert result['attention_mask'] == [1, 1, 1, 1, 1, 1]
72+
assert result['type_ids'] == [0, 0, 0, 0, 0, 0]
73+
assert result['special_tokens_mask'] == [0, 0, 0, 0, 0, 0]
74+
assert result['offsets'] == [(0, 1), (2, 6), (7, 9), (10, 13), (13, 15), (15, 16)]
75+
assert result['word_ids'] == [0, 1, 2, 3, 3, 4]
76+
assert result['sequence_ids'] == [0, 0, 0, 0, 0, 0]
77+
78+
def test_post_bert_prosess_tokenizer(self, default_single_config):
79+
default_single_config = copy.deepcopy(default_single_config)
80+
tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
81+
default_single_config['config']['train']['config_path'] = tokenizer_path
82+
# default_single_config['config']['train']['pre_tokenizer'] = ['bert']
83+
default_single_config['config']['train']['post_processor'] = 'bert'
84+
tokenizer_config = FastTokenizerConfig(stage='train', config=default_single_config)
85+
tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
86+
data = {
87+
"data": {
88+
"train": pd.DataFrame(data={'sentence': ["I have an apple."]})
89+
}
90+
}
91+
result = tokenizer.process(data)
92+
result = result['data']['train'].iloc[0]
93+
assert result['sentence'] == "I have an apple."
94+
assert result['tokens'] == ['[CLS]', 'I', 'have', 'an', 'app', '##le', '.', '[SEP]']
95+
assert result['input_ids'] == [0, 8, 9, 10, 6, 7, 11, 1]
96+
assert result['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1]
97+
assert result['type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0]
98+
assert result['special_tokens_mask'] == [1, 0, 0, 0, 0, 0, 0, 1]
99+
assert result['offsets'] == [(0, 0), (0, 1), (2, 6), (7, 9), (10, 13), (13, 15), (15, 16), (0, 0)]
100+
assert result['word_ids'] == [None, 0, 1, 2, 3, 3, 4, None]
101+
assert result['sequence_ids'] == [None, 0, 0, 0, 0, 0, 0, None]
102+
103+
def test_pre_tokenized_tokenizer(self, default_single_config):
104+
default_single_config = copy.deepcopy(default_single_config)
105+
tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
106+
default_single_config['config']['train']['config_path'] = tokenizer_path
107+
default_single_config['config']['train']['process_data']['is_pretokenized'] = True
108+
# default_single_config['config']['train']['post_processor'] = 'bert'
109+
tokenizer_config = FastTokenizerConfig(stage='train', config=default_single_config)
110+
tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
111+
data = {
112+
"data": {
113+
"train": pd.DataFrame(data={'sentence': [["我", "来自", '山东', '济宁', '.']]})
114+
}
115+
}
116+
result = tokenizer.process(data)
117+
result = result['data']['train'].iloc[0]
118+
assert result['sentence'] == ["我", "来自", '山东', '济宁', '.']
119+
assert result['tokens'] == ['我', '[UNK]', '山', '##东', '济宁', '.']
120+
assert result['input_ids'] == [12, 4, 15, 17, 18, 11]
121+
assert result['attention_mask'] == [1, 1, 1, 1, 1, 1]
122+
assert result['type_ids'] == [0, 0, 0, 0, 0, 0]
123+
assert result['special_tokens_mask'] == [0, 0, 0, 0, 0, 0]
124+
assert result['offsets'] == [(0, 1), (0, 2), (0, 1), (1, 2), (0, 2), (0, 1)]
125+
assert result['word_ids'] == [0, 1, 2, 2, 3, 4]
126+
assert result['sequence_ids'] == [0, 0, 0, 0, 0, 0]
127+
128+
@pytest.mark.cur
129+
def test_pair_tokenizer(self, default_single_config):
130+
default_pair_config = copy.deepcopy(default_single_config)
131+
tokenizer_path = os.path.join(get_root(), 'tests/data/tokenizer/vocab_tokenizer.json')
132+
default_pair_config['config']['train']['config_path'] = tokenizer_path
133+
default_pair_config['config']['train']['data_type'] = 'pair'
134+
default_pair_config['config']['train']['input_map'] = {
135+
"sentence_a": "sentence_a",
136+
"sentence_b": "sentence_b",
137+
}
138+
default_pair_config['config']['train']['truncation'] = {
139+
"max_length": 20,
140+
"strategy": "longest_first"
141+
}
142+
143+
tokenizer_config = FastTokenizerConfig(stage='train', config=default_pair_config)
144+
tokenizer = FastTokenizer(stage='train', config=tokenizer_config)
145+
data = {
146+
"data": {
147+
"train": pd.DataFrame(data={'sentence_a': ["I have an apple."], 'sentence_b': ['an apple.']})
148+
}
149+
}
150+
result = tokenizer.process(data)
151+
result = result['data']['train']
152+
print(result)
153+
# assert result['sentence'] == "I have an apple."
154+
# assert result['tokens'] == ['I', 'have', 'an', 'app', '##le', '.']
155+
# assert result['input_ids'] == [8, 9, 10, 6, 7, 11]
156+
# assert result['attention_mask'] == [1, 1, 1, 1, 1, 1]
157+
# assert result['type_ids'] == [0, 0, 0, 0, 0, 0]
158+
# assert result['special_tokens_mask'] == [0, 0, 0, 0, 0, 0]
159+
# assert result['offsets'] == [(0, 1), (2, 6), (7, 9), (10, 13), (13, 15), (15, 16)]
160+
# assert result['word_ids'] == [0, 1, 2, 3, 3, 4]
161+
# assert result['sequence_ids'] == [0, 0, 0, 0, 0, 0]
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"type": "distil_bert", //roberta, bert, vocab, embedding
3-
"config_dir": "test", //transformers vocab files
4-
"output": "./tokenizer.json"
2+
"type": "bert", //roberta, bert, vocab, embedding
3+
"config_dir": "data", //transformers vocab files
4+
"output": "./data/convert_tokenizer.json"
55
}

tools/convert_tokenizer/convert.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131

3232
if __name__ == "__main__":
33-
parser = argparse.ArgumentParser()
33+
parser = argparse.ArgumentParser(description="Convert the original vocab.txt and other format tokenizer config for Transformers to tokenizer.json for `tokenizer`")
3434
parser.add_argument(
3535
"--config",
3636
type=str,
Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
{
22
"unk": "[UNK]",
3-
"vocab": "./test/embedding_vocab.txt",
3+
"vocab": "./data/vocab.txt",
44
"do_norm": false,
55
"do_lowercase": false,
66
"do_bert_postprocess": false,
7-
"pre_tokenizer": 'whitespacesplit',
8-
"output": './vocab_tokenizer.json',
7+
"pre_tokenizer": 'whitespace',
8+
"output": './data/vocab_tokenizer.json',
99
}

tools/convert_tokenizer/vocab2tokenizer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818

1919
pre_tokenizers = {
2020
"bert": {"type": "BertPreTokenizer"},
21-
"whitespacesplit": {"type": "WhitespaceSplit"}
21+
"whitespacesplit": {"type": "WhitespaceSplit"},
22+
"whitespace": {"type": "Whitespace"}
2223
}
2324

2425
template = {
@@ -137,7 +138,7 @@
137138
}
138139

139140
if __name__ == "__main__":
140-
parser = argparse.ArgumentParser()
141+
parser = argparse.ArgumentParser(description="Convert the file in --vocab to tokenizer.json for `tokenizer`")
141142
parser.add_argument(
142143
"--config",
143144
type=str,

0 commit comments

Comments
 (0)