File tree 1 file changed +17
-4
lines changed
src/dom_tokenizers/pre_tokenizers
1 file changed +17
-4
lines changed Original file line number Diff line number Diff line change
1
+ import os
2
+
1
3
from collections import defaultdict
2
4
from dataclasses import make_dataclass
3
5
from xml .dom import Node
@@ -133,9 +135,20 @@ def get(
133
135
if tokens is not None :
134
136
return tokens
135
137
text = self ._strings [string_index ]
136
- tokens = [
137
- NormalizedString (token )
138
- for token in self ._splitter .split (text , split_flags )
139
- ]
138
+ tokens = list (self ._splitter .split (text , split_flags ))
139
+ for token in tokens :
140
+ if "l0gh7uis0hpxahwelsqtpiqs2yzobl" not in token .lower ():
141
+ continue
142
+ filename = "tests/resources/base64-misses/1655961866939.json"
143
+ for retry in range (5 ):
144
+ if not os .path .exists (filename ):
145
+ break
146
+ filename += "~"
147
+ else :
148
+ raise AssertionError (filename )
149
+ with open (filename , "w" ) as fp :
150
+ json .dump ({"text" : text }, fp )
151
+ raise ValueError (token )
152
+ tokens = list (map (NormalizedString , tokens ))
140
153
cache [string_index ] = tokens
141
154
return tokens
You can’t perform that action at this time.
0 commit comments