Skip to content

Commit ff35615

Browse files
committed
*** crowbar and write out for light test
1 parent f77bbec commit ff35615

File tree

1 file changed

+17
-4
lines changed

1 file changed

+17
-4
lines changed

src/dom_tokenizers/pre_tokenizers/dom_snapshot.py

+17-4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
13
from collections import defaultdict
24
from dataclasses import make_dataclass
35
from xml.dom import Node
@@ -133,9 +135,20 @@ def get(
133135
if tokens is not None:
134136
return tokens
135137
text = self._strings[string_index]
136-
tokens = [
137-
NormalizedString(token)
138-
for token in self._splitter.split(text, split_flags)
139-
]
138+
tokens = list(self._splitter.split(text, split_flags))
139+
for token in tokens:
140+
if "l0gh7uis0hpxahwelsqtpiqs2yzobl" not in token.lower():
141+
continue
142+
filename = "tests/resources/base64-misses/1655961866939.json"
143+
for retry in range(5):
144+
if not os.path.exists(filename):
145+
break
146+
filename += "~"
147+
else:
148+
raise AssertionError(filename)
149+
with open(filename, "w") as fp:
150+
json.dump({"text": text}, fp)
151+
raise ValueError(token)
152+
tokens = list(map(NormalizedString, tokens))
140153
cache[string_index] = tokens
141154
return tokens

0 commit comments

Comments
 (0)