Skip to content

Commit 796e0cd

Browse files
committed
Substitute long tokens in debug output
1 parent 5fcc9dc commit 796e0cd

File tree

1 file changed

+10
-8
lines changed

1 file changed

+10
-8
lines changed

src/dom_tokenizers/pre_tokenizers/splitter.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -164,14 +164,16 @@ def split(self, text: str, flags: Flags = Flags.FULL) -> Iterable[str]:
164164

165165
curr = splits[cursor]
166166
if VERBOSE: # pragma: no cover
167-
if len(splits) < 32:
168-
debug(" ".join(
169-
f"""\x1B[{'48;5;15;1;31'
170-
if index == cursor
171-
else '48;5;248;30'}m{split}\x1B[0m"""
172-
for index, split in enumerate(splits)))
173-
else:
174-
debug("curr: %s", repr(curr))
167+
start = max(cursor - 16, 0)
168+
limit = min(cursor + 17, len(splits))
169+
debug(" ".join(
170+
f"""\x1B[{'48;5;15;1;31'
171+
if index + start == cursor
172+
else '48;5;248;30'}m{
173+
split
174+
if split is SPLIT or len(split) < 256
175+
else '[long]'}\x1B[0m"""
176+
for index, split in enumerate(splits[start:limit])))
175177

176178
# Pop empty strings and whitespace
177179
cursor, is_changed = _pop_unless_nonempty(curr, cursor, splits)

0 commit comments

Comments
 (0)