Skip to content

Commit

Permalink
Substitute long tokens in debug output
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 20, 2024
1 parent 5fcc9dc commit 796e0cd
Showing 1 changed file with 10 additions and 8 deletions.
18 changes: 10 additions & 8 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,14 +164,16 @@ def split(self, text: str, flags: Flags = Flags.FULL) -> Iterable[str]:

curr = splits[cursor]
if VERBOSE: # pragma: no cover
if len(splits) < 32:
debug(" ".join(
f"""\x1B[{'48;5;15;1;31'
if index == cursor
else '48;5;248;30'}m{split}\x1B[0m"""
for index, split in enumerate(splits)))
else:
debug("curr: %s", repr(curr))
start = max(cursor - 16, 0)
limit = min(cursor + 17, len(splits))
debug(" ".join(
f"""\x1B[{'48;5;15;1;31'
if index + start == cursor
else '48;5;248;30'}m{
split
if split is SPLIT or len(split) < 256
else '[long]'}\x1B[0m"""
for index, split in enumerate(splits[start:limit])))

# Pop empty strings and whitespace
cursor, is_changed = _pop_unless_nonempty(curr, cursor, splits)
Expand Down

0 comments on commit 796e0cd

Please sign in to comment.