Skip to content

Commit

Permalink
Correctly handle escapes embedded in Base64
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 20, 2024
1 parent 796e0cd commit fd6df64
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 1 deletion.
9 changes: 8 additions & 1 deletion src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,14 @@ def split(self, text: str, flags: Flags = Flags.FULL) -> Iterable[str]:
if words:
if VERBOSE: # pragma: no cover
debug("it's some words")
splits[cursor:cursor+1] = words + [SPLIT]
cursor_limit = cursor + 1

if cursor_limit < len(splits):
next = splits[cursor_limit]
if next and next is not SPLIT and next[0] not in r"%\&":
words.append(SPLIT)

splits[cursor:cursor_limit] = words
continue

if True: # pragma: no cover
Expand Down
13 changes: 13 additions & 0 deletions tests/test_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,19 @@ def test_sub_js_escape_crasher():
assert splits == [SPLIT, ""]


@pytest.mark.parametrize(
"text,expect_tokens",
(("kNEu9lE8g2RGVVvZ6clo/g\u00f6\u00f6d+m\u00f6rning/kNEu9lE8g2RGVVvZ6clo",
["[BASE64]", "good", "morning", "[BASE64]"]),
(r"kNEu9lE8g2RGVVvZ6clo/g\u00f6\u00f6d+m\u00f6rning/kNEu9lE8g2RGVVvZ6clo",
["[BASE64]", "good", "morning", "[BASE64]"]),
))
def test_nonascii_in_base64(text, expect_tokens):
"""Ensure non-ASCII characters in obvious base64 are handled correctly.
"""
assert list(TextSplitter().split(text)) == expect_tokens


@pytest.mark.parametrize(
"text,expect_tokens",
(("That\u2019s all we know.",
Expand Down

0 comments on commit fd6df64

Please sign in to comment.