From 322a5049707219f63a2227f332b7d4b9b68f9d95 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Tue, 4 Jun 2024 09:20:50 +0100 Subject: [PATCH] Recognize prefixed hex --- src/dom_tokenizers/pre_tokenizers/splitter.py | 12 ++++++++- tests/test_splitter.py | 27 ++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index ef2bca2..32ebe82 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -63,11 +63,12 @@ def special_tokens(self) -> Iterable[str]: JS_CHAR_ESCAPE_RE = re.compile(f"(?:x|u{_TWOHEX}){_TWOHEX}") ENTITY_STARTS = {"&", "&#"} ESCAPE_START_RE = re.compile(r".([&%\\])") + PREFIXED_HEX_RE = re.compile(r"^(0x)([0-9a-f]+)([+/=]*)$", re.I) # XXX older bits MAXWORDLEN = 32 WORD_RE = re.compile(r"(?:\w+['’]?)+") - HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$") + HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I) DIGIT_RE = re.compile(r"\d") LONGEST_URLISH = 1024 # XXX? URLISH_LOOKBACK = 5 @@ -159,6 +160,15 @@ def split(self, text: str) -> Iterable[str]: splits[cursor:cursor+1] = new_splits continue + # Are we looking at some prefixed hex? + if (match := self.PREFIXED_HEX_RE.match(curr)): + if VERBOSE: # pragma: no cover + debug("prefixed hex") + new_splits = [s for s in match.groups() if s] + splits[cursor:cursor+1] = new_splits + cursor += len(new_splits) + continue + # Are we looking at something that might be base64? if self.BASE64_RE.match(curr): cursor = self._sub_base64(splits, cursor) diff --git a/tests/test_splitter.py b/tests/test_splitter.py index 11a96f4..e980fe5 100644 --- a/tests/test_splitter.py +++ b/tests/test_splitter.py @@ -4,7 +4,7 @@ @pytest.mark.parametrize( - ("text,expect_splits"), + "text,expect_splits", (("hello world", ["hello", " ", "world"]), ("$hello world", ["", "$", "hello", " ", "world"]), ("hello-world", ["hello", "-", "world"]), @@ -55,7 +55,7 @@ def test_first_split_re(text, expect_splits): @pytest.mark.parametrize( - ("text,expect_tokens"), + "text,expect_tokens", (("hello world", ["hello", "world"]), ("hello-world", ["hello", "world"]), ("hello_world", ["hello", "world"]), @@ -181,7 +181,28 @@ def test_decoding(text, expect_tokens): @pytest.mark.parametrize( - ("text,expect_tokens"), + "text,expect_tokens", + (("0x0", ["0x", "0"]), + ("0x1234", ["0x", "1234"]), + ("0x71c765", ["0x", "71c765"]), + ("0xdeadbeef", + ["0x", "[LONG]", "hex", "digits"]), + ("0xdeadbeefL", ["0xdeadbeefL"]), + ("0x4AAAAAAAAjq6WYeRDKmebM", + ["0x4AAAAAAAAjq6WYeRDKmebM"]), + ("0XPmYE28fJingEYE1hThk7F4SZFf1EVe2PxVNsmv", + ["[BASE64]"]), + ("0XBEA020C3BD417F30DE4D6BD05B0ED310AC586CC0", + ["0X", "[LONG]", "hex", "digits"]), + )) +def test_prefixed_hex(text, expect_tokens): + """Ensure prefixed hex constants are recognized and split. + """ + assert list(TextSplitter().split(text)) == expect_tokens + + +@pytest.mark.parametrize( + "text,expect_tokens", (("That\u2019s all we know.", ["That's", "all", "we", "know"]), ("Page=Login&Action=Login\';\n\t\t\treturn",