Recognize prefixed hex

gbenson · gbenson · commit 5d83ddb7fc6c · 2024-06-04T09:20:50.000+01:00
diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py
@@ -63,11 +63,12 @@ def special_tokens(self) -> Iterable[str]:
     JS_CHAR_ESCAPE_RE = re.compile(f"(?:x|u{_TWOHEX}){_TWOHEX}")
     ENTITY_STARTS = {"&", "&#"}
     ESCAPE_START_RE = re.compile(r".([&%\\])")
+    PREFIXED_HEX_RE = re.compile(r"^(0x)([0-9a-f]+)([+/=]*)$", re.I)
 
     # XXX older bits
     MAXWORDLEN = 32
     WORD_RE = re.compile(r"(?:\w+['’]?)+")
-    HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$")
+    HEX_RE = re.compile(r"^(?:0x|[0-9a-f]{2})[0-9a-f]{6,}$", re.I)
     DIGIT_RE = re.compile(r"\d")
     LONGEST_URLISH = 1024  # XXX?
     URLISH_LOOKBACK = 5
@@ -159,6 +160,16 @@ def split(self, text: str) -> Iterable[str]:
                 splits[cursor:cursor+1] = new_splits
                 continue
 
+            # Are we looking at some prefixed hex?
+            if (match := self.PREFIXED_HEX_RE.match(curr)):
+                if VERBOSE:  # pragma: no cover
+                    debug("prefixed hex")
+                new_splits = [s for s in match.groups() if s]
+                print("new_splits:", new_splits)
+                splits[cursor:cursor+1] = new_splits
+                cursor += len(new_splits)
+                continue
+
             # Are we looking at something that might be base64?
             if self.BASE64_RE.match(curr):
                 cursor = self._sub_base64(splits, cursor)
diff --git a/tests/test_splitter.py b/tests/test_splitter.py
@@ -4,7 +4,7 @@
 
 
 @pytest.mark.parametrize(
-    ("text,expect_splits"),
+    "text,expect_splits",
     (("hello world", ["hello", " ", "world"]),
      ("$hello world", ["", "$", "hello", " ", "world"]),
      ("hello-world", ["hello", "-", "world"]),
@@ -55,7 +55,7 @@ def test_first_split_re(text, expect_splits):
 
 
 @pytest.mark.parametrize(
-    ("text,expect_tokens"),
+    "text,expect_tokens",
     (("hello world", ["hello", "world"]),
      ("hello-world", ["hello", "world"]),
      ("hello_world", ["hello", "world"]),
@@ -181,7 +181,28 @@ def test_decoding(text, expect_tokens):
 
 
 @pytest.mark.parametrize(
-    ("text,expect_tokens"),
+    "text,expect_tokens",
+    (("0x0", ["0x", "0"]),
+     ("0x1234", ["0x", "1234"]),
+     ("0x71c765", ["0x", "71c765"]),
+     ("0xdeadbeef",
+      ["0x", "[LONG]", "hex", "digits"]),
+     ("0xdeadbeefL", ["0xdeadbeefL"]),
+     ("0x4AAAAAAAAjq6WYeRDKmebM",
+      ["0x4AAAAAAAAjq6WYeRDKmebM"]),
+     ("0XPmYE28fJingEYE1hThk7F4SZFf1EVe2PxVNsmv",
+      ["[BASE64]"]),
+     ("0XBEA020C3BD417F30DE4D6BD05B0ED310AC586CC0",
+      ["0X", "[LONG]", "hex", "digits"]),
+     ))
+def test_prefixed_hex(text, expect_tokens):
+    """Ensure prefixed hex constants are recognized and split.
+    """
+    assert list(TextSplitter().split(text)) == expect_tokens
+
+
+@pytest.mark.parametrize(
+    "text,expect_tokens",
     (("That\u2019s all we know.",
       ["That's", "all", "we", "know"]),
      ("Page=Login&Action=Login\';\n\t\t\treturn",