Skip to content

Commit afed6fe

Browse files
committed
*** what's next?
1 parent a4b9072 commit afed6fe

File tree

2 files changed

+60
-9
lines changed

2 files changed

+60
-9
lines changed

src/dom_tokenizers/pre_tokenizers/splitter.py

+23-9
Original file line numberDiff line numberDiff line change
@@ -62,12 +62,14 @@ def special_tokens(self) -> Iterable[str]:
6262
FIRST_SPLIT_RE = re.compile(rf"([^\w'’{BASE64_NONWORD}]+)")
6363
BASE64_NONWORD_RE = re.compile("[+/=]+")
6464

65-
_TWOHEX = "[0-9a-fA-F]{2}"
65+
_HEX = "[0-9a-fA-F]"
66+
ALL_HEX_RE = re.compile(f"^{_HEX}+$")
67+
_TWOHEX = f"{_HEX}{{2}}"
6668
TWOHEX_RE = re.compile(_TWOHEX)
6769
JS_CHAR_ESCAPE_RE = re.compile(f"(?:x|u{_TWOHEX}){_TWOHEX}")
6870
ENTITY_STARTS = {"&", "&#"}
6971
ESCAPE_START_RE = re.compile(r".([&%\\])")
70-
PREFIXED_HEX_RE = re.compile(r"^(0x)([0-9a-f]+)([+/=]*)$", re.I)
72+
PREFIXED_HEX_RE = re.compile(rf"^(0[xX])({_HEX}+)([+/=]*)$", re.I)
7173

7274
# XXX older bits
7375
MAXWORDLEN = 32
@@ -461,20 +463,32 @@ def _split_base64_utf8(self, text, encoded):
461463
return [self.base64_token, "json"]
462464
except json.JSONDecodeError:
463465
pass
464-
if self.oracle.first_is_better(encoded, text):
465-
return None # encoded is better
466+
#with open("base64.matches", "a") as fp:
467+
# print("text", encoded, file=fp)
468+
#if self.oracle.first_is_better(encoded, text):
469+
# return None # encoded is better
466470
return [self.base64_token, "text"]
467471

468472
def _split_base64_binary(self, data, encoded):
473+
if len(encoded) < 9: # XXX review
474+
return None
469475
filetype = sniff_bytes(data)
470-
if not filetype:
471-
if self.oracle.is_texty(encoded):
472-
return None
473-
return [self.base64_token, "data"]
474-
return [self.base64_token, filetype.name.lower()]
476+
if filetype:
477+
return [self.base64_token, filetype.name.lower()]
478+
#with open("unsniffed.matches", "a") as fp:
479+
# print(len(encoded), encoded, file=fp)
480+
#if self.oracle.is_texty(encoded):
481+
# return None
482+
return [self.base64_token, "data"]
483+
raise NotImplementedError(encoded)
475484

476485
# XXX junk?
477486

487+
#all uppercase
488+
#all lowercase
489+
#all hex digits
490+
#CamelCase
491+
478492
def _sub_base64(self, splits, cursor):
479493
curr = splits[cursor]
480494
try:

tests/test_splitter.py

+37
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from base64 import b64decode
2+
13
import pytest
24

35
from dom_tokenizers.pre_tokenizers.splitter import TextSplitter
@@ -201,6 +203,41 @@ def test_prefixed_hex(text, expect_tokens):
201203
assert list(TextSplitter().split(text)) == expect_tokens
202204

203205

206+
def test_split_base64_binary():
207+
ts = TextSplitter()
208+
for filename in ("6-short-matches", "6-unsniffed-matches"):
209+
with open(filename) as fp:
210+
for line in fp.readlines():
211+
check, encoded = line.split(maxsplit=1)
212+
encoded = encoded.rstrip()
213+
assert len(encoded) == int(check)
214+
data = b64decode(encoded)
215+
x = ts._split_base64_binary(data, encoded)
216+
print(encoded, x)
217+
assert x is not None
218+
219+
#@pytest.mark.parametrize(
220+
# "encoded,expect_tokens",
221+
# [(encoded, None) for encoded in (
222+
# "ajax=",
223+
# "00000002",
224+
# textures/spgm/contrib/overlib410/overlib
225+
# ShouldAllowProductItemOutlineBorderStyle
226+
# ShowAutomaticDiscountDataOnProductWidget
227+
# UseQueryBySessionStartForContactBookings
228+
#
229+
# # too short
230+
# "evid=", "MA6Y=", "uapv=", "1gsQ==", "null==", "data===",
231+
# "00000005", "000000de", "000001px", "16H162a2", "1/8/3/20",
232+
# "1PAPISID", "20137566", "20150315", "//action", "activate",
233+
# "Activate", "activeEl", "+apple/i", "appName=", "+g+k+l+h",
234+
#12 000000000000
235+
#12 000cf00eb582
236+
#
237+
#8 00000002
238+
#def test_split_base64_binary_encoded_(self, data, encoded):
239+
240+
204241
@pytest.mark.parametrize(
205242
"text,expect_tokens",
206243
(("That\u2019s all we know.",

0 commit comments

Comments
 (0)