Skip to content

Commit

Permalink
blah
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 4, 2024
1 parent afed6fe commit 7f8ef60
Showing 1 changed file with 17 additions and 10 deletions.
27 changes: 17 additions & 10 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,15 +471,23 @@ def _split_base64_utf8(self, text, encoded):

def _split_base64_binary(self, data, encoded):
if len(encoded) < 9: # XXX review
return None
return None # too short
filetype = sniff_bytes(data)
if filetype:
return [self.base64_token, filetype.name.lower()]
if len(encoded) > 100: # XXX review
return [self.base64_token, "data"] # too long
hist = defaultdict(int)
for c in encoded:
hist[c] += 1
xoxo = list(sorted((count, c) for c, count in hist.items()))
print(encoded)
print(len(encoded), len(hist), xoxo[-2:])
#print(xoxo)
#with open("unsniffed.matches", "a") as fp:
# print(len(encoded), encoded, file=fp)
#if self.oracle.is_texty(encoded):
# return None
return [self.base64_token, "data"]
raise NotImplementedError(encoded)

# XXX junk?
Expand Down Expand Up @@ -598,16 +606,15 @@ def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]:
# terminal-quotes.
token = token.rstrip("'")

if self.HEX_RE.match(token):
yield self.long_token
try:
_ = int(token)
except ValueError:
yield "hex"
yield "digits"
if len(token) <= 6:
yield token # 6 hex == CSS color
continue

if self.ALL_HEX_RE.match(token):
yield "[COOKIE]" # hex _or_ decimal
continue

if len(token) <= self.MAXWORDLEN:
if len(token) <= 100: # self.MAXWORDLEN:
yield token
continue

Expand Down

0 comments on commit 7f8ef60

Please sign in to comment.