diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index f9c821b..0de000a 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -471,15 +471,23 @@ def _split_base64_utf8(self, text, encoded): def _split_base64_binary(self, data, encoded): if len(encoded) < 9: # XXX review - return None + return None # too short filetype = sniff_bytes(data) if filetype: return [self.base64_token, filetype.name.lower()] + if len(encoded) > 100: # XXX review + return [self.base64_token, "data"] # too long + hist = defaultdict(int) + for c in encoded: + hist[c] += 1 + xoxo = list(sorted((count, c) for c, count in hist.items())) + print(encoded) + print(len(encoded), len(hist), xoxo[-2:]) + #print(xoxo) #with open("unsniffed.matches", "a") as fp: # print(len(encoded), encoded, file=fp) #if self.oracle.is_texty(encoded): # return None - return [self.base64_token, "data"] raise NotImplementedError(encoded) # XXX junk? @@ -598,16 +606,15 @@ def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]: # terminal-quotes. token = token.rstrip("'") - if self.HEX_RE.match(token): - yield self.long_token - try: - _ = int(token) - except ValueError: - yield "hex" - yield "digits" + if len(token) <= 6: + yield token # 6 hex == CSS color + continue + + if self.ALL_HEX_RE.match(token): + yield "[COOKIE]" # hex _or_ decimal continue - if len(token) <= self.MAXWORDLEN: + if len(token) <= 100: # self.MAXWORDLEN: yield token continue