From 4ffaa4c602ec2263392c3df53fc6ea798f325efe Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Thu, 6 Jun 2024 01:29:16 +0100 Subject: [PATCH] *** script to dump raw tokens --- src/dom_tokenizers/pre_tokenizers/splitter.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index 1af33ff..62b33aa 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -1,3 +1,4 @@ +import atexit import logging import re @@ -424,7 +425,7 @@ def _sub_base64(self, splits, cursor): raise FalseBase64Error("part of a URL") # It's not obviously part of a URL, time to pull out the big guns - splits[cursor:cursor + 1] = self._enter_base64(curr) + _ = self._enter_base64(curr) # XXX if logger.isEnabledFor(logging.DEBUG): # pragma: no cover if splits[cursor] == self.base64_token: debug("it's base64?") @@ -540,6 +541,10 @@ def _enter_base64_binary(self, data, encoded): raise FalseBase64Error("text") return [self.base64_token] + _seen_tokens = set() + _tokens_file = open("unique-tokens-4-100", "w") + atexit.register(_tokens_file.close) + def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]: for token in tokens: if token is SPLIT: @@ -553,6 +558,12 @@ def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]: # terminal-quotes. token = token.rstrip("'") + if len(token) >= 4: + truncated_token = token[:100] + if truncated_token not in self._seen_tokens: + print(truncated_token, file=self._tokens_file) + self._seen_tokens.add(truncated_token) + if self.HEX_RE.match(token): yield self.long_token try: