Skip to content

Commit

Permalink
*** script to dump raw tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 7, 2024
1 parent c2640bd commit 4ffaa4c
Showing 1 changed file with 12 additions and 1 deletion.
13 changes: 12 additions & 1 deletion src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import atexit
import logging
import re

Expand Down Expand Up @@ -424,7 +425,7 @@ def _sub_base64(self, splits, cursor):
raise FalseBase64Error("part of a URL")

# It's not obviously part of a URL, time to pull out the big guns
splits[cursor:cursor + 1] = self._enter_base64(curr)
_ = self._enter_base64(curr) # XXX
if logger.isEnabledFor(logging.DEBUG): # pragma: no cover
if splits[cursor] == self.base64_token:
debug("it's base64?")
Expand Down Expand Up @@ -540,6 +541,10 @@ def _enter_base64_binary(self, data, encoded):
raise FalseBase64Error("text")
return [self.base64_token]

_seen_tokens = set()
_tokens_file = open("unique-tokens-4-100", "w")
atexit.register(_tokens_file.close)

def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]:
for token in tokens:
if token is SPLIT:
Expand All @@ -553,6 +558,12 @@ def _postprocess(self, tokens: Iterable[str]) -> Iterable[str]:
# terminal-quotes.
token = token.rstrip("'")

if len(token) >= 4:
truncated_token = token[:100]
if truncated_token not in self._seen_tokens:
print(truncated_token, file=self._tokens_file)
self._seen_tokens.add(truncated_token)

if self.HEX_RE.match(token):
yield self.long_token
try:
Expand Down

0 comments on commit 4ffaa4c

Please sign in to comment.