Skip to content

Commit

Permalink
*** debug code
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 4, 2024
1 parent a4b9072 commit 61b1fd0
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,15 +461,19 @@ def _split_base64_utf8(self, text, encoded):
return [self.base64_token, "json"]
except json.JSONDecodeError:
pass
if self.oracle.first_is_better(encoded, text):
return None # encoded is better
with open("base64.matches", "a") as fp:
print("text", encoded, file=fp)
#if self.oracle.first_is_better(encoded, text):
# return None # encoded is better
return [self.base64_token, "text"]

def _split_base64_binary(self, data, encoded):
filetype = sniff_bytes(data)
if not filetype:
if self.oracle.is_texty(encoded):
return None
with open("base64.matches", "a") as fp:
print("data", encoded, file=fp)
#if self.oracle.is_texty(encoded):
# return None
return [self.base64_token, "data"]
return [self.base64_token, filetype.name.lower()]

Expand Down

0 comments on commit 61b1fd0

Please sign in to comment.