Skip to content

Commit 61b1fd0

Browse files
committed
*** debug code
1 parent a4b9072 commit 61b1fd0

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

src/dom_tokenizers/pre_tokenizers/splitter.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -461,15 +461,19 @@ def _split_base64_utf8(self, text, encoded):
461461
return [self.base64_token, "json"]
462462
except json.JSONDecodeError:
463463
pass
464-
if self.oracle.first_is_better(encoded, text):
465-
return None # encoded is better
464+
with open("base64.matches", "a") as fp:
465+
print("text", encoded, file=fp)
466+
#if self.oracle.first_is_better(encoded, text):
467+
# return None # encoded is better
466468
return [self.base64_token, "text"]
467469

468470
def _split_base64_binary(self, data, encoded):
469471
filetype = sniff_bytes(data)
470472
if not filetype:
471-
if self.oracle.is_texty(encoded):
472-
return None
473+
with open("base64.matches", "a") as fp:
474+
print("data", encoded, file=fp)
475+
#if self.oracle.is_texty(encoded):
476+
# return None
473477
return [self.base64_token, "data"]
474478
return [self.base64_token, filetype.name.lower()]
475479

0 commit comments

Comments
 (0)