Skip to content

Commit

Permalink
XXX next bits
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 4, 2024
1 parent 61b1fd0 commit 6c214d8
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 5 deletions.
17 changes: 12 additions & 5 deletions src/dom_tokenizers/pre_tokenizers/splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from ..internal import json
from .oracle import Oracle
from .shared_oracle import SharedOracle
from .sniffer import sniff_bytes
from .sniffer import sniff_bytes, MIN_BASE64_FOR_SNIFF

logger = logging.getLogger(__name__)
debug = logger.debug
Expand Down Expand Up @@ -461,13 +461,20 @@ def _split_base64_utf8(self, text, encoded):
return [self.base64_token, "json"]
except json.JSONDecodeError:
pass
with open("base64.matches", "a") as fp:
print("text", encoded, file=fp)
#if self.oracle.first_is_better(encoded, text):
# return None # encoded is better
#with open("base64.matches", "a") as fp:
# print("text", encoded, file=fp)
if self.oracle.first_is_better(encoded, text):
return None # encoded is better
return [self.base64_token, "text"]

def _split_base64_binary(self, data, encoded):
if len(encoded) < 24
#if len(curr) < self.MIN_BASE64_FOR_SNIFF:
#smallest JSON ie NOT BINARY was 24 bytes encoded
# ("eyJleHAiOjE3MTQxNDE1MDN9"
# => '{"exp":1714141503}'
# tho obvs smaller could be valid
# AND THIS ISN'T TEXT
filetype = sniff_bytes(data)
if not filetype:
with open("base64.matches", "a") as fp:
Expand Down
15 changes: 15 additions & 0 deletions tests/test_splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,21 @@ def test_prefixed_hex(text, expect_tokens):
assert list(TextSplitter().split(text)) == expect_tokens


@pytest.mark.parametrize(
"text,expect_tokens",
"encoded,expect_tokens",
[(encoded, None) for encoded in (
# too short
"evid=", "MA6Y=", "uapv=", "1gsQ==", "null==", "data===",
"00000005", "000000de", "000001px", "16H162a2", "1/8/3/20",
"1PAPISID", "20137566", "20150315", "//action", "activate",
"Activate", "activeEl", "+apple/i", "appName=", "+g+k+l+h",
12 000000000000
12 000cf00eb582

8 00000002
def test_split_base64_binary_encoded_(self, data, encoded):

@pytest.mark.parametrize(
"text,expect_tokens",
(("That\u2019s all we know.",
Expand Down

0 comments on commit 6c214d8

Please sign in to comment.