diff --git a/pyproject.toml b/pyproject.toml index 366628e..3ecd591 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "python-magic", # XXX review "tokenizers", "unidecode", # XXX review - "vec64>0.0.2", + "vec64>0.0.5", ] [project.urls] diff --git a/src/dom_tokenizers/internal/base64.py b/src/dom_tokenizers/internal/base64.py index 14301f1..32929a9 100644 --- a/src/dom_tokenizers/internal/base64.py +++ b/src/dom_tokenizers/internal/base64.py @@ -1,7 +1,5 @@ from base64 import b64decode as _b64decode, _bytes_from_decode_data, binascii -from vec64 import base64_symbol_indexes as _base64_symbol_indexes - def b64decode(s, *args, **kwargs) -> bytes: fix_padding = kwargs.pop("fix_padding", False) @@ -16,13 +14,3 @@ def b64decode(s, *args, **kwargs) -> bytes: n = len(t) & 3 t += b"AA=="[n:] return _b64decode(t, *args, **kwargs) - - -def base64_symbol_indexes(text: str) -> bytes: - try: - return _base64_symbol_indexes(text) - except UnicodeEncodeError: - return _base64_symbol_indexes(text.encode(errors="replace")) - - -base64_symbol_indexes.__doc__ = _base64_symbol_indexes.__doc__ diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index d01db9a..1c61742 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -10,9 +10,10 @@ import magic from unidecode import unidecode +from vec64 import base64_symbol_indexes from ..internal import json -from ..internal.base64 import b64decode, base64_symbol_indexes +from ..internal.base64 import b64decode from .base64 import base64_probability logger = logging.getLogger(__name__)