From 5a6757883d6eb4976935f7fb6dccedff644c01af Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Wed, 3 Jul 2024 21:57:23 +0100 Subject: [PATCH] Migrate base64_symbol_indexes wrapper to vec64 --- pyproject.toml | 2 +- src/dom_tokenizers/internal/base64.py | 12 ------------ src/dom_tokenizers/pre_tokenizers/splitter.py | 3 ++- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 366628e..3ecd591 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "python-magic", # XXX review "tokenizers", "unidecode", # XXX review - "vec64>0.0.2", + "vec64>0.0.5", ] [project.urls] diff --git a/src/dom_tokenizers/internal/base64.py b/src/dom_tokenizers/internal/base64.py index 14301f1..32929a9 100644 --- a/src/dom_tokenizers/internal/base64.py +++ b/src/dom_tokenizers/internal/base64.py @@ -1,7 +1,5 @@ from base64 import b64decode as _b64decode, _bytes_from_decode_data, binascii -from vec64 import base64_symbol_indexes as _base64_symbol_indexes - def b64decode(s, *args, **kwargs) -> bytes: fix_padding = kwargs.pop("fix_padding", False) @@ -16,13 +14,3 @@ def b64decode(s, *args, **kwargs) -> bytes: n = len(t) & 3 t += b"AA=="[n:] return _b64decode(t, *args, **kwargs) - - -def base64_symbol_indexes(text: str) -> bytes: - try: - return _base64_symbol_indexes(text) - except UnicodeEncodeError: - return _base64_symbol_indexes(text.encode(errors="replace")) - - -base64_symbol_indexes.__doc__ = _base64_symbol_indexes.__doc__ diff --git a/src/dom_tokenizers/pre_tokenizers/splitter.py b/src/dom_tokenizers/pre_tokenizers/splitter.py index d01db9a..1c61742 100644 --- a/src/dom_tokenizers/pre_tokenizers/splitter.py +++ b/src/dom_tokenizers/pre_tokenizers/splitter.py @@ -10,9 +10,10 @@ import magic from unidecode import unidecode +from vec64 import base64_symbol_indexes from ..internal import json -from ..internal.base64 import b64decode, base64_symbol_indexes +from ..internal.base64 import b64decode from .base64 import base64_probability logger = logging.getLogger(__name__)