From dbca5fca51a911f0b10dd341c50374240eeb4ae8 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Thu, 30 May 2024 21:02:57 +0100 Subject: [PATCH] Hide per-split normalizer lookups in a closure --- .../pre_tokenizers/pre_tokenizer.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/dom_tokenizers/pre_tokenizers/pre_tokenizer.py b/src/dom_tokenizers/pre_tokenizers/pre_tokenizer.py index 99bdac3..1401b9d 100644 --- a/src/dom_tokenizers/pre_tokenizers/pre_tokenizer.py +++ b/src/dom_tokenizers/pre_tokenizers/pre_tokenizer.py @@ -2,7 +2,6 @@ import weakref from abc import ABC, abstractmethod -from functools import cached_property from tokenizers import NormalizedString, PreTokenizedString from tokenizers.pre_tokenizers import PreTokenizer as _PreTokenizer @@ -54,14 +53,21 @@ def _backend_tokenizer(self): def _normalizer(self): return self._backend_tokenizer.normalizer - @cached_property + @property def special_tokens(self) -> set[str]: return set(self._splitter.special_tokens) - def _normalize_nonspecial(self, split: NormalizedString): - if split.original in self.special_tokens: - return - self._normalizer.normalize(split) + @property + def _normalize_nonspecial(self): + special_tokens = self.special_tokens + normalize = self._normalizer.normalize + + def func(split: NormalizedString): + if split.original in special_tokens: + return + normalize(split) + + return func # Entry point