Skip to content

Commit

Permalink
Hide per-split normalizer lookups in a closure
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 30, 2024
1 parent 53e90cd commit dbca5fc
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions src/dom_tokenizers/pre_tokenizers/pre_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import weakref

from abc import ABC, abstractmethod
from functools import cached_property

from tokenizers import NormalizedString, PreTokenizedString
from tokenizers.pre_tokenizers import PreTokenizer as _PreTokenizer
Expand Down Expand Up @@ -54,14 +53,21 @@ def _backend_tokenizer(self):
def _normalizer(self):
return self._backend_tokenizer.normalizer

@cached_property
@property
def special_tokens(self) -> set[str]:
return set(self._splitter.special_tokens)

def _normalize_nonspecial(self, split: NormalizedString):
if split.original in self.special_tokens:
return
self._normalizer.normalize(split)
@property
def _normalize_nonspecial(self):
special_tokens = self.special_tokens
normalize = self._normalizer.normalize

def func(split: NormalizedString):
if split.original in special_tokens:
return
normalize(split)

return func

# Entry point

Expand Down

0 comments on commit dbca5fc

Please sign in to comment.