diff --git a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py index 28ceffc..a092bc5 100644 --- a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py +++ b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py @@ -132,13 +132,13 @@ def get( tokens = cache.get(string_index) if tokens is not None: return tokens + text = self._strings[string_index] + if lowercase: + text = text.lower() tokens = [ NormalizedString(token) - for token in self._splitter.split(self._strings[string_index]) + for token in self._splitter.split(text) ] - if lowercase: - for token in tokens: - token.lowercase() cache[string_index] = tokens return tokens