From 9da6b9a6bbb5697288bb457a35420ffb6830889d Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Thu, 30 May 2024 22:09:44 +0100 Subject: [PATCH] Lowercase tag names before splitting --- src/dom_tokenizers/pre_tokenizers/dom_snapshot.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py index 28ceffc..a092bc5 100644 --- a/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py +++ b/src/dom_tokenizers/pre_tokenizers/dom_snapshot.py @@ -132,13 +132,13 @@ def get( tokens = cache.get(string_index) if tokens is not None: return tokens + text = self._strings[string_index] + if lowercase: + text = text.lower() tokens = [ NormalizedString(token) - for token in self._splitter.split(self._strings[string_index]) + for token in self._splitter.split(text) ] - if lowercase: - for token in tokens: - token.lowercase() cache[string_index] = tokens return tokens