From 583ac4660993e9f64ffb52872687c59aca5c2ee7 Mon Sep 17 00:00:00 2001 From: mozharovsky Date: Fri, 17 Sep 2021 04:32:25 +0300 Subject: [PATCH] feat(core): make nmt normalization false by default --- git_t5/core/tokenizer_model.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/git_t5/core/tokenizer_model.py b/git_t5/core/tokenizer_model.py index 13184e7..63224b5 100644 --- a/git_t5/core/tokenizer_model.py +++ b/git_t5/core/tokenizer_model.py @@ -62,6 +62,7 @@ class SentencePieceTokenizerConfig(TokenizerConfig): trim_offsets: bool = False min_frequency: int = 2 lowercase: bool = False + nmt_normalizer: bool = False remove_extra_spaces: bool = True unicode_normalizer: Optional[str] = "nfkc" unk_token: str = "" @@ -127,7 +128,9 @@ def get_tokenizer(self) -> Tokenizer: def get_normalizer(self) -> normalizers.Normalizer: normalizer_list: List[normalizers.Normalizer] = [] - normalizer_list.append(normalizers.Nmt()) + + if self.config.nmt_normalizer: + normalizer_list.append(normalizers.Nmt()) if self.config.unicode_normalizer is not None: normalizer_list.append(unicode_normalizer(self.config.unicode_normalizer))