Skip to content

Commit

Permalink
feat(core): make nmt normalization false by default
Browse files Browse the repository at this point in the history
  • Loading branch information
mozharovsky committed Sep 17, 2021
1 parent f2857d9 commit 583ac46
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion git_t5/core/tokenizer_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ class SentencePieceTokenizerConfig(TokenizerConfig):
trim_offsets: bool = False
min_frequency: int = 2
lowercase: bool = False
nmt_normalizer: bool = False
remove_extra_spaces: bool = True
unicode_normalizer: Optional[str] = "nfkc"
unk_token: str = "<unk>"
Expand Down Expand Up @@ -127,7 +128,9 @@ def get_tokenizer(self) -> Tokenizer:

def get_normalizer(self) -> normalizers.Normalizer:
normalizer_list: List[normalizers.Normalizer] = []
normalizer_list.append(normalizers.Nmt())

if self.config.nmt_normalizer:
normalizer_list.append(normalizers.Nmt())

if self.config.unicode_normalizer is not None:
normalizer_list.append(unicode_normalizer(self.config.unicode_normalizer))
Expand Down

0 comments on commit 583ac46

Please sign in to comment.