From c9da27c66e4105a36a0686c352b5dd3c60c69684 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Fri, 24 May 2024 23:21:37 +0100 Subject: [PATCH] Switch back to uncased base model --- src/dom_tokenizers/train.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/dom_tokenizers/train.py b/src/dom_tokenizers/train.py index 05d8aee..3163f0f 100644 --- a/src/dom_tokenizers/train.py +++ b/src/dom_tokenizers/train.py @@ -11,7 +11,7 @@ from .internal.transformers import AutoTokenizer from .pre_tokenizers import DOMSnapshotPreTokenizer -DEFAULT_BASE_TOKENIZER = "bert-base-cased" +DEFAULT_BASE_TOKENIZER = "bert-base-uncased" DEFAULT_SPLIT = "train" DEFAULT_VOCAB_SIZE = 1024 SEND_BUGS_TO = "https://github.com/gbenson/dom-tokenizers/issues" @@ -149,9 +149,10 @@ def main(): base_tokenizer=args.base_tokenizer, vocab_size=args.vocab_size, corpus_size=args.corpus_size) - print(f'\n{tokenizer.tokenize("training complete")}') + tokenizer.backend_tokenizer.pre_tokenizer = WhitespaceSplit() + print(f'\n{tokenizer.tokenize("Training complete")}') tokenizer.save_pretrained(save_directory) - print(tokenizer.tokenize("tokenizer state saved")) - print(tokenizer.tokenize("see you soon") + ["!!"]) + print(tokenizer.tokenize("Tokenizer state saved")) + print(tokenizer.tokenize("See you soon!!"))