Skip to content

Commit

Permalink
Switch back to uncased base model
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 24, 2024
1 parent 7d84762 commit c9da27c
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions src/dom_tokenizers/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .internal.transformers import AutoTokenizer
from .pre_tokenizers import DOMSnapshotPreTokenizer

DEFAULT_BASE_TOKENIZER = "bert-base-cased"
DEFAULT_BASE_TOKENIZER = "bert-base-uncased"
DEFAULT_SPLIT = "train"
DEFAULT_VOCAB_SIZE = 1024
SEND_BUGS_TO = "https://github.com/gbenson/dom-tokenizers/issues"
Expand Down Expand Up @@ -149,9 +149,10 @@ def main():
base_tokenizer=args.base_tokenizer,
vocab_size=args.vocab_size,
corpus_size=args.corpus_size)
print(f'\n{tokenizer.tokenize("training complete")}')
tokenizer.backend_tokenizer.pre_tokenizer = WhitespaceSplit()
print(f'\n{tokenizer.tokenize("Training complete")}')

tokenizer.save_pretrained(save_directory)

print(tokenizer.tokenize("tokenizer state saved"))
print(tokenizer.tokenize("see you soon") + ["!!"])
print(tokenizer.tokenize("Tokenizer state saved"))
print(tokenizer.tokenize("See you soon!!"))

0 comments on commit c9da27c

Please sign in to comment.