diff --git a/README.md b/README.md index 52c4aeb..d33e784 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,7 @@ # DOM tokenizers -DOM-aware tokenizers for 🤗 [Hugging Face](https://huggingface.co/) -language models. +DOM-aware tokenizers for Hugging Face language models. ## Installation @@ -31,7 +30,9 @@ pip install --upgrade pip pip install -e .[dev,train] ``` -## Train a tokenizer +## Load a pretrained tokenizer from the Hub + +## Train your own ### On the command line diff --git a/pyproject.toml b/pyproject.toml index 6b73303..509da9a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,10 +1,10 @@ [project] name = "dom-tokenizers" -version = "0.0.2" +version = "0.0.3" authors = [{ name = "Gary Benson" }] -description = "DOM-aware tokenizers for Hugging Face language models" +description = "DOM-aware tokenizers for 🤗 Hugging Face language models" readme = "README.md" -license = { text = "Apache Software License (Apache-2.0)" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" # match..case classifiers = [ "Development Status :: 4 - Beta", diff --git a/src/dom_tokenizers/train.py b/src/dom_tokenizers/train.py index 5cf0579..8724f2f 100644 --- a/src/dom_tokenizers/train.py +++ b/src/dom_tokenizers/train.py @@ -69,6 +69,8 @@ def get_training_corpus(): corpus_size = len(training_dataset) except TypeError: pass + cs = f"{corpus_size:,}" if corpus_size else "an unknown number of" + print(f"Generating {vocab_size:,} tokens from {cs} examples:") # Train the new tokenizer. new_tokenizer = base_tokenizer.train_new_from_iterator( @@ -78,10 +80,18 @@ def get_training_corpus(): length=corpus_size, show_progress=True, ) + new_tokenizer.name_or_path = _pretty_name(new_tokenizer) return new_tokenizer +def _pretty_name(tokenizer=None, *, vocab_size=None, prefix="dom-tokenizer-"): + if vocab_size is None: + vocab_size = tokenizer.vocab_size + pretty_size = _round_and_prefix(vocab_size) + return f"{prefix}{pretty_size}" + + def _round_and_prefix(value): """314159 -> '314k'.""" whole, frac = divmod(log10(value), 1) @@ -123,8 +133,7 @@ def main(): save_directory = args.save_directory if save_directory is None: - pretty_size = _round_and_prefix(args.vocab_size) - save_directory = f"dom-tokenizer-{pretty_size}" + save_directory = _pretty_name(vocab_size=args.vocab_size) print(f"Output directory: {save_directory}\n") warnings.filterwarnings("ignore", message=r".*resume_download.*")