Minor updates

gbenson · May 16, 2024 · 50b9863 · 50b9863
1 parent 241a522
commit 50b9863
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -9,8 +9,7 @@
 
 # DOM tokenizers
 
-DOM-aware tokenizers for 🤗 [Hugging Face](https://huggingface.co/)
-language models.
+DOM-aware tokenizers for Hugging Face language models.
 
 ## Installation
 
@@ -31,7 +30,9 @@ pip install --upgrade pip
 pip install -e .[dev,train]
 ```
 
-## Train a tokenizer
+## Load a pretrained tokenizer from the Hub
+
+## Train your own
 
 ### On the command line
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,10 +1,10 @@
 [project]
 name = "dom-tokenizers"
-version = "0.0.2"
+version = "0.0.3"
 authors = [{ name = "Gary Benson" }]
-description = "DOM-aware tokenizers for Hugging Face language models"
+description = "DOM-aware tokenizers for 🤗 Hugging Face language models"
 readme = "README.md"
-license = { text = "Apache Software License (Apache-2.0)" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"  # match..case
 classifiers = [
     "Development Status :: 4 - Beta",

diff --git a/src/dom_tokenizers/train.py b/src/dom_tokenizers/train.py
@@ -69,6 +69,8 @@ def get_training_corpus():
             corpus_size = len(training_dataset)
         except TypeError:
             pass
+    cs = f"{corpus_size:,}" if corpus_size else "an unknown number of"
+    print(f"Generating {vocab_size:,} tokens from {cs} examples:")
 
     # Train the new tokenizer.
     new_tokenizer = base_tokenizer.train_new_from_iterator(
@@ -78,10 +80,18 @@ def get_training_corpus():
         length=corpus_size,
         show_progress=True,
     )
+    new_tokenizer.name_or_path = _pretty_name(new_tokenizer)
 
     return new_tokenizer
 
 
+def _pretty_name(tokenizer=None, *, vocab_size=None, prefix="dom-tokenizer-"):
+    if vocab_size is None:
+        vocab_size = tokenizer.vocab_size
+    pretty_size = _round_and_prefix(vocab_size)
+    return f"{prefix}{pretty_size}"
+
+
 def _round_and_prefix(value):
     """314159 -> '314k'."""
     whole, frac = divmod(log10(value), 1)
@@ -123,8 +133,7 @@ def main():
 
     save_directory = args.save_directory
     if save_directory is None:
-        pretty_size = _round_and_prefix(args.vocab_size)
-        save_directory = f"dom-tokenizer-{pretty_size}"
+        save_directory = _pretty_name(vocab_size=args.vocab_size)
         print(f"Output directory: {save_directory}\n")
 
     warnings.filterwarnings("ignore", message=r".*resume_download.*")