Skip to content

Commit

Permalink
Minor updates
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 16, 2024
1 parent 241a522 commit 50b9863
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@

# DOM tokenizers

DOM-aware tokenizers for 🤗 [Hugging Face](https://huggingface.co/)
language models.
DOM-aware tokenizers for Hugging Face language models.

## Installation

Expand All @@ -31,7 +30,9 @@ pip install --upgrade pip
pip install -e .[dev,train]
```

## Train a tokenizer
## Load a pretrained tokenizer from the Hub

## Train your own

### On the command line

Expand Down
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
[project]
name = "dom-tokenizers"
version = "0.0.2"
version = "0.0.3"
authors = [{ name = "Gary Benson" }]
description = "DOM-aware tokenizers for Hugging Face language models"
description = "DOM-aware tokenizers for 🤗 Hugging Face language models"
readme = "README.md"
license = { text = "Apache Software License (Apache-2.0)" }
license = { text = "Apache-2.0" }
requires-python = ">=3.10" # match..case
classifiers = [
"Development Status :: 4 - Beta",
Expand Down
13 changes: 11 additions & 2 deletions src/dom_tokenizers/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ def get_training_corpus():
corpus_size = len(training_dataset)
except TypeError:
pass
cs = f"{corpus_size:,}" if corpus_size else "an unknown number of"
print(f"Generating {vocab_size:,} tokens from {cs} examples:")

# Train the new tokenizer.
new_tokenizer = base_tokenizer.train_new_from_iterator(
Expand All @@ -78,10 +80,18 @@ def get_training_corpus():
length=corpus_size,
show_progress=True,
)
new_tokenizer.name_or_path = _pretty_name(new_tokenizer)

return new_tokenizer


def _pretty_name(tokenizer=None, *, vocab_size=None, prefix="dom-tokenizer-"):
if vocab_size is None:
vocab_size = tokenizer.vocab_size
pretty_size = _round_and_prefix(vocab_size)
return f"{prefix}{pretty_size}"


def _round_and_prefix(value):
"""314159 -> '314k'."""
whole, frac = divmod(log10(value), 1)
Expand Down Expand Up @@ -123,8 +133,7 @@ def main():

save_directory = args.save_directory
if save_directory is None:
pretty_size = _round_and_prefix(args.vocab_size)
save_directory = f"dom-tokenizer-{pretty_size}"
save_directory = _pretty_name(vocab_size=args.vocab_size)
print(f"Output directory: {save_directory}\n")

warnings.filterwarnings("ignore", message=r".*resume_download.*")
Expand Down

0 comments on commit 50b9863

Please sign in to comment.