From 4734b680d7c9480b662f3cca1e49c6e1dc78d660 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Thu, 16 May 2024 01:02:24 +0100 Subject: [PATCH] *** next steps --- README.md | 6 ++++-- pyproject.toml | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b809b48..da316cd 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ # DOM tokenizers -DOM-aware tokenizers for [🤗 Hugging Face](https://huggingface.co/) +DOM-aware tokenizers for [Hugging Face](https://huggingface.co/) language models. ## Installation @@ -30,7 +30,9 @@ pip install --upgrade pip pip install -e .[dev,train] ``` -## Train a tokenizer +## Load a pretrained tokenizer from the Hub + +## Train your own ```sh train-tokenizer gbenson/interesting-dom-snapshots -n 10000 ``` diff --git a/pyproject.toml b/pyproject.toml index 6b73303..ccc9d7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,9 +2,9 @@ name = "dom-tokenizers" version = "0.0.2" authors = [{ name = "Gary Benson" }] -description = "DOM-aware tokenizers for Hugging Face language models" +description = "DOM-aware tokenizers for 🤗 Hugging Face language models" readme = "README.md" -license = { text = "Apache Software License (Apache-2.0)" } +license = { text = "Apache-2.0" } requires-python = ">=3.10" # match..case classifiers = [ "Development Status :: 4 - Beta",