Skip to content

Commit

Permalink
Deprecate DOMSnapshotTokenizer.from_pretrained()
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed May 21, 2024
1 parent ea11e33 commit ea6f26a
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 9 deletions.
9 changes: 5 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "dom-tokenizers"
version = "0.0.10"
authors = [{ name = "Gary Benson", email = "[email protected]" }]
description = "DOM-aware tokenizers for 🤗 Hugging Face language models"
description = "DOM-aware tokenization for 🤗 Hugging Face language models"
readme = "README.md"
license = { text = "Apache Software License" }
requires-python = ">=3.10" # match..case
Expand All @@ -24,10 +24,11 @@ classifiers = [
"Topic :: Text Processing :: Markup :: HTML",
]
dependencies = [
"python-magic",
"python-magic", # XXX review
"tokenizers",
"transformers",
"unidecode",
"transformers", # move to dev, train
"typing-extensions",
"unidecode", # XXX review
]

[project.urls]
Expand Down
4 changes: 4 additions & 0 deletions src/dom_tokenizers/tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
from typing_extensions import deprecated

from .internal.transformers import AutoTokenizer
from .pre_tokenizers import DOMSnapshotPreTokenizer


@deprecated("use `DOMSnapshotPreTokenizer` instead")
class DOMSnapshotTokenizer:
@classmethod
@deprecated("use `DOMSnapshotPreTokenizer.adapt()` instead")
def from_pretrained(cls, *args, **kwargs):
tokenizer = AutoTokenizer.from_pretrained(*args, **kwargs)
DOMSnapshotPreTokenizer.hook_into(tokenizer)
Expand Down
7 changes: 4 additions & 3 deletions src/dom_tokenizers/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from datasets import load_dataset
from tokenizers.pre_tokenizers import WhitespaceSplit

from .tokenizers import DOMSnapshotTokenizer
from .internal.transformers import AutoTokenizer
from .pre_tokenizers import DOMSnapshotPreTokenizer

DEFAULT_BASE_TOKENIZER = "bert-base-cased"
DEFAULT_SPLIT = "train"
Expand All @@ -24,8 +25,8 @@ def train_tokenizer(

# Create the base tokenizer we'll train our new tokenizer from.
if isinstance(base_tokenizer, str):
base_tokenizer = DOMSnapshotTokenizer.from_pretrained(
base_tokenizer)
base_tokenizer = AutoTokenizer.from_pretrained(base_tokenizer)
DOMSnapshotPreTokenizer.hook_into(base_tokenizer)

# It's not possible to train using a custom pre-tokenizer, the Rust
# code raises "Exception: Custom PreTokenizer cannot be serialized"
Expand Down
7 changes: 5 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pytest

from dom_tokenizers import DOMSnapshotTokenizer
from dom_tokenizers import DOMSnapshotPreTokenizer
from dom_tokenizers.internal.transformers import AutoTokenizer
from dom_tokenizers.train import DEFAULT_BASE_TOKENIZER


Expand All @@ -9,7 +10,9 @@ def dom_snapshot_tokenizer():
"""An instance of a tokenizer that consumes JSON-serialized
DOM snapshots.
"""
return DOMSnapshotTokenizer.from_pretrained(DEFAULT_BASE_TOKENIZER)
tokenizer = AutoTokenizer.from_pretrained(DEFAULT_BASE_TOKENIZER)
DOMSnapshotPreTokenizer.hook_into(tokenizer)
return tokenizer


@pytest.fixture
Expand Down

0 comments on commit ea6f26a

Please sign in to comment.