Skip to content

Commit

Permalink
Log here too
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 20, 2024
1 parent e243845 commit 196d354
Showing 1 changed file with 8 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/dom_tokenizers/train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
import os
import warnings

Expand All @@ -19,6 +20,8 @@
SEND_BUGS_TO,
)

logger = logging.getLogger(__name__)


def train_tokenizer(
training_dataset,
Expand Down Expand Up @@ -55,6 +58,11 @@ def futz_input(real_input):

def get_training_corpus():
for row in training_dataset:
logger.info(
"source %d: %s",
row.get("source_index", -1),
row.get("displayed_url"),
)
yield futz_input(json.dumps(row["dom_snapshot"]))

# Try and get a dataset length, for the progress tracker.
Expand Down

0 comments on commit 196d354

Please sign in to comment.