From 196d354e25489a7f961f9b36f3de2b734c4ed7a6 Mon Sep 17 00:00:00 2001 From: Gary Benson Date: Fri, 21 Jun 2024 00:03:11 +0100 Subject: [PATCH] Log here too --- src/dom_tokenizers/train.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/dom_tokenizers/train.py b/src/dom_tokenizers/train.py index 7e6f1b7..ad39f3e 100644 --- a/src/dom_tokenizers/train.py +++ b/src/dom_tokenizers/train.py @@ -1,3 +1,4 @@ +import logging import os import warnings @@ -19,6 +20,8 @@ SEND_BUGS_TO, ) +logger = logging.getLogger(__name__) + def train_tokenizer( training_dataset, @@ -55,6 +58,11 @@ def futz_input(real_input): def get_training_corpus(): for row in training_dataset: + logger.info( + "source %d: %s", + row.get("source_index", -1), + row.get("displayed_url"), + ) yield futz_input(json.dumps(row["dom_snapshot"])) # Try and get a dataset length, for the progress tracker.