Skip to content

Commit

Permalink
*** dump json snapshots
Browse files Browse the repository at this point in the history
  • Loading branch information
gbenson committed Jun 5, 2024
1 parent 4cd2ad2 commit 07e2cd7
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions src/dom_tokenizers/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def train_tokenizer(
base_tokenizer.backend_tokenizer.pre_tokenizer = WhitespaceSplit()
base_pretokenizer = base_tokenizer.backend_tokenizer.pre_tokenizer

def futz_input(real_input):
def futz_input(real_input, source_index):
pretokenized = new_pretokenizer.pre_tokenize_str(real_input)
want_tokens = list(chain.from_iterable(
token.split() for token, offsets in pretokenized
Expand All @@ -54,7 +54,12 @@ def futz_input(real_input):

def get_training_corpus():
for row in training_dataset:
yield futz_input(json.dumps(row["dom_snapshot"]))
source_index = row["source_index"]
filename = os.path.expanduser(f"~/json/{source_index}.json")
serialized = json.dumps(row["dom_snapshot"])
with open(filename, "w") as fp:
fp.write(serialized)
yield futz_input(serialized, source_index)

# Try and get a dataset length, for the progress tracker.
if corpus_size is None:
Expand Down

0 comments on commit 07e2cd7

Please sign in to comment.