Skip to content

Commit

Permalink
refactor: Introduce jldump
Browse files Browse the repository at this point in the history
Dump a list to a file in jsonl format.

Signed-off-by: Costa Shulyupin <[email protected]>
  • Loading branch information
makelinux committed Nov 19, 2024
1 parent 8bb8e3b commit 870cefe
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 12 deletions.
16 changes: 4 additions & 12 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
PipelineContext,
)
from instructlab.sdg.utils import GenerateException, models
from instructlab.sdg.utils.json import jldump
from instructlab.sdg.utils.taxonomy import (
leaf_node_to_samples,
read_taxonomy_leaf_nodes,
Expand Down Expand Up @@ -112,15 +113,9 @@ def _gen_train_data(
}
messages_data.append(_convert_to_messages(sample))

with open(output_file_train, "w", encoding="utf-8") as outfile:
for entry in train_data:
json.dump(entry, outfile, ensure_ascii=False)
outfile.write("\n")
jldump(train_data, output_file_train)

with open(output_file_messages, "w", encoding="utf-8") as outfile:
for entry in messages_data:
json.dump(entry, outfile, ensure_ascii=False)
outfile.write("\n")
jldump(messages_data, output_file_messages)


def _knowledge_seed_example_to_test_data(seed_example, system_prompt):
Expand Down Expand Up @@ -170,10 +165,7 @@ def _gen_test_data(
}
)

with open(output_file_test, "w", encoding="utf-8") as outfile:
for entry in test_data:
json.dump(entry, outfile, ensure_ascii=False)
outfile.write("\n")
jldump(test_data, output_file_test)


def _check_pipeline_dir(pipeline):
Expand Down
14 changes: 14 additions & 0 deletions src/instructlab/sdg/utils/json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0

# Standard
from typing import Any, Iterable
import io
import json
import os
Expand Down Expand Up @@ -46,3 +47,16 @@ def jload(f, mode="r"):
"""Load a .json file into a dictionary."""
with _make_r_io_base(f, mode) as f_:
return json.load(f_)


def jldump(data: Iterable[Any], out: str | io.IOBase) -> None:
"""Dump a list to a file in jsonl format.
Args:
data: An data to be written.
f: io.IOBase or file path
"""
with _make_w_io_base(out, "w") as outfile:
for entry in data:
json.dump(entry, outfile, ensure_ascii=False)
outfile.write("\n")

0 comments on commit 870cefe

Please sign in to comment.