Skip to content

Commit

Permalink
Merge pull request #366 from aakankshaduggal/data-mix-fix
Browse files Browse the repository at this point in the history
Data mix fix
  • Loading branch information
mergify[bot] authored Nov 12, 2024
2 parents eaaccca + 3040657 commit b6f07a8
Showing 1 changed file with 14 additions and 5 deletions.
19 changes: 14 additions & 5 deletions src/instructlab/sdg/datamixing.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,9 @@ def __create_auxiliary_ds(rec):


def _create_phase10_ds(
generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]]
generated_dataset: Dataset,
auxiliary_inst: Optional[Dict[str, List[str]]],
use_legacy_pretraining_format: bool,
):
"""
Create a dataset for Phase 1.0 of downstream training.
Expand All @@ -457,13 +459,20 @@ def _create_phase10_ds(
knowledge_ds = _generate_knowledge_qa_dataset(
generated_dataset, keep_context_separate=True
)
knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4)
raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4)
# Include phase07
pretraining_knowledge_ds = _generate_knowledge_qa_dataset(
generated_dataset, keep_context_separate=False
).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format))

auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst)

if auxiliary_dataset is not None:
phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset])
phase10 = concatenate_datasets(
[raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset]
)
else:
phase10 = knowledge_ds
phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds])
return phase10


Expand Down Expand Up @@ -601,7 +610,7 @@ def collect(
)

skills_phase_data = _create_phase10_ds(
new_generated_data, self.auxiliary_inst
new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format
)
output_file_leaf_skills = (
f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"
Expand Down

0 comments on commit b6f07a8

Please sign in to comment.