diff --git a/src/instructlab/sdg/datamixing.py b/src/instructlab/sdg/datamixing.py index 2a4f3923..5172fdfb 100644 --- a/src/instructlab/sdg/datamixing.py +++ b/src/instructlab/sdg/datamixing.py @@ -445,7 +445,9 @@ def __create_auxiliary_ds(rec): def _create_phase10_ds( - generated_dataset: Dataset, auxiliary_inst: Optional[Dict[str, List[str]]] + generated_dataset: Dataset, + auxiliary_inst: Optional[Dict[str, List[str]]], + use_legacy_pretraining_format: bool, ): """ Create a dataset for Phase 1.0 of downstream training. @@ -457,13 +459,20 @@ def _create_phase10_ds( knowledge_ds = _generate_knowledge_qa_dataset( generated_dataset, keep_context_separate=True ) - knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + raft_knowledge_ds = _add_extra_contexts_to_samples(knowledge_ds, p=0.4) + # Include phase07 + pretraining_knowledge_ds = _generate_knowledge_qa_dataset( + generated_dataset, keep_context_separate=False + ).map(lambda rec: _conv_pretrain(rec, use_legacy_pretraining_format)) auxiliary_dataset = _create_auxiliary_dataset(generated_dataset, auxiliary_inst) + if auxiliary_dataset is not None: - phase10 = concatenate_datasets([knowledge_ds, auxiliary_dataset]) + phase10 = concatenate_datasets( + [raft_knowledge_ds, pretraining_knowledge_ds, auxiliary_dataset] + ) else: - phase10 = knowledge_ds + phase10 = concatenate_datasets([raft_knowledge_ds, pretraining_knowledge_ds]) return phase10 @@ -601,7 +610,7 @@ def collect( ) skills_phase_data = _create_phase10_ds( - new_generated_data, self.auxiliary_inst + new_generated_data, self.auxiliary_inst, use_legacy_pretraining_format ) output_file_leaf_skills = ( f"node_datasets_{self.date_suffix}/{leaf_node_path}_p10.jsonl"