google-research · shayne-longpre · May 16, 2023
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbert_answer_the_following_q_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbert_answer_the_following_q_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbert_based_on_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbert_based_on_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbert_generate_question_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbert_generate_question_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbert_question_context_answer_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbert_question_context_answer_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbert_tell_what_it_is_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbert_tell_what_it_is_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbidaf_answer_the_following_q_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbidaf_answer_the_following_q_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbidaf_based_on_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbidaf_based_on_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbidaf_generate_question_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbidaf_generate_question_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbidaf_question_context_answer_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbidaf_question_context_answer_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_dbidaf_tell_what_it_is_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_dbidaf_tell_what_it_is_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_droberta_answer_the_following_q_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_droberta_answer_the_following_q_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_droberta_based_on_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_droberta_based_on_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_droberta_generate_question_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_droberta_generate_question_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_droberta_question_context_answer_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_droberta_question_context_answer_train.tsv
diff --git a/flan/v2/p3_ready_data/adversarial_qa_droberta_tell_what_it_is_train.tsv b/flan/v2/p3_ready_data/adversarial_qa_droberta_tell_what_it_is_train.tsv
diff --git a/flan/v2/preprocessors.py b/flan/v2/preprocessors.py
@@ -938,6 +938,18 @@ def simple_cot_tsv(example):
       "chain_of_thought": chain_of_thought,
   }
 
+@seqio.map_over_dataset
+def simple_p3_tsv(example):
+  """Processes a simple tsv file with chain of thought."""
+  question = tf.strings.split(example, sep="\t")[0]
+  question = tf.strings.regex_replace(question, r"\\n", "\n")
+  answer = tf.strings.split(example, sep="\t")[1]
+  answer = tf.strings.regex_replace(answer, r"\\n", "\n")
+  return {
+      "inputs_pretokenized": question,
+      "targets_pretokenized": answer,
+  }
+
 
 @seqio.map_over_dataset
 def simple_tsv(example):

diff --git a/flan/v2/run_example.py b/flan/v2/run_example.py
@@ -77,7 +77,8 @@
 ##### See 3 Examples of Mixtures or Submixtures you can try
 ##############################################################
 # 1. Example use cases to use just the chain-of-thought zero-shot data:
-selected_mixture = seqio.get_mixture_or_task('cot_zsopt')
+# selected_mixture = seqio.get_mixture_or_task('cot_zsopt')
+selected_mixture = seqio.get_mixture_or_task('t0_zsopt')
 
 # 2. Example use cases to use just all chain-of-thought templates together:
 # selected_mixture = seqio.get_mixture_or_task('cot_submix')

diff --git a/flan/v2/task_configs.py b/flan/v2/task_configs.py
@@ -184,15 +184,35 @@ def _process_lambada(example):
 
   t0_metadata_prep = functools.partial(prep.add_source_info,
     task_name=subtask_id, task_source="P3")
-  T0_TASK_CONFIGS[task_name] = TaskConfig(
-      source=seqio.TfdsDataSource(
-          tfds_name=f"huggingface:bigscience__p3/{subtask_id}",
-        #   tfds_name=f"bigscience__p3/{subtask_id}",
-          splits=["train"]),
-      preprocessors=preprocessors + [t0_metadata_prep],
+
+  if "adversarial_qa_" in subtask_id:
+    P3_DATA_PATH = os.path.join(os.path.dirname(__file__), "p3_ready_data")
+    preprocessors = [
+        prep.simple_p3_tsv, 
+        functools.partial(prep.t0, multiple_choice=False),
+        t0_metadata_prep,
+    ]
+    fpath = os.path.join(P3_DATA_PATH, f"{subtask_id}_train.tsv")
+    print(fpath)
+    T0_TASK_CONFIGS[task_name] = TaskConfig(
+      source=seqio.TextLineDataSource(
+          {"train": fpath},
+          num_input_examples={"train": 10000}),
+      preprocessors=preprocessors,
       postprocess_fn=postprocessors,
       metric_fns=[t5_metrics.accuracy],
-  )
+    )
+  else:
+    continue
+    # T0_TASK_CONFIGS[task_name] = TaskConfig(
+    #     source=seqio.TfdsDataSource(
+    #         tfds_name=f"huggingface:bigscience__p3/{subtask_id}",
+    #         #   tfds_name=f"bigscience__p3/{subtask_id}",
+    #         splits=["train"]),
+    #     preprocessors=preprocessors + [t0_metadata_prep],
+    #     postprocess_fn=postprocessors,
+    #     metric_fns=[t5_metrics.accuracy],
+    # )
 
 # ====================== Natural Instructions v2.5 ======================
 # Prepare lookup table for positive example info