wellcometrust · pdan93 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023 · Sep 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -163,3 +163,5 @@ cython_debug/
 # Folder where training outputs are stored
 bertmesh_outs/
 wandb/
+/bertmesh_before_retagging
+/preprocessed_results
diff --git a/README.md b/README.md
diff --git a/bertmesh_before_retagging.dvc b/bertmesh_before_retagging.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: 4964c2e8f83f071bcb7c467a859726a6.dir
+  size: 2593104471
+  nfiles: 5
+  path: bertmesh_before_retagging
diff --git a/data/grants_comparison/.gitignore b/data/grants_comparison/.gitignore
@@ -1,2 +1,3 @@
 /meshterms_list.txt
 /comparison.csv
+/comparison.xlsx
diff --git a/data/raw/.gitignore b/data/raw/.gitignore
@@ -3,3 +3,4 @@
 /desc2021.xml
 /disease_tags_validation_grants.xlsx
 /active_grants_last_5_years.csv
+/retagging
diff --git a/data/raw/retagging.dvc b/data/raw/retagging.dvc
@@ -0,0 +1,5 @@
+outs:
+- md5: 1a64ed7c09ef3bc49b1bfcc17f5d7e1f.dir
+  size: 5546175163
+  nfiles: 6
+  path: retagging
diff --git a/examples/augment.sh b/examples/augment.sh
@@ -1,3 +1,5 @@
-grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FOLDER_HERE] \
-  --min-examples 25 \
-  --concurrent-calls 25
+# Augments data using a file with 1 label per line and years
+grants-tagger augment mesh [FOLDER_AFTER_PREPROCESSING] [SET_YOUR_OUTPUT_FILE] \
+  --tags "Mathematics" \
+  --examples 25 \
+  --concurrent-calls 1
diff --git a/examples/augment_specific_tags.sh b/examples/augment_specific_tags.sh
diff --git a/examples/preprocess_and_train_by_epochs.sh b/examples/preprocess_and_train_by_epochs.sh
@@ -1,7 +1,8 @@
 # Run on g5.12xlarge instance
 
 # Without saving (on-the-fly)
-SOURCE="data/raw/allMeSH_2021.jsonl"
+#SOURCE="data/raw/allMeSH_2021.jsonl"
+SOURCE="data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
 
 grants-tagger train bertmesh \
     "" \

diff --git a/examples/preprocess_and_train_by_steps.sh b/examples/preprocess_and_train_by_steps.sh
@@ -1,7 +1,8 @@
 # Run on g5.12xlarge instance
 
 # Without saving (on-the-fly)
-SOURCE="data/raw/allMeSH_2021.jsonl"
+# SOURCE="data/raw/allMeSH_2021.jsonl"
+SOURCE="data/raw/retagging/allMeSH_2021.2016-2021.jsonl"
 
 grants-tagger train bertmesh \
     "" \

diff --git a/examples/preprocess_splitting_by_fract.sh b/examples/preprocess_splitting_by_fract.sh
@@ -1,2 +1,2 @@
-grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
-  --test-size 0.05
+grants-tagger preprocess mesh data/raw/retagging/allMeSH_2021.2016-2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+  --test-size 0.05
diff --git a/examples/preprocess_splitting_by_rows.sh b/examples/preprocess_splitting_by_rows.sh
@@ -1,2 +1,2 @@
-grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
-  --test-size 25000
+grants-tagger preprocess mesh data/raw/retagging/allMeSH_2021.2016-2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+  --test-size 25000
diff --git a/examples/preprocess_splitting_by_years.sh b/examples/preprocess_splitting_by_years.sh
@@ -1,4 +1,4 @@
-grants-tagger preprocess mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
+grants-tagger preprocess mesh data/raw/retagging/allMeSH_2021.2016-2021.jsonl [SET_YOUR_OUTPUT_FOLDER_HERE] '' \
   --test-size 25000 \
   --train-years 2016,2017,2018,2019 \
-  --test-years 2020,2021
+  --test-years 2020,2021
diff --git a/examples/resume_train_by_epoch.sh b/examples/resume_train_by_epoch.sh
@@ -34,4 +34,4 @@ grants-tagger train bertmesh \
     --save_strategy epoch \
     --wandb_project wellcome-mesh \
     --wandb_name test-train-all \
-    --wandb_api_key ${WANDB_API_KEY}
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/resume_train_by_steps.sh b/examples/resume_train_by_steps.sh
@@ -36,4 +36,4 @@ grants-tagger train bertmesh \
     --save_steps 10000 \
     --wandb_project wellcome-mesh \
     --wandb_name test-train-all \
-    --wandb_api_key ${WANDB_API_KEY}
+    --wandb_api_key ${WANDB_API_KEY}
diff --git a/examples/retag.sh b/examples/retag.sh
@@ -0,0 +1,6 @@
+grants-tagger retag mesh data/raw/allMeSH_2021.jsonl [SET_YOUR_OUTPUT_FILE_HERE] \
+  --tags "Artificial Intelligence,HIV,Data Collection,Mathematics,Geography" \
+  --years 2016,2017,2018,2019,2020,2021 \
+  --train-examples 100 \
+  --batch-size 10000 \
+  --supervised
diff --git a/grants_tagger_light/augmentation/augment.py b/grants_tagger_light/augmentation/augment.py
@@ -14,6 +14,7 @@
 from grants_tagger_light.augmentation.parallel_augment_openai import (
     ParallelAugmentOpenAI,
 )
+from grants_tagger_light.utils.years_tags_parser import parse_tags
 
 augment_app = typer.Typer()
 
@@ -50,6 +51,7 @@ def augment(
     prompt_template: str = "grants_tagger_light/augmentation/prompt.template",
     concurrent_calls: int = os.cpu_count() * 2,
     temperature: float = 1.5,
+    tags: list = None,
     tags_file_path: str = None,
 ):
     if model_key.strip().lower() not in ["gpt-3.5-turbo", "text-davinci", "gpt-4"]:
@@ -60,6 +62,7 @@ def augment(
     dset = load_from_disk(os.path.join(data_path, "dataset"))
     if "train" in dset:
         dset = dset["train"]
+
     logger.info("Obtaining count values from the labels...")
     pool = multiprocessing.Pool(processes=num_proc)
     element_counts_list = pool.map(_count_elements_in_sublist, dset["meshMajor"])
@@ -71,16 +74,22 @@ def augment(
         merged_element_counts.items(), key=lambda x: x[1], reverse=True
     )
     sorted_merged_element_counts_dict = dict(sorted_merged_element_counts)
+
+    print(f"Tags: {tags}")
+    if tags is None:
+        tags = []
     if tags_file_path is not None:
         with open(tags_file_path, "r") as f:
-            tags = f.read().split("\n")
+            tags.extend([x.strip() for x in f.readlines()])
             logger.info(
                 f"Tags file path found. Filtering {len(tags)} tags "
                 f"(examples found: {tags[:15]}...)"
             )
-            sorted_merged_element_counts_dict = {
-                k: v for k, v in sorted_merged_element_counts_dict.items() if k in tags
-            }
+    if len(tags) > 0:
+        sorted_merged_element_counts_dict = {
+            k: v for k, v in sorted_merged_element_counts_dict.items() if k in tags
+        }
+        logger.info(f"Tags count dictionary: {sorted_merged_element_counts_dict}")
 
     if min_examples is not None:
         sorted_merged_element_counts_dict = {
@@ -89,11 +98,28 @@ def augment(
             if v < min_examples
         }
 
+    if len(sorted_merged_element_counts_dict.keys()) < 1:
+        logger.error(
+            "I did not find any examples for your tags "
+            "in your preprocessed folder. Try:\n"
+            "- Other train/set split in `preprocess`;\n"
+            "- Other years;\n"
+            "- Other tags;"
+        )
+        exit(-1)
+
     with open(f"{save_to_path}.count", "w") as f:
         f.write(json.dumps(sorted_merged_element_counts_dict, indent=2))
 
     tags_to_augment = list(sorted_merged_element_counts_dict.keys())
 
+    if len(tags_to_augment) < concurrent_calls:
+        logger.error(
+            "Found less tags than concurrent calls to OpenAI."
+            f" Overwritting `concurrent-calls` to {len(tags_to_augment)}"
+        )
+        concurrent_calls = len(tags_to_augment)
+
     biggest_tags_to_augment = [
         f"{k}({sorted_merged_element_counts_dict[k]})" for k in tags_to_augment[:5]
     ]
@@ -156,10 +182,8 @@ def augment(
 
 @augment_app.command()
 def augment_cli(
-    data_path: str = typer.Argument(..., help="Path to mesh.jsonl"),
-    save_to_path: str = typer.Argument(
-        ..., help="Path to save the serialized PyArrow dataset after preprocessing"
-    ),
+    data_path: str = typer.Argument(..., help="Path to folder after `preprocess`"),
+    save_to_path: str = typer.Argument(..., help="Path to save the new jsonl data"),
     model_key: str = typer.Option(
         "gpt-3.5-turbo",
         help="LLM to use data augmentation. By now, only `openai` is supported",
@@ -193,6 +217,7 @@ def augment_cli(
         max=2,
         help="A value between 0 and 2. The bigger - the more creative.",
     ),
+    tags: str = typer.Option(None, help="Comma separated list of tags to retag"),
     tags_file_path: str = typer.Option(
         None,
         help="Text file containing one line per tag to be considered. "
@@ -206,13 +231,17 @@ def augment_cli(
         )
         exit(-1)
 
-    if tags_file_path is None and min_examples is None:
+    if tags_file_path is None and tags is None and min_examples is None:
         logger.error(
             "To understand which tags need to be augmented, "
-            "set either --min-examples or --tags-file-path"
+            "set either --min-examples or --tags-file-path or --tags"
         )
         exit(-1)
 
+    if tags_file_path is not None and not os.path.isfile(tags_file_path):
+        logger.error(f"{tags_file_path} not found")
+        exit(-1)
+
     if float(temperature) > 2.0 or float(temperature) < -2.0:
         logger.error("Temperature should be in the range [-2, 2]")
         exit(-1)
@@ -228,5 +257,6 @@ def augment_cli(
         prompt_template=prompt_template,
         concurrent_calls=concurrent_calls,
         temperature=temperature,
+        tags=parse_tags(tags),
         tags_file_path=tags_file_path,
     )
diff --git a/grants_tagger_light/augmentation/prompt.template b/grants_tagger_light/augmentation/prompt.template
@@ -9,4 +9,4 @@ ABSTRACT:
 {ABSTRACT}
 
 TOPIC:
-{TOPIC}
+{TOPIC}
diff --git a/grants_tagger_light/cli.py b/grants_tagger_light/cli.py
@@ -5,6 +5,7 @@
 from grants_tagger_light.augmentation import augment_app
 from grants_tagger_light.download_epmc import download_epmc_cli
 from grants_tagger_light.evaluation import evaluate_app
+from grants_tagger_light.retagging import retag_app
 from grants_tagger_light.predict import predict_cli
 from grants_tagger_light.preprocessing import preprocess_app
 from grants_tagger_light.tune_threshold import tune_threshold_cli
@@ -18,6 +19,7 @@
 app.add_typer(preprocess_app, name="preprocess")
 app.add_typer(augment_app, name="augment")
 app.add_typer(evaluate_app, name="evaluate")
+app.add_typer(retag_app, name="retag")
 
 
 app.command("predict")(predict_cli)

diff --git a/grants_tagger_light/evaluation/evaluate_model.py b/grants_tagger_light/evaluation/evaluate_model.py
@@ -6,6 +6,7 @@
 from typing import Optional
 from transformers import pipeline
 from transformers.pipelines import PIPELINE_REGISTRY
+from tqdm.auto import tqdm
 
 import scipy.sparse as sp
 import typer
@@ -33,13 +34,20 @@ def evaluate_model(
     model = BertMesh.from_pretrained(model_path)
 
     label_binarizer = MultiLabelBinarizer()
-    label_binarizer.fit([list(model.id2label.values())])
+    id2labels = [0 for i in range(model.config.num_labels)]
+    for k, v in model.id2label.items():
+        id2labels[k] = v
+    label_binarizer.fit([id2labels])
 
     pipe = pipeline(
         "grants-tagging",
         model=model,
         tokenizer="Wellcome/WellcomeBertMesh",
+        device=0,
     )
+    def data():
+        for x in X_test:
+            yield x
 
     if split_data:
         print(
@@ -48,13 +56,14 @@ def evaluate_model(
         )
         _, X_test, _, Y_test = load_train_test_data(data_path, label_binarizer)
     else:
-        X_test, Y_test, _ = load_data(data_path, label_binarizer)
-
-    Y_pred_proba = pipe(X_test, return_labels=False)
-
+        X_test, Y_test, _ = load_data(data_path, label_binarizer, model_id2labels=model.id2label)
+
+    Y_pred_proba = []
+    for out in tqdm(pipe(data(), return_labels=False)):
+        Y_pred_proba.append(out)
     Y_pred_proba = torch.vstack(Y_pred_proba)
 
-    Y_pred_proba = sp.csr_matrix(Y_pred_proba)
+    #Y_pred_proba = sp.csr_matrix(Y_pred_proba)
 
     if not isinstance(threshold, list):
         threshold = [threshold]

diff --git a/grants_tagger_light/models/xlinear/model.py b/grants_tagger_light/models/xlinear/model.py
@@ -68,6 +68,13 @@ def __init__(
         # Those are MeshXLinear params
         self.threshold = threshold
 
+        self.model_path = None
+        self.xlinear_model_ = None
+        self.vectorizer_ = None
+
+        self.label_binarizer_path = label_binarizer_path
+        self.label_binarizer_ = None
+
         if label_binarizer_path is not None:
             self.load_label_binarizer(label_binarizer_path)
 
@@ -167,7 +174,6 @@ def predict_tags(
         """
         X: list or numpy array of texts
         model_path: path to trained model
-        label_binarizer_path: path to trained label_binarizer
         probabilities: bool, default False. When true probabilities
                     are returned along with tags
         threshold: float, default 0.5. Probability threshold to be used to assign tags.
@@ -217,6 +223,9 @@ def load(self, model_path, is_predict_only=True):
         with open(params_path, "r") as f:
             self.__dict__.update(json.load(f))
 
+        self.load_label_binarizer(self.label_binarizer_path)
+        self.model_path = model_path
+
         if self.vectorizer_library == "sklearn":
             self.vectorizer_ = load_pickle(vectorizer_path)
         else:
@@ -229,6 +238,8 @@ def load(self, model_path, is_predict_only=True):
             model_path, is_predict_only=is_predict_only
         )
 
+        return self
+
     def load_label_binarizer(self, label_binarizer_path):
         with open(label_binarizer_path, "rb") as f:
             self.label_binarizer_ = pickle.loads(f.read())

diff --git a/grants_tagger_light/preprocessing/preprocess_mesh.py b/grants_tagger_light/preprocessing/preprocess_mesh.py
@@ -118,7 +118,6 @@ def preprocess_mesh(
         num_proc=num_proc,
         desc="Tokenizing",
         fn_kwargs={"tokenizer": tokenizer, "x_col": "abstractText"},
-        load_from_cache_file=False,
     )
     logger.info("Time taken to tokenize: {}".format(time.time() - t1))
 
@@ -261,7 +260,7 @@ def preprocess_mesh_cli(
     if not data_path.endswith("jsonl"):
         logger.error(
             "It seems your input MeSH data is not in `jsonl` format. "
-            "Please, run first `scripts/mesh_json_to_jsonlpy.`"
+            "Please, run first `scripts/mesh_json_to_jsonl.py.`"
         )
         exit(-1)
 

diff --git a/grants_tagger_light/retagging/__init__.py b/grants_tagger_light/retagging/__init__.py
@@ -0,0 +1,8 @@
+import typer
+from .retagging import retag_cli
+
+retag_app = typer.Typer()
+retag_app.command(
+    "mesh",
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)(retag_cli)
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,4 +9,4 @@ ABSTRACT: @@
     {ABSTRACT}
     TOPIC:
-    {TOPIC}
+    {TOPIC}