Merge pull request #6 from jonasrenault/update-v0.4.0

Update v0.4.0
jonasrenault · Jun 13, 2024 · 9b4e8c4 · 9b4e8c4
2 parents 6f00dc4 + 92c8a48
commit 9b4e8c4
Show file tree

Hide file tree

Showing 31 changed files with 3,156 additions and 116,263 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,6 @@ repos:
     rev: '1.8.2'
     hooks:
       - id: poetry-check
-      - id: poetry-lock
 
   # black - formatting
   - repo: https://github.com/psf/black

diff --git a/.streamlit/config.toml b/.streamlit/config.toml
@@ -0,0 +1,8 @@
+[server]
+# Max size, in megabytes, for files uploaded with the file_uploader.
+# Default: 200
+maxUploadSize = 10
+
+[theme]
+# Background color used for the sidebar and most interactive widgets.
+secondaryBackgroundColor = "#f0f2f699"
diff --git a/LICENSE b/LICENSE
@@ -1,4 +1,4 @@
-Copyright (c) 2024 Jonas Renault
+Copyright (c) 2024 Inria
 
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![License](https://img.shields.io/badge/License-MIT-yellow)](LICENSE)
 ![python_version](https://img.shields.io/badge/Python-%3E=3.11-blue)
 
-CPREx is an end to end tool for Named Entity Recognition (NER) and Relation Extraction (RE) specifically designed for chemical compounds and their properties. The goal of the tool is to identify, extract and link chemical compounds and their properties from scientific literature. For ease of use, CPREx provides a custom [spacy](https://spacy.io/) pipeline to perform NER and RE.
+CPREx is an end to end tool for Named Entity Recognition (NER) and Relation Extraction (RE) specifically designed for chemical compounds and their properties. The goal of the tool is to identify, extract and link chemical compounds and their properties from scientific literature. For ease of use, CPREx provides a custom [spaCy](https://spacy.io/) pipeline to perform NER and RE.
 
 The pipeline performs the following steps
 
@@ -97,7 +97,7 @@ cprex install-models
 
 This will install a [PubmedBert model](https://ftp.ncbi.nlm.nih.gov/pub/lu/BC7-NLM-Chem-track/) finetuned on the NLM-CHEM corpus for extraction of chemical named entities. This model was finetuned by the [BioCreative VII track](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-2/).
 
-It will also install a [RE model](https://github.com/jonasrenault/cprex/releases/tag/v0.3.0) pre-trained on our own annotated dataset.
+It will also install a [RE model](https://github.com/jonasrenault/cprex/releases/tag/v0.4.0) pre-trained on our own annotated dataset.
 
 #### Installing a base spacy model
 

diff --git a/cprex/__init__.py b/cprex/__init__.py
@@ -1,5 +1,5 @@
 """Chemical Properties Relation EXtractor"""
 
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 
 from .pipeline import get_pipeline as get_pipeline
diff --git a/cprex/commands.py b/cprex/commands.py
@@ -15,7 +15,7 @@
 from cprex.rel.parse_data import parse_label_studio_annotations
 
 PUBMED_BERT_MODEL_URL = "https://ftp.ncbi.nlm.nih.gov/pub/lu/BC7-NLM-Chem-track/model_PubMedBERT_NLMChemBC5CDRBC7Silver.tar.gz"
-REL_MODEL_URL = "https://github.com/jonasrenault/cprex/releases/download/v0.3.0/cprex-rel-model-0.3.0.tar.gz"
+REL_MODEL_URL = "https://github.com/jonasrenault/cprex/releases/download/v0.4.0/cprex-rel-model-0.4.0.tar.gz"
 GROBID_URL = "https://github.com/kermitt2/grobid/archive/"
 GROBID_MASTER_URL = "https://github.com/kermitt2/grobid/zipball/master"
 
@@ -122,7 +122,7 @@ def install_models(models_directory: str) -> None:
     else:
         click.echo(f"Downloading REL model to {relmodel_dir}")
         click.echo("This can take a while as model file is 1.2G ...")
-        zipped_file = Path() / "cprex-rel-model-0.3.0.tar.gz"
+        zipped_file = Path() / "cprex-rel-model-0.4.0.tar.gz"
         download_and_extract_archive(REL_MODEL_URL, zipped_file, relmodel_dir)
         click.echo(f"Downloaded REL model to {relmodel_dir}")
 
@@ -166,7 +166,7 @@ def install_grobid(grobid_directory: str, version: str = "0.8.0") -> None:
             "--single-branch",
             "--branch",
             "chemical-units",
-            "git@github.com:jonasrenault/grobid-quantities.git",
+            "https://github.com/jonasrenault/grobid-quantities.git",
         ],
         cwd=grobid_directory,
     )

diff --git a/cprex/corpus/corpus.py b/cprex/corpus/corpus.py
@@ -1,5 +1,8 @@
 import json
+import logging
 import traceback
+from dataclasses import dataclass
+from io import BytesIO
 from pathlib import Path
 from typing import Any
 
@@ -13,39 +16,18 @@
     parse_article_metadata,
 )
 from cprex.ner.chem_ner import ner_article
+from cprex.ner.quantities import PROPERTY_TO_UNITS
 from cprex.parser.pdf_parser import parse_pdf_to_dict
-from cprex.pipeline import get_pipeline
-
-INTERESTING_UNITS = [
-    "TEMPERATURE",
-    "DENSITY",
-    "ENTHALPY",
-    "SOLUBILITY",
-    "MOLAR VOLUME",
-    "ABSORPTIVITY",
-    "ENERGY",
-    "VELOCITY",
-    "PRESSURE",
-    "HEAT CAPACITY",
-    "DYNAMIC VISCOSITY",
-    "THERMAL CONDUCTIVITY",
-]
-
-PROPERTY_TO_UNITS: dict[str, list[str]] = {
-    "enthalpy": ["ENTHALPY"],
-    "energy": ["ENERGY", "ENTHALPY"],
-    "absorptivity": ["ABSORPTIVITY"],
-    "heat capacity": ["HEAT CAPACITY"],
-    "temperature": ["TEMPERATURE"],
-    "pressure": ["PRESSURE"],
-    "density": ["SOLUBILITY", "DENSITY"],
-    "viscosity": ["DYNAMIC VISCOSITY"],
-    "velocity": ["VELOCITY"],
-    "toxicity": [],
-    "thermal": ["TIME", "TEMPERATURE"],
-    "formula weight": [],
-    "sensibility": [],
-}
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ParsedPaper:
+    title: str
+    doi: str
+    id: str
+    docs: list[Doc]
 
 
 def prop_matches_quantity(doc: Doc) -> bool:
@@ -66,7 +48,8 @@ def prop_matches_quantity(doc: Doc) -> bool:
 
     for property, units in PROPERTY_TO_UNITS.items():
         if property in prop_types and (
-            len(units) == 0 or any([unit in quantity_types for unit in units])
+            (len(units) == 0 and len(quantity_types) > 0)
+            or any([unit in quantity_types for unit in units])
         ):
             return True
 
@@ -87,7 +70,9 @@ def filter_doc(doc: Doc) -> bool:
     return prop_matches_quantity(doc)
 
 
-def parse_and_filter(pdf: Path, nlp: Language, segment_sentences=False) -> list[Doc]:
+def parse_and_filter_pdf(
+    pdf: str | Path | BytesIO, nlp: Language, segment_sentences=False, filter: bool = True
+) -> list[Doc]:
     """
     Parse the pdf file with the nlp pipeline and filter the resulting docs
     to only keep those interesting (with property and quantity entities).
@@ -97,17 +82,19 @@ def parse_and_filter(pdf: Path, nlp: Language, segment_sentences=False) -> list[
         nlp (Language): the spacy nlp pipeline
         segment_sentences (bool, optional): whether to segment sentences
         during parsing. Defaults to False.
+        filter (bool, optional): whether to filter docs. Defaults to True.
 
     Returns:
         list[Doc]: the filtered docs
     """
-    article = parse_pdf_to_dict(str(pdf), segment_sentences=segment_sentences)
+    article = parse_pdf_to_dict(pdf, segment_sentences=segment_sentences)
     docs = ner_article(article, nlp)
-    filtered_docs = [doc for doc in docs if filter_doc(doc)]
-    return filtered_docs
+    if filter:
+        docs = [doc for doc in docs if filter_doc(doc)]
+    return docs
 
 
-def save_docs(docs: list[Doc], save_file: Path):
+def save_docs(docs: list[Doc], save_file: Path, save_trf_data: bool = False):
     """
     Save a list of docs to disk.
 
@@ -117,24 +104,33 @@ def save_docs(docs: list[Doc], save_file: Path):
     """
     doc_bin = DocBin(store_user_data=True)
     for doc in docs:
+        if not save_trf_data and Doc.has_extension("trf_data"):
+            doc._.trf_data = None
         doc_bin.add(doc)
 
     doc_bin.to_disk(save_file)
 
 
-def load_docs(save_file: Path, nlp: Language) -> list[Doc]:
+def load_docs(save_file: Path, nlp: Language, set_doi: bool = False) -> list[Doc]:
     """
     Load a list of docs from disk.
 
     Args:
         save_file (Path): the file to load from
         nlp (Language): the nlp pipeline used to create the docs
+        set_doi (bool, optional): update doc doi from filename. Defaults to False.
 
     Returns:
         list[Doc]: the list of docs loaded from the file
     """
     doc_bin = DocBin().from_disk(save_file)
     docs = list(doc_bin.get_docs(nlp.vocab))
+
+    # set doi for docs if not present
+    if set_doi:
+        doi = save_file.stem.replace("_", "/")
+        for doc in docs:
+            doc._.doi = doi
     return docs
 
 
@@ -152,76 +148,77 @@ def crawl_chemrxiv_papers(dump_file: Path, query: str):
     dump = []
     count = 0
 
-    print("Starting to crawl chemRxiv API.")
+    logger.info("Starting to crawl chemRxiv API.")
     for paper in tqdm(api.query_generator(f"items?term={query}")):
         dump.append(parse_article_metadata(paper["item"]))
         count += 1
 
-    print(f"Crawl finished. Dumping results to {dump_file.name}")
+    logger.info(f"Crawl finished. Dumping results to {dump_file.name}")
     with open(dump_file, "w") as f:
         for paper in dump:
             f.write(json.dumps(paper) + "\n")
 
 
-def create_corpus_from_metadata_file(
+def parse_papers(
     metadata_file: Path,
     download_dir: Path,
+    nlp: Language,
     limit: int = 1000,
     force: bool = False,
-) -> list[Doc]:
+    save_parsed_docs: bool = False,
+) -> list[ParsedPaper]:
     """
-    Create a list of filtered docs from a metadata_file containing
-    a list of article metadata crawled from chemrxiv.
+    Given a metadata_file containing a list of paper metadata (title, doi, pdf_url),
+    download the PDFs and process them with the given nlp pipeline.
 
     Args:
-        metadata_file (Path): the path to the metadata file
-        download_dir (Path): the dir to save the pdf files to
-        limit (int, optional): the maximum number of articles to parse. Defaults to 1000.
-        force (bool, optional): if True, start over from the start of the metadata
-        file. Defaults to False.
+        metadata_file (Path): metadata_file with list of papers to process.
+        download_dir (Path): directory where PDF files are saved.
+        nlp (Language): the spacy pipeline used to process papers
+        limit (int, optional): limit of papers to process. Defaults to 1000.
+        force (bool, optional): if true, process all papers, otherwise process
+            only new papers. Defaults to False.
+        save_parsed_docs (bool, optional): if ture, save parsed docs to disk.
+            Defaults to False.
 
     Returns:
-        list[Doc]: the filtered docs
+        list[ParsedPaper]: list of ParsedPaper
     """
-    # get list of articles from metadata file
-    print("Reading paper metadata")
+    # get list of papers from metadata file
+    logger.info("Reading paper metadata ...")
     with open(metadata_file, "r") as f:
-        papers = [json.loads(line) for line in tqdm(f.readlines())]
+        papers = [json.loads(line) for line in f.readlines()]
     if force:
         for paper in papers:
             if "processed" in paper:
                 del paper["processed"]
 
-    # create ner pipeline
-    model_dir = Path() / "model"
-    nlp = get_pipeline(str(model_dir))
-
     # process articles
-    count = 0
-    corpus_docs = []
-    try:
-        for paper in tqdm(papers):
-            if "pdf" in paper and "processed" not in paper:
-                pdf_file = download_dir / f"{paper['doi'].replace('/', '_')}.pdf"
-                if not pdf_file.exists():
-                    download_pdf_for_paper(paper["pdf"], pdf_file)
-
-                print(f"Processing {pdf_file.name}")
-                docs = parse_and_filter(pdf_file, nlp, segment_sentences=False)
-                corpus_docs.extend(docs)
-                paper["processed"] = True
-                count += 1
-                if count > limit:
-                    break
-    except Exception as e:
-        print(e)
-        traceback.print_exc()
-    finally:
-        print("Done processing. Writing output.")
-        with open(metadata_file, "w") as f:
-            for paper in papers:
-                f.write(json.dumps(paper) + "\n")
-        return corpus_docs
+    output: list[ParsedPaper] = []
+    logger.info(f"Processing papers (max {limit}) ...")
+    for paper in tqdm([p for p in papers if "pdf" in p and "processed" not in p][:limit]):
+        try:
+            pdf_file = download_dir / f"{paper['doi'].replace('/', '_')}.pdf"
+            if not pdf_file.exists():
+                download_pdf_for_paper(paper["pdf"], pdf_file)
+
+            docs = parse_and_filter_pdf(pdf_file, nlp, segment_sentences=False)
+            output.append(ParsedPaper(paper["title"], paper["doi"], paper["id"], docs))
+
+            if save_parsed_docs and docs:
+                save_docs(docs, download_dir / f"{paper['doi'].replace('/', '_')}.spacy")
+        except Exception as e:
+            logger.error(e)
+            traceback.print_exc()
+        finally:
+            paper["processed"] = True
+
+    logger.info("Done processing. Writing output.")
+    with open(metadata_file, "w") as f:
+        for paper in papers:
+            f.write(json.dumps(paper) + "\n")
+
+    return output
 
 
 def export_doc_to_label_studio(doc: Doc) -> dict[str, Any]: