Skip to content

Commit

Permalink
Merge pull request #6 from jonasrenault/update-v0.4.0
Browse files Browse the repository at this point in the history
Update v0.4.0
  • Loading branch information
jonasrenault authored Jun 13, 2024
2 parents 6f00dc4 + 92c8a48 commit 9b4e8c4
Show file tree
Hide file tree
Showing 31 changed files with 3,156 additions and 116,263 deletions.
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ repos:
rev: '1.8.2'
hooks:
- id: poetry-check
- id: poetry-lock

# black - formatting
- repo: https://github.com/psf/black
Expand Down
8 changes: 8 additions & 0 deletions .streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
[server]
# Max size, in megabytes, for files uploaded with the file_uploader.
# Default: 200
maxUploadSize = 10

[theme]
# Background color used for the sidebar and most interactive widgets.
secondaryBackgroundColor = "#f0f2f699"
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Copyright (c) 2024 Jonas Renault
Copyright (c) 2024 Inria

Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
[![License](https://img.shields.io/badge/License-MIT-yellow)](LICENSE)
![python_version](https://img.shields.io/badge/Python-%3E=3.11-blue)

CPREx is an end to end tool for Named Entity Recognition (NER) and Relation Extraction (RE) specifically designed for chemical compounds and their properties. The goal of the tool is to identify, extract and link chemical compounds and their properties from scientific literature. For ease of use, CPREx provides a custom [spacy](https://spacy.io/) pipeline to perform NER and RE.
CPREx is an end to end tool for Named Entity Recognition (NER) and Relation Extraction (RE) specifically designed for chemical compounds and their properties. The goal of the tool is to identify, extract and link chemical compounds and their properties from scientific literature. For ease of use, CPREx provides a custom [spaCy](https://spacy.io/) pipeline to perform NER and RE.

The pipeline performs the following steps

Expand Down Expand Up @@ -97,7 +97,7 @@ cprex install-models

This will install a [PubmedBert model](https://ftp.ncbi.nlm.nih.gov/pub/lu/BC7-NLM-Chem-track/) finetuned on the NLM-CHEM corpus for extraction of chemical named entities. This model was finetuned by the [BioCreative VII track](https://biocreative.bioinformatics.udel.edu/tasks/biocreative-vii/track-2/).

It will also install a [RE model](https://github.com/jonasrenault/cprex/releases/tag/v0.3.0) pre-trained on our own annotated dataset.
It will also install a [RE model](https://github.com/jonasrenault/cprex/releases/tag/v0.4.0) pre-trained on our own annotated dataset.

#### Installing a base spacy model

Expand Down
2 changes: 1 addition & 1 deletion cprex/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Chemical Properties Relation EXtractor"""

__version__ = "0.3.0"
__version__ = "0.4.0"

from .pipeline import get_pipeline as get_pipeline
6 changes: 3 additions & 3 deletions cprex/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from cprex.rel.parse_data import parse_label_studio_annotations

PUBMED_BERT_MODEL_URL = "https://ftp.ncbi.nlm.nih.gov/pub/lu/BC7-NLM-Chem-track/model_PubMedBERT_NLMChemBC5CDRBC7Silver.tar.gz"
REL_MODEL_URL = "https://github.com/jonasrenault/cprex/releases/download/v0.3.0/cprex-rel-model-0.3.0.tar.gz"
REL_MODEL_URL = "https://github.com/jonasrenault/cprex/releases/download/v0.4.0/cprex-rel-model-0.4.0.tar.gz"
GROBID_URL = "https://github.com/kermitt2/grobid/archive/"
GROBID_MASTER_URL = "https://github.com/kermitt2/grobid/zipball/master"

Expand Down Expand Up @@ -122,7 +122,7 @@ def install_models(models_directory: str) -> None:
else:
click.echo(f"Downloading REL model to {relmodel_dir}")
click.echo("This can take a while as model file is 1.2G ...")
zipped_file = Path() / "cprex-rel-model-0.3.0.tar.gz"
zipped_file = Path() / "cprex-rel-model-0.4.0.tar.gz"
download_and_extract_archive(REL_MODEL_URL, zipped_file, relmodel_dir)
click.echo(f"Downloaded REL model to {relmodel_dir}")

Expand Down Expand Up @@ -166,7 +166,7 @@ def install_grobid(grobid_directory: str, version: str = "0.8.0") -> None:
"--single-branch",
"--branch",
"chemical-units",
"git@github.com:jonasrenault/grobid-quantities.git",
"https://github.com/jonasrenault/grobid-quantities.git",
],
cwd=grobid_directory,
)
Expand Down
163 changes: 80 additions & 83 deletions cprex/corpus/corpus.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import json
import logging
import traceback
from dataclasses import dataclass
from io import BytesIO
from pathlib import Path
from typing import Any

Expand All @@ -13,39 +16,18 @@
parse_article_metadata,
)
from cprex.ner.chem_ner import ner_article
from cprex.ner.quantities import PROPERTY_TO_UNITS
from cprex.parser.pdf_parser import parse_pdf_to_dict
from cprex.pipeline import get_pipeline

INTERESTING_UNITS = [
"TEMPERATURE",
"DENSITY",
"ENTHALPY",
"SOLUBILITY",
"MOLAR VOLUME",
"ABSORPTIVITY",
"ENERGY",
"VELOCITY",
"PRESSURE",
"HEAT CAPACITY",
"DYNAMIC VISCOSITY",
"THERMAL CONDUCTIVITY",
]

PROPERTY_TO_UNITS: dict[str, list[str]] = {
"enthalpy": ["ENTHALPY"],
"energy": ["ENERGY", "ENTHALPY"],
"absorptivity": ["ABSORPTIVITY"],
"heat capacity": ["HEAT CAPACITY"],
"temperature": ["TEMPERATURE"],
"pressure": ["PRESSURE"],
"density": ["SOLUBILITY", "DENSITY"],
"viscosity": ["DYNAMIC VISCOSITY"],
"velocity": ["VELOCITY"],
"toxicity": [],
"thermal": ["TIME", "TEMPERATURE"],
"formula weight": [],
"sensibility": [],
}

logger = logging.getLogger(__name__)


@dataclass
class ParsedPaper:
title: str
doi: str
id: str
docs: list[Doc]


def prop_matches_quantity(doc: Doc) -> bool:
Expand All @@ -66,7 +48,8 @@ def prop_matches_quantity(doc: Doc) -> bool:

for property, units in PROPERTY_TO_UNITS.items():
if property in prop_types and (
len(units) == 0 or any([unit in quantity_types for unit in units])
(len(units) == 0 and len(quantity_types) > 0)
or any([unit in quantity_types for unit in units])
):
return True

Expand All @@ -87,7 +70,9 @@ def filter_doc(doc: Doc) -> bool:
return prop_matches_quantity(doc)


def parse_and_filter(pdf: Path, nlp: Language, segment_sentences=False) -> list[Doc]:
def parse_and_filter_pdf(
pdf: str | Path | BytesIO, nlp: Language, segment_sentences=False, filter: bool = True
) -> list[Doc]:
"""
Parse the pdf file with the nlp pipeline and filter the resulting docs
to only keep those interesting (with property and quantity entities).
Expand All @@ -97,17 +82,19 @@ def parse_and_filter(pdf: Path, nlp: Language, segment_sentences=False) -> list[
nlp (Language): the spacy nlp pipeline
segment_sentences (bool, optional): whether to segment sentences
during parsing. Defaults to False.
filter (bool, optional): whether to filter docs. Defaults to True.
Returns:
list[Doc]: the filtered docs
"""
article = parse_pdf_to_dict(str(pdf), segment_sentences=segment_sentences)
article = parse_pdf_to_dict(pdf, segment_sentences=segment_sentences)
docs = ner_article(article, nlp)
filtered_docs = [doc for doc in docs if filter_doc(doc)]
return filtered_docs
if filter:
docs = [doc for doc in docs if filter_doc(doc)]
return docs


def save_docs(docs: list[Doc], save_file: Path):
def save_docs(docs: list[Doc], save_file: Path, save_trf_data: bool = False):
"""
Save a list of docs to disk.
Expand All @@ -117,24 +104,33 @@ def save_docs(docs: list[Doc], save_file: Path):
"""
doc_bin = DocBin(store_user_data=True)
for doc in docs:
if not save_trf_data and Doc.has_extension("trf_data"):
doc._.trf_data = None
doc_bin.add(doc)

doc_bin.to_disk(save_file)


def load_docs(save_file: Path, nlp: Language) -> list[Doc]:
def load_docs(save_file: Path, nlp: Language, set_doi: bool = False) -> list[Doc]:
"""
Load a list of docs from disk.
Args:
save_file (Path): the file to load from
nlp (Language): the nlp pipeline used to create the docs
set_doi (bool, optional): update doc doi from filename. Defaults to False.
Returns:
list[Doc]: the list of docs loaded from the file
"""
doc_bin = DocBin().from_disk(save_file)
docs = list(doc_bin.get_docs(nlp.vocab))

# set doi for docs if not present
if set_doi:
doi = save_file.stem.replace("_", "/")
for doc in docs:
doc._.doi = doi
return docs


Expand All @@ -152,76 +148,77 @@ def crawl_chemrxiv_papers(dump_file: Path, query: str):
dump = []
count = 0

print("Starting to crawl chemRxiv API.")
logger.info("Starting to crawl chemRxiv API.")
for paper in tqdm(api.query_generator(f"items?term={query}")):
dump.append(parse_article_metadata(paper["item"]))
count += 1

print(f"Crawl finished. Dumping results to {dump_file.name}")
logger.info(f"Crawl finished. Dumping results to {dump_file.name}")
with open(dump_file, "w") as f:
for paper in dump:
f.write(json.dumps(paper) + "\n")


def create_corpus_from_metadata_file(
def parse_papers(
metadata_file: Path,
download_dir: Path,
nlp: Language,
limit: int = 1000,
force: bool = False,
) -> list[Doc]:
save_parsed_docs: bool = False,
) -> list[ParsedPaper]:
"""
Create a list of filtered docs from a metadata_file containing
a list of article metadata crawled from chemrxiv.
Given a metadata_file containing a list of paper metadata (title, doi, pdf_url),
download the PDFs and process them with the given nlp pipeline.
Args:
metadata_file (Path): the path to the metadata file
download_dir (Path): the dir to save the pdf files to
limit (int, optional): the maximum number of articles to parse. Defaults to 1000.
force (bool, optional): if True, start over from the start of the metadata
file. Defaults to False.
metadata_file (Path): metadata_file with list of papers to process.
download_dir (Path): directory where PDF files are saved.
nlp (Language): the spacy pipeline used to process papers
limit (int, optional): limit of papers to process. Defaults to 1000.
force (bool, optional): if true, process all papers, otherwise process
only new papers. Defaults to False.
save_parsed_docs (bool, optional): if ture, save parsed docs to disk.
Defaults to False.
Returns:
list[Doc]: the filtered docs
list[ParsedPaper]: list of ParsedPaper
"""
# get list of articles from metadata file
print("Reading paper metadata")
# get list of papers from metadata file
logger.info("Reading paper metadata ...")
with open(metadata_file, "r") as f:
papers = [json.loads(line) for line in tqdm(f.readlines())]
papers = [json.loads(line) for line in f.readlines()]
if force:
for paper in papers:
if "processed" in paper:
del paper["processed"]

# create ner pipeline
model_dir = Path() / "model"
nlp = get_pipeline(str(model_dir))

# process articles
count = 0
corpus_docs = []
try:
for paper in tqdm(papers):
if "pdf" in paper and "processed" not in paper:
pdf_file = download_dir / f"{paper['doi'].replace('/', '_')}.pdf"
if not pdf_file.exists():
download_pdf_for_paper(paper["pdf"], pdf_file)

print(f"Processing {pdf_file.name}")
docs = parse_and_filter(pdf_file, nlp, segment_sentences=False)
corpus_docs.extend(docs)
paper["processed"] = True
count += 1
if count > limit:
break
except Exception as e:
print(e)
traceback.print_exc()
finally:
print("Done processing. Writing output.")
with open(metadata_file, "w") as f:
for paper in papers:
f.write(json.dumps(paper) + "\n")
return corpus_docs
output: list[ParsedPaper] = []
logger.info(f"Processing papers (max {limit}) ...")
for paper in tqdm([p for p in papers if "pdf" in p and "processed" not in p][:limit]):
try:
pdf_file = download_dir / f"{paper['doi'].replace('/', '_')}.pdf"
if not pdf_file.exists():
download_pdf_for_paper(paper["pdf"], pdf_file)

docs = parse_and_filter_pdf(pdf_file, nlp, segment_sentences=False)
output.append(ParsedPaper(paper["title"], paper["doi"], paper["id"], docs))

if save_parsed_docs and docs:
save_docs(docs, download_dir / f"{paper['doi'].replace('/', '_')}.spacy")
except Exception as e:
logger.error(e)
traceback.print_exc()
finally:
paper["processed"] = True

logger.info("Done processing. Writing output.")
with open(metadata_file, "w") as f:
for paper in papers:
f.write(json.dumps(paper) + "\n")

return output


def export_doc_to_label_studio(doc: Doc) -> dict[str, Any]:
Expand Down
Loading

0 comments on commit 9b4e8c4

Please sign in to comment.