diff --git a/README.md b/README.md index fbcea2e..bf418db 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # PubMed ID (PMID) Cite [![Tweet](https://img.shields.io/twitter/url/http/shields.io.svg?style=social)](https://twitter.com/intent/tweet?text=Python%20library%20to%20download%20pubmed%20citation%20counts%20and%20data,%20given%20a%20PMID&url=https://github.com/dvklopfenstein/pmidcite&via=dvklopfenstein&hashtags=pubmed,pmid,citations,pubmed2cite,writingtips,scientificwriting) -[![build](https://github.com/dvklopfenstein/pmidcite/actions/workflows/build.yml/badge.svg)](https://github.com/dvklopfenstein/pmidcite/actions/workflows/build.yml) [![CodeQL](https://github.com/dvklopfenstein/pmidcite/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/dvklopfenstein/pmidcite/actions/workflows/codeql-analysis.yml) [![Latest PyPI version](https://img.shields.io/pypi/v/pmidcite.svg)](https://pypi.org/project/pmidcite/) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.5172712.svg)](https://doi.org/10.5281/zenodo.5172712) diff --git a/makefile b/makefile index adb5590..585acf7 100644 --- a/makefile +++ b/makefile @@ -18,11 +18,6 @@ p: d: find src -regextype posix-extended -regex "[a-z./]*" -type d -g: - git status -uno - git remote -v - git branch - cli: find src/pmidcite/cli -name \*.py @@ -129,4 +124,4 @@ clobber_tmp: rm -rf ./src/tests/icite clobber: - make -f makefile clobber_tmp clean_build + make -f makefile clobber_tmp clean_build pyc diff --git a/setup.py b/setup.py index 395dba4..3721bf3 100755 --- a/setup.py +++ b/setup.py @@ -39,6 +39,10 @@ def get_long_description(): with open(join(dir_cur, 'README.md'), 'rb') as ifstrm: return ifstrm.read().decode("UTF-8") +CONSOLE_SCRIPTS = [ + 'icite=pmidcite.scripts.icite:main', + 'sumpaps=pmidcite.scripts.summarize_papers:main', +] setup( name=NAME, @@ -49,10 +53,7 @@ def get_long_description(): package_dir=PACKAGE_DIRS, scripts=glob('src/bin/*.py'), entry_points={ - 'console_scripts':[ - 'icite=pmidcite.scripts.icite:main', - 'sumpaps=pmidcite.scripts.summarize_papers:main', - ], + 'console_scripts': CONSOLE_SCRIPTS, }, # https://pypi.org/classifiers/ classifiers=[ diff --git a/src/pmidcite/cfg.py b/src/pmidcite/cfg.py index d25ef73..8a44200 100644 --- a/src/pmidcite/cfg.py +++ b/src/pmidcite/cfg.py @@ -185,13 +185,14 @@ def _chk_apikey(self, loaded): """Check to see that user has added a NCBI API key""" try: int(loaded['apikey'], 16) - except ValueError: - msg = ('SET API KEY IN {CFG}\n' + except ValueError as exc: + msg = (f'SET API KEY IN {self.cfgfile}\n' 'Get an NCBI API key to run the E-utilities:\n' 'https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/' 'new-api-keys-for-the-e-utilities\n' - 'To ensure your API key is not made public, add {CFG} to the .gitignore') - raise RuntimeError(msg.format(CFG=self.cfgfile)) + 'To ensure your API key is not made public, ' + f'add {self.cfgfile} to the .gitignore') + raise RuntimeError(msg) from exc def _err_notfound(self): """Report the config file was not found""" diff --git a/src/pmidcite/eutils/cmds/base.py b/src/pmidcite/eutils/cmds/base.py index 65d75b4..ac487f7 100755 --- a/src/pmidcite/eutils/cmds/base.py +++ b/src/pmidcite/eutils/cmds/base.py @@ -211,39 +211,16 @@ def epost(self, database, ids, num_ids_p_epost=10): ret['querykey'] = rsp['querykey'] return ret - @staticmethod - def _return_einforesult(record): - """Return EInfo result""" - einforesult = record['einforesult'] - cmdtype = record['header']['type'] - if 'dblist' in einforesult: - return einforesult['dblist'] - if cmdtype == 'einfo' and 'dbinfo' in einforesult: - assert len(record['einforesult']['dbinfo']) == 1 - ## print('RRRRRRRRRRRRRRR', record.keys()) - ## print('RRRRRRRRRRRRRRR', len(record['einforesult']['dbinfo'])) - ## print('RRRRRRRRRRRRRRR', record) - return record['einforesult']['dbinfo'][0] - raise RuntimeError('IMPLEMENT _return_einforesult') - - @staticmethod - def _return_linksets(record): - """Return ELink result""" - links_all = [] - for dct0 in record['linksets']: - ## print('DCT', dct0) - if 'linksetdbs' in dct0: - for dct1 in dct0['linksetdbs']: - links_all.extend(dct1['links']) - print('{N} LINKED ITEMS'.format(N=len(links_all))) - return links_all - # ------------------------------------------------------------------------------------ def run_eutilscmd(self, cmd, **params): # params=None, post=None, ecitmatch=False): """Run NCBI E-Utilities command""" # params example: db retstart retmax rettype retmode webenv query_key + # print('RUN NCBI EUTILS CMD', cmd) rsp_dct = self.run_req(cmd, **params) # post=None, ecitmatch=False): - ## print('RRRRRRRRRRRRRRRRRRRRRRR', rsp_dct) + # print('RRRRRRRRRRRRRRRRRRRRRRR', rsp_dct.keys()) + # dict_keys(['code', 'msg', 'url', 'headers', 'data']) + # print('RRRRRRRRRRRRRRRRRRRRRRR', rsp_dct['data']) + # print('RRRRRRRRRRRRRRRRRRRRRRR', rsp_dct) if rsp_dct is not None: return self._extract_rsp(rsp_dct['data'], params.get('retmode')) return None @@ -251,8 +228,8 @@ def run_eutilscmd(self, cmd, **params): # params=None, post=None, ecitmatch=Fal def _mk_cgi(self, cmd, **params): """Get Fast Common Gateway Interface (fcgi) string, given E-utils command/parameters""" cgi = self.cgifmt.format(ECMD=cmd) + ##print('PARAMS', params) params = self._construct_params(params) - ## print('PARAMS', params) options = self._encode_options(params) cgi += '?' + options return cgi @@ -333,29 +310,19 @@ def _extract_rsp(self, record, retmode): """Extract the data from a response from running a Entrez Utilities command""" if retmode == 'json': try: - dct = json.loads(record) - if 'esearchresult' in dct: - return dct['esearchresult'] - if 'einforesult' in dct: - return self._return_einforesult(dct) - if 'linksets' in dct: - return self._return_linksets(dct) - print('KEYS:', dct.keys()) - print('DCT:', dct) - raise RuntimeError('UNKNOWN RESULT in _run_req') + return json.loads(record) except json.decoder.JSONDecodeError as errobj: print('JSONDecodeError = {ERR}'.format(ERR=str(errobj))) traceback.print_exc() print('\n**FATAL JSONDecodeError:\n{RECORD}'.format(RECORD=record.decode('utf-8'))) - if retmode == 'text': + if retmode in {'text', 'asn.1'}: ## print('RECORD:', str(record)) return record.decode('utf-8') ## print('RETMODE', retmode) ## print('RECORD', record) - ## print(record) # # # Parse XML root = ElementTree.fromstring(record) - ## print('root.tag', root.tag) - assert root.tag in 'ePostResult', root.tag + #print(f'ElementTree.fromstring(record).root:\n{root}') + return root + # TODO + #print('root.tag', root.tag) + assert root.tag in 'ePostResult', f'ElementTree.fromstring(record).tag: {root.tag}' dct = {r.tag.lower():r.text for r in root} if 'querykey' in dct: dct['querykey'] = int(dct['querykey']) diff --git a/src/pmidcite/eutils/cmds/cmdbase.py b/src/pmidcite/eutils/cmds/cmdbase.py deleted file mode 100644 index 41ad9a6..0000000 --- a/src/pmidcite/eutils/cmds/cmdbase.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Fetch items and write""" - -__author__ = 'DV Klopfenstein, PhD' -__copyright__ = "Copyright (C) 2016-present DV Klopfenstein, PhD. All rights reserved." -__license__ = "GPL" - -from pmidcite.cfg import Cfg -from pmidcite.eutils.cmds.base import EntrezUtilities - - -#### class EntrezCommands(EntrezUtilities): -class CommandBase(EntrezUtilities): - """Fetch and write text""" - - def __init__(self, retmax=10000, rettype='medline', retmode='text', batch_size=100, **kws): - kws_base = {k:v for k, v in kws.items() if k in EntrezUtilities.exp_kws} - cfg = Cfg() - super(CommandBase, self).__init__( - cfg.get_email(), cfg.get_apikey(), cfg.get_tool(), **kws_base) - self.batch_size = batch_size - self.retmax = retmax - self.rettype = rettype - self.retmode = retmode - - -# Copyright (C) 2016-present DV Klopfenstein, PhD. All rights reserved. diff --git a/src/pmidcite/eutils/cmds/efetch.py b/src/pmidcite/eutils/cmds/efetch.py deleted file mode 100644 index e895785..0000000 --- a/src/pmidcite/eutils/cmds/efetch.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Fetch items and write""" -# https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch - -__author__ = 'DV Klopfenstein, PhD' -__copyright__ = "Copyright (C) 2016-present DV Klopfenstein, PhD. All rights reserved." -__license__ = "GNU AGPLv3" - -import sys -import re -from pmidcite.eutils.cmds.cmdbase import CommandBase - - -class EFetch(CommandBase): - """Fetch and write text""" - - # pylint: disable=too-many-arguments - #### def __init__(self, retmax=10000, rettype='medline', retmode='text', batch_size=100, **kws): - def __init__(self, rettype='medline', retmode='text', batch_size=100, **kws): - kws_base = {k:v for k, v in kws.items() if k in CommandBase.exp_kws} - ##print('FFFFFFFFFFFFFFFFFFFF', kws_base) - super(EFetch, self).__init__(**kws_base) - - def efetch_and_write(self, ostrm, database, webenv, querykey, num_fetches): - """EFetch records found for PMIIDs, page by page""" - ## QueryKey( 1) EFetching(database=pubmed) up to 10 records, starting at 0; ABSTRACT - ## QueryKey( 1) EFetching(database=pubmed) up to 10 records, starting at 10; ABSTRACT - ## msg_fmt = (' QueryKey({:>6}) EFetching(database={}) up to {:5} records, ' - ## 'starting at {}; {}\n') - for start in range(0, num_fetches, self.batch_size): - ## msg = msg_fmt.format(querykey, database, self.batch_size, start, self.desc) - ## sys.stdout.write(msg) - ## print('SSSSSSSSSSSSSSSSSSSSSSSTART:', start) - txt = self.efetch_txt(start, self.batch_size, database, webenv, querykey) - - if txt is not None: - try: - # Read the downloaded data from the socket handle - mtch = re.search(r'(ERROR.*\n)', txt) - if mtch: - sys.stdout.write(mtch.group(1)) - ostrm.write(txt) - ostrm.flush() - # pylint: disable=broad-except - except Exception as err: - sys.stdout.write("*FATAL: BAD READ SOCKET HANDLE: {}\n".format(str(err))) - else: - sys.stdout.write("*FATAL: NO SOCKET HANDLE TO READ FROM\n") - - def efetch_txt(self, start, retmax, database, webenv, querykey): - """Fetch database text""" - try: - # pylint: disable=bad-whitespace - txt = self.run_eutilscmd( - 'efetch', - db = database, - retstart = start, # dflt: 1 - retmax = retmax, # max: 10,000 - rettype = self.rettype, # Ex: medline - retmode = self.retmode, # Ex: text - webenv = webenv, - query_key = querykey) - #print('FETCH:', dct) - return txt - except IOError as err: - msg = "\n*FATAL: EFetching FAILED: {}".format(err) - sys.stdout.write(msg) - sys.stdout.write(" database: {}\n".format(database)) - sys.stdout.write(" retstart: {}\n".format(start)) - sys.stdout.write(" batch_size: {}\n".format(self.batch_size)) - sys.stdout.write(" rettype: {}\n".format(self.rettype)) - sys.stdout.write(" retmode: {}\n".format(self.retmode)) - sys.stdout.write(" webenv: {}\n".format(webenv)) - sys.stdout.write(" querykey: {}\n".format(querykey)) - - -# Copyright (C) 2016-present DV Klopfenstein, PhD. All rights reserved. diff --git a/src/pmidcite/eutils/cmds/elink.py b/src/pmidcite/eutils/cmds/elink.py index 213a535..7fa136e 100644 --- a/src/pmidcite/eutils/cmds/elink.py +++ b/src/pmidcite/eutils/cmds/elink.py @@ -1,4 +1,4 @@ -"""Fetch items and write""" +"""ELink""" __author__ = 'DV Klopfenstein, PhD' __copyright__ = "Copyright (C) 2016-present DV Klopfenstein, PhD. All rights reserved." @@ -6,17 +6,16 @@ import sys import re -from pmidcite.eutils.cmds.cmdbase import CommandBase +from pmidcite.eutils.cmds.base import EntrezUtilities -# TBD: -class ELink(CommandBase): - """Fetch and write text""" +class ELink(EntrezUtilities): + """ELink""" # pylint: disable=too-many-arguments - def __init__(self, retmax=10000, rettype='medline', retmode='text', batch_size=100, **kws): - kws_base = {k:v for k, v in kws.items() if k in CommandBase.exp_kws} - super(ELink, self).__init__(**kws_base) + def __init__(self, email, apikey, tool, batch_size=100): + super().__init__(email, apikey, tool) + self.batch_size = batch_size def elink(self, database_from, linkname, webenv, querykey, num_fetches): """EFetch records found for PMIDs, page by page""" @@ -29,7 +28,6 @@ def elink(self, database_from, linkname, webenv, querykey, num_fetches): ## sys.stdout.write(msg) record = None try: - # pylint: disable=bad-whitespace record = self.run_eutilscmd( 'elink', db = database_from, @@ -41,15 +39,15 @@ def elink(self, database_from, linkname, webenv, querykey, num_fetches): query_key = querykey) print('ELINK:', linkname, record) except IOError as err: - msg = "\n*FATAL: EFetching FAILED: {}".format(err) + msg = f"\n*FATAL: EFetching FAILED: {err}" sys.stdout.write(msg) - sys.stdout.write(" database: {}\n".format(database_from)) - sys.stdout.write(" retstart: {}\n".format(start)) - # sys.stdout.write(" retmax: {}\n".format(retmax)) - sys.stdout.write(" batch_size: {}\n".format(self.batch_size)) - sys.stdout.write(" linkname: {}\n".format(linkname)) - sys.stdout.write(" webenv: {}\n".format(webenv)) - sys.stdout.write(" querykey: {}\n".format(querykey)) + sys.stdout.write(f" database: {database_from}\n") + sys.stdout.write(f" retstart: {start}\n") + # sys.stdout.write(f" retmax: {retmax}\n") + sys.stdout.write(f" batch_size: {self.batch_size}\n") + sys.stdout.write(f" linkname: {linkname}\n") + sys.stdout.write(f" webenv: {webenv}\n") + sys.stdout.write(f" querykey: {querykey}\n") if record is not None: try: @@ -61,7 +59,7 @@ def elink(self, database_from, linkname, webenv, querykey, num_fetches): # ostrm.flush() # pylint: disable=broad-except except Exception as err: - sys.stdout.write("*FATAL: BAD READ SOCKET HANDLE: {}\n".format(str(err))) + sys.stdout.write(f"*FATAL: BAD READ SOCKET HANDLE: {str(err)}\n") else: sys.stdout.write("*FATAL: NO SOCKET HANDLE TO READ FROM\n") diff --git a/src/pmidcite/eutils/cmds/query_ids.py b/src/pmidcite/eutils/cmds/query_ids.py index 272c871..8180306 100644 --- a/src/pmidcite/eutils/cmds/query_ids.py +++ b/src/pmidcite/eutils/cmds/query_ids.py @@ -36,29 +36,28 @@ class QueryIDs(EntrezUtilities): } def __init__(self, email, apikey, tool, prt=sys.stdout): - super(QueryIDs, self).__init__(email, apikey, tool, prt) + super().__init__(email, apikey, tool, prt) def dnld_query_ids(self, query, database, num_ids_p_epost=10): """Searches a NCBI database for a user query, writes resulting entries into one file.""" rsp_dct = self.get_query_rsp(query, database, num_ids_p_epost) - return self.get_ids(rsp_dct, query, database, num_ids_p_epost) + return self._get_ids(rsp_dct, query, database, num_ids_p_epost) def get_query_rsp(self, query, database, num_ids_p_epost=10): """Searches a NCBI database for a user query, writes resulting entries into one file.""" # 1) Query PubMed/Protein, PhD/etc. Get first N (num_ids_p_epost) of the total PMIDs - rsp_dct = self.query(database, query, retmax=num_ids_p_epost) + rsp_dct = self.get_ids_esearch(database, query, retmax=num_ids_p_epost) if rsp_dct is None: if self.log: - self.log.write('No {DB} entries found: {Q}\n'.format(DB=database, Q=query)) + self.log.write(f'No {database} entries found: {query}\n') self.log.flush() return [] if rsp_dct and self.log: - self.log.write('{N:6,} IDs FOR {DB} QUERY({Q})\n'.format( - DB=database, N=rsp_dct['count'], Q=query)) + self.log.write(f'{rsp_dct["count"]:6,} IDs FOR {database} QUERY({query})\n') self.log.flush() return rsp_dct - def get_ids(self, rsp_dct, query, database, num_ids_p_epost=10): + def _get_ids(self, rsp_dct, query, database, num_ids_p_epost=10): """Download PMIDs, N (num_ids_p_epost) at a time""" ##print('WWWWWWWWWWWWWWWWWWWWW pmidcite/eutils/cmds/query_ids.py', rsp_dct) if not rsp_dct: @@ -73,7 +72,8 @@ def get_ids(self, rsp_dct, query, database, num_ids_p_epost=10): ##print('WWWWWWWWWWWWWWWWWWWWWWWW', kws_p) for retnum in range(1, self._get_num_querykeys(num_ids_p_epost, tot_ids)): ##print('WWWWWWWWWWWWWWWWWWWWWWWW retnum', retnum) - rsp_dct = self.query(database, query, retstart=num_ids_p_epost*retnum, **kws_p) + # pylint: disable=line-too-long + rsp_dct = self.get_ids_esearch(database, query, retstart=num_ids_p_epost*retnum, **kws_p) if rsp_dct: ##print('WWWWWWWWWWWWWWWWWWWWWWWW idlist', rsp_dct['idlist']) ids.extend(rsp_dct['idlist']) @@ -90,10 +90,10 @@ def _get_num_querykeys(num_ids_p_epost, num_pmids): ## print(f'num_querykeys({num_querykeys})') return num_querykeys - def query(self, database, query, **esearch): - """Text query finds database UIDs for later use in ESummary, EFetch or ELink""" + def get_ids_esearch(self, database, query, **kws): + """Esearch for json uilist finds database UIDs for later use in ESummary, EFetch or ELink""" kws_exp = self.exp_params.difference({'db', 'term', 'rettype', 'usehistory', 'retmode'}) - kws_act = {k:v for k, v in esearch.items() if k in kws_exp} + kws_act = {k:v for k, v in kws.items() if k in kws_exp} # Returns: # count # retmax @@ -113,12 +113,22 @@ def query(self, database, query, **esearch): usehistory="y", # NCBI prefers we use history(QueryKey, WebEnv) for next acess retmode='json', **kws_act) - if dct is not None and 'idlist' in dct and dct['idlist']: - if database in {'pubmed',}: - dct['idlist'] = [int(n) for n in dct['idlist']] - for fldname in {'count', 'retmax'}: - dct[fldname] = int(dct[fldname]) - return dct + ## print(f'run_eutilscmd rsp {dct.keys()}') + esearchresult = self._get_esearchresult(dct) + ## print(f'run_eutilscmd rsp {esearchresult}') + if esearchresult is not None and 'idlist' in esearchresult and esearchresult['idlist']: + if database in {'pubmed','gene'}: + esearchresult['idlist'] = [int(n) for n in esearchresult['idlist']] + for fldname in ['count', 'retmax']: + esearchresult[fldname] = int(esearchresult[fldname]) + return esearchresult + return None + + @staticmethod + def _get_esearchresult(dct): + if dct is not None: + if 'esearchresult' in dct: + return dct['esearchresult'] return None