Skip to content

Commit

Permalink
add tool to retrieve whole molecule cat, add check for known
Browse files Browse the repository at this point in the history
malformatted catalogs, improve LUT, refactor and significantly robustify
CDMS data table handling.  Also, update cached metadata files

add more regression tests

whitespace

whitespace
  • Loading branch information
keflavich committed Jan 16, 2025
1 parent ca584f6 commit 97c8bb1
Show file tree
Hide file tree
Showing 8 changed files with 2,827 additions and 1,215 deletions.
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ New Tools and Services
Service fixes and enhancements
------------------------------

linelists.cdms
^^^^^^^^^^^^^^

- Add whole catalog retrieval, improve error messaging for unparseable lines,
and improve metadata catalog [#3173]


Infrastructure, Utility and Other Changes and Additions
-------------------------------------------------------
Expand Down
8 changes: 8 additions & 0 deletions astroquery/linelists/cdms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ class Conf(_config.ConfigNamespace):
'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
'CDMS partition function table listing all available molecules.')

catfile_url2 = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
'CDMS catalog table listing all available molecules (with different names from partition function).')

classic_server = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic',
'CDMS Classic Molecule List server.')

timeout = _config.ConfigItem(
60,
'Time limit for connecting to the CDMS server.')
Expand Down
186 changes: 176 additions & 10 deletions astroquery/linelists/cdms/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bs4 import BeautifulSoup
import astropy.units as u
from astropy import table
from astropy.io import ascii
from astroquery.query import BaseQuery
from astroquery.utils import async_to_sync
Expand All @@ -27,7 +28,9 @@ def data_path(filename):
class CDMSClass(BaseQuery):
# use the Configuration Items imported from __init__.py
URL = conf.server
CLASSIC_URL = conf.classic_server
TIMEOUT = conf.timeout
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']

def query_lines_async(self, min_frequency, max_frequency, *,
min_strength=-500, molecule='All',
Expand Down Expand Up @@ -143,8 +146,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
else:
payload['Molecules'] = molecule

payload = list(payload.items())

if get_query_payload:
return payload
# BaseQuery classes come with a _request method that includes a
Expand All @@ -170,6 +171,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
response2 = self._request(method='GET', url=fullurl,
timeout=self.TIMEOUT, cache=cache)

# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
if payload['Molecules'] in badlist:
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "
f"Try get_molecule({payload['Molecules']}) instead.")

return response2

def _parse_result(self, response, *, verbose=False):
Expand Down Expand Up @@ -278,8 +286,9 @@ def _parse_result(self, response, *, verbose=False):

return result

def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
catfile_url=conf.catfile_url):
def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
catfile_url=conf.catfile_url,
catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
"""
A directory of the catalog is found in a file called 'catdir.cat.'
Expand All @@ -303,8 +312,24 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,

if use_cached:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
else:
result = retrieve_catfile(catfile_url)
result2 = retrieve_catfile2(catfile_url2)
result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)

merged = table.join(result, result2, keys=['tag'])
if not all(merged['#lines'] == merged['# lines']):
raise ValueError("Inconsistent table of molecules from CDMS.")
del merged['# lines']

# reorder columns
result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))',
'Ver.', 'Documentation', 'Date of entry', 'Entry in cm–1']]

meta = {'lg(Q(1000))': 1000.0,
'lg(Q(500))': 500.0,
Expand All @@ -331,6 +356,96 @@ def tryfloat(x):
result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
37.5, 18.75, 9.375, 5., 2.725]}

result.add_index('tag')

return result

def get_molecule(self, molecule_id, *, cache=True):
"""
Retrieve the whole molecule table for a given molecule id
"""
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
raise ValueError("molecule_id should be a length-6 string of numbers")
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
response = self._request(method='GET', url=url,
timeout=self.TIMEOUT, cache=cache)
result = self._parse_cat(response)

species_table = self.get_species_table()
result.meta = dict(species_table.loc[int(molecule_id)])

return result

def _parse_cat(self, response, *, verbose=False):
"""
Parse a catalog response into an `~astropy.table.Table`
See details in _parse_response; this is a very similar function,
but the catalog responses have a slightly different format.
"""

if 'Zero lines were found' in response.text:
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")

text = response.text

# notes about the format
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
starts = {'FREQ': 0,
'ERR': 14,
'LGINT': 22,
'DR': 30,
'ELO': 32,
'GUP': 42,
'TAG': 45,
'QNFMT': 52,
'Q1': 56,
'Q2': 58,
'Q3': 60,
'Q4': 62,
'Q5': 64,
'Q6': 66,
'Q7': 68,
'Q8': 70,
'Q9': 72,
'Q10': 74,
'Q11': 76,
'Q12': 78,
'Q13': 80,
'Q14': 82,
}

result = ascii.read(text, header_start=None, data_start=0,
comment=r'THIS|^\s{12,14}\d{4,6}.*',
names=list(starts.keys()),
col_starts=list(starts.values()),
format='fixed_width', fast_reader=False)

# int truncates - which is what we want
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

fix_keys = ['GUP']
for suf in '':
for qn in (f'Q{ii}' for ii in range(1, 15)):
qnind = qn+suf
fix_keys.append(qnind)
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),
dtype=int)
result[key] = intcol

result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)

return result


Expand Down Expand Up @@ -394,9 +509,18 @@ def find(self, st, flags):
def build_lookup():

result = CDMS.get_species_table()

# start with the 'molecule' column
keys = list(result['molecule'][:]) # convert NAME column to list
values = list(result['tag'][:]) # convert TAG column to list
dictionary = dict(zip(keys, values)) # make k,v dictionary

# repeat with the Name column
keys = list(result['Name'][:])
values = list(result['tag'][:])
dictionary2 = dict(zip(keys, values))
dictionary.update(dictionary2)

lookuptable = Lookuptable(dictionary) # apply the class above

return lookuptable
Expand All @@ -408,10 +532,52 @@ def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partit
"""
response = requests.get(url)
response.raise_for_status()
tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))'],
col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
format='fixed_width', delimiter=' ')
lines = response.text.split("\n")

# used to convert '---' to nan
def tryfloat(x):
try:
return float(x)
except ValueError:
return np.nan

# the 'fixed width' table reader fails because there are rows that violate fixed width
tbl_rows = []
for row in lines[15:-5]:
split = row.split()
tag = int(split[0])
molecule_and_lines = row[7:41]
molecule = " ".join(molecule_and_lines.split()[:-1])
nlines = int(molecule_and_lines.split()[-1])
partfunc = map(tryfloat, row[41:].split())
partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
tbl_rows.append({'tag': tag,
'molecule': molecule,
'#lines': nlines,
})
tbl_rows[-1].update(partfunc_dict)
tbl = table.Table(tbl_rows)
# tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
# names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
# 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
# 'lg(Q(2.725))'],
# col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
# format='fixed_width', delimiter=' ')
return tbl


def retrieve_catfile2(url='https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html'):
"""
Simple retrieve index function
"""
response = requests.get(url)
response.raise_for_status()
tbl = ascii.read(response.text, format='html')
# delete a junk column (wastes space)
del tbl['Catalog']

# for joining - want same capitalization
tbl.rename_column("Tag", "tag")
return tbl
Loading

0 comments on commit 97c8bb1

Please sign in to comment.