Skip to content

Commit

Permalink
add tool to retrieve whole molecule cat, add check for known
Browse files Browse the repository at this point in the history
malformatted catalogs, improve LUT, refactor and significantly robustify
CDMS data table handling.  Also, update cached metadata files

add more regression tests

whitespace

whitespace

setup for data file

almost there ... unicode bad

col name

recode unicode raised minus (character 96) as simple dash

try replacing the text before ascii-reading it

oops

dashes

try a different approach...

try a different approach... part 2

super minor

remove unnecessary test that I just added

allow lookuptable to skip regex searching for exact matches

no single-char variables

looks like my fixes didn't work, and of course I made a bunch of stupid errors.  I might have to give up for the night

hooray!  got a different error this time.  Now just guesswork though....

Entry column

changelog
  • Loading branch information
keflavich committed Jan 17, 2025
1 parent ca584f6 commit 5ea744f
Show file tree
Hide file tree
Showing 10 changed files with 2,871 additions and 1,224 deletions.
11 changes: 11 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ New Tools and Services
Service fixes and enhancements
------------------------------

linelists.cdms
^^^^^^^^^^^^^^

- Add whole catalog retrieval, improve error messaging for unparseable lines,
improve metadata catalog, and improve lookuptable behavior [#3173,#2901]

jplspec
^^^^^^^

- minor improvement to lookuptable behavior [#3173,#2901]


Infrastructure, Utility and Other Changes and Additions
-------------------------------------------------------
Expand Down
15 changes: 9 additions & 6 deletions astroquery/jplspec/lookup_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

class Lookuptable(dict):

def find(self, s, flags):
def find(self, st, flags):
"""
Search dictionary keys for a regex match to string s
Expand All @@ -22,17 +22,20 @@ def find(self, s, flags):
Returns
-------
The list of values corresponding to the matches
The dictionary containing only values whose keys match the regex
"""

R = re.compile(s, flags)
if st in self:
return {st: self[st]}

Check warning on line 30 in astroquery/jplspec/lookup_table.py

View check run for this annotation

Codecov / codecov/patch

astroquery/jplspec/lookup_table.py#L30

Added line #L30 was not covered by tests

R = re.compile(st, flags)

out = {}

for k, v in self.items():
match = R.search(str(k))
for key, val in self.items():
match = R.search(str(key))
if match:
out[k] = v
out[key] = val

return out
8 changes: 8 additions & 0 deletions astroquery/linelists/cdms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ class Conf(_config.ConfigNamespace):
'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
'CDMS partition function table listing all available molecules.')

catfile_url2 = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
'CDMS catalog table listing all available molecules (with different names from partition function).')

classic_server = _config.ConfigItem(
'https://cdms.astro.uni-koeln.de/classic',
'CDMS Classic Molecule List server.')

timeout = _config.ConfigItem(
60,
'Time limit for connecting to the CDMS server.')
Expand Down
216 changes: 204 additions & 12 deletions astroquery/linelists/cdms/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bs4 import BeautifulSoup
import astropy.units as u
from astropy import table
from astropy.io import ascii
from astroquery.query import BaseQuery
from astroquery.utils import async_to_sync
Expand All @@ -27,7 +28,9 @@ def data_path(filename):
class CDMSClass(BaseQuery):
# use the Configuration Items imported from __init__.py
URL = conf.server
CLASSIC_URL = conf.classic_server
TIMEOUT = conf.timeout
MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']

def query_lines_async(self, min_frequency, max_frequency, *,
min_strength=-500, molecule='All',
Expand Down Expand Up @@ -143,8 +146,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
else:
payload['Molecules'] = molecule

payload = list(payload.items())

if get_query_payload:
return payload
# BaseQuery classes come with a _request method that includes a
Expand All @@ -170,6 +171,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
response2 = self._request(method='GET', url=fullurl,
timeout=self.TIMEOUT, cache=cache)

# accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
badlist = (self.MALFORMATTED_MOLECULE_LIST + # noqa
[y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
if payload['Molecules'] in badlist:
raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format. "

Check warning on line 178 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L178

Added line #L178 was not covered by tests
f"Try get_molecule({payload['Molecules']}) instead.")

return response2

def _parse_result(self, response, *, verbose=False):
Expand Down Expand Up @@ -278,8 +286,9 @@ def _parse_result(self, response, *, verbose=False):

return result

def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
catfile_url=conf.catfile_url):
def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
catfile_url=conf.catfile_url,
catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
"""
A directory of the catalog is found in a file called 'catdir.cat.'
Expand All @@ -302,9 +311,35 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
"""

if use_cached:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
try:
result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
except UnicodeDecodeError:
with open(data_path(catfile), 'rb') as fh:
content = fh.read()
text = content.decode('ascii', errors='replace')
result = ascii.read(text, format='basic', delimiter='|')
with open(data_path(catfile2), 'rb') as fh:
content = fh.read()
text = content.decode('ascii', errors='replace')
result2 = ascii.read(text, format='basic', delimiter='|')

Check warning on line 325 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L317-L325

Added lines #L317 - L325 were not covered by tests
else:
result = retrieve_catfile(catfile_url)
result2 = retrieve_catfile2(catfile_url2)
result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)

Check warning on line 330 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L328-L330

Added lines #L328 - L330 were not covered by tests

merged = table.join(result, result2, keys=['tag'])
if not all(merged['#lines'] == merged['# lines']):
raise ValueError("Inconsistent table of molecules from CDMS.")

Check warning on line 334 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L334

Added line #L334 was not covered by tests
del merged['# lines']

# reorder columns
result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))',
'Ver.', 'Documentation', 'Date of entry', 'Entry']]

meta = {'lg(Q(1000))': 1000.0,
'lg(Q(500))': 500.0,
Expand All @@ -331,6 +366,96 @@ def tryfloat(x):
result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
37.5, 18.75, 9.375, 5., 2.725]}

result.add_index('tag')

return result

def get_molecule(self, molecule_id, *, cache=True):
"""
Retrieve the whole molecule table for a given molecule id
"""
if not isinstance(molecule_id, str) or len(molecule_id) != 6:
raise ValueError("molecule_id should be a length-6 string of numbers")
url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
response = self._request(method='GET', url=url,

Check warning on line 380 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L377-L380

Added lines #L377 - L380 were not covered by tests
timeout=self.TIMEOUT, cache=cache)
result = self._parse_cat(response)

Check warning on line 382 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L382

Added line #L382 was not covered by tests

species_table = self.get_species_table()
result.meta = dict(species_table.loc[int(molecule_id)])

Check warning on line 385 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L384-L385

Added lines #L384 - L385 were not covered by tests

return result

Check warning on line 387 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L387

Added line #L387 was not covered by tests

def _parse_cat(self, response, *, verbose=False):
"""
Parse a catalog response into an `~astropy.table.Table`
See details in _parse_response; this is a very similar function,
but the catalog responses have a slightly different format.
"""

if 'Zero lines were found' in response.text:
raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")

Check warning on line 398 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L397-L398

Added lines #L397 - L398 were not covered by tests

text = response.text

Check warning on line 400 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L400

Added line #L400 was not covered by tests

# notes about the format
# [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN noqa
# 13 21 29 31 41 44 51 55 57 59 61 63 65 67 69 71 73 75 77 79 noqa
starts = {'FREQ': 0,

Check warning on line 405 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L405

Added line #L405 was not covered by tests
'ERR': 14,
'LGINT': 22,
'DR': 30,
'ELO': 32,
'GUP': 42,
'TAG': 45,
'QNFMT': 52,
'Q1': 56,
'Q2': 58,
'Q3': 60,
'Q4': 62,
'Q5': 64,
'Q6': 66,
'Q7': 68,
'Q8': 70,
'Q9': 72,
'Q10': 74,
'Q11': 76,
'Q12': 78,
'Q13': 80,
'Q14': 82,
}

result = ascii.read(text, header_start=None, data_start=0,

Check warning on line 429 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L429

Added line #L429 was not covered by tests
comment=r'THIS|^\s{12,14}\d{4,6}.*',
names=list(starts.keys()),
col_starts=list(starts.values()),
format='fixed_width', fast_reader=False)

# int truncates - which is what we want
result['MOLWT'] = [int(x/1e4) for x in result['TAG']]

Check warning on line 436 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L436

Added line #L436 was not covered by tests

result['FREQ'].unit = u.MHz
result['ERR'].unit = u.MHz

Check warning on line 439 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L438-L439

Added lines #L438 - L439 were not covered by tests

result['Lab'] = result['MOLWT'] < 0
result['MOLWT'] = np.abs(result['MOLWT'])
result['MOLWT'].unit = u.Da

Check warning on line 443 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L441-L443

Added lines #L441 - L443 were not covered by tests

fix_keys = ['GUP']
for suf in '':
for qn in (f'Q{ii}' for ii in range(1, 15)):
qnind = qn+suf
fix_keys.append(qnind)
for key in fix_keys:
if not np.issubdtype(result[key].dtype, np.integer):
intcol = np.array(list(map(parse_letternumber, result[key])),

Check warning on line 452 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L445-L452

Added lines #L445 - L452 were not covered by tests
dtype=int)
result[key] = intcol

Check warning on line 454 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L454

Added line #L454 was not covered by tests

result['LGINT'].unit = u.nm**2 * u.MHz
result['ELO'].unit = u.cm**(-1)

Check warning on line 457 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L456-L457

Added lines #L456 - L457 were not covered by tests

return result


Expand Down Expand Up @@ -375,10 +500,13 @@ def find(self, st, flags):
Returns
-------
The list of values corresponding to the matches
The dictionary containing only values whose keys match the regex
"""

if st in self:
return {st: self[st]}

out = {}

for kk, vv in self.items():
Expand All @@ -394,9 +522,18 @@ def find(self, st, flags):
def build_lookup():

result = CDMS.get_species_table()

# start with the 'molecule' column
keys = list(result['molecule'][:]) # convert NAME column to list
values = list(result['tag'][:]) # convert TAG column to list
dictionary = dict(zip(keys, values)) # make k,v dictionary

# repeat with the Name column
keys = list(result['Name'][:])
values = list(result['tag'][:])
dictionary2 = dict(zip(keys, values))
dictionary.update(dictionary2)

lookuptable = Lookuptable(dictionary) # apply the class above

return lookuptable
Expand All @@ -408,10 +545,65 @@ def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partit
"""
response = requests.get(url)
response.raise_for_status()
tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
'lg(Q(2.725))'],
col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
format='fixed_width', delimiter=' ')
lines = response.text.split("\n")

Check warning on line 548 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L548

Added line #L548 was not covered by tests

# used to convert '---' to nan
def tryfloat(x):
try:
return float(x)
except ValueError:
return np.nan

Check warning on line 555 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L551-L555

Added lines #L551 - L555 were not covered by tests

# the 'fixed width' table reader fails because there are rows that violate fixed width
tbl_rows = []
for row in lines[15:-5]:
split = row.split()
tag = int(split[0])
molecule_and_lines = row[7:41]
molecule = " ".join(molecule_and_lines.split()[:-1])
nlines = int(molecule_and_lines.split()[-1])
partfunc = map(tryfloat, row[41:].split())
partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',

Check warning on line 566 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L558-L566

Added lines #L558 - L566 were not covered by tests
'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
tbl_rows.append({'tag': tag,

Check warning on line 569 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L569

Added line #L569 was not covered by tests
'molecule': molecule,
'#lines': nlines,
})
tbl_rows[-1].update(partfunc_dict)
tbl = table.Table(tbl_rows)

Check warning on line 574 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L573-L574

Added lines #L573 - L574 were not covered by tests
# tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
# names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
# 'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
# 'lg(Q(2.725))'],
# col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
# format='fixed_width', delimiter=' ')
return tbl

Check warning on line 581 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L581

Added line #L581 was not covered by tests


def retrieve_catfile2(url='https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html'):
"""
Simple retrieve index function
"""
response = requests.get(url)
response.raise_for_status()
try:
tbl = ascii.read(response.text, format='html')
except UnicodeDecodeError:

Check warning on line 592 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L588-L592

Added lines #L588 - L592 were not covered by tests
# based on https://github.com/astropy/astropy/issues/3826#issuecomment-256113937
text = response.content.decode('ascii', errors='replace')
tbl = ascii.read(text, format='html')

Check warning on line 595 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L594-L595

Added lines #L594 - L595 were not covered by tests

# delete a junk column (wastes space)
del tbl['Catalog']

Check warning on line 598 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L598

Added line #L598 was not covered by tests

# for joining - want same capitalization
tbl.rename_column("Tag", "tag")

Check warning on line 601 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L601

Added line #L601 was not covered by tests

# one of these is a unicode dash, the other is a normal dash.... in theory
if 'Entry in cm–1' in tbl.colnames:
tbl.rename_column('Entry in cm–1', 'Entry')
if 'Entry in cm-1' in tbl.colnames:
tbl.rename_column('Entry in cm-1', 'Entry')

Check warning on line 607 in astroquery/linelists/cdms/core.py

View check run for this annotation

Codecov / codecov/patch

astroquery/linelists/cdms/core.py#L604-L607

Added lines #L604 - L607 were not covered by tests

return tbl
Loading

0 comments on commit 5ea744f

Please sign in to comment.