add tool to retrieve whole molecule cat, add check for known

malformatted catalogs, improve LUT, refactor and significantly robustify CDMS data table handling. Also, update cached metadata files add more regression tests whitespace whitespace
astropy · Jan 16, 2025 · 97c8bb1 · 97c8bb1
1 parent ca584f6
commit 97c8bb1
Show file tree

Hide file tree

Showing 8 changed files with 2,827 additions and 1,215 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -8,6 +8,12 @@ New Tools and Services
 Service fixes and enhancements
 ------------------------------
 
+linelists.cdms
+^^^^^^^^^^^^^^
+
+- Add whole catalog retrieval, improve error messaging for unparseable lines,
+  and improve metadata catalog [#3173]
+
 
 Infrastructure, Utility and Other Changes and Additions
 -------------------------------------------------------

diff --git a/astroquery/linelists/cdms/__init__.py b/astroquery/linelists/cdms/__init__.py
@@ -21,6 +21,14 @@ class Conf(_config.ConfigNamespace):
         'https://cdms.astro.uni-koeln.de/classic/entries/partition_function.html',
         'CDMS partition function table listing all available molecules.')
 
+    catfile_url2 = _config.ConfigItem(
+        'https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html',
+        'CDMS catalog table listing all available molecules (with different names from partition function).')
+
+    classic_server = _config.ConfigItem(
+        'https://cdms.astro.uni-koeln.de/classic',
+        'CDMS Classic Molecule List server.')
+
     timeout = _config.ConfigItem(
         60,
         'Time limit for connecting to the CDMS server.')

diff --git a/astroquery/linelists/cdms/core.py b/astroquery/linelists/cdms/core.py
@@ -5,6 +5,7 @@
 
 from bs4 import BeautifulSoup
 import astropy.units as u
+from astropy import table
 from astropy.io import ascii
 from astroquery.query import BaseQuery
 from astroquery.utils import async_to_sync
@@ -27,7 +28,9 @@ def data_path(filename):
 class CDMSClass(BaseQuery):
     # use the Configuration Items imported from __init__.py
     URL = conf.server
+    CLASSIC_URL = conf.classic_server
     TIMEOUT = conf.timeout
+    MALFORMATTED_MOLECULE_LIST = ['017506 NH3-wHFS', '028582 H2NC', '058501 H2C2S', '064527 HC3HCN']
 
     def query_lines_async(self, min_frequency, max_frequency, *,
                           min_strength=-500, molecule='All',
@@ -143,8 +146,6 @@ def query_lines_async(self, min_frequency, max_frequency, *,
             else:
                 payload['Molecules'] = molecule
 
-        payload = list(payload.items())
-
         if get_query_payload:
             return payload
         # BaseQuery classes come with a _request method that includes a
@@ -170,6 +171,13 @@ def query_lines_async(self, min_frequency, max_frequency, *,
         response2 = self._request(method='GET', url=fullurl,
                                   timeout=self.TIMEOUT, cache=cache)
 
+        # accounts for three formats, e.g.: '058501' or 'H2C2S' or '058501 H2C2S'
+        badlist = (self.MALFORMATTED_MOLECULE_LIST +  # noqa
+                   [y for x in self.MALFORMATTED_MOLECULE_LIST for y in x.split()])
+        if payload['Molecules'] in badlist:
+            raise ValueError(f"Molecule {payload['Molecules']} is known not to comply with standard CDMS format.  "
+                             f"Try get_molecule({payload['Molecules']}) instead.")
+
         return response2
 
     def _parse_result(self, response, *, verbose=False):
@@ -278,8 +286,9 @@ def _parse_result(self, response, *, verbose=False):
 
         return result
 
-    def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
-                          catfile_url=conf.catfile_url):
+    def get_species_table(self, *, catfile='partfunc.cat', use_cached=True,
+                          catfile_url=conf.catfile_url,
+                          catfile2='catdir.cat', catfile_url2=conf.catfile_url2):
         """
         A directory of the catalog is found in a file called 'catdir.cat.'
 
@@ -303,8 +312,24 @@ def get_species_table(self, *, catfile='catdir.cat', use_cached=True,
 
         if use_cached:
             result = ascii.read(data_path(catfile), format='fixed_width', delimiter='|')
+            result2 = ascii.read(data_path(catfile2), format='fixed_width', delimiter='|')
         else:
             result = retrieve_catfile(catfile_url)
+            result2 = retrieve_catfile2(catfile_url2)
+            result.write(data_path(catfile), format='ascii.fixed_width', delimiter='|', overwrite=True)
+            result2.write(data_path(catfile2), format='ascii.fixed_width', delimiter='|', overwrite=True)
+
+        merged = table.join(result, result2, keys=['tag'])
+        if not all(merged['#lines'] == merged['# lines']):
+            raise ValueError("Inconsistent table of molecules from CDMS.")
+        del merged['# lines']
+
+        # reorder columns
+        result = merged[['tag', 'molecule', 'Name', '#lines', 'lg(Q(1000))',
+                         'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))', 'lg(Q(150))', 'lg(Q(75))',
+                         'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
+                         'lg(Q(2.725))',
+                         'Ver.', 'Documentation', 'Date of entry', 'Entry in cm–1']]
 
         meta = {'lg(Q(1000))': 1000.0,
                 'lg(Q(500))': 500.0,
@@ -331,6 +356,96 @@ def tryfloat(x):
         result.meta = {'Temperature (K)': [1000., 500., 300., 225., 150., 75.,
                                            37.5, 18.75, 9.375, 5., 2.725]}
 
+        result.add_index('tag')
+
+        return result
+
+    def get_molecule(self, molecule_id, *, cache=True):
+        """
+        Retrieve the whole molecule table for a given molecule id
+        """
+        if not isinstance(molecule_id, str) or len(molecule_id) != 6:
+            raise ValueError("molecule_id should be a length-6 string of numbers")
+        url = f'{self.CLASSIC_URL}/entries/c{molecule_id}.cat'
+        response = self._request(method='GET', url=url,
+                                 timeout=self.TIMEOUT, cache=cache)
+        result = self._parse_cat(response)
+
+        species_table = self.get_species_table()
+        result.meta = dict(species_table.loc[int(molecule_id)])
+
+        return result
+
+    def _parse_cat(self, response, *, verbose=False):
+        """
+        Parse a catalog response into an `~astropy.table.Table`
+
+        See details in _parse_response; this is a very similar function,
+        but the catalog responses have a slightly different format.
+        """
+
+        if 'Zero lines were found' in response.text:
+            raise EmptyResponseError(f"Response was empty; message was '{response.text}'.")
+
+        text = response.text
+
+        # notes about the format
+        # [F13.4, 2F8.4, I2, F10.4, I3, I7, I4, 12I2]: FREQ, ERR, LGINT, DR, ELO, GUP, TAG, QNFMT, QN  noqa
+        #      13 21 29  31     41  44  51  55  57 59 61 63 65 67  69 71 73 75 77 79                   noqa
+        starts = {'FREQ': 0,
+                  'ERR': 14,
+                  'LGINT': 22,
+                  'DR': 30,
+                  'ELO': 32,
+                  'GUP': 42,
+                  'TAG': 45,
+                  'QNFMT': 52,
+                  'Q1': 56,
+                  'Q2': 58,
+                  'Q3': 60,
+                  'Q4': 62,
+                  'Q5': 64,
+                  'Q6': 66,
+                  'Q7': 68,
+                  'Q8': 70,
+                  'Q9': 72,
+                  'Q10': 74,
+                  'Q11': 76,
+                  'Q12': 78,
+                  'Q13': 80,
+                  'Q14': 82,
+                  }
+
+        result = ascii.read(text, header_start=None, data_start=0,
+                            comment=r'THIS|^\s{12,14}\d{4,6}.*',
+                            names=list(starts.keys()),
+                            col_starts=list(starts.values()),
+                            format='fixed_width', fast_reader=False)
+
+        # int truncates - which is what we want
+        result['MOLWT'] = [int(x/1e4) for x in result['TAG']]
+
+        result['FREQ'].unit = u.MHz
+        result['ERR'].unit = u.MHz
+
+        result['Lab'] = result['MOLWT'] < 0
+        result['MOLWT'] = np.abs(result['MOLWT'])
+        result['MOLWT'].unit = u.Da
+
+        fix_keys = ['GUP']
+        for suf in '':
+            for qn in (f'Q{ii}' for ii in range(1, 15)):
+                qnind = qn+suf
+                fix_keys.append(qnind)
+        for key in fix_keys:
+            if not np.issubdtype(result[key].dtype, np.integer):
+                intcol = np.array(list(map(parse_letternumber, result[key])),
+                                  dtype=int)
+                result[key] = intcol
+
+        result['LGINT'].unit = u.nm**2 * u.MHz
+        result['ELO'].unit = u.cm**(-1)
+
         return result
 
 
@@ -394,9 +509,18 @@ def find(self, st, flags):
 def build_lookup():
 
     result = CDMS.get_species_table()
+
+    # start with the 'molecule' column
     keys = list(result['molecule'][:])  # convert NAME column to list
     values = list(result['tag'][:])  # convert TAG column to list
     dictionary = dict(zip(keys, values))  # make k,v dictionary
+
+    # repeat with the Name column
+    keys = list(result['Name'][:])
+    values = list(result['tag'][:])
+    dictionary2 = dict(zip(keys, values))
+    dictionary.update(dictionary2)
+
     lookuptable = Lookuptable(dictionary)  # apply the class above
 
     return lookuptable
@@ -408,10 +532,52 @@ def retrieve_catfile(url='https://cdms.astro.uni-koeln.de/classic/entries/partit
     """
     response = requests.get(url)
     response.raise_for_status()
-    tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
-                     names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
-                            'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
-                            'lg(Q(2.725))'],
-                     col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
-                     format='fixed_width', delimiter=' ')
+    lines = response.text.split("\n")
+
+    # used to convert '---' to nan
+    def tryfloat(x):
+        try:
+            return float(x)
+        except ValueError:
+            return np.nan
+
+    # the 'fixed width' table reader fails because there are rows that violate fixed width
+    tbl_rows = []
+    for row in lines[15:-5]:
+        split = row.split()
+        tag = int(split[0])
+        molecule_and_lines = row[7:41]
+        molecule = " ".join(molecule_and_lines.split()[:-1])
+        nlines = int(molecule_and_lines.split()[-1])
+        partfunc = map(tryfloat, row[41:].split())
+        partfunc_dict = dict(zip(['lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
+                                  'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))',
+                                  'lg(Q(9.375))', 'lg(Q(5.000))', 'lg(Q(2.725))'], partfunc))
+        tbl_rows.append({'tag': tag,
+                         'molecule': molecule,
+                         '#lines': nlines,
+                         })
+        tbl_rows[-1].update(partfunc_dict)
+    tbl = table.Table(tbl_rows)
+    # tbl = ascii.read(response.text, header_start=None, data_start=15, data_end=-5,
+    #                  names=['tag', 'molecule', '#lines', 'lg(Q(1000))', 'lg(Q(500))', 'lg(Q(300))', 'lg(Q(225))',
+    #                         'lg(Q(150))', 'lg(Q(75))', 'lg(Q(37.5))', 'lg(Q(18.75))', 'lg(Q(9.375))', 'lg(Q(5.000))',
+    #                         'lg(Q(2.725))'],
+    #                  col_starts=(0, 7, 34, 41, 53, 66, 79, 92, 106, 117, 131, 145, 159, 173),
+    #                  format='fixed_width', delimiter=' ')
+    return tbl
+
+
+def retrieve_catfile2(url='https://cdms.astro.uni-koeln.de/classic/predictions/catalog/catdir.html'):
+    """
+    Simple retrieve index function
+    """
+    response = requests.get(url)
+    response.raise_for_status()
+    tbl = ascii.read(response.text, format='html')
+    # delete a junk column (wastes space)
+    del tbl['Catalog']
+
+    # for joining - want same capitalization
+    tbl.rename_column("Tag", "tag")
     return tbl