Merge pull request #96 from mmorley0395/oid_try

added elif statement to catch an edge case where oid isnt in meta
openaddresses · Aug 7, 2023 · d330a79 · d330a79
2 parents 769cea6 + be75e3c
commit d330a79
Showing 1 changed file with 73 additions and 37 deletions.
diff --git a/esridump/dumper.py b/esridump/dumper.py
@@ -8,6 +8,7 @@
 from esridump import esri2geojson
 from esridump.errors import EsriDownloadError
 
+
 class EsriDumper(object):
     def __init__(self, url, parent_logger=None,
                  extra_query_args=None, extra_headers=None,
@@ -49,7 +50,8 @@ def _request(self, method, url, **kwargs):
                 if params:
                     url += '?' + urlencode(params)
 
-            self._logger.debug("%s %s, args %s", method, url, kwargs.get('params') or kwargs.get('data'))
+            self._logger.debug("%s %s, args %s", method, url,
+                               kwargs.get('params') or kwargs.get('data'))
             return requests.request(method, url, timeout=self._http_timeout, **kwargs)
         except requests.exceptions.SSLError:
             self._logger.warning("Retrying %s without SSL verification", url)
@@ -125,15 +127,16 @@ def can_handle_pagination(self, query_fields):
         })
         headers = self._build_headers()
         query_url = self._build_url('/query')
-        response = self._request('POST', query_url, headers=headers, data=check_args)
+        response = self._request(
+            'POST', query_url, headers=headers, data=check_args)
 
         try:
             data = response.json()
         except:
             self._logger.error("Could not parse response from pagination check %s as JSON:\n\n%s",
-                response.request.url,
-                response.text,
-            )
+                               response.request.url,
+                               response.text,
+                               )
             return False
 
         return data.get('error') and data['error']['message'] != "Failed to execute query."
@@ -144,8 +147,10 @@ def get_metadata(self):
         })
         headers = self._build_headers()
         url = self._build_url()
-        response = self._request('GET', url, params=query_args, headers=headers)
-        metadata_json = self._handle_esri_errors(response, "Could not retrieve layer metadata")
+        response = self._request(
+            'GET', url, params=query_args, headers=headers)
+        metadata_json = self._handle_esri_errors(
+            response, "Could not retrieve layer metadata")
         return metadata_json
 
     def get_feature_count(self):
@@ -156,8 +161,10 @@ def get_feature_count(self):
         })
         headers = self._build_headers()
         url = self._build_url('/query')
-        response = self._request('GET', url, params=query_args, headers=headers)
-        count_json = self._handle_esri_errors(response, "Could not retrieve row count")
+        response = self._request(
+            'GET', url, params=query_args, headers=headers)
+        count_json = self._handle_esri_errors(
+            response, "Could not retrieve row count")
         count = count_json.get('count')
         if count is None:
             raise EsriDownloadError("Server doesn't support returnCountOnly")
@@ -169,6 +176,9 @@ def _find_oid_field_name(self, metadata):
             for f in metadata['fields']:
                 if f.get('type') == 'esriFieldTypeOID':
                     oid_field_name = f['name']
+                elif f['name'].lower() == 'objectid':
+                    oid_field_name = f['name']
+                else:
                     break
 
         return oid_field_name
@@ -179,14 +189,18 @@ def _get_layer_min_max(self, oid_field_name):
             'f': 'json',
             'outFields': '',
             'outStatistics': json.dumps([
-                dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'),
-                dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'),
+                dict(statisticType='min', onStatisticField=oid_field_name,
+                     outStatisticFieldName='THE_MIN'),
+                dict(statisticType='max', onStatisticField=oid_field_name,
+                     outStatisticFieldName='THE_MAX'),
             ], separators=(',', ':'))
         })
         headers = self._build_headers()
         url = self._build_url('/query')
-        response = self._request('GET', url, params=query_args, headers=headers)
-        metadata = self._handle_esri_errors(response, "Could not retrieve min/max oid values")
+        response = self._request(
+            'GET', url, params=query_args, headers=headers)
+        metadata = self._handle_esri_errors(
+            response, "Could not retrieve min/max oid values")
 
         # Some servers (specifically version 10.11, it seems) will respond with SQL statements
         # for the attribute names rather than the requested field names, so pick the min and max
@@ -198,8 +212,10 @@ def _get_layer_min_max(self, oid_field_name):
             'f': 'json',
             'outFields': '*',
             'outStatistics': json.dumps([
-                dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'),
-                dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'),
+                dict(statisticType='min', onStatisticField=oid_field_name,
+                     outStatisticFieldName='THE_MIN'),
+                dict(statisticType='max', onStatisticField=oid_field_name,
+                     outStatisticFieldName='THE_MAX'),
             ], separators=(',', ':'))
         })
         query_args = self._build_query_args({
@@ -214,8 +230,10 @@ def _get_layer_min_max(self, oid_field_name):
         })
         headers = self._build_headers()
         url = self._build_url('/query')
-        response = self._request('GET', url, params=query_args, headers=headers)
-        oid_data = self._handle_esri_errors(response, "Could not check min/max values")
+        response = self._request(
+            'GET', url, params=query_args, headers=headers)
+        oid_data = self._handle_esri_errors(
+            response, "Could not check min/max values")
         if not oid_data or not oid_data.get('objectIds') or min_value not in oid_data['objectIds'] or max_value not in oid_data['objectIds']:
             raise EsriDownloadError('Server returned invalid min/max')
 
@@ -229,8 +247,10 @@ def _get_layer_oids(self):
         })
         url = self._build_url('/query')
         headers = self._build_headers()
-        response = self._request('GET', url, params=query_args, headers=headers)
-        oid_data = self._handle_esri_errors(response, "Could not retrieve object IDs")
+        response = self._request(
+            'GET', url, params=query_args, headers=headers)
+        oid_data = self._handle_esri_errors(
+            response, "Could not retrieve object IDs")
         oids = oid_data.get('objectIds')
         if not oids:
             raise EsriDownloadError("Server doesn't support returnIdsOnly")
@@ -250,8 +270,10 @@ def _fetch_bounded_features(self, envelope, outSR):
         })
         headers = self._build_headers()
         url = self._build_url('/query')
-        response = self._request('GET', url, params=query_args, headers=headers)
-        features = self._handle_esri_errors(response, "Could not retrieve a section of features")
+        response = self._request(
+            'GET', url, params=query_args, headers=headers)
+        features = self._handle_esri_errors(
+            response, "Could not retrieve a section of features")
         return features['features']
 
     def _split_envelope(self, envelope):
@@ -288,7 +310,8 @@ def _scrape_an_envelope(self, envelope, outSR, max_records):
         features = self._fetch_bounded_features(envelope, outSR)
 
         if len(features) >= max_records:
-            self._logger.info("Retrieved exactly the maximum record count. Splitting this box and retrieving the children.")
+            self._logger.info(
+                "Retrieved exactly the maximum record count. Splitting this box and retrieving the children.")
 
             envelopes = self._split_envelope(envelope)
 
@@ -302,7 +325,8 @@ def _scrape_an_envelope(self, envelope, outSR, max_records):
     def __iter__(self):
         query_fields = self._fields
         metadata = self.get_metadata()
-        page_size = min(self._max_page_size, metadata.get('maxRecordCount', 500))
+        page_size = min(self._max_page_size,
+                        metadata.get('maxRecordCount', 500))
         geometry_type = metadata.get('geometryType')
 
         row_count = None
@@ -319,15 +343,16 @@ def __iter__(self):
 
         page_args = []
 
-        if not self._paginate_oid and row_count is not None and (metadata.get('supportsPagination') or \
-                (metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])):
+        if not self._paginate_oid and row_count is not None and (metadata.get('supportsPagination') or
+                                                                 (metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])):
             # If the layer supports pagination, we can use resultOffset/resultRecordCount to paginate
 
             # There's a bug where some servers won't handle these queries in combination with a list of
             # fields specified. We'll make a single, 1 row query here to check if the server supports this
             # and switch to querying for all fields if specifying the fields fails.
             if query_fields and not self.can_handle_pagination(query_fields):
-                self._logger.info("Source does not support pagination with fields specified, so querying for all fields.")
+                self._logger.info(
+                    "Source does not support pagination with fields specified, so querying for all fields.")
                 query_fields = None
 
             for offset in range(self._startWith, row_count, page_size):
@@ -342,15 +367,17 @@ def __iter__(self):
                     'f': 'json',
                 })
                 page_args.append(query_args)
-            self._logger.info("Built %s requests using resultOffset method", len(page_args))
+            self._logger.info(
+                "Built %s requests using resultOffset method", len(page_args))
         else:
             # If not, we can still use the `where` argument to paginate
 
             use_oids = True
             oid_field_name = self._find_oid_field_name(metadata)
 
             if not oid_field_name:
-                raise EsriDownloadError("Could not find object ID field name for deduplication")
+                raise EsriDownloadError(
+                    "Could not find object ID field name for deduplication")
 
             if metadata.get('supportsStatistics'):
                 # If the layer supports statistics, we can request maximum and minimum object ID
@@ -374,13 +401,15 @@ def __iter__(self):
                             'f': 'json',
                         })
                         page_args.append(query_args)
-                    self._logger.info("Built {} requests using OID where clause method".format(len(page_args)))
+                    self._logger.info(
+                        "Built {} requests using OID where clause method".format(len(page_args)))
 
                     # If we reach this point we don't need to fall through to enumerating all object IDs
                     # because the statistics method worked
                     use_oids = False
                 except EsriDownloadError:
-                    self._logger.exception("Finding max/min from statistics failed. Trying OID enumeration.")
+                    self._logger.exception(
+                        "Finding max/min from statistics failed. Trying OID enumeration.")
 
             if use_oids:
                 # If the layer does not support statistics, we can request
@@ -408,7 +437,8 @@ def __iter__(self):
                             'f': 'json',
                         })
                         page_args.append(query_args)
-                    self._logger.info("Built %s requests using OID enumeration method", len(page_args))
+                    self._logger.info(
+                        "Built %s requests using OID enumeration method", len(page_args))
                 except EsriDownloadError:
                     self._logger.info("Falling back to geo queries")
                     # Use geospatial queries when none of the ID-based methods will work
@@ -439,19 +469,24 @@ def __iter__(self):
                     # pause every number of "requests_to_pause", that increase the probability for server response
                     if query_index % self._requests_to_pause == 0:
                         time.sleep(self._pause_seconds)
-                        self._logger.info("pause for %s seconds", self._pause_seconds)
-                    response = self._request('POST', query_url, headers=headers, data=query_args)
-                    data = self._handle_esri_errors(response, "Could not retrieve this chunk of objects")
+                        self._logger.info(
+                            "pause for %s seconds", self._pause_seconds)
+                    response = self._request(
+                        'POST', query_url, headers=headers, data=query_args)
+                    data = self._handle_esri_errors(
+                        response, "Could not retrieve this chunk of objects")
                     # reset the exception state.
                     download_exception = None
                     # get out of retry loop, as the request succeeded
                     break
                 except socket.timeout as e:
-                    raise EsriDownloadError("Timeout when connecting to URL", e)
+                    raise EsriDownloadError(
+                        "Timeout when connecting to URL", e)
                 except ValueError as e:
                     raise EsriDownloadError("Could not parse JSON", e)
                 except Exception as e:
-                    download_exception = EsriDownloadError("Could not connect to URL", e)
+                    download_exception = EsriDownloadError(
+                        "Could not connect to URL", e)
                     # increase the pause time every retry, to increase the probability of fetching data successfully
                     time.sleep(self._pause_seconds * (retry + 1))
                     self._logger.info("retry pause {0}".format(retry))
@@ -461,7 +496,8 @@ def __iter__(self):
 
             error = data.get('error')
             if error:
-                raise EsriDownloadError("Problem querying ESRI dataset with args {}. Server said: {}".format(query_args, error['message']))
+                raise EsriDownloadError("Problem querying ESRI dataset with args {}. Server said: {}".format(
+                    query_args, error['message']))
 
             features = data.get('features')