Skip to content

Commit

Permalink
Merge pull request #96 from mmorley0395/oid_try
Browse files Browse the repository at this point in the history
added elif statement to catch an edge case where oid isnt in meta
  • Loading branch information
iandees authored Aug 7, 2023
2 parents 769cea6 + be75e3c commit d330a79
Showing 1 changed file with 73 additions and 37 deletions.
110 changes: 73 additions & 37 deletions esridump/dumper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from esridump import esri2geojson
from esridump.errors import EsriDownloadError


class EsriDumper(object):
def __init__(self, url, parent_logger=None,
extra_query_args=None, extra_headers=None,
Expand Down Expand Up @@ -49,7 +50,8 @@ def _request(self, method, url, **kwargs):
if params:
url += '?' + urlencode(params)

self._logger.debug("%s %s, args %s", method, url, kwargs.get('params') or kwargs.get('data'))
self._logger.debug("%s %s, args %s", method, url,
kwargs.get('params') or kwargs.get('data'))
return requests.request(method, url, timeout=self._http_timeout, **kwargs)
except requests.exceptions.SSLError:
self._logger.warning("Retrying %s without SSL verification", url)
Expand Down Expand Up @@ -125,15 +127,16 @@ def can_handle_pagination(self, query_fields):
})
headers = self._build_headers()
query_url = self._build_url('/query')
response = self._request('POST', query_url, headers=headers, data=check_args)
response = self._request(
'POST', query_url, headers=headers, data=check_args)

try:
data = response.json()
except:
self._logger.error("Could not parse response from pagination check %s as JSON:\n\n%s",
response.request.url,
response.text,
)
response.request.url,
response.text,
)
return False

return data.get('error') and data['error']['message'] != "Failed to execute query."
Expand All @@ -144,8 +147,10 @@ def get_metadata(self):
})
headers = self._build_headers()
url = self._build_url()
response = self._request('GET', url, params=query_args, headers=headers)
metadata_json = self._handle_esri_errors(response, "Could not retrieve layer metadata")
response = self._request(
'GET', url, params=query_args, headers=headers)
metadata_json = self._handle_esri_errors(
response, "Could not retrieve layer metadata")
return metadata_json

def get_feature_count(self):
Expand All @@ -156,8 +161,10 @@ def get_feature_count(self):
})
headers = self._build_headers()
url = self._build_url('/query')
response = self._request('GET', url, params=query_args, headers=headers)
count_json = self._handle_esri_errors(response, "Could not retrieve row count")
response = self._request(
'GET', url, params=query_args, headers=headers)
count_json = self._handle_esri_errors(
response, "Could not retrieve row count")
count = count_json.get('count')
if count is None:
raise EsriDownloadError("Server doesn't support returnCountOnly")
Expand All @@ -169,6 +176,9 @@ def _find_oid_field_name(self, metadata):
for f in metadata['fields']:
if f.get('type') == 'esriFieldTypeOID':
oid_field_name = f['name']
elif f['name'].lower() == 'objectid':
oid_field_name = f['name']
else:
break

return oid_field_name
Expand All @@ -179,14 +189,18 @@ def _get_layer_min_max(self, oid_field_name):
'f': 'json',
'outFields': '',
'outStatistics': json.dumps([
dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'),
dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'),
dict(statisticType='min', onStatisticField=oid_field_name,
outStatisticFieldName='THE_MIN'),
dict(statisticType='max', onStatisticField=oid_field_name,
outStatisticFieldName='THE_MAX'),
], separators=(',', ':'))
})
headers = self._build_headers()
url = self._build_url('/query')
response = self._request('GET', url, params=query_args, headers=headers)
metadata = self._handle_esri_errors(response, "Could not retrieve min/max oid values")
response = self._request(
'GET', url, params=query_args, headers=headers)
metadata = self._handle_esri_errors(
response, "Could not retrieve min/max oid values")

# Some servers (specifically version 10.11, it seems) will respond with SQL statements
# for the attribute names rather than the requested field names, so pick the min and max
Expand All @@ -198,8 +212,10 @@ def _get_layer_min_max(self, oid_field_name):
'f': 'json',
'outFields': '*',
'outStatistics': json.dumps([
dict(statisticType='min', onStatisticField=oid_field_name, outStatisticFieldName='THE_MIN'),
dict(statisticType='max', onStatisticField=oid_field_name, outStatisticFieldName='THE_MAX'),
dict(statisticType='min', onStatisticField=oid_field_name,
outStatisticFieldName='THE_MIN'),
dict(statisticType='max', onStatisticField=oid_field_name,
outStatisticFieldName='THE_MAX'),
], separators=(',', ':'))
})
query_args = self._build_query_args({
Expand All @@ -214,8 +230,10 @@ def _get_layer_min_max(self, oid_field_name):
})
headers = self._build_headers()
url = self._build_url('/query')
response = self._request('GET', url, params=query_args, headers=headers)
oid_data = self._handle_esri_errors(response, "Could not check min/max values")
response = self._request(
'GET', url, params=query_args, headers=headers)
oid_data = self._handle_esri_errors(
response, "Could not check min/max values")
if not oid_data or not oid_data.get('objectIds') or min_value not in oid_data['objectIds'] or max_value not in oid_data['objectIds']:
raise EsriDownloadError('Server returned invalid min/max')

Expand All @@ -229,8 +247,10 @@ def _get_layer_oids(self):
})
url = self._build_url('/query')
headers = self._build_headers()
response = self._request('GET', url, params=query_args, headers=headers)
oid_data = self._handle_esri_errors(response, "Could not retrieve object IDs")
response = self._request(
'GET', url, params=query_args, headers=headers)
oid_data = self._handle_esri_errors(
response, "Could not retrieve object IDs")
oids = oid_data.get('objectIds')
if not oids:
raise EsriDownloadError("Server doesn't support returnIdsOnly")
Expand All @@ -250,8 +270,10 @@ def _fetch_bounded_features(self, envelope, outSR):
})
headers = self._build_headers()
url = self._build_url('/query')
response = self._request('GET', url, params=query_args, headers=headers)
features = self._handle_esri_errors(response, "Could not retrieve a section of features")
response = self._request(
'GET', url, params=query_args, headers=headers)
features = self._handle_esri_errors(
response, "Could not retrieve a section of features")
return features['features']

def _split_envelope(self, envelope):
Expand Down Expand Up @@ -288,7 +310,8 @@ def _scrape_an_envelope(self, envelope, outSR, max_records):
features = self._fetch_bounded_features(envelope, outSR)

if len(features) >= max_records:
self._logger.info("Retrieved exactly the maximum record count. Splitting this box and retrieving the children.")
self._logger.info(
"Retrieved exactly the maximum record count. Splitting this box and retrieving the children.")

envelopes = self._split_envelope(envelope)

Expand All @@ -302,7 +325,8 @@ def _scrape_an_envelope(self, envelope, outSR, max_records):
def __iter__(self):
query_fields = self._fields
metadata = self.get_metadata()
page_size = min(self._max_page_size, metadata.get('maxRecordCount', 500))
page_size = min(self._max_page_size,
metadata.get('maxRecordCount', 500))
geometry_type = metadata.get('geometryType')

row_count = None
Expand All @@ -319,15 +343,16 @@ def __iter__(self):

page_args = []

if not self._paginate_oid and row_count is not None and (metadata.get('supportsPagination') or \
(metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])):
if not self._paginate_oid and row_count is not None and (metadata.get('supportsPagination') or
(metadata.get('advancedQueryCapabilities') and metadata['advancedQueryCapabilities']['supportsPagination'])):
# If the layer supports pagination, we can use resultOffset/resultRecordCount to paginate

# There's a bug where some servers won't handle these queries in combination with a list of
# fields specified. We'll make a single, 1 row query here to check if the server supports this
# and switch to querying for all fields if specifying the fields fails.
if query_fields and not self.can_handle_pagination(query_fields):
self._logger.info("Source does not support pagination with fields specified, so querying for all fields.")
self._logger.info(
"Source does not support pagination with fields specified, so querying for all fields.")
query_fields = None

for offset in range(self._startWith, row_count, page_size):
Expand All @@ -342,15 +367,17 @@ def __iter__(self):
'f': 'json',
})
page_args.append(query_args)
self._logger.info("Built %s requests using resultOffset method", len(page_args))
self._logger.info(
"Built %s requests using resultOffset method", len(page_args))
else:
# If not, we can still use the `where` argument to paginate

use_oids = True
oid_field_name = self._find_oid_field_name(metadata)

if not oid_field_name:
raise EsriDownloadError("Could not find object ID field name for deduplication")
raise EsriDownloadError(
"Could not find object ID field name for deduplication")

if metadata.get('supportsStatistics'):
# If the layer supports statistics, we can request maximum and minimum object ID
Expand All @@ -374,13 +401,15 @@ def __iter__(self):
'f': 'json',
})
page_args.append(query_args)
self._logger.info("Built {} requests using OID where clause method".format(len(page_args)))
self._logger.info(
"Built {} requests using OID where clause method".format(len(page_args)))

# If we reach this point we don't need to fall through to enumerating all object IDs
# because the statistics method worked
use_oids = False
except EsriDownloadError:
self._logger.exception("Finding max/min from statistics failed. Trying OID enumeration.")
self._logger.exception(
"Finding max/min from statistics failed. Trying OID enumeration.")

if use_oids:
# If the layer does not support statistics, we can request
Expand Down Expand Up @@ -408,7 +437,8 @@ def __iter__(self):
'f': 'json',
})
page_args.append(query_args)
self._logger.info("Built %s requests using OID enumeration method", len(page_args))
self._logger.info(
"Built %s requests using OID enumeration method", len(page_args))
except EsriDownloadError:
self._logger.info("Falling back to geo queries")
# Use geospatial queries when none of the ID-based methods will work
Expand Down Expand Up @@ -439,19 +469,24 @@ def __iter__(self):
# pause every number of "requests_to_pause", that increase the probability for server response
if query_index % self._requests_to_pause == 0:
time.sleep(self._pause_seconds)
self._logger.info("pause for %s seconds", self._pause_seconds)
response = self._request('POST', query_url, headers=headers, data=query_args)
data = self._handle_esri_errors(response, "Could not retrieve this chunk of objects")
self._logger.info(
"pause for %s seconds", self._pause_seconds)
response = self._request(
'POST', query_url, headers=headers, data=query_args)
data = self._handle_esri_errors(
response, "Could not retrieve this chunk of objects")
# reset the exception state.
download_exception = None
# get out of retry loop, as the request succeeded
break
except socket.timeout as e:
raise EsriDownloadError("Timeout when connecting to URL", e)
raise EsriDownloadError(
"Timeout when connecting to URL", e)
except ValueError as e:
raise EsriDownloadError("Could not parse JSON", e)
except Exception as e:
download_exception = EsriDownloadError("Could not connect to URL", e)
download_exception = EsriDownloadError(
"Could not connect to URL", e)
# increase the pause time every retry, to increase the probability of fetching data successfully
time.sleep(self._pause_seconds * (retry + 1))
self._logger.info("retry pause {0}".format(retry))
Expand All @@ -461,7 +496,8 @@ def __iter__(self):

error = data.get('error')
if error:
raise EsriDownloadError("Problem querying ESRI dataset with args {}. Server said: {}".format(query_args, error['message']))
raise EsriDownloadError("Problem querying ESRI dataset with args {}. Server said: {}".format(
query_args, error['message']))

features = data.get('features')

Expand Down

0 comments on commit d330a79

Please sign in to comment.