From 2fde8021bbfa053462756e24fda17a8857bbf3b9 Mon Sep 17 00:00:00 2001 From: Anita Caron Date: Wed, 21 Aug 2024 16:21:52 +0100 Subject: [PATCH] download ontology using stream option and save in chunks for efficiency on large ontology --- util/dashboard_config.py | 26 +++++++++++++------------- util/lib.py | 31 ++++++++++++++++++++++++++++--- 2 files changed, 41 insertions(+), 16 deletions(-) diff --git a/util/dashboard_config.py b/util/dashboard_config.py index d4d337b..4ba59ec 100755 --- a/util/dashboard_config.py +++ b/util/dashboard_config.py @@ -1,17 +1,17 @@ #!/usr/bin/env python3 -import os -import yaml -import click -import logging -import urllib.request import json +import logging +import os - -from lib import DashboardConfig, runcmd, sha256sum, save_yaml, \ - load_yaml, robot_prepare_ontology, get_hours_since, get_base_prefixes, \ - compute_percentage_reused_entities, round_float, create_dashboard_score_badge, \ - create_dashboard_qc_badge +import click +import requests +import yaml +from lib import (DashboardConfig, compute_percentage_reused_entities, + create_dashboard_qc_badge, create_dashboard_score_badge, + download_file, get_base_prefixes, get_hours_since, load_yaml, + robot_prepare_ontology, round_float, runcmd, save_yaml, + sha256sum) logging.basicConfig(level=logging.INFO) @@ -190,11 +190,11 @@ def prepare_ontologies(ontologies, ontology_dir, dashboard_dir, make_parameters, continue if download: - logging.info(f"Downloading {o}...") + logging.info("Downloading %s...", o) try: - urllib.request.urlretrieve(ourl, ont_path) + download_file(ourl, ont_path) except Exception: - logging.exception(f'Failed to download {o} from {ourl}') + logging.exception("Failed to download %s from %s", o, ourl) ont_results['failure'] = 'failed_download' save_yaml(ont_results, ont_results_path) create_dashboard_qc_badge("red", "Failed to download", ont_dashboard_dir) diff --git a/util/lib.py b/util/lib.py index 3272cff..2211ab5 100644 --- a/util/lib.py +++ b/util/lib.py @@ -1,15 +1,17 @@ #!/usr/bin/env python3 -import yaml +import hashlib import json import logging import subprocess import threading import urllib.request -import hashlib +from datetime import datetime from subprocess import check_call + import requests -from datetime import datetime +import yaml +from requests.exceptions import ChunkedEncodingError obo_purl = "http://purl.obolibrary.org/obo/" @@ -574,3 +576,26 @@ def url_exists(url: str) -> bool: # as the URL not existing logging.error(e, exc_info=True) return False + + +def download_file(url, dest_path, retries=3): + """ + Download the ontology from the URL to a local path. Retries on ChunkedEncodingError. + """ + attempt = 0 + while attempt < retries: + try: + response = requests.get(url, stream=True, timeout=1000000) + response.raise_for_status() + + with open(dest_path, 'wb') as f: + for chunk in response.iter_content(chunk_size=32768): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + logging.info("Downloaded %s to %s", url, dest_path) + return # Exit the function if download is successful + except ChunkedEncodingError as e: + attempt += 1 + logging.warning("ChunkedEncodingError encountered: %s. Retrying %s/%s...", e, attempt, retries) + except Exception as e: + logging.exception("Failed to download %s: %s", url, e)