From ee71bc4ab0bf964588bdddc574c08c1ca4517571 Mon Sep 17 00:00:00 2001 From: Briar Date: Tue, 10 Dec 2024 17:47:15 -0500 Subject: [PATCH] Create test dataset --- .../gdacs/config/hdx_dataset_static.yaml | 14 ++-- .../gdacs/config/project_configuration.yaml | 35 ++++++++ src/hdx/scraper/gdacs/gdacs.py | 79 +++++++++++++------ 3 files changed, 100 insertions(+), 28 deletions(-) diff --git a/src/hdx/scraper/gdacs/config/hdx_dataset_static.yaml b/src/hdx/scraper/gdacs/config/hdx_dataset_static.yaml index b449a0f..f70ea89 100755 --- a/src/hdx/scraper/gdacs/config/hdx_dataset_static.yaml +++ b/src/hdx/scraper/gdacs/config/hdx_dataset_static.yaml @@ -1,10 +1,12 @@ license_id: cc-by methodology: Other -caveats: None -dataset_source: Proper name of source organization +methodology_other: https://gdacs.org/Knowledge/overview.aspx +caveats: While we try everything to ensure accuracy, this information is purely indicative and should not be used for any decision making without alternate sources of information. The JRC is not responsible for any damage or loss resulting from use of the information presented on this website. +dataset_source: European Union package_creator: HDX Data Systems Team private: False -maintainer: dataset maintainer HDX ID -owner_org: dataset organization HDX ID -data_update_frequency: update frequency -notes: dataset notes +maintainer: aa13de36-28c5-47a7-8d0b-6d7c754ba8c8 +owner_org: f27b8618-52b9-4827-9440-eb65a1f66d41 +data_update_frequency: 1 +notes: Disaster alerts in the past 4 days. European Union, 2024 +subnational: "1" diff --git a/src/hdx/scraper/gdacs/config/project_configuration.yaml b/src/hdx/scraper/gdacs/config/project_configuration.yaml index f150aa2..3e91024 100755 --- a/src/hdx/scraper/gdacs/config/project_configuration.yaml +++ b/src/hdx/scraper/gdacs/config/project_configuration.yaml @@ -1,3 +1,38 @@ # Collector specific configuration base_url: "https://www.gdacs.org/XML/RSS.xml" + +dataset_name: "gdacs-rss-information" + +dataset_title: "GDACS RSS information" + +disaster_conversion: + "DR": "Drought" + "EQ": "Earthquake" + "FL": "Flood" + "TC": "Tropical Cyclone" + "VO": "Volcano" + "WF": "Wildfire" + +tags: + - "cyclones-hurricanes-typhoons" + - "earthquake-tsunami" + - "flooding" + - "natural disasters" + +hxl_tags: + "id": "#event+id" + "iso3": "#country+code" + "country": "#country+name" + "title": "#meta+title" + "summary": "#meta+summary" + "event_type": "#event+type" + "severity_unit": "#event+severity+unit" + "severity_value": "#event+severity+value" + "source": "#meta+source" + "from_date": "#date+start" + "to_date": "#date+end" + "link": "#meta+link" + "geo_lat": "#geo+lat" + "geo_long": "#geo+lon" + "gdacs_bbox": "#geo+bbox" diff --git a/src/hdx/scraper/gdacs/gdacs.py b/src/hdx/scraper/gdacs/gdacs.py index a755f08..ebe2f2c 100755 --- a/src/hdx/scraper/gdacs/gdacs.py +++ b/src/hdx/scraper/gdacs/gdacs.py @@ -7,7 +7,7 @@ from feedparser import parse from hdx.api.configuration import Configuration from hdx.data.dataset import Dataset -from hdx.data.hdxobject import HDXError +from hdx.utilities.dateparse import parse_date from hdx.utilities.retriever import Retrieve logger = logging.getLogger(__name__) @@ -17,22 +17,54 @@ class GDACS: def __init__(self, configuration: Configuration, retriever: Retrieve): self._configuration = configuration self._retriever = retriever - self.data = {} + self.data = [] + self.dates = [] + self.countries = set() def get_data(self) -> None: text = self._retriever.download_text(self._configuration["base_url"]) - feed = parse(text) + entries = parse(text).entries + for entry in entries: + iso3 = entry.gdacs_iso3 + if iso3 and iso3 != "": + self.countries.add(iso3) + from_date = entry.gdacs_fromdate + to_date = entry.gdacs_todate + self.dates.append(parse_date(from_date)) + self.dates.append(parse_date(to_date)) + event_type = entry.gdacs_eventtype + event_type = self._configuration["disaster_conversions"].get( + event_type, event_type + ) + self.data.append( + { + "id": entry.id, + "iso3": iso3, + "country": entry.gdacs_country, + "title": entry.title, + "summary": entry.summary, + "event_type": event_type, + "severity_unit": entry.gdacs_severity["unit"], + "severity_value": entry.gdacs_severity["value"], + "source": entry.gdacs_description, + "from_date": from_date, + "to_date": to_date, + "link": entry.link, + "geo_lat": entry.geo_lat, + "geo_long": entry.geo_long, + "gdacs_bbox": entry.gdacs_bbox, + } + ) return def generate_dataset(self) -> Optional[Dataset]: - # To be generated - dataset_name = None - dataset_title = None - dataset_time_period = None - dataset_tags = None - dataset_country_iso3 = None - - # Dataset info + dataset_name = self._configuration["dataset_name"] + dataset_title = self._configuration["dataset_title"] + dataset_time_start = min(self.dates) + dataset_time_end = max(self.dates) + dataset_tags = self._configuration["tags"] + dataset_country_iso3s = self.countries + dataset = Dataset( { "name": dataset_name, @@ -40,18 +72,21 @@ def generate_dataset(self) -> Optional[Dataset]: } ) - dataset.set_time_period(dataset_time_period) + dataset.set_time_period(dataset_time_start, dataset_time_end) dataset.add_tags(dataset_tags) - # Only if needed - dataset.set_subnational(True) - try: - dataset.add_country_location(dataset_country_iso3) - except HDXError: - logger.error( - f"Couldn't find country {dataset_country_iso3}, skipping" - ) - return + dataset.add_country_locations(dataset_country_iso3s) - # Add resources here + dataset.generate_resource_from_iterable( + headers=list(self.data[0].keys()), + iterable=self.data, + hxltags=self._configuration["hxl_tags"], + folder=self._retriever.temp_dir, + filename="gdacs_rss_information.csv", + resourcedata={ + "name": "gdacs_rss_information.csv", + "description": " ", + }, + encoding="utf-8-sig", + ) return dataset