Skip to content

Commit

Permalink
Create test dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
b-j-mills committed Dec 10, 2024
1 parent 7809594 commit ee71bc4
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 28 deletions.
14 changes: 8 additions & 6 deletions src/hdx/scraper/gdacs/config/hdx_dataset_static.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
license_id: cc-by
methodology: Other
caveats: None
dataset_source: Proper name of source organization
methodology_other: https://gdacs.org/Knowledge/overview.aspx
caveats: While we try everything to ensure accuracy, this information is purely indicative and should not be used for any decision making without alternate sources of information. The JRC is not responsible for any damage or loss resulting from use of the information presented on this website.
dataset_source: European Union
package_creator: HDX Data Systems Team
private: False
maintainer: dataset maintainer HDX ID
owner_org: dataset organization HDX ID
data_update_frequency: update frequency
notes: dataset notes
maintainer: aa13de36-28c5-47a7-8d0b-6d7c754ba8c8
owner_org: f27b8618-52b9-4827-9440-eb65a1f66d41
data_update_frequency: 1
notes: Disaster alerts in the past 4 days. European Union, 2024
subnational: "1"
35 changes: 35 additions & 0 deletions src/hdx/scraper/gdacs/config/project_configuration.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,38 @@
# Collector specific configuration

base_url: "https://www.gdacs.org/XML/RSS.xml"

dataset_name: "gdacs-rss-information"

dataset_title: "GDACS RSS information"

disaster_conversion:
"DR": "Drought"
"EQ": "Earthquake"
"FL": "Flood"
"TC": "Tropical Cyclone"
"VO": "Volcano"
"WF": "Wildfire"

tags:
- "cyclones-hurricanes-typhoons"
- "earthquake-tsunami"
- "flooding"
- "natural disasters"

hxl_tags:
"id": "#event+id"
"iso3": "#country+code"
"country": "#country+name"
"title": "#meta+title"
"summary": "#meta+summary"
"event_type": "#event+type"
"severity_unit": "#event+severity+unit"
"severity_value": "#event+severity+value"
"source": "#meta+source"
"from_date": "#date+start"
"to_date": "#date+end"
"link": "#meta+link"
"geo_lat": "#geo+lat"
"geo_long": "#geo+lon"
"gdacs_bbox": "#geo+bbox"
79 changes: 57 additions & 22 deletions src/hdx/scraper/gdacs/gdacs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from feedparser import parse
from hdx.api.configuration import Configuration
from hdx.data.dataset import Dataset
from hdx.data.hdxobject import HDXError
from hdx.utilities.dateparse import parse_date
from hdx.utilities.retriever import Retrieve

logger = logging.getLogger(__name__)
Expand All @@ -17,41 +17,76 @@ class GDACS:
def __init__(self, configuration: Configuration, retriever: Retrieve):
self._configuration = configuration
self._retriever = retriever
self.data = {}
self.data = []
self.dates = []
self.countries = set()

def get_data(self) -> None:
text = self._retriever.download_text(self._configuration["base_url"])
feed = parse(text)
entries = parse(text).entries
for entry in entries:
iso3 = entry.gdacs_iso3
if iso3 and iso3 != "":
self.countries.add(iso3)
from_date = entry.gdacs_fromdate
to_date = entry.gdacs_todate
self.dates.append(parse_date(from_date))
self.dates.append(parse_date(to_date))
event_type = entry.gdacs_eventtype
event_type = self._configuration["disaster_conversions"].get(
event_type, event_type
)
self.data.append(
{
"id": entry.id,
"iso3": iso3,
"country": entry.gdacs_country,
"title": entry.title,
"summary": entry.summary,
"event_type": event_type,
"severity_unit": entry.gdacs_severity["unit"],
"severity_value": entry.gdacs_severity["value"],
"source": entry.gdacs_description,
"from_date": from_date,
"to_date": to_date,
"link": entry.link,
"geo_lat": entry.geo_lat,
"geo_long": entry.geo_long,
"gdacs_bbox": entry.gdacs_bbox,
}
)
return

def generate_dataset(self) -> Optional[Dataset]:
# To be generated
dataset_name = None
dataset_title = None
dataset_time_period = None
dataset_tags = None
dataset_country_iso3 = None

# Dataset info
dataset_name = self._configuration["dataset_name"]
dataset_title = self._configuration["dataset_title"]
dataset_time_start = min(self.dates)
dataset_time_end = max(self.dates)
dataset_tags = self._configuration["tags"]
dataset_country_iso3s = self.countries

dataset = Dataset(
{
"name": dataset_name,
"title": dataset_title,
}
)

dataset.set_time_period(dataset_time_period)
dataset.set_time_period(dataset_time_start, dataset_time_end)
dataset.add_tags(dataset_tags)
# Only if needed
dataset.set_subnational(True)
try:
dataset.add_country_location(dataset_country_iso3)
except HDXError:
logger.error(
f"Couldn't find country {dataset_country_iso3}, skipping"
)
return
dataset.add_country_locations(dataset_country_iso3s)

# Add resources here
dataset.generate_resource_from_iterable(
headers=list(self.data[0].keys()),
iterable=self.data,
hxltags=self._configuration["hxl_tags"],
folder=self._retriever.temp_dir,
filename="gdacs_rss_information.csv",
resourcedata={
"name": "gdacs_rss_information.csv",
"description": " ",
},
encoding="utf-8-sig",
)

return dataset

0 comments on commit ee71bc4

Please sign in to comment.