From b4aac2f7180324b276a5aa30fcf854d60c19b36f Mon Sep 17 00:00:00 2001 From: hannahker Date: Wed, 18 Dec 2024 13:45:59 -0800 Subject: [PATCH 1/3] add admin lookup to blob --- exploration/admin_lookup.md | 137 ++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 exploration/admin_lookup.md diff --git a/exploration/admin_lookup.md b/exploration/admin_lookup.md new file mode 100644 index 0000000..0cde2e4 --- /dev/null +++ b/exploration/admin_lookup.md @@ -0,0 +1,137 @@ +--- +jupyter: + jupytext: + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.16.3 + kernelspec: + display_name: venv + language: python + name: python3 +--- + +# Creating an admin lookup table + +This notebook processes our saved Fieldmaps CODs to create a lookup table matching pcodes and places names across all relevant admin levels for which we have raster stats data. We select the **first available** column in the original CODs with the place name. In most cases this is the English, French, or Spanish name. + +We want an output table with the following columns: + +```python +DEFAULT_COLS = ["ISO3", "ADM0_PCODE", "ADM0_NAME", "ADM1_PCODE", "ADM1_NAME", "ADM2_PCODE", "ADM2_NAME", "NAME_LANGUAGE"] +``` + +```python +import pandas as pd +from sqlalchemy import create_engine +import tempfile +import geopandas as gpd +import os +from typing import Literal + +from src.utils.database_utils import db_engine_url +from src.utils.iso3_utils import get_iso3_data, load_shp_from_azure +from src.utils.metadata_utils import select_name_column +from azure.storage.blob import ContainerClient, ContentSettings +from dotenv import load_dotenv + +load_dotenv() + +PROD_BLOB_SAS = os.getenv("DSCI_AZ_SAS_PROD") +DEV_BLOB_SAS = os.getenv("DSCI_AZ_SAS_DEV") + +PROJECT_PREFIX = "polygon" + +MODE = "dev" +engine = create_engine(db_engine_url(MODE)) +df_iso3s = get_iso3_data(iso3_codes=None, engine=engine) +``` + +```python +dfs = [] + +with tempfile.TemporaryDirectory() as temp_dir: + for _, row in df_iso3s[:5].iterrows(): + iso3 = row["iso3"] + max_adm_level = row["max_adm_level"] + load_shp_from_azure(iso3, temp_dir, MODE) + + name_columns = [] + for admin_level in range(max_adm_level + 1): + gdf = gpd.read_file(f"{temp_dir}/{iso3.lower()}_adm{admin_level}.shp") + + # Get name column and its language code + name_column = select_name_column(gdf, admin_level) + language_code = name_column[-2:] + name_columns.append(name_column) + + # Standardize column names and add language info + new_columns = [x.replace(f"_{language_code}", "_NAME") for x in name_columns] + gdf = gdf.rename(columns=dict(zip(name_columns, new_columns))) + gdf["NAME_LANGUAGE"] = language_code + gdf["ISO3"] = iso3 + + # Keep only relevant columns + matching_cols = [col for col in gdf.columns if col in DEFAULT_COLS] + dfs.append(gdf[matching_cols]) + +df_all = pd.concat(dfs) +``` + +Now writing this to Azure... + +```python +def get_container_client( + container_name: str = "projects", stage: Literal["prod", "dev"] = "dev" +): + sas = DEV_BLOB_SAS if stage == "dev" else PROD_BLOB_SAS + container_url = ( + f"https://imb0chd0{stage}.blob.core.windows.net/" + f"{container_name}?{sas}" + ) + return ContainerClient.from_container_url(container_url) + + +def upload_parquet_to_blob( + blob_name, + df, + stage: Literal["prod", "dev"] = "dev", + container_name: str = "projects", + **kwargs, +): + upload_blob_data( + blob_name, + df.to_parquet(**kwargs), + stage=stage, + container_name=container_name, + ) + + +def upload_blob_data( + blob_name, + data, + stage: Literal["prod", "dev"] = "dev", + container_name: str = "projects", + content_type: str = None, +): + container_client = get_container_client( + stage=stage, container_name=container_name + ) + + if content_type is None: + content_settings = ContentSettings( + content_type="application/octet-stream" + ) + else: + content_settings = ContentSettings(content_type=content_type) + + blob_client = container_client.get_blob_client(blob_name) + blob_client.upload_blob( + data, overwrite=True, content_settings=content_settings + ) +``` + +```python +upload_parquet_to_blob("admin_lookup.parquet", df_all, MODE, PROJECT_PREFIX) +``` From 58be3311c068b61b31404eb6e7b80e2714f4912d Mon Sep 17 00:00:00 2001 From: hannahker Date: Wed, 18 Dec 2024 14:44:01 -0800 Subject: [PATCH 2/3] remove redundancies and ignore index in export --- exploration/admin_lookup.md | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/exploration/admin_lookup.md b/exploration/admin_lookup.md index 0cde2e4..62b061a 100644 --- a/exploration/admin_lookup.md +++ b/exploration/admin_lookup.md @@ -52,31 +52,28 @@ df_iso3s = get_iso3_data(iso3_codes=None, engine=engine) dfs = [] with tempfile.TemporaryDirectory() as temp_dir: - for _, row in df_iso3s[:5].iterrows(): + for _, row in df_iso3s.iterrows(): iso3 = row["iso3"] max_adm_level = row["max_adm_level"] load_shp_from_azure(iso3, temp_dir, MODE) + gdf = gpd.read_file(f"{temp_dir}/{iso3.lower()}_adm{max_adm_level}.shp") - name_columns = [] - for admin_level in range(max_adm_level + 1): - gdf = gpd.read_file(f"{temp_dir}/{iso3.lower()}_adm{admin_level}.shp") + # Get name column and its language code + name_column = select_name_column(gdf, max_adm_level) + language_code = name_column[-2:] + name_columns = [f"ADM{i}_{language_code}" for i in range(0, max_adm_level + 1)] - # Get name column and its language code - name_column = select_name_column(gdf, admin_level) - language_code = name_column[-2:] - name_columns.append(name_column) + # Standardize column names and add language info + new_columns = [x.replace(f"_{language_code}", "_NAME") for x in name_columns] + gdf = gdf.rename(columns=dict(zip(name_columns, new_columns))) + gdf["NAME_LANGUAGE"] = language_code + gdf["ISO3"] = iso3 - # Standardize column names and add language info - new_columns = [x.replace(f"_{language_code}", "_NAME") for x in name_columns] - gdf = gdf.rename(columns=dict(zip(name_columns, new_columns))) - gdf["NAME_LANGUAGE"] = language_code - gdf["ISO3"] = iso3 + # Keep only relevant columns + matching_cols = [col for col in gdf.columns if col in DEFAULT_COLS] + dfs.append(gdf[matching_cols]) - # Keep only relevant columns - matching_cols = [col for col in gdf.columns if col in DEFAULT_COLS] - dfs.append(gdf[matching_cols]) - -df_all = pd.concat(dfs) +df_all = pd.concat(dfs, ignore_index=True) ``` Now writing this to Azure... From 2f2a148dab880bd0ab3628ba880f652c84867d87 Mon Sep 17 00:00:00 2001 From: hannahker Date: Wed, 18 Dec 2024 14:56:33 -0800 Subject: [PATCH 3/3] add ADM_LEVEL column --- exploration/admin_lookup.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/exploration/admin_lookup.md b/exploration/admin_lookup.md index 62b061a..0a8a961 100644 --- a/exploration/admin_lookup.md +++ b/exploration/admin_lookup.md @@ -7,7 +7,7 @@ jupyter: format_version: '1.3' jupytext_version: 1.16.3 kernelspec: - display_name: venv + display_name: Python 3 (ipykernel) language: python name: python3 --- @@ -19,7 +19,7 @@ This notebook processes our saved Fieldmaps CODs to create a lookup table matchi We want an output table with the following columns: ```python -DEFAULT_COLS = ["ISO3", "ADM0_PCODE", "ADM0_NAME", "ADM1_PCODE", "ADM1_NAME", "ADM2_PCODE", "ADM2_NAME", "NAME_LANGUAGE"] +DEFAULT_COLS = ["ISO3", "ADM0_PCODE", "ADM0_NAME", "ADM1_PCODE", "ADM1_NAME", "ADM2_PCODE", "ADM2_NAME", "NAME_LANGUAGE", "ADM_LEVEL"] ``` ```python @@ -68,6 +68,7 @@ with tempfile.TemporaryDirectory() as temp_dir: gdf = gdf.rename(columns=dict(zip(name_columns, new_columns))) gdf["NAME_LANGUAGE"] = language_code gdf["ISO3"] = iso3 + gdf["ADM_LEVEL"] = max_adm_level # Keep only relevant columns matching_cols = [col for col in gdf.columns if col in DEFAULT_COLS]