diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index bfadc56..8d491f3 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -112,6 +112,8 @@ curate: 'location', 'state', 'host', + 'host_genus', + 'host_type', 'is_lab_host', #'date_submitted', #'sra_accession', diff --git a/ingest/defaults/host_hostgenus_hosttype_map.tsv b/ingest/defaults/host_hostgenus_hosttype_map.tsv index a1d6b5e..f1edcb2 100644 --- a/ingest/defaults/host_hostgenus_hosttype_map.tsv +++ b/ingest/defaults/host_hostgenus_hosttype_map.tsv @@ -1,4 +1,4 @@ -"Host" "Host_Genus" "Host_Type" +"host" "host_genus" "host_type" "Homo sapiens" "Homo" "Human" "Culex pipiens" "Culex" "Mosquito" "Culex annulirostris" "Culex" "Mosquito" diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 77dc91a..da5b549 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -46,6 +46,7 @@ rule curate: sequences_ndjson="data/genbank.ndjson", all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["curate"]["annotations"], + manual_mapping="defaults/host_hostgenus_hosttype_map.tsv", output: metadata="data/raw_metadata_curated.tsv", sequences="results/sequences.fasta", @@ -98,6 +99,12 @@ rule curate: | ./scripts/post_process_metadata.py \ | ./scripts/add-field-names \ --metadata-columns {params.metadata_columns} \ + | ./scripts/transform-new-fields \ + --map-tsv {input.manual_mapping} \ + --map-id host \ + --metadata-id host \ + --map-fields host_genus host_type \ + --pass-through true \ | augur curate apply-record-annotations \ --annotations {input.annotations} \ --id-field {params.annotations_id} \ diff --git a/ingest/scripts/transform-new-fields b/ingest/scripts/transform-new-fields new file mode 100755 index 0000000..a846990 --- /dev/null +++ b/ingest/scripts/transform-new-fields @@ -0,0 +1,50 @@ +#! /usr/bin/env python3 + +import argparse +import json +import csv +from sys import stdin, stdout + +def parse_args(): + parser = argparse.ArgumentParser( + description="Transform JSON data by applying a TSV mapping and adding new columns." + ) + parser.add_argument("--map-tsv", required=True, + help="Path to the TSV mapping file.") + parser.add_argument("--map-id", required=True, + help="Column name in the map TSV to use as the mapping key.") + parser.add_argument("--metadata-id", required=True, + help="Column name in the metadata JSON to use as the mapping key.") + parser.add_argument("--map-fields", nargs="+", required=True, + help="Columns to add from the mapping file.") + parser.add_argument("--pass-through", default=False, + help="If set, pass through the original value when no mapping is found.") + return parser.parse_args() + +def load_mapping(map_tsv, map_id, map_fields): + mapping = {} + with open(map_tsv, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + key = row[map_id] + mapping[key] = {col: row[col] for col in map_fields} + return mapping + +def main(): + args = parse_args() + mapping = load_mapping(args.map_tsv, args.map_id, args.map_fields) + + for line in stdin: + record = json.loads(line) + key = record.get(args.metadata_id, '') + + if key in mapping: + record.update(mapping[key]) + elif args.pass_through: + for col in args.map_fields: + record[col] = record.get(args.metadata_id, '') + + stdout.write(json.dumps(record) + '\n') + +if __name__ == "__main__": + main()