Skip to content

Commit

Permalink
Ingest: apply mapping to add host genus and host type fields
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 committed Nov 13, 2024
1 parent c54b53e commit 94ed8f3
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 1 deletion.
2 changes: 2 additions & 0 deletions ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ curate:
'location',
'state',
'host',
'host_genus',
'host_type',
'is_lab_host',
#'date_submitted',
#'sra_accession',
Expand Down
2 changes: 1 addition & 1 deletion ingest/defaults/host_hostgenus_hosttype_map.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"Host" "Host_Genus" "Host_Type"
"host" "host_genus" "host_type"
"Homo sapiens" "Homo" "Human"
"Culex pipiens" "Culex" "Mosquito"
"Culex annulirostris" "Culex" "Mosquito"
Expand Down
7 changes: 7 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ rule curate:
sequences_ndjson="data/genbank.ndjson",
all_geolocation_rules="data/all-geolocation-rules.tsv",
annotations=config["curate"]["annotations"],
manual_mapping="defaults/host_hostgenus_hosttype_map.tsv",
output:
metadata="data/raw_metadata_curated.tsv",
sequences="results/sequences.fasta",
Expand Down Expand Up @@ -98,6 +99,12 @@ rule curate:
| ./scripts/post_process_metadata.py \
| ./scripts/add-field-names \
--metadata-columns {params.metadata_columns} \
| ./scripts/transform-new-fields \
--map-tsv {input.manual_mapping} \
--map-id host \
--metadata-id host \
--map-fields host_genus host_type \
--pass-through true \
| augur curate apply-record-annotations \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down
50 changes: 50 additions & 0 deletions ingest/scripts/transform-new-fields
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#! /usr/bin/env python3

import argparse
import json
import csv
from sys import stdin, stdout

def parse_args():
parser = argparse.ArgumentParser(
description="Transform JSON data by applying a TSV mapping and adding new columns."
)
parser.add_argument("--map-tsv", required=True,
help="Path to the TSV mapping file.")
parser.add_argument("--map-id", required=True,
help="Column name in the map TSV to use as the mapping key.")
parser.add_argument("--metadata-id", required=True,
help="Column name in the metadata JSON to use as the mapping key.")
parser.add_argument("--map-fields", nargs="+", required=True,
help="Columns to add from the mapping file.")
parser.add_argument("--pass-through", default=False,
help="If set, pass through the original value when no mapping is found.")
return parser.parse_args()

def load_mapping(map_tsv, map_id, map_fields):
mapping = {}
with open(map_tsv, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f, delimiter='\t')
for row in reader:
key = row[map_id]
mapping[key] = {col: row[col] for col in map_fields}
return mapping

def main():
args = parse_args()
mapping = load_mapping(args.map_tsv, args.map_id, args.map_fields)

for line in stdin:
record = json.loads(line)
key = record.get(args.metadata_id, '')

if key in mapping:
record.update(mapping[key])
elif args.pass_through:
for col in args.map_fields:
record[col] = record.get(args.metadata_id, '')

stdout.write(json.dumps(record) + '\n')

if __name__ == "__main__":
main()

0 comments on commit 94ed8f3

Please sign in to comment.