diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index bfadc56..8d491f3 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -112,6 +112,8 @@ curate: 'location', 'state', 'host', + 'host_genus', + 'host_type', 'is_lab_host', #'date_submitted', #'sra_accession', diff --git a/ingest/defaults/host_hostgenus_hosttype_map.tsv b/ingest/defaults/host_hostgenus_hosttype_map.tsv new file mode 100644 index 0000000..fdffcf7 --- /dev/null +++ b/ingest/defaults/host_hostgenus_hosttype_map.tsv @@ -0,0 +1,286 @@ +host host_genus host_type +Homo sapiens Homo Human +Aedeomyia madagascarica Aedeomyia Mosquito +Aedes albopictus Aedes Mosquito +Aedes cinereus Aedes Mosquito +Aedes dalzieli Aedes Mosquito +Aedes japonicus Aedes Mosquito +Aedes rossicus Aedes Mosquito +Aedes sp. Aedes Mosquito +Aedes vexans Aedes Mosquito +Anopheles atroparvus Anopheles Mosquito +Anopheles bancroftii Anopheles Mosquito +Anopheles coustani Anopheles Mosquito +Anopheles farauti Anopheles Mosquito +Anopheles hyrcanus Anopheles Mosquito +Anopheles maculipennis Anopheles Mosquito +Anopheles messeae Anopheles Mosquito +Anopheles pauliani Anopheles Mosquito +Anopheles plumbeus Anopheles Mosquito +Anopheles sp. Anopheles Mosquito +Anopheles stephensi Anopheles Mosquito +Coquillettidia Coquillettidia Mosquito +Coquillettidia perturbans Coquillettidia Mosquito +Coquillettidia richiardii Coquillettidia Mosquito +Culex Culex Mosquito +Culex annulirostris Culex Mosquito +Culex antennatus Culex Mosquito +Culex bitaeniorhynchus Culex Mosquito +Culex erraticus Culex Mosquito +Culex erythrothorax Culex Mosquito +Culex gelidus Culex Mosquito +Culex interrogator Culex Mosquito +Culex modestus Culex Mosquito +Culex neavei Culex Mosquito +Culex nigripalpus Culex Mosquito +Culex perexiguus Culex Mosquito +Culex perfuscus Culex Mosquito +Culex pipiens Culex Mosquito +Culex pipiens complex Culex Mosquito +Culex pipiens pipiens Culex Mosquito +Culex pipiens sensu lato Culex Mosquito +Culex poicilipes Culex Mosquito +Culex pseudovishnui Culex Mosquito +Culex pullus Culex Mosquito +Culex quinquefasciatus Culex Mosquito +Culex restuans Culex Mosquito +Culex salinarius Culex Mosquito +Culex sitiens Culex Mosquito +Culex squamosus Culex Mosquito +Culex stigmatosoma Culex Mosquito +Culex tarsalis Culex Mosquito +Culex theileri Culex Mosquito +Culex tritaeniorhynchus Culex Mosquito +Culex univittatus Culex Mosquito +Culex vishnui Culex Mosquito +Culex whitmorei Culex Mosquito +Culicidae Culicidae Mosquito +Culiseta Culiseta Mosquito +Culiseta inornata Culiseta Mosquito +Culiseta longiareolata Culiseta Mosquito +Culiseta melanura Culiseta Mosquito +Culiseta morsitans Culiseta Mosquito +Mansonia uniformis Mansonia Mosquito +Ochlerotatus canadensis Ochlerotatus Mosquito +Ochlerotatus cantator Ochlerotatus Mosquito +Ochlerotatus caspius Ochlerotatus Mosquito +Ochlerotatus communis Ochlerotatus Mosquito +Ochlerotatus dorsalis Ochlerotatus Mosquito +Ochlerotatus flavescens Ochlerotatus Mosquito +Ochlerotatus sollicitans Ochlerotatus Mosquito +Ochlerotatus spencerii Ochlerotatus Mosquito +Ochlerotatus sticticus Ochlerotatus Mosquito +Ochlerotatus taeniorhynchus Ochlerotatus Mosquito +Ochlerotatus triseriatus Ochlerotatus Mosquito +Ochlerotatus trivittatus Ochlerotatus Mosquito +Psorophora Psorophora Mosquito +Psorophora ferox Psorophora Mosquito +Uranotaenia Uranotaenia Mosquito +Uranotaenia unguiculata Uranotaenia Mosquito +Accipiter Accipiter Bird +Accipiter cooperii Accipiter Bird +Accipiter gentilis Accipiter Bird +Accipiter nisus Accipiter Bird +Accipiter striatus Accipiter Bird +Accipitridae Accipitridae Bird +Acrocephalus dumetorum Acrocephalus Bird +Actitis macularius Actitis Bird +Aegithalos caudatus Aegithalos Bird +Aegypius monachus Aegypius Bird +Agelaius phoeniceus Agelaius Bird +Alopochen aegyptiaca Alopochen Bird +Anas platyrhynchos Anas Bird +Anatidae Anatidae Bird +Anser Anser Bird +Aphelocoma Aphelocoma Bird +Aphelocoma californica Aphelocoma Bird +Apus apus Apus Bird +Aquila adalberti Aquila Bird +Aquila chrysaetos Aquila Bird +Aquila fasciata Aquila Bird +Ardeidae Ardeidae Bird +Asio flammeus Asio Bird +Asio otus Asio Bird +Athene noctua Athene Bird +Aves Aves Bird +Baeolophus bicolor Baeolophus Bird +Baeolophus inornatus Baeolophus Bird +Bombycilla garrulus Bombycilla Bird +Bonasa umbellus Bonasa Bird +Branta canadensis Branta Bird +Bubo scandiacus Bubo Bird +Bubo virginianus Bubo Bird +Bubulcus ibis Bubulcus Bird +Buteo Buteo Bird +Buteo buteo Buteo Bird +Buteo jamaicensis Buteo Bird +Buteo lineatus Buteo Bird +Buteo regalis Buteo Bird +Buteo swainsoni Buteo Bird +Butorides virescens Butorides Bird +Calidris alba Calidris Bird +Calypte costae Calypte Bird +Cardinalis Cardinalis Bird +Cardinalis cardinalis Cardinalis Bird +Charadrius melodus Charadrius Bird +Chroicocephalus ridibundus Chroicocephalus Bird +Ciconiidae Ciconiidae Bird +Clanga pomarina Clanga Bird +Colaptes auratus Colaptes Bird +Coloeus monedula Coloeus Bird +Columba livia Columba Bird +Columba palumbus Columba Bird +Columbidae Columbidae Bird +Coracopsis vasa Coracopsis Bird +Coragyps atratus Coragyps Bird +Corvidae Corvidae Bird +Corvus Corvus Bird +Corvus brachyrhynchos Corvus Bird +Corvus corax Corvus Bird +Corvus cornix Corvus Bird +Corvus corone Corvus Bird +Corvus frugilegus Corvus Bird +Corvus ossifragus Corvus Bird +Cuculus canorus Cuculus Bird +Curruca conspicillata Curruca Bird +Cyanistes caeruleus Cyanistes Bird +Cyanocitta cristata Cyanocitta Bird +Cygnus buccinator Cygnus Bird +Cygnus olor Cygnus Bird +Dumetella Dumetella Bird +Egretta garzetta Egretta Bird +Euphagus cyanocephalus Euphagus Bird +Falco Falco Bird +Falco columbarius Falco Bird +Falco peregrinus Falco Bird +Falco punctatus Falco Bird +Falco sparverius Falco Bird +Falco tinnunculus Falco Bird +Fulica Fulica Bird +Galliformes Galliformes Bird +Gallus gallus Gallus Bird +Garrulus glandarius Garrulus Bird +Gymnogyps californianus Gymnogyps Bird +Haemorhous mexicanus Haemorhous Bird +Haliaeetus albicilla Haliaeetus Bird +Haliaeetus leucocephalus Haliaeetus Bird +Hirundinidae Hirundinidae Bird +Hirundo rustica Hirundo Bird +Hylocichla mustelina Hylocichla Bird +Ichthyaetus leucophthalmus Ichthyaetus Bird +Ictinia mississippiensis Ictinia Bird +Lanius ludovicianus Lanius Bird +Larus crassirostris Larus Bird +Larus delawarensis Larus Bird +Larus michahellis Larus Bird +Larus smithsonianus Larus Bird +Lathamus discolor Lathamus Bird +Loriini Loriini Bird +Meleagris gallopavo Meleagris Bird +Mergus squamatus Mergus Bird +Mimus Mimus Bird +Mimus polyglottos Mimus Bird +Molothrus ater Molothrus Bird +Nestor notabilis Nestor Bird +Oena capensis Oena Bird +Oriolus flavocinctus Oriolus Bird +Pandion haliaetus Pandion Bird +Parulidae Parulidae Bird +Parus major Parus Bird +Passer domesticus Passer Bird +Passer sp. Passer Bird +Passeridae Passeridae Bird +Pelecanus Pelecanus Bird +Pelecanus erythrorhynchos Pelecanus Bird +Pelecanus occidentalis Pelecanus Bird +Phalacrocoracidae Phalacrocoracidae Bird +Phalacrocorax auritus Phalacrocorax Bird +Phalacrocorax carbo Phalacrocorax Bird +Phasianinae Phasianinae Bird +Phasianus colchicus Phasianus Bird +Pheucticus melanocephalus Pheucticus Bird +Phoenicoparrus andinus Phoenicoparrus Bird +Phoenicopterus chilensis Phoenicopterus Bird +Phoenicopterus roseus Phoenicopterus Bird +Phoenicopterus ruber Phoenicopterus Bird +Phoenicopterus sp. Phoenicopterus Bird +Phylloscopus collybita Phylloscopus Bird +Pica hudsonia Pica Bird +Pica nuttalli Pica Bird +Pica pica Pica Bird +Pluvialis apricaria Pluvialis Bird +Podiceps cristatus Podiceps Bird +Poecile atricapillus Poecile Bird +Poecile carolinensis Poecile Bird +Prunella modularis Prunella Bird +Pyrrhocorax graculus Pyrrhocorax Bird +Quelea quelea Quelea Bird +Quiscalus Quiscalus Bird +Quiscalus major Quiscalus Bird +Quiscalus quiscula Quiscalus Bird +Rallus aquaticus Rallus Bird +Serinus canaria Serinus Bird +Spatula querquedula Spatula Bird +Spheniscus humboldti Spheniscus Bird +Spinus tristis Spinus Bird +Sternula Sternula Bird +Sternula antillarum Sternula Bird +Streptopelia capicola Streptopelia Bird +Streptopelia decaocto Streptopelia Bird +Strigidae Strigidae Bird +Strix aluco Strix Bird +Strix nebulosa Strix Bird +Strix nebulosa lapponica Strix Bird +Sturnidae Sturnidae Bird +Sturnus vulgaris Sturnus Bird +Sylvia atricapilla Sylvia Bird +Toxostoma rufum Toxostoma Bird +Trichoglossus haematodus Trichoglossus Bird +Trichoglossus moluccanus Trichoglossus Bird +Turdus merula Turdus Bird +Turdus migratorius Turdus Bird +Turdus philomelos Turdus Bird +Tyto alba Tyto Bird +Zenaida macroura Zenaida Bird +Equus caballus Equus Horse +Equus caballus x Equus asinus Equus Horse +Alectorobius capensis Alectorobius Tick +Dermacentor marginatus Dermacentor Tick +Hyalomma Hyalomma Tick +Hyalomma marginatum Hyalomma Tick +Hyalomma marginatum marginatum Hyalomma Tick +Hyalomma plumbeum plumbeum Hyalomma Tick +Hyalomma scupense Hyalomma Tick +Ixodoidea Ixodoidea Tick +Rhipicephalus guilhoni Rhipicephalus Tick +Rhipicephalus pulchellus Rhipicephalus Tick +Bos taurus Bos Other +Camelus bactrianus Camelus Other +Camelus dromedarius Camelus Other +Canis lupus familiaris Canis Other +Chiroptera Chiroptera Other +Cricetinae Cricetinae Other +Crocodylus moreletii Crocodylus Other +Crocodylus niloticus Crocodylus Other +Crocodylus porosus Crocodylus Other +Equus Equus Other +Equus asinus Equus Other +Equus ferus Equus Other +Giraffa giraffa Giraffa Other +Laridae Laridae Other +Mastomys erythroleucus Mastomys Other +Mephitis mephitis Mephitis Other +Mesocricetus auratus Mesocricetus Other +Mus musculus Mus Other +Orcinus orca Orcinus Other +Ovis aries Ovis Other +Panthera leo Panthera Other +Pelophylax ridibundus Pelophylax Other +Platycercus Platycercus Other +Rodentia Rodentia Other +Rousettus leschenaultii Rousettus Other +Sciuridae Sciuridae Other +Sciurus carolinensis Sciurus Other +Sciurus niger Sciurus Other +Syncerus caffer Syncerus Other +Vicugna pacos Vicugna Other diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 77dc91a..da5b549 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -46,6 +46,7 @@ rule curate: sequences_ndjson="data/genbank.ndjson", all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["curate"]["annotations"], + manual_mapping="defaults/host_hostgenus_hosttype_map.tsv", output: metadata="data/raw_metadata_curated.tsv", sequences="results/sequences.fasta", @@ -98,6 +99,12 @@ rule curate: | ./scripts/post_process_metadata.py \ | ./scripts/add-field-names \ --metadata-columns {params.metadata_columns} \ + | ./scripts/transform-new-fields \ + --map-tsv {input.manual_mapping} \ + --map-id host \ + --metadata-id host \ + --map-fields host_genus host_type \ + --pass-through true \ | augur curate apply-record-annotations \ --annotations {input.annotations} \ --id-field {params.annotations_id} \ diff --git a/ingest/scripts/transform-new-fields b/ingest/scripts/transform-new-fields new file mode 100755 index 0000000..a846990 --- /dev/null +++ b/ingest/scripts/transform-new-fields @@ -0,0 +1,50 @@ +#! /usr/bin/env python3 + +import argparse +import json +import csv +from sys import stdin, stdout + +def parse_args(): + parser = argparse.ArgumentParser( + description="Transform JSON data by applying a TSV mapping and adding new columns." + ) + parser.add_argument("--map-tsv", required=True, + help="Path to the TSV mapping file.") + parser.add_argument("--map-id", required=True, + help="Column name in the map TSV to use as the mapping key.") + parser.add_argument("--metadata-id", required=True, + help="Column name in the metadata JSON to use as the mapping key.") + parser.add_argument("--map-fields", nargs="+", required=True, + help="Columns to add from the mapping file.") + parser.add_argument("--pass-through", default=False, + help="If set, pass through the original value when no mapping is found.") + return parser.parse_args() + +def load_mapping(map_tsv, map_id, map_fields): + mapping = {} + with open(map_tsv, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f, delimiter='\t') + for row in reader: + key = row[map_id] + mapping[key] = {col: row[col] for col in map_fields} + return mapping + +def main(): + args = parse_args() + mapping = load_mapping(args.map_tsv, args.map_id, args.map_fields) + + for line in stdin: + record = json.loads(line) + key = record.get(args.metadata_id, '') + + if key in mapping: + record.update(mapping[key]) + elif args.pass_through: + for col in args.map_fields: + record[col] = record.get(args.metadata_id, '') + + stdout.write(json.dumps(record) + '\n') + +if __name__ == "__main__": + main() diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json index 44da9c1..3913356 100644 --- a/phylogenetic/defaults/auspice_config.json +++ b/phylogenetic/defaults/auspice_config.json @@ -1,5 +1,11 @@ { "title": "Washington Focused West Nile Virus Build", + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], "colorings": [ {"key": "gt", "title": "Genotype", "type": "categorical"}, {"key": "num_date", "title": "Sampling Date", "type": "continuous"}, @@ -10,7 +16,9 @@ {"key": "lineage", "title": "Pathoplexus lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, - {"key": "host", "title": "Host Species", "type": "categorical"} + {"key": "host", "title": "Host Species", "type": "categorical"}, + {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, + {"key": "host_type", "title": "Host Type", "type": "categorical"} ], "geo_resolutions": [ "state", @@ -29,6 +37,8 @@ "author", "clade_membership", "host", + "host_genus", + "host_type", "lineage" ], "display_defaults": { diff --git a/phylogenetic/defaults/auspice_config_global.json b/phylogenetic/defaults/auspice_config_global.json index 37ce140..b7b598b 100644 --- a/phylogenetic/defaults/auspice_config_global.json +++ b/phylogenetic/defaults/auspice_config_global.json @@ -1,5 +1,11 @@ { "title": "Global West Nile Virus Build", + "data_provenance": [ + { + "name": "GenBank", + "url": "https://www.ncbi.nlm.nih.gov/genbank/" + } + ], "colorings": [ {"key": "gt", "title": "Genotype", "type": "categorical"}, {"key": "num_date", "title": "Sampling Date", "type": "continuous"}, @@ -8,7 +14,9 @@ {"key": "lineage", "title": "Pathoplexus lineage", "type": "categorical"}, {"key": "clade_membership", "title": "Clade", "type": "categorical"}, {"key": "author", "title": "Authors", "type": "categorical"}, - {"key": "host", "title": "Host Species", "type": "categorical"} + {"key": "host", "title": "Host Species", "type": "categorical"}, + {"key": "host_genus", "title": "Host Genus", "type": "categorical"}, + {"key": "host_type", "title": "Host Type", "type": "categorical"} ], "geo_resolutions": [ "region", @@ -23,7 +31,9 @@ "region", "country", "author", - "host" + "host", + "host_genus", + "host_type" ], "display_defaults": { "color_by": "region", diff --git a/phylogenetic/defaults/description.md b/phylogenetic/defaults/description.md index 60e5e46..38354ea 100644 --- a/phylogenetic/defaults/description.md +++ b/phylogenetic/defaults/description.md @@ -1,3 +1,8 @@ We gratefully acknowledge the authors, originating and submitting laboratories of the genetic sequences and metadata for sharing their work. Please note that although data generators have generously shared data in an open fashion, that does not mean there should be free license to publish on this data. Data generators should be cited where possible and collaborations should be sought in some circumstances. Please try to avoid scooping someone else's work. Reach out if uncertain. Special thanks to individuals at the [Northwest Pathogen Genomics Center of Excellence](https://github.com/NW-PaGe) and [Grubaugh lab](https://grubaughlab.com/) for comments, code and suggestions. + +We curate sequence data and metadata from NCBI as starting point for our analyses. For global lineage designations, we query [pathoplexus](https://pathoplexus.org/) for lineage assignments and exclusively work with NCBI-sourced records at this time. Curated sequences and metadata are available as flat files at: + +* [data.nextstrain.org/files/workflows/WNV/sequences.fasta.zst](https://data.nextstrain.org/files/workflows/WNV/sequences.fasta.zst) +* [data.nextstrain.org/files/workflows/WNV/metadata.tsv.zst](https://data.nextstrain.org/files/workflows/WNV/metadata.tsv.zst)