Skip to content

Commit

Permalink
Global tree and separating out the WA-specific configs #30
Browse files Browse the repository at this point in the history
  • Loading branch information
j23414 authored Oct 25, 2024
2 parents 30b65b1 + 6b2c5dd commit ea9ec0e
Show file tree
Hide file tree
Showing 12 changed files with 666 additions and 57 deletions.
40 changes: 40 additions & 0 deletions phylogenetic/build-configs/washington-state/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This configuration file contains the custom configurations parameters
# for the Washington State phylogenetic build with custom rules and metadata

# Use 'NY99' as the reference since it should be basel to the USA sequences
reference: "defaults/reference.gb"
# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context
root: "AF481864"

# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata
input_metadata: "../ingest/results/metadata.tsv"
input_sequences: "../ingest/results/sequences.fasta"

# This command excludes all strains by default and then forces the inclusion of
# the strains selected by the subsampling logic defined above.
subsampling:
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
force_include: --exclude-all --include ../nextclade/defaults/include.txt

refine:
treetime_params: --coalescent opt --clock-filter-iqd 4 --date-inference marginal --date-confidence

traits:
metadata_columns: [
'country',
'division',
'location',
'clade_membership',
'host'
]

export:
auspice_config: "defaults/auspice_config.json"

## Custom rules to run as part of the CI automated workflow
## The paths should be relative to the phylogenetic directory.
custom_rules:
- build-configs/washington-state/washington-state-rules.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
These are washington specific rules for the phylogenetic workflow.
"""

rule create_lat_longs:
"""
This rule creates an averaged lat_longs.tsv file from the metadata_filtered.tsv file, but this requires a USA state annotation. This rule fails on global datasets.
"""
input:
metadata = "results/metadata_filtered.tsv"
output:
lat_longs = "results/lat_longs.tsv"
log:
"logs/lat_longs.txt",
benchmark:
"benchmarks/lat_longs.txt"
shell:
"""
python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log}
"""


rule create_colors:
input:
metadata = "results/metadata_filtered.tsv"
output:
colors = "results/colors.tsv"
log:
"logs/colors.txt",
benchmark:
"benchmarks/colors.txt"
shell:
"""
python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log}
"""


rule export_washington_build:
"""
This part of the workflow collects the phylogenetic tree and annotations to
export a Nextstrain dataset.
This includes incorporating the lat_long.tsv annotation.
"""
input:
tree = "results/tree.nwk",
metadata = "results/metadata_filtered.tsv",
branch_lengths = "results/branch_lengths.json",
traits = "results/traits.json",
nt_muts = "results/nt_muts.json",
aa_muts = "results/aa_muts.json",
colors = "results/colors.tsv",
description = config["export"]["description"],
lat_longs = "results/lat_longs.tsv",
auspice_config = config["export"]["auspice_config"],
output:
auspice = "auspice/WNV_genome.json"
log:
"logs/export.txt",
benchmark:
"benchmarks/export.txt"
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns "accession" \
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
--colors {input.colors} \
--lat-longs {input.lat_longs} \
--description {input.description} \
--auspice-config {input.auspice_config} \
--output {output.auspice} 2>&1 | tee {log}
"""

# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules
# that have the same output as the copy_example_data rule.
ruleorder: export_washington_build > export
31 changes: 31 additions & 0 deletions phylogenetic/defaults/auspice_config_global.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"title": "Global West Nile Virus Build",
"colorings": [
{"key": "gt", "title": "Genotype", "type": "categorical"},
{"key": "num_date", "title": "Sampling Date", "type": "continuous"},
{"key": "region", "title": "Region", "type": "categorical"},
{"key": "country", "title": "Country", "type": "categorical"},
{"key": "author", "title": "Authors", "type": "categorical"},
{"key": "host", "title": "Host Species", "type": "categorical"}
],
"geo_resolutions": [
"region",
"country"
],
"maintainers": [
{"name": "Marcela Torres", "url": "https://github.com/NW-PaGe/WNV-nextstrain"},
{"name": "NW-PaGe", "url": "https://github.com/NW-PaGe/WNV-nextstrain"},
{"name": "Nextstrain Team", "url": "https://next.nextstrain.org/"}
],
"filters": [
"region",
"country",
"author",
"host"
],
"display_defaults": {
"color_by": "region",
"map_triplicate": true,
"geo_resolution": "country"
}
}
29 changes: 12 additions & 17 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
strain_id_field: "accession"
# Use 'NY99' as the reference since it should be basel to the USA sequences
reference: "defaults/reference.gb"
# Use 'IS88' as the root strain on the phylogenetic tree
# To place samples within the global context
root: "AF481864"
# Use 'Egypt 1951' as the reference and root, following Mencattelli et al, 2023
# https://www.nature.com/articles/s41467-023-42185-7
reference: "defaults/reference_global.gb"
root: "AF260968"

# Sequences must be FASTA and metadata must be TSV
# Both files must be zstd compressed
Expand Down Expand Up @@ -66,22 +65,18 @@ input_sequences: "data/sequences.fasta"
# This command excludes all strains by default and then forces the inclusion of
# the strains selected by the subsampling logic defined above.
subsampling:
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
force_include: --exclude-all --include ../nextclade/defaults/include.txt
#global: --query "country != 'USA'" --group-by country year --subsample-max-sequences 200
region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 --exclude defaults/exclude.txt
force_include: --exclude-all --include defaults/include.txt

refine:
treetime_params: --coalescent opt --date-inference marginal --date-confidence

traits:
metadata_columns: [
'country',
'division',
'location',
'clade_membership',
'host'
'region',
'country'
]

export:
description: "defaults/description.md"
auspice_config: "defaults/auspice_config.json"
auspice_config: "defaults/auspice_config_global.json"
55 changes: 55 additions & 0 deletions phylogenetic/defaults/exclude.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
HW816192 # 11029 bp PAT 27-MAY-2015
CS543188 # 11029 bp PAT 20-APR-2007
CS568914 # 11029 bp PAT 18-MAY-2007
CS568916 # 11029 bp PAT 18-MAY-2007
CS568917 # 11029 bp PAT 18-MAY-2007
CS568918 # 11029 bp PAT 18-MAY-2007
CS568919 # 11029 bp PAT 18-MAY-2007
FV537222 # 10962 bp PAT 18-MAR-2010
FV537223 # 10962 bp PAT 18-MAR-2010
FV537224 # 10962 bp PAT 18-MAR-2010
FV537225 # 10962 bp PAT 18-MAR-2010
LQ460608 # 8839 bp PAT 06-OCT-2016
LQ564350 # 8839 bp PAT 06-OCT-2016
LY683288 # 11062 bp PAT 04-DEC-2019
MA388207 # 8839 bp PAT 30-OCT-2018
HC467807 # 11029 bp PAT 21-APR-2010
HH961658 # 10975 bp PAT 31-OCT-2010
HH961659 # 11029 bp PAT 31-OCT-2010
HV572312 # 11029 bp PAT 31-MAY-2012
OP846974 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7
OK239667 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7
OM202920 # Clusters below PAT FV537222
OM202936 # Clusters below PAT FV537222
OM202914 # Clusters below PAT FV537222
OM202933 # Clusters below PAT FV537222
OM202907 # Clusters below PAT FV537222
OK573263 # Clusters below PAT FV537222
FV537224 # Clusters below PAT FV537222
OK573278 # Clusters below PAT FV537222
OM202917 # Clusters below PAT FV537222
OM202919 # Clusters below PAT FV537222
OM202910 # Clusters below PAT FV537222
OM202911 # Clusters below PAT FV537222
OM202922 # Clusters below PAT FV537222
OK573272 # Clusters below PAT FV537222
OK573262 # Clusters below PAT FV537222
OK573279 # Clusters below PAT FV537222
OK573269 # Clusters below PAT FV537222
OM202923 # Clusters below PAT FV537222
OM202906 # Clusters below PAT FV537222
OM202909 # Clusters below PAT FV537222
OM202930 # Clusters below PAT FV537222
OM202929 # Clusters below PAT FV537222
OM202904 # Clusters below PAT FV537222
OM202913 # Clusters below PAT FV537222
OM202908 # Clusters below PAT FV537222
OM202915 # Clusters below PAT FV537222
OM202912 # Clusters below PAT FV537222
OK572999 # Clusters below PAT FV537222
OK573277 # Clusters below PAT FV537222
FV537225 # Clusters below PAT FV537222
OM202905 # Clusters below PAT FV537222
OM202932 # Clusters below PAT FV537222
FV537223 # Clusters below PAT FV537222
FV537222 # Clusters below PAT FV537222
70 changes: 70 additions & 0 deletions phylogenetic/defaults/include.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
AF260968 # Egypt 1951
NC_001563 # Lineage 2 reference
NC_009942 # Lineage 1 reference
HM051416 # Isreal 1953
GQ851607 # Nigeria 1965
GQ851606 # Senegal 1979
AF481864 # pre-NY
MH166901 # NY99
MH166903 # NY99
MH166904 # NY99
KX547395 # NY99
KX547519 # NY99
KX547602 # NY99
HM488130 # NY99
HM488132 # NY99
HQ671707 # NY99
AF202541 # NY99
AF206518 # NY99
HM488127 # NY99
HM488126 # NY99
KX547410 # WN02
KJ501434 # WN02
KX547456 # WN02
KY216155 # WN02
KX547460 # WN02
MF175829 # WN02
KX547482 # WN02
MF175827 # WN02
MF175839 # WN02
KT020853 # WN02
KX547548 # WN02
MF175863 # WN02
KX547286 # WN02
MF175873 # WN02
MF175865 # WN02
MF175831 # WN02
MF175858 # WN02
KJ501117 # SW03
KJ501120 # SW03
MF175815 # SW03
MG004533 # SW03
KF704147 # SW03
KF704153 # SW03
KR348940 # SW03
KR348937 # SW03
KX547361 # SW03
JX015523 # SW03
KR348944 # SW03
KJ501124 # SW03
KX547552 # SW03
KJ145829 # SW03
KR348981 # SW03
KJ501118 # SW03
KR348938 # SW03
KR348976 # SW03
KJ501170 # SW03
KR348993 # SW03
JQ700438 # SW03
KR348977 # SW03
KR348942 # SW03
KR348941 # SW03
KJ501121 # SW03
KJ501122 # SW03
KX547375 # SW03
KM012172 # SW03
KC333375 # SW03
KJ501222 # SW03
MG004537 # SW03
MF175866 # SW03
MG004540 # SW03
Loading

0 comments on commit ea9ec0e

Please sign in to comment.