Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Global tree and separating out the WA-specific configs #30

Merged
merged 8 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions phylogenetic/build-configs/washington-state/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# This configuration file contains the custom configurations parameters
# for the Washington State phylogenetic build with custom rules and metadata

# Use 'NY99' as the reference since it should be basel to the USA sequences
reference: "defaults/reference.gb"
# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context
root: "AF481864"

# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata
input_metadata: "../ingest/results/metadata.tsv"
input_sequences: "../ingest/results/sequences.fasta"

# This command excludes all strains by default and then forces the inclusion of
# the strains selected by the subsampling logic defined above.
subsampling:
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
force_include: --exclude-all --include ../nextclade/defaults/include.txt

refine:
treetime_params: --coalescent opt --clock-filter-iqd 4 --date-inference marginal --date-confidence

traits:
metadata_columns: [
'country',
'division',
'location',
'clade_membership',
'host'
]

export:
auspice_config: "defaults/auspice_config.json"

## Custom rules to run as part of the CI automated workflow
## The paths should be relative to the phylogenetic directory.
custom_rules:
- build-configs/washington-state/washington-state-rules.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
These are washington specific rules for the phylogenetic workflow.
"""

rule create_lat_longs:
"""
This rule creates an averaged lat_longs.tsv file from the metadata_filtered.tsv file, but this requires a USA state annotation. This rule fails on global datasets.
"""
input:
metadata = "results/metadata_filtered.tsv"
output:
lat_longs = "results/lat_longs.tsv"
log:
"logs/lat_longs.txt",
benchmark:
"benchmarks/lat_longs.txt"
shell:
"""
python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log}
"""


rule create_colors:
input:
metadata = "results/metadata_filtered.tsv"
output:
colors = "results/colors.tsv"
log:
"logs/colors.txt",
benchmark:
"benchmarks/colors.txt"
shell:
"""
python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log}
"""


rule export_washington_build:
"""
This part of the workflow collects the phylogenetic tree and annotations to
export a Nextstrain dataset.
This includes incorporating the lat_long.tsv annotation.
"""
input:
tree = "results/tree.nwk",
metadata = "results/metadata_filtered.tsv",
branch_lengths = "results/branch_lengths.json",
traits = "results/traits.json",
nt_muts = "results/nt_muts.json",
aa_muts = "results/aa_muts.json",
colors = "results/colors.tsv",
description = config["export"]["description"],
lat_longs = "results/lat_longs.tsv",
auspice_config = config["export"]["auspice_config"],
output:
auspice = "auspice/WNV_genome.json"
log:
"logs/export.txt",
benchmark:
"benchmarks/export.txt"
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns "accession" \
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \
--colors {input.colors} \
--lat-longs {input.lat_longs} \
--description {input.description} \
--auspice-config {input.auspice_config} \
--output {output.auspice} 2>&1 | tee {log}
"""

# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules
# that have the same output as the copy_example_data rule.
ruleorder: export_washington_build > export
31 changes: 31 additions & 0 deletions phylogenetic/defaults/auspice_config_global.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"title": "Global West Nile Virus Build",
"colorings": [
{"key": "gt", "title": "Genotype", "type": "categorical"},
{"key": "num_date", "title": "Sampling Date", "type": "continuous"},
{"key": "region", "title": "Region", "type": "categorical"},
{"key": "country", "title": "Country", "type": "categorical"},
{"key": "author", "title": "Authors", "type": "categorical"},
{"key": "host", "title": "Host Species", "type": "categorical"}
],
"geo_resolutions": [
"region",
"country"
],
"maintainers": [
{"name": "Marcela Torres", "url": "https://github.com/NW-PaGe/WNV-nextstrain"},
{"name": "NW-PaGe", "url": "https://github.com/NW-PaGe/WNV-nextstrain"},
{"name": "Nextstrain Team", "url": "https://next.nextstrain.org/"}
],
"filters": [
"region",
"country",
"author",
"host"
],
"display_defaults": {
"color_by": "region",
"map_triplicate": true,
"geo_resolution": "country"
}
}
29 changes: 12 additions & 17 deletions phylogenetic/defaults/config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
strain_id_field: "accession"
# Use 'NY99' as the reference since it should be basel to the USA sequences
reference: "defaults/reference.gb"
# Use 'IS88' as the root strain on the phylogenetic tree
# To place samples within the global context
root: "AF481864"
# Use 'Egypt 1951' as the reference and root, following Mencattelli et al, 2023
# https://www.nature.com/articles/s41467-023-42185-7
reference: "defaults/reference_global.gb"
root: "AF260968"

# Sequences must be FASTA and metadata must be TSV
# Both files must be zstd compressed
Expand Down Expand Up @@ -66,22 +65,18 @@ input_sequences: "data/sequences.fasta"
# This command excludes all strains by default and then forces the inclusion of
# the strains selected by the subsampling logic defined above.
subsampling:
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800'
force_include: --exclude-all --include ../nextclade/defaults/include.txt
#global: --query "country != 'USA'" --group-by country year --subsample-max-sequences 200
region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 --exclude defaults/exclude.txt
force_include: --exclude-all --include defaults/include.txt

refine:
treetime_params: --coalescent opt --date-inference marginal --date-confidence

traits:
metadata_columns: [
'country',
'division',
'location',
'clade_membership',
'host'
'region',
'country'
]

export:
description: "defaults/description.md"
auspice_config: "defaults/auspice_config.json"
auspice_config: "defaults/auspice_config_global.json"
55 changes: 55 additions & 0 deletions phylogenetic/defaults/exclude.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
HW816192 # 11029 bp PAT 27-MAY-2015
CS543188 # 11029 bp PAT 20-APR-2007
CS568914 # 11029 bp PAT 18-MAY-2007
CS568916 # 11029 bp PAT 18-MAY-2007
CS568917 # 11029 bp PAT 18-MAY-2007
CS568918 # 11029 bp PAT 18-MAY-2007
CS568919 # 11029 bp PAT 18-MAY-2007
FV537222 # 10962 bp PAT 18-MAR-2010
FV537223 # 10962 bp PAT 18-MAR-2010
FV537224 # 10962 bp PAT 18-MAR-2010
FV537225 # 10962 bp PAT 18-MAR-2010
LQ460608 # 8839 bp PAT 06-OCT-2016
LQ564350 # 8839 bp PAT 06-OCT-2016
LY683288 # 11062 bp PAT 04-DEC-2019
MA388207 # 8839 bp PAT 30-OCT-2018
HC467807 # 11029 bp PAT 21-APR-2010
HH961658 # 10975 bp PAT 31-OCT-2010
HH961659 # 11029 bp PAT 31-OCT-2010
HV572312 # 11029 bp PAT 31-MAY-2012
OP846974 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7
OK239667 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7
OM202920 # Clusters below PAT FV537222
OM202936 # Clusters below PAT FV537222
OM202914 # Clusters below PAT FV537222
OM202933 # Clusters below PAT FV537222
OM202907 # Clusters below PAT FV537222
OK573263 # Clusters below PAT FV537222
FV537224 # Clusters below PAT FV537222
OK573278 # Clusters below PAT FV537222
OM202917 # Clusters below PAT FV537222
OM202919 # Clusters below PAT FV537222
OM202910 # Clusters below PAT FV537222
OM202911 # Clusters below PAT FV537222
OM202922 # Clusters below PAT FV537222
OK573272 # Clusters below PAT FV537222
OK573262 # Clusters below PAT FV537222
OK573279 # Clusters below PAT FV537222
OK573269 # Clusters below PAT FV537222
OM202923 # Clusters below PAT FV537222
OM202906 # Clusters below PAT FV537222
OM202909 # Clusters below PAT FV537222
OM202930 # Clusters below PAT FV537222
OM202929 # Clusters below PAT FV537222
OM202904 # Clusters below PAT FV537222
OM202913 # Clusters below PAT FV537222
OM202908 # Clusters below PAT FV537222
OM202915 # Clusters below PAT FV537222
OM202912 # Clusters below PAT FV537222
OK572999 # Clusters below PAT FV537222
OK573277 # Clusters below PAT FV537222
FV537225 # Clusters below PAT FV537222
OM202905 # Clusters below PAT FV537222
OM202932 # Clusters below PAT FV537222
FV537223 # Clusters below PAT FV537222
FV537222 # Clusters below PAT FV537222
70 changes: 70 additions & 0 deletions phylogenetic/defaults/include.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
AF260968 # Egypt 1951
NC_001563 # Lineage 2 reference
NC_009942 # Lineage 1 reference
HM051416 # Isreal 1953
GQ851607 # Nigeria 1965
GQ851606 # Senegal 1979
AF481864 # pre-NY
MH166901 # NY99
MH166903 # NY99
MH166904 # NY99
KX547395 # NY99
KX547519 # NY99
KX547602 # NY99
HM488130 # NY99
HM488132 # NY99
HQ671707 # NY99
AF202541 # NY99
AF206518 # NY99
HM488127 # NY99
HM488126 # NY99
KX547410 # WN02
KJ501434 # WN02
KX547456 # WN02
KY216155 # WN02
KX547460 # WN02
MF175829 # WN02
KX547482 # WN02
MF175827 # WN02
MF175839 # WN02
KT020853 # WN02
KX547548 # WN02
MF175863 # WN02
KX547286 # WN02
MF175873 # WN02
MF175865 # WN02
MF175831 # WN02
MF175858 # WN02
KJ501117 # SW03
KJ501120 # SW03
MF175815 # SW03
MG004533 # SW03
KF704147 # SW03
KF704153 # SW03
KR348940 # SW03
KR348937 # SW03
KX547361 # SW03
JX015523 # SW03
KR348944 # SW03
KJ501124 # SW03
KX547552 # SW03
KJ145829 # SW03
KR348981 # SW03
KJ501118 # SW03
KR348938 # SW03
KR348976 # SW03
KJ501170 # SW03
KR348993 # SW03
JQ700438 # SW03
KR348977 # SW03
KR348942 # SW03
KR348941 # SW03
KJ501121 # SW03
KJ501122 # SW03
KX547375 # SW03
KM012172 # SW03
KC333375 # SW03
KJ501222 # SW03
MG004537 # SW03
MF175866 # SW03
MG004540 # SW03
Loading