-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Global tree and separating out the WA-specific configs #30
- Loading branch information
Showing
12 changed files
with
666 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# This configuration file contains the custom configurations parameters | ||
# for the Washington State phylogenetic build with custom rules and metadata | ||
|
||
# Use 'NY99' as the reference since it should be basel to the USA sequences | ||
reference: "defaults/reference.gb" | ||
# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context | ||
root: "AF481864" | ||
|
||
# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata | ||
input_metadata: "../ingest/results/metadata.tsv" | ||
input_sequences: "../ingest/results/sequences.fasta" | ||
|
||
# This command excludes all strains by default and then forces the inclusion of | ||
# the strains selected by the subsampling logic defined above. | ||
subsampling: | ||
state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000 | ||
neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000 | ||
region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000 | ||
country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800' | ||
force_include: --exclude-all --include ../nextclade/defaults/include.txt | ||
|
||
refine: | ||
treetime_params: --coalescent opt --clock-filter-iqd 4 --date-inference marginal --date-confidence | ||
|
||
traits: | ||
metadata_columns: [ | ||
'country', | ||
'division', | ||
'location', | ||
'clade_membership', | ||
'host' | ||
] | ||
|
||
export: | ||
auspice_config: "defaults/auspice_config.json" | ||
|
||
## Custom rules to run as part of the CI automated workflow | ||
## The paths should be relative to the phylogenetic directory. | ||
custom_rules: | ||
- build-configs/washington-state/washington-state-rules.smk |
77 changes: 77 additions & 0 deletions
77
phylogenetic/build-configs/washington-state/washington-state-rules.smk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
""" | ||
These are washington specific rules for the phylogenetic workflow. | ||
""" | ||
|
||
rule create_lat_longs: | ||
""" | ||
This rule creates an averaged lat_longs.tsv file from the metadata_filtered.tsv file, but this requires a USA state annotation. This rule fails on global datasets. | ||
""" | ||
input: | ||
metadata = "results/metadata_filtered.tsv" | ||
output: | ||
lat_longs = "results/lat_longs.tsv" | ||
log: | ||
"logs/lat_longs.txt", | ||
benchmark: | ||
"benchmarks/lat_longs.txt" | ||
shell: | ||
""" | ||
python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log} | ||
""" | ||
|
||
|
||
rule create_colors: | ||
input: | ||
metadata = "results/metadata_filtered.tsv" | ||
output: | ||
colors = "results/colors.tsv" | ||
log: | ||
"logs/colors.txt", | ||
benchmark: | ||
"benchmarks/colors.txt" | ||
shell: | ||
""" | ||
python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log} | ||
""" | ||
|
||
|
||
rule export_washington_build: | ||
""" | ||
This part of the workflow collects the phylogenetic tree and annotations to | ||
export a Nextstrain dataset. | ||
This includes incorporating the lat_long.tsv annotation. | ||
""" | ||
input: | ||
tree = "results/tree.nwk", | ||
metadata = "results/metadata_filtered.tsv", | ||
branch_lengths = "results/branch_lengths.json", | ||
traits = "results/traits.json", | ||
nt_muts = "results/nt_muts.json", | ||
aa_muts = "results/aa_muts.json", | ||
colors = "results/colors.tsv", | ||
description = config["export"]["description"], | ||
lat_longs = "results/lat_longs.tsv", | ||
auspice_config = config["export"]["auspice_config"], | ||
output: | ||
auspice = "auspice/WNV_genome.json" | ||
log: | ||
"logs/export.txt", | ||
benchmark: | ||
"benchmarks/export.txt" | ||
shell: | ||
""" | ||
augur export v2 \ | ||
--tree {input.tree} \ | ||
--metadata {input.metadata} \ | ||
--metadata-id-columns "accession" \ | ||
--node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \ | ||
--colors {input.colors} \ | ||
--lat-longs {input.lat_longs} \ | ||
--description {input.description} \ | ||
--auspice-config {input.auspice_config} \ | ||
--output {output.auspice} 2>&1 | tee {log} | ||
""" | ||
|
||
# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules | ||
# that have the same output as the copy_example_data rule. | ||
ruleorder: export_washington_build > export |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
{ | ||
"title": "Global West Nile Virus Build", | ||
"colorings": [ | ||
{"key": "gt", "title": "Genotype", "type": "categorical"}, | ||
{"key": "num_date", "title": "Sampling Date", "type": "continuous"}, | ||
{"key": "region", "title": "Region", "type": "categorical"}, | ||
{"key": "country", "title": "Country", "type": "categorical"}, | ||
{"key": "author", "title": "Authors", "type": "categorical"}, | ||
{"key": "host", "title": "Host Species", "type": "categorical"} | ||
], | ||
"geo_resolutions": [ | ||
"region", | ||
"country" | ||
], | ||
"maintainers": [ | ||
{"name": "Marcela Torres", "url": "https://github.com/NW-PaGe/WNV-nextstrain"}, | ||
{"name": "NW-PaGe", "url": "https://github.com/NW-PaGe/WNV-nextstrain"}, | ||
{"name": "Nextstrain Team", "url": "https://next.nextstrain.org/"} | ||
], | ||
"filters": [ | ||
"region", | ||
"country", | ||
"author", | ||
"host" | ||
], | ||
"display_defaults": { | ||
"color_by": "region", | ||
"map_triplicate": true, | ||
"geo_resolution": "country" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
HW816192 # 11029 bp PAT 27-MAY-2015 | ||
CS543188 # 11029 bp PAT 20-APR-2007 | ||
CS568914 # 11029 bp PAT 18-MAY-2007 | ||
CS568916 # 11029 bp PAT 18-MAY-2007 | ||
CS568917 # 11029 bp PAT 18-MAY-2007 | ||
CS568918 # 11029 bp PAT 18-MAY-2007 | ||
CS568919 # 11029 bp PAT 18-MAY-2007 | ||
FV537222 # 10962 bp PAT 18-MAR-2010 | ||
FV537223 # 10962 bp PAT 18-MAR-2010 | ||
FV537224 # 10962 bp PAT 18-MAR-2010 | ||
FV537225 # 10962 bp PAT 18-MAR-2010 | ||
LQ460608 # 8839 bp PAT 06-OCT-2016 | ||
LQ564350 # 8839 bp PAT 06-OCT-2016 | ||
LY683288 # 11062 bp PAT 04-DEC-2019 | ||
MA388207 # 8839 bp PAT 30-OCT-2018 | ||
HC467807 # 11029 bp PAT 21-APR-2010 | ||
HH961658 # 10975 bp PAT 31-OCT-2010 | ||
HH961659 # 11029 bp PAT 31-OCT-2010 | ||
HV572312 # 11029 bp PAT 31-MAY-2012 | ||
OP846974 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7 | ||
OK239667 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7 | ||
OM202920 # Clusters below PAT FV537222 | ||
OM202936 # Clusters below PAT FV537222 | ||
OM202914 # Clusters below PAT FV537222 | ||
OM202933 # Clusters below PAT FV537222 | ||
OM202907 # Clusters below PAT FV537222 | ||
OK573263 # Clusters below PAT FV537222 | ||
FV537224 # Clusters below PAT FV537222 | ||
OK573278 # Clusters below PAT FV537222 | ||
OM202917 # Clusters below PAT FV537222 | ||
OM202919 # Clusters below PAT FV537222 | ||
OM202910 # Clusters below PAT FV537222 | ||
OM202911 # Clusters below PAT FV537222 | ||
OM202922 # Clusters below PAT FV537222 | ||
OK573272 # Clusters below PAT FV537222 | ||
OK573262 # Clusters below PAT FV537222 | ||
OK573279 # Clusters below PAT FV537222 | ||
OK573269 # Clusters below PAT FV537222 | ||
OM202923 # Clusters below PAT FV537222 | ||
OM202906 # Clusters below PAT FV537222 | ||
OM202909 # Clusters below PAT FV537222 | ||
OM202930 # Clusters below PAT FV537222 | ||
OM202929 # Clusters below PAT FV537222 | ||
OM202904 # Clusters below PAT FV537222 | ||
OM202913 # Clusters below PAT FV537222 | ||
OM202908 # Clusters below PAT FV537222 | ||
OM202915 # Clusters below PAT FV537222 | ||
OM202912 # Clusters below PAT FV537222 | ||
OK572999 # Clusters below PAT FV537222 | ||
OK573277 # Clusters below PAT FV537222 | ||
FV537225 # Clusters below PAT FV537222 | ||
OM202905 # Clusters below PAT FV537222 | ||
OM202932 # Clusters below PAT FV537222 | ||
FV537223 # Clusters below PAT FV537222 | ||
FV537222 # Clusters below PAT FV537222 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
AF260968 # Egypt 1951 | ||
NC_001563 # Lineage 2 reference | ||
NC_009942 # Lineage 1 reference | ||
HM051416 # Isreal 1953 | ||
GQ851607 # Nigeria 1965 | ||
GQ851606 # Senegal 1979 | ||
AF481864 # pre-NY | ||
MH166901 # NY99 | ||
MH166903 # NY99 | ||
MH166904 # NY99 | ||
KX547395 # NY99 | ||
KX547519 # NY99 | ||
KX547602 # NY99 | ||
HM488130 # NY99 | ||
HM488132 # NY99 | ||
HQ671707 # NY99 | ||
AF202541 # NY99 | ||
AF206518 # NY99 | ||
HM488127 # NY99 | ||
HM488126 # NY99 | ||
KX547410 # WN02 | ||
KJ501434 # WN02 | ||
KX547456 # WN02 | ||
KY216155 # WN02 | ||
KX547460 # WN02 | ||
MF175829 # WN02 | ||
KX547482 # WN02 | ||
MF175827 # WN02 | ||
MF175839 # WN02 | ||
KT020853 # WN02 | ||
KX547548 # WN02 | ||
MF175863 # WN02 | ||
KX547286 # WN02 | ||
MF175873 # WN02 | ||
MF175865 # WN02 | ||
MF175831 # WN02 | ||
MF175858 # WN02 | ||
KJ501117 # SW03 | ||
KJ501120 # SW03 | ||
MF175815 # SW03 | ||
MG004533 # SW03 | ||
KF704147 # SW03 | ||
KF704153 # SW03 | ||
KR348940 # SW03 | ||
KR348937 # SW03 | ||
KX547361 # SW03 | ||
JX015523 # SW03 | ||
KR348944 # SW03 | ||
KJ501124 # SW03 | ||
KX547552 # SW03 | ||
KJ145829 # SW03 | ||
KR348981 # SW03 | ||
KJ501118 # SW03 | ||
KR348938 # SW03 | ||
KR348976 # SW03 | ||
KJ501170 # SW03 | ||
KR348993 # SW03 | ||
JQ700438 # SW03 | ||
KR348977 # SW03 | ||
KR348942 # SW03 | ||
KR348941 # SW03 | ||
KJ501121 # SW03 | ||
KJ501122 # SW03 | ||
KX547375 # SW03 | ||
KM012172 # SW03 | ||
KC333375 # SW03 | ||
KJ501222 # SW03 | ||
MG004537 # SW03 | ||
MF175866 # SW03 | ||
MG004540 # SW03 |
Oops, something went wrong.