From 0e0714ce19eb8f0505855cb93552c3164b0bc4ce Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 16 Oct 2024 13:52:50 -0700 Subject: [PATCH 1/8] Copy phylogenetic/build-configs/ci to initialize a washignton-state specific build configuration Subsequent commits will place the washington state specific configs and rules in this directory. --- .../build-configs/washington-state/config.yaml | 11 +++++++++++ .../washington-state/copy_example_data.smk | 17 +++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 phylogenetic/build-configs/washington-state/config.yaml create mode 100644 phylogenetic/build-configs/washington-state/copy_example_data.smk diff --git a/phylogenetic/build-configs/washington-state/config.yaml b/phylogenetic/build-configs/washington-state/config.yaml new file mode 100644 index 0000000..2533419 --- /dev/null +++ b/phylogenetic/build-configs/washington-state/config.yaml @@ -0,0 +1,11 @@ +# This configuration file contains the custom configurations parameters +# for the CI workflow to run with the example data. + +# Pull in metadata and sequences from the example_data directory +input_metadata: "example_data/metadata.tsv" +input_sequences: "example_data/sequences.fasta" + +## Custom rules to run as part of the CI automated workflow +## The paths should be relative to the phylogenetic directory. +#custom_rules: +# - build-configs/ci/copy_example_data.smk diff --git a/phylogenetic/build-configs/washington-state/copy_example_data.smk b/phylogenetic/build-configs/washington-state/copy_example_data.smk new file mode 100644 index 0000000..5a32700 --- /dev/null +++ b/phylogenetic/build-configs/washington-state/copy_example_data.smk @@ -0,0 +1,17 @@ +rule copy_example_data: + input: + sequences="example_data/sequences.fasta", + metadata="example_data/metadata.tsv", + output: + sequences="data/sequences.fasta", + metadata="data/metadata.tsv", + shell: + """ + cp -f {input.sequences} {output.sequences} + cp -f {input.metadata} {output.metadata} + """ + +# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules +# that have the same output as the copy_example_data rule. + +# ruleorder: copy_example_data > ... \ No newline at end of file From 39d468d56bcf81bdf7ca08b0cc22dab8a7a453f6 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 16 Oct 2024 14:29:22 -0700 Subject: [PATCH 2/8] Initial separation of washington state specific configs and rules This separates the washginton-specific rules and configs from the new global workflow. Subsequent commits will tune global rules and configs, and rooting. The washginton-specific workflow can run via: nextstrain build phylogenetic --configfile build-configs/washington-state/config.yaml --- .../washington-state/config.yaml | 38 +++++++-- .../washington-state/copy_example_data.smk | 17 ---- .../washington-state-rules.smk | 77 +++++++++++++++++++ phylogenetic/defaults/config.yaml | 14 +--- phylogenetic/rules/export.smk | 4 - phylogenetic/rules/prepare_sequences.smk | 28 ------- 6 files changed, 112 insertions(+), 66 deletions(-) delete mode 100644 phylogenetic/build-configs/washington-state/copy_example_data.smk create mode 100644 phylogenetic/build-configs/washington-state/washington-state-rules.smk diff --git a/phylogenetic/build-configs/washington-state/config.yaml b/phylogenetic/build-configs/washington-state/config.yaml index 2533419..0b91a50 100644 --- a/phylogenetic/build-configs/washington-state/config.yaml +++ b/phylogenetic/build-configs/washington-state/config.yaml @@ -1,11 +1,37 @@ # This configuration file contains the custom configurations parameters -# for the CI workflow to run with the example data. +# for the Washington State phylogenetic build with custom rules and metadata -# Pull in metadata and sequences from the example_data directory -input_metadata: "example_data/metadata.tsv" -input_sequences: "example_data/sequences.fasta" +# Use 'NY99' as the reference since it should be basel to the USA sequences +reference: "defaults/reference.gb" +# Use 'IS88' as the root strain on the phylogenetic tree to place samples within the global context +root: "AF481864" + +# Pull in metadata and sequences from the ingest directory after it has been annotated with washington-state specific metadata +input_metadata: "../ingest/results/metadata.tsv" +input_sequences: "../ingest/results/sequences.fasta" + +# This command excludes all strains by default and then forces the inclusion of +# the strains selected by the subsampling logic defined above. +subsampling: + state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000 + neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000 + region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000 + country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800' + force_include: --exclude-all --include ../nextclade/defaults/include.txt + +traits: + metadata_columns: [ + 'country', + 'division', + 'location', + 'clade_membership', + 'host' + ] + +export: + auspice_config: "defaults/auspice_config.json" ## Custom rules to run as part of the CI automated workflow ## The paths should be relative to the phylogenetic directory. -#custom_rules: -# - build-configs/ci/copy_example_data.smk +custom_rules: + - build-configs/washington-state/washington-state-rules.smk diff --git a/phylogenetic/build-configs/washington-state/copy_example_data.smk b/phylogenetic/build-configs/washington-state/copy_example_data.smk deleted file mode 100644 index 5a32700..0000000 --- a/phylogenetic/build-configs/washington-state/copy_example_data.smk +++ /dev/null @@ -1,17 +0,0 @@ -rule copy_example_data: - input: - sequences="example_data/sequences.fasta", - metadata="example_data/metadata.tsv", - output: - sequences="data/sequences.fasta", - metadata="data/metadata.tsv", - shell: - """ - cp -f {input.sequences} {output.sequences} - cp -f {input.metadata} {output.metadata} - """ - -# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules -# that have the same output as the copy_example_data rule. - -# ruleorder: copy_example_data > ... \ No newline at end of file diff --git a/phylogenetic/build-configs/washington-state/washington-state-rules.smk b/phylogenetic/build-configs/washington-state/washington-state-rules.smk new file mode 100644 index 0000000..f2a1c77 --- /dev/null +++ b/phylogenetic/build-configs/washington-state/washington-state-rules.smk @@ -0,0 +1,77 @@ +""" +These are washington specific rules for the phylogenetic workflow. +""" + +rule create_lat_longs: + """ + This rule creates an averaged lat_longs.tsv file from the metadata_filtered.tsv file, but this requires a USA state annotation. This rule fails on global datasets. + """ + input: + metadata = "results/metadata_filtered.tsv" + output: + lat_longs = "results/lat_longs.tsv" + log: + "logs/lat_longs.txt", + benchmark: + "benchmarks/lat_longs.txt" + shell: + """ + python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log} + """ + + +rule create_colors: + input: + metadata = "results/metadata_filtered.tsv" + output: + colors = "results/colors.tsv" + log: + "logs/colors.txt", + benchmark: + "benchmarks/colors.txt" + shell: + """ + python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log} + """ + + +rule export_washington_build: + """ + This part of the workflow collects the phylogenetic tree and annotations to + export a Nextstrain dataset. + This includes incorporating the lat_long.tsv annotation. + """ + input: + tree = "results/tree.nwk", + metadata = "results/metadata_filtered.tsv", + branch_lengths = "results/branch_lengths.json", + traits = "results/traits.json", + nt_muts = "results/nt_muts.json", + aa_muts = "results/aa_muts.json", + colors = "results/colors.tsv", + description = config["export"]["description"], + lat_longs = "results/lat_longs.tsv", + auspice_config = config["export"]["auspice_config"], + output: + auspice = "auspice/WNV_genome.json" + log: + "logs/export.txt", + benchmark: + "benchmarks/export.txt" + shell: + """ + augur export v2 \ + --tree {input.tree} \ + --metadata {input.metadata} \ + --metadata-id-columns "accession" \ + --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \ + --colors {input.colors} \ + --lat-longs {input.lat_longs} \ + --description {input.description} \ + --auspice-config {input.auspice_config} \ + --output {output.auspice} 2>&1 | tee {log} + """ + +# Add a Snakemake ruleorder directive here if you need to resolve ambiguous rules +# that have the same output as the copy_example_data rule. +ruleorder: export_washington_build > export \ No newline at end of file diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 8330df5..a5a7b95 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -66,20 +66,12 @@ input_sequences: "data/sequences.fasta" # This command excludes all strains by default and then forces the inclusion of # the strains selected by the subsampling logic defined above. subsampling: - state: --query "state == 'WA'" --min-length '9800' --subsample-max-sequences 5000 - neighboring_state: --query "state in ['CA', 'ID', 'OR', 'NV']" --group-by state year --min-length '9800' --subsample-max-sequences 5000 - region: --query "state in ['AZ','NM', 'CO', 'UT', 'WY', 'MT']" --group-by state year --min-length '9800' --subsample-max-sequences 5000 - country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800' - force_include: --exclude-all --include ../nextclade/defaults/include.txt - #global: --query "country != 'USA'" --group-by country year --subsample-max-sequences 200 + region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 traits: metadata_columns: [ - 'country', - 'division', - 'location', - 'clade_membership', - 'host' + 'region', + 'country' ] export: diff --git a/phylogenetic/rules/export.smk b/phylogenetic/rules/export.smk index e431b90..41b89d0 100644 --- a/phylogenetic/rules/export.smk +++ b/phylogenetic/rules/export.smk @@ -34,9 +34,7 @@ rule export: traits = "results/traits.json", nt_muts = "results/nt_muts.json", aa_muts = "results/aa_muts.json", - colors = "results/colors.tsv", description = config["export"]["description"], - lat_longs = "results/lat_longs.tsv", auspice_config = config["export"]["auspice_config"], output: auspice = "auspice/WNV_genome.json" @@ -51,8 +49,6 @@ rule export: --metadata {input.metadata} \ --metadata-id-columns "accession" \ --node-data {input.branch_lengths} {input.traits} {input.nt_muts} {input.aa_muts} \ - --colors {input.colors} \ - --lat-longs {input.lat_longs} \ --description {input.description} \ --auspice-config {input.auspice_config} \ --output {output.auspice} 2>&1 | tee {log} diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk index fd66b22..b38265b 100644 --- a/phylogenetic/rules/prepare_sequences.smk +++ b/phylogenetic/rules/prepare_sequences.smk @@ -51,34 +51,6 @@ rule decompress: """ -rule create_colors: - input: - metadata = "results/metadata_filtered.tsv" - output: - colors = "results/colors.tsv" - log: - "logs/colors.txt", - benchmark: - "benchmarks/colors.txt" - shell: - """ - python ./scripts/make_colors.py {input.metadata} {output.colors} 2>&1 | tee {log} - """ - -rule create_lat_longs: - input: - metadata = "results/metadata_filtered.tsv" - output: - lat_longs = "results/lat_longs.tsv" - log: - "logs/lat_longs.txt", - benchmark: - "benchmarks/lat_longs.txt" - shell: - """ - python ./scripts/create_lat_longs.py {input.metadata} {output.lat_longs} 2>&1 | tee {log} - """ - rule align: input: sequences = "results/sequences_filtered.fasta", From bc2e2910c2a215b71c5d9ffeac1c92f89a3f7ae5 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 16 Oct 2024 15:29:00 -0700 Subject: [PATCH 3/8] Root the global tree with AF260968 Use AF260968 ('Egypt 1951') as the global reference and root, following Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7 --- phylogenetic/defaults/config.yaml | 9 +- phylogenetic/defaults/reference_global.gb | 238 ++++++++++++++++++++++ 2 files changed, 242 insertions(+), 5 deletions(-) create mode 100644 phylogenetic/defaults/reference_global.gb diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index a5a7b95..5f7bf54 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -1,9 +1,8 @@ strain_id_field: "accession" -# Use 'NY99' as the reference since it should be basel to the USA sequences -reference: "defaults/reference.gb" -# Use 'IS88' as the root strain on the phylogenetic tree -# To place samples within the global context -root: "AF481864" +# Use 'Egypt 1951' as the reference and root, following Mencattelli et al, 2023 +# https://www.nature.com/articles/s41467-023-42185-7 +reference: "defaults/reference_global.gb" +root: "AF260968_REF" # Sequences must be FASTA and metadata must be TSV # Both files must be zstd compressed diff --git a/phylogenetic/defaults/reference_global.gb b/phylogenetic/defaults/reference_global.gb new file mode 100644 index 0000000..61356c7 --- /dev/null +++ b/phylogenetic/defaults/reference_global.gb @@ -0,0 +1,238 @@ +LOCUS AF260968_REF 11029 bp RNA linear VRL 27-AUG-2000 +DEFINITION West Nile virus strain Eg101, complete genome. +ACCESSION AF260968_REF +VERSION AF260968.1 +KEYWORDS . +SOURCE West Nile virus (WNV) + ORGANISM West Nile virus + Viruses; Riboviria; Orthornavirae; Kitrinoviricota; Flasuviricetes; + Amarillovirales; Flaviviridae; Orthoflavivirus; Orthoflavivirus + nilense. +REFERENCE 1 (bases 1 to 11029) + AUTHORS Bowen,M., Meyer,R.F., McKinney,N., Morrill,W. and Lanciotti,R. + TITLE Complete genomic sequence of West Nile virus strain Eg101 + JOURNAL Unpublished +REFERENCE 2 (bases 1 to 11029) + AUTHORS Bowen,M., Meyer,R.F., McKinney,N., Morrill,W. and Lanciotti,R. + TITLE Direct Submission + JOURNAL Submitted (27-APR-2000) Arbovirus Diseases Branch, Centers for + Disease Control & Prevention, Rampart Road, Fort Collins, CO 80521, + USA +FEATURES Location/Qualifiers + source 1..11029 + /organism="West Nile virus" + /mol_type="genomic RNA" + /strain="Eg101" + /db_xref="taxon:11082" + CDS 97..10398 + /codon_start=1 + /product="polyprotein precursor" + /protein_id="AAG02039.1" + CDS 97..465 + /gene="capsid" + CDS 466..741 + /gene="prM" + CDS 742..966 + /gene="M" + CDS 967..2469 + /gene="env" + CDS 2470..3525 + /gene="NS1" + CDS 3526..4218 + /gene="NS2A" + CDS 4219..4611 + /gene="NS2B" + CDS 4612..6468 + /gene="NS3" + CDS 6469..6915 + /gene="NS4A" + CDS 6916..7680 + /gene="NS4B" + CDS 7681..10395 + /gene="NS5" +ORIGIN + 1 agtagttcgc ctgtgtgagc tgacaaactt agtagtgttt gtgaggatta acaacaatta + 61 acacggtgcg agctgtttct tagcacgaag atctcgatgt ctaagaaacc aggagggccc + 121 ggcaagagcc gggctgtcaa tatgctaaaa cgcggaatgc cccgcgtgtt gtccttgatt + 181 ggactgaaga gggcaatgtt gagcctgatc gacggcaagg gaccaatacg atttgtgttg + 241 gctctcttgg cgttcttcag gttcacagca attgctccga cccgagcagt gctggatcga + 301 tggagaggtg tgaacaaaca aacagcgatg aaacaccttc tgagttttaa gaaggaacta + 361 gggaccttga ccagtgctat caatcggcgg agctcaaaac aaaagaaaag aggaggaaag + 421 accggaattg cagtcatgat tggcttgatc gccagcgtgg gagcagttac cctctctaac + 481 ttccaaggga aggtgatgat gactgtaaat gccactgacg tcacagacgt catcacgatt + 541 ccaacagctg ctggaaagaa tctatgcatt gtcagagcaa tggacgtggg gtacatgtgt + 601 gatgatacta tcacctatga atgtccagtg ctgtcggctg gtaatgatcc agaagacatc + 661 gactgttggt gcacaaaatc agcagtctac gtcaggtatg gaagatgcac caagacacgc + 721 cactcaagac gtagccggag gtcactgaca gtgcagacac atggagaaag cactctagcg + 781 aacaagaagg gggcttggat ggacagcacc aaggctacaa ggtatttggt aaaaacagaa + 841 tcatggatct tgaggaaccc cggatatgcc ctggtggcag ccgtcattgg ttggatgctt + 901 ggaagcaaca ccatgcagcg agttgtgttc gttgtgctac tgctcttggt ggctccagcc + 961 tacagcttta actgccttgg aatgagcaac agagacttct tagagggagt gtctggagca + 1021 acatgggtgg atttggttct cgaaggcgac agctgtgtga ccatcatgtc taaggacaag + 1081 cctaccatcg atgtgaagat gatgaatatg gaggccgcca acctggcaga ggtccgcagt + 1141 tattgctatc tggccaccgt cagcgatctc tccaccaaag ctgcgtgccc gactatggga + 1201 gaagctcaca atgacaaacg tgctgaccca gcttttgtgt gtaaacaagg agtagtggac + 1261 aggggttggg gcaacggctg tggactattt ggtaaaggaa gcattgacac atgcgccaaa + 1321 tttgcctgtt ctaccaaggc aacaggaaga accattctga aagagaacat caagtacgaa + 1381 gtggctatct ttgtccatgg accaaccact gtggagtcgc atggaaacta ccccacacag + 1441 attggggcca ctcaggcagg gagattcagc atcactcctg cggcgccttc atacacacta + 1501 aaacttggag agtatggaga ggtgacggtg gactgtgaac cacgatcagg gattgacacc + 1561 aatgcatact acgtgatgac tgtcggaaca aagacgttct tggtccatcg tgagtggttt + 1621 atggacctca acctcccctg gagcagtgcc ggaagcactg tgtggaggaa cagagagacg + 1681 ttgatggagt ttgaggaacc acacgccacg aagcagtctg tgatagcatt gggctcacaa + 1741 gagggagctc tgcatcaagc tttggctgga gccattcctg tggaattttc aagcaacact + 1801 gtcaagttga catcgggtca tttgaagtgt agagtgaaga tggaaaaatt gcagttgaag + 1861 ggaacaacct acggcgtctg ttcaaaggct ttcaagtttc ttggaactcc cgcagacaca + 1921 ggccacggca ctgtagtgtt ggaattgcag tacactggca cggatggacc ttgcaaagtt + 1981 cccatctcgt cagtggcttc attgaacgac ctaacgccag tgggcaggtt ggtcactgtc + 2041 aacccctttg tttcagtagc cacggccaat gccaaggtcc tgattgaatt ggaaccaccc + 2101 tttggagact catacatagt ggtgggcaga ggagaacaac agattaatca ccattggcac + 2161 aagtctggaa gcagcattgg caaagccttc acaaccaccc tcaaaggggc gcagagatta + 2221 gccgccctag gagatacagc ttgggacttt ggatcagttg gaggggtgtt cacctcagtg + 2281 gggaaggctg tccatcaagt gtttggtgga gcattccgct cactgttcgg aggcatgtct + 2341 tggataacgc aaggattgct gggggctctg ctgttgtgga tgggcatcaa tgctcgtgac + 2401 aggtccatag ctctcacgtt tctcgcagtt ggaggggttt tgctctttct ctccgtgaac + 2461 gtgcacgctg acactggatg tgccatagac atcagccggc aggagctgag atgtggaagt + 2521 ggagtgttca tacacaatga tgtggaggct tggatggacc ggtacaagta ctaccctgaa + 2581 acgccacaag gcctagccaa gatcattcaa aaagcccaca aagaaggagt gtgcggtcta + 2641 cggtcggttt ccagactgga gcaccaaatg tgggaagcgg tgaaggacga gctaaacact + 2701 cttttgaaag agaatggtgt ggacctcagt gttgtggttg agaaacagga gggaatgtac + 2761 aagtcagcac ctaaacgtct caccgctacc acggaaaaat tggaaatagg ctggaaggcc + 2821 tggggaaaga gcatcctatt cgcaccagaa ttggccaaca acacttttgt ggttgatggt + 2881 ccggagacca aggaatgccc aactcagaat cgcgcttgga acagcttgga agtagaggat + 2941 tttggatttg gtctcaccag tacccggatg ttcctgaagg tcagagagag caacacaact + 3001 gaatgtgact caaagatcat cggaacggct gtcaagaaca acttggcgat ccacagtgac + 3061 ctgtcctatt ggattgaaag caggcttaat gatacgtgga agcttgaaag ggcggtcctg + 3121 ggtgaagtta aatcatgcac ttggcctgaa acgcacactt tgtggggtga aggaatcctc + 3181 gagagtgact tgataatacc agtcacactg gcgggaccac gaagcaacca caatcggaga + 3241 cctgggtaca agacacaaaa ccagggccca tgggacgaag gccgggtaga gattgatttc + 3301 gattactgcc caggaacgac ggtcaccctg agtgagagct gcggacaccg tggacctgcc + 3361 actcgcacca ccacagagag cggaaagctg ataacggact ggtgctgcag gagctgcacc + 3421 ttaccaccat tgcgctacca gacggacagc ggttgttggt atggtatgga gattagacca + 3481 cagaggcatg atgaaaagac ccttgtgcag tcacaagtga atgcttacaa cgctgatatg + 3541 attgatcctt ttcagctggg ccttctggtc gtgttcttgg ccacccagga ggtccttcgc + 3601 aagaggtgga cagccaagat cagcatgcca gctatactga ttgctctgct agtcctggtg + 3661 tttgggggca ttacttacac tgacgtgtta cgctatgtca tcttagtggg agcagctttc + 3721 gcagaatcca attcgggagg agacgtggta cacttggcgc tcatggcgac cttcaagata + 3781 caaccagtgt ttatggtggc atcgtttctc aaagcgagat ggaccaacca ggagaatatc + 3841 ttgttgatgt tggcggctgt tttctttcaa atggcttacc atgacgctcg ccaaattctg + 3901 ctttgggaga tccctgatgt gttgaattca ttggcagtag cttggatgat actgagagcc + 3961 ataaccttta caacaacatc aaacgtggtt gttccgctgc tagctctgtt aacacccgga + 4021 ctgagatgct tgaatctgga tgtgtacagg atcctgctat tgatggtcgg aataggcagc + 4081 ttgatcagag agaagagaag cgcagctgca aaaaagaaag gagcaagtct gttatgcctg + 4141 gctctagcct caacaggact tttcaaccct atgatcctcg ccgctggact cattgcatgt + 4201 gatcccaacc gtaaacgagg atggcccgca actgaagtga tgactgctgt cggcctgatg + 4261 tttgccattg tcggagggct ggcagagctt gacattgact ccatggccat tccaatgacc + 4321 atcgcagggc tcatgtttgc tgccttcgtg atatctggga aatcaacaga tatgtggatc + 4381 gagaggacgg cggacatctc ctgggaaagt gatgcggaaa ttacaggctc gagcgagaga + 4441 gttgatgtgc ggcttgatga tgacggaaat ttccagctca tgaatgatcc aggagcacct + 4501 tggaagatat ggatgctcag aatggcttgc ctcgcgatta gtgcgtacac cccttgggca + 4561 atcctgccct cagtagttgg attttggata actctccaat acacaaagag aggaggtgtg + 4621 ctgtgggaca ctccctcacc aaaggagtac aaaaaagggg acacgaccac tggcgtctac + 4681 aggatcatga ctcgtgggct gctcggcagt tatcaagcag gagcgggcgt gatggttgaa + 4741 ggggttttcc acaccctttg gcatacaaca aaaggagccg ctctgatgag cggggaaggc + 4801 cgcctggacc catactgggg tagtgtcaaa gaggatcgac tttgctacgg aggaccctgg + 4861 aaattgcagc acaagtggaa tgggcaggat gaggtgcaaa tgattgtggt ggaacctggc + 4921 aagaacgtta aaaacgtcca gacgaaacca ggggtgttca aaacacctga aggagaaatt + 4981 ggggccgtga ctctggactt ccccactgga acatcaggct caccaatagt ggacaaaaac + 5041 ggtgatgtga tcgggctcta tggcaatgga gtcataatgc ccaacggctc atacataagc + 5101 gcgatagtgc agggtgaaag gatggatgag ccgatcccag ccggattcga acctgagatg + 5161 ctgaggaaaa aacagatcac agttctggac cttcatcccg gtgctggtaa aacaaggagg + 5221 atactgccac agatcatcaa agaggccata aatagaagat tgagaacggc cgtgctagca + 5281 ccaactaggg ttgtagccgc tgagatggct gaagccctga gaggactgcc catccggtat + 5341 cagacatctg cagtgcccag agaacacaat ggaaatgaga ttgttgatgt catgtgccat + 5401 gccactctca ctcacaggct gatgtctcct cacagggtgc cgaactacaa tcttttcgtg + 5461 atggatgagg ctcattttac cgacccagct agcattgcag caaggggtta tatttccaca + 5521 aaagtcgagc tgggggaggc ggcggcaata ttcatgacag ctaccccacc aggcacttca + 5581 gacccattcc cagagtccaa ttcacctatt tctgacttgc agactgagat cccagatcgg + 5641 gcctggaact ctgggtacga atggattaca gaatacattg ggaaaacggt ttggtttgtg + 5701 cccagtgtga aaatggggaa tgagattgcc ctttgtctac aacgtgccgg caaaaaagta + 5761 gtccaactga acagaaagtc gtatgagacg gagtacccaa agtgcaagaa cgatgattgg + 5821 gactttgtta tcacaacaga catatctgaa atgggggcta acttcaaggc gagcagggtg + 5881 attgacagca ggaagagtgt gaaaccaacc atcatcacgg aaggagaagg gagggtgatc + 5941 ctgggagaac catccgctgt gacagcagct agtgcagccc aaagacgtgg acgcatcggt + 6001 aggaatccat cgcaagttgg tgatgagtac tgctatgggg ggcacacgaa tgaagacgac + 6061 tcgaacttcg cccattggac tgaggcacga atcatgctgg acaacatcaa catgccaaac + 6121 ggactgatcg ctcaattcta ccaaccagag cgtgaaaagg tatacaccat ggatggagaa + 6181 taccgactca gaggagaaga gaggaaaaac tttctggaat tattgaggac tgcagatctg + 6241 ccagtttggc tggcttacaa ggtggcagcg gctggagtgt cataccacga tcggagatgg + 6301 tgttttgatg gccctaggac aaacacaatt ctagaagaca acaacgaagt ggaagtcatt + 6361 acgaagcttg gtgaaagaaa gattctgagg ccgcgctgga ttgacgccag ggtgtactcg + 6421 gatcatcagg cattaaaggc gttcaaggac tttgcttcgg gaaagcgttc tcagataggg + 6481 ctcattgagg ttctgggaaa gatgcctgag cacttcatgg ggaagacatg ggaagcactt + 6541 gacaccatgt atgttgtggc caccgcagag aaagggggaa gagctcacag aatggccttg + 6601 gaggaactgc cagatgctct ccagacaatt gccctgattg ccttattgag tgtgatgacc + 6661 atgggagtat tcttcctcct catgcagcgg aagggcattg gaaagatagg tttgggaggc + 6721 gttgtcctgg gagtcgcaac cttcttttgt tggatggctg aagttccagg aacgaagatc + 6781 gccggaatgt tgctgctttc ccttctcttg atgattgtgc taatccctga gccagagaag + 6841 caacgttcgc agacagacaa ccagctagcc gtgttcctga tttgtgtgtt gaccctcgtg + 6901 agcgcagtgg cagccaacga aatgggttgg ctggacaaga ccaagaatga tataagcagt + 6961 ttgtttgggc aaagaattga ggccaaggag aatttcagta tgggagagtt tctcctggac + 7021 ttgagaccgg caacagcctg gtcactgtat gctgtgacca cagcggttct cactccactg + 7081 ctaaagcatc tgatcacgtc agattacatc aacacttcat tgacctcaat caatgttcaa + 7141 gcaagtgcac tattcacact cgcgcgaggc ttcccctttg tcgatgttgg agtgtcggct + 7201 ctcctgctag cagccggatg ctggggacaa gtcaccctca ccgtgacggt gacagcggca + 7261 acactcctgt tctgccacta cgcctacatg gttcccggat ggcaggctga ggcaatgcgc + 7321 tcagcccagc ggcggacagc ggctggaatc atgaaaaacg ctgtagtgga tggcatcgtg + 7381 gccacggacg tcccagaatt agagcgcacc acacccatca tgcagaagaa agttgggcaa + 7441 atcatgctga tcttggtgtc tctagctgca gtagtagtga acccgtctgt gaagacagtg + 7501 cgagaagccg gaattctgat cacggcagca gcggtgacac tctgggagaa tggagcaagc + 7561 tctgtttgga atgcaacaac tgccatcgga ctctgccaca tcatgcgtgg gggttggttg + 7621 tcatgcttat ccataacatg gacactcata aagaacatgg aaaaaccagg actaaaaaga + 7681 ggtggggcaa agggacgcac cttgggagag gtttggaaag aaagactcaa ccagatgaca + 7741 aaagaagagt tcactaggta ccgcaaagag gccatcatcg aagtcgatcg ctcagcagca + 7801 aaacacgcca ggaaagaagg caatgtcact ggagggcatc cagtctctag aggcacagca + 7861 aagctgagat ggctggtcga gcggaggttt ctcgaaccgg tcggaaaagt gattgacctt + 7921 ggatgtggaa gaggcggttg gtgttactac atggcaaccc aaaaaagagt ccaagaggtc + 7981 agagggtaca caaagggtgg tcccggacat gaagagcccc aactggtgca aagttatgga + 8041 tggaacattg tcaccatgaa gagcggagtg gatgtgttct acagaccttc tgagtgctgc + 8101 gataccctcc tttgtgacat cggagagtct tcatcaagtg ctgaggttga agagcatagg + 8161 acgatccggg tccttgaaat ggttgaggac tggctgcacc gagggccaaa ggaattttgt + 8221 gtgaaggtgc tctgccccta tatgccaaaa gtcatagaaa agatggagct gctccagcgc + 8281 cggtatgggg ggggactggt cagaaaccca ctctcgcgga attccacgca cgagatgtat + 8341 tgggtaagtc gagcttcggg caatgtggta cactcagtga acatgaccag ccaggtgctt + 8401 ctgggaagaa tggagaaaag gacctggaag ggaccccaat acgaggaaga tgtgaacttg + 8461 ggaagtggaa ccagggcggt gggaaaaccc ctactcaact cagacactag taaaatcaag + 8521 aacaggattg aacgactcag gcgtgagtac agttcgacgt ggcaccacga tgagaaccac + 8581 ccatatagaa cctggaacta tcacggcagt tatgatgtga aacctacagg ctccgccagc + 8641 tcgctggtca atggagtggt taggctcctc tcaaaaccat gggacaccat cacgaacgtt + 8701 accaccatgg ccatgactga cactactccc ttcggacagc agcgggtgtt taaagagaag + 8761 gtggacacga aagctcctga accgccagaa ggagtgaagt atgtgctcaa tgaaaccacc + 8821 aactggttgt gggcgtttct ggccagagaa aaacgtccca gaatgtgctc tcgagaggaa + 8881 ttcataaaaa aggtcaatag caatgcagct ctgggtgcca tgtttgaaga gcagaaccaa + 8941 tggaggagcg ccagagaagc agttgaggat ccaaaatttt gggagatggt ggatgaggag + 9001 cgcgaggcac acctgcgggg ggaatgtcac acttgcatct acaacatgat ggggaagaga + 9061 gagaagaaac ctggagagtt cggaaaggct aagggaagca gagccatatg gttcatgtgg + 9121 ctcggagctc gctttctgga gttcgaagct ctgggctttc ttaacgaaga ccactggctt + 9181 ggaagaaaga actcaggagg cggggtcgag ggcttgggcc tccaaaaact gggttatatt + 9241 ctgcgtgaag ttggcacccg acctggaggc aagatctatg ctgatgacac agctggctgg + 9301 gacacccgca ttacgagagc tgacctggaa aatgaagcta aggttcttga gttgctggat + 9361 ggggaacatc ggcgtcttgc tagggccatc attgagctca cctatcgtca caaagttgtg + 9421 aaagtgatgc gcccggctgc tgatggaaga accgtcatgg atgtcatctc cagagaagat + 9481 cagaggggga gtggacaagt tgtcacctac gctctaaaca ccttcaccaa cctggccgtc + 9541 cagttggtga ggatgatgga aggggaagga gtgattggcc cagatgatgt ggagaaactc + 9601 acaaagggaa aaggacctaa agtcaggacc tggctgtttg agaatgggga ggaaagactc + 9661 agccgcatgg ctgtcagcgg agatgactgt gtggtaaagc ccctagatga ccgcttcgcc + 9721 acctctctcc acttcctcaa cgccatgtca aaggttcgca aagatatcca ggagtggaaa + 9781 ccgtcaactg gatggtatga ctggcagcag gttccattct gctcgaacca tttcactgaa + 9841 ttaatcatga aagatggaag aacactggtg gttccatgcc gaggacagga cgaactggta + 9901 ggcagagctc gcatttctcc aggggccgga tggaacgtcc gtgacactgc ttgtctggct + 9961 aagtcttatg cccagatgtg gctgcttctg tacttccaca gaagagacct gcggctaatg + 10021 gccaacgcca tttgctccgc tgtccctgtg aattgggtcc ctaccggaag aaccacgtgg + 10081 tccatccatg ccggagggga gtggatgaca acagaagaca tgctggaggt ctggaaccgt + 10141 gtttggatag aggagaatga atggatggaa gacaaaaccc cagtggagaa atggagtgac + 10201 gtcccatact caggaaaacg ggaggacatc tggtgtggca gcttgattgg cacaagaacc + 10261 cgagccacgt gggcagaaaa catccaggta gccatcaacc aagtcagagc aatcattgga + 10321 gatgagaagt atgtggatta catgagttca ttaaagagat atgaagacac gactttggtt + 10381 gaggacacag tactgtaaat actttattaa ttgtaaatag acaatgtaag catgtgtaaa + 10441 agtatagttt tatagtagca tttagtgatg ttagtgtaaa tagttaagaa aattttaagg + 10501 aggaagtcag gccggaaagt ttccgccacc ggaagttgag tagacggtgc tgcctgcgac + 10561 tcaaccccag gaggactggg tgaacaaagc tgcgaagtga tccatgtaag ccctcagaac + 10621 cgtctcggaa ggaggacccc acatgttgta acttcaaagc ccaatgtcag accacgctac + 10681 ggcgtgccac tctgcggaga gtgcagtctg cgatagtgcc ccaggaggac tgggttaaca + 10741 aaggcagatc aacgccccac gcggccctag ccctggtaat ggtgttaacc agggcgaaag + 10801 gactagaggt tagaggagac cccgcggttt aaagtgcacg gcccagcctg actgaagctg + 10861 taggtcaggg gaaggactag aggttagtgg agaccccgtg ccacaaaaca ccacaacaaa + 10921 acagcatatt gacacctggg atagactagg agatcttctg ctctgcacaa ccagccacac + 10981 ggcacagtgc gccgacaatg gtggctggtg gtgcgagaac acaggatct +// From de1b1c5f3542949f166f43960f5ff7f4164c888b Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 16 Oct 2024 15:31:17 -0700 Subject: [PATCH 4/8] Update the global auspice config --- .../defaults/auspice_config_global.json | 31 +++++++++++++++++++ phylogenetic/defaults/config.yaml | 2 +- 2 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 phylogenetic/defaults/auspice_config_global.json diff --git a/phylogenetic/defaults/auspice_config_global.json b/phylogenetic/defaults/auspice_config_global.json new file mode 100644 index 0000000..ca0eb10 --- /dev/null +++ b/phylogenetic/defaults/auspice_config_global.json @@ -0,0 +1,31 @@ +{ + "title": "Global West Nile Virus Build", + "colorings": [ + {"key": "gt", "title": "Genotype", "type": "categorical"}, + {"key": "num_date", "title": "Sampling Date", "type": "continuous"}, + {"key": "region", "title": "Region", "type": "categorical"}, + {"key": "country", "title": "Country", "type": "categorical"}, + {"key": "author", "title": "Authors", "type": "categorical"}, + {"key": "host", "title": "Host Species", "type": "categorical"} + ], + "geo_resolutions": [ + "region", + "country" + ], + "maintainers": [ + {"name": "Marcela Torres", "url": "https://github.com/NW-PaGe/WNV-nextstrain"}, + {"name": "NW-PaGe", "url": "https://github.com/NW-PaGe/WNV-nextstrain"}, + {"name": "Nextstrain Team", "url": "https://next.nextstrain.org/"} + ], + "filters": [ + "region", + "country", + "author", + "host" + ], + "display_defaults": { + "color_by": "region", + "map_triplicate": true, + "geo_resolution": "country" + } +} diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 5f7bf54..742620d 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -75,4 +75,4 @@ traits: export: description: "defaults/description.md" - auspice_config: "defaults/auspice_config.json" + auspice_config: "defaults/auspice_config_global.json" From bf9550f264429335b15e88e57924155895540f84 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 16 Oct 2024 15:47:15 -0700 Subject: [PATCH 5/8] Force include key global strains that tend to be used as references in phylogenetic analyses --- phylogenetic/defaults/config.yaml | 1 + phylogenetic/defaults/include.txt | 70 +++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 phylogenetic/defaults/include.txt diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 742620d..0d55f81 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -66,6 +66,7 @@ input_sequences: "data/sequences.fasta" # the strains selected by the subsampling logic defined above. subsampling: region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 + force_include: --exclude-all --include defaults/include.txt traits: metadata_columns: [ diff --git a/phylogenetic/defaults/include.txt b/phylogenetic/defaults/include.txt new file mode 100644 index 0000000..8161a2a --- /dev/null +++ b/phylogenetic/defaults/include.txt @@ -0,0 +1,70 @@ +AF260968 # Egypt 1951 +NC_001563 # Lineage 2 reference +NC_009942 # Lineage 1 reference +HM051416 # Isreal 1953 +GQ851607 # Nigeria 1965 +GQ851606 # Senegal 1979 +AF481864 # pre-NY +MH166901 # NY99 +MH166903 # NY99 +MH166904 # NY99 +KX547395 # NY99 +KX547519 # NY99 +KX547602 # NY99 +HM488130 # NY99 +HM488132 # NY99 +HQ671707 # NY99 +AF202541 # NY99 +AF206518 # NY99 +HM488127 # NY99 +HM488126 # NY99 +KX547410 # WN02 +KJ501434 # WN02 +KX547456 # WN02 +KY216155 # WN02 +KX547460 # WN02 +MF175829 # WN02 +KX547482 # WN02 +MF175827 # WN02 +MF175839 # WN02 +KT020853 # WN02 +KX547548 # WN02 +MF175863 # WN02 +KX547286 # WN02 +MF175873 # WN02 +MF175865 # WN02 +MF175831 # WN02 +MF175858 # WN02 +KJ501117 # SW03 +KJ501120 # SW03 +MF175815 # SW03 +MG004533 # SW03 +KF704147 # SW03 +KF704153 # SW03 +KR348940 # SW03 +KR348937 # SW03 +KX547361 # SW03 +JX015523 # SW03 +KR348944 # SW03 +KJ501124 # SW03 +KX547552 # SW03 +KJ145829 # SW03 +KR348981 # SW03 +KJ501118 # SW03 +KR348938 # SW03 +KR348976 # SW03 +KJ501170 # SW03 +KR348993 # SW03 +JQ700438 # SW03 +KR348977 # SW03 +KR348942 # SW03 +KR348941 # SW03 +KJ501121 # SW03 +KJ501122 # SW03 +KX547375 # SW03 +KM012172 # SW03 +KC333375 # SW03 +KJ501222 # SW03 +MG004537 # SW03 +MF175866 # SW03 +MG004540 # SW03 From c3395e3b5b11e4917a31d936ca302ccc37db5ecc Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 16 Oct 2024 15:49:13 -0700 Subject: [PATCH 6/8] Exclude PAT strains and outliers from phylogenetic analysis Exclude several PAT strains that did not have the is_lab_host metadata field set to True. Exclude several strains that were either putative recombinants or were below the PAT FV537222 cluster. --- phylogenetic/defaults/config.yaml | 2 +- phylogenetic/defaults/exclude.txt | 55 +++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 phylogenetic/defaults/exclude.txt diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index 0d55f81..d723efb 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -65,7 +65,7 @@ input_sequences: "data/sequences.fasta" # This command excludes all strains by default and then forces the inclusion of # the strains selected by the subsampling logic defined above. subsampling: - region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 + region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 --exclude defaults/exclude.txt force_include: --exclude-all --include defaults/include.txt traits: diff --git a/phylogenetic/defaults/exclude.txt b/phylogenetic/defaults/exclude.txt new file mode 100644 index 0000000..49028e0 --- /dev/null +++ b/phylogenetic/defaults/exclude.txt @@ -0,0 +1,55 @@ +HW816192 # 11029 bp PAT 27-MAY-2015 +CS543188 # 11029 bp PAT 20-APR-2007 +CS568914 # 11029 bp PAT 18-MAY-2007 +CS568916 # 11029 bp PAT 18-MAY-2007 +CS568917 # 11029 bp PAT 18-MAY-2007 +CS568918 # 11029 bp PAT 18-MAY-2007 +CS568919 # 11029 bp PAT 18-MAY-2007 +FV537222 # 10962 bp PAT 18-MAR-2010 +FV537223 # 10962 bp PAT 18-MAR-2010 +FV537224 # 10962 bp PAT 18-MAR-2010 +FV537225 # 10962 bp PAT 18-MAR-2010 +LQ460608 # 8839 bp PAT 06-OCT-2016 +LQ564350 # 8839 bp PAT 06-OCT-2016 +LY683288 # 11062 bp PAT 04-DEC-2019 +MA388207 # 8839 bp PAT 30-OCT-2018 +HC467807 # 11029 bp PAT 21-APR-2010 +HH961658 # 10975 bp PAT 31-OCT-2010 +HH961659 # 11029 bp PAT 31-OCT-2010 +HV572312 # 11029 bp PAT 31-MAY-2012 +OP846974 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7 +OK239667 # Suspected recombinant sequences from Mencattelli et al, 2023 https://www.nature.com/articles/s41467-023-42185-7 +OM202920 # Clusters below PAT FV537222 +OM202936 # Clusters below PAT FV537222 +OM202914 # Clusters below PAT FV537222 +OM202933 # Clusters below PAT FV537222 +OM202907 # Clusters below PAT FV537222 +OK573263 # Clusters below PAT FV537222 +FV537224 # Clusters below PAT FV537222 +OK573278 # Clusters below PAT FV537222 +OM202917 # Clusters below PAT FV537222 +OM202919 # Clusters below PAT FV537222 +OM202910 # Clusters below PAT FV537222 +OM202911 # Clusters below PAT FV537222 +OM202922 # Clusters below PAT FV537222 +OK573272 # Clusters below PAT FV537222 +OK573262 # Clusters below PAT FV537222 +OK573279 # Clusters below PAT FV537222 +OK573269 # Clusters below PAT FV537222 +OM202923 # Clusters below PAT FV537222 +OM202906 # Clusters below PAT FV537222 +OM202909 # Clusters below PAT FV537222 +OM202930 # Clusters below PAT FV537222 +OM202929 # Clusters below PAT FV537222 +OM202904 # Clusters below PAT FV537222 +OM202913 # Clusters below PAT FV537222 +OM202908 # Clusters below PAT FV537222 +OM202915 # Clusters below PAT FV537222 +OM202912 # Clusters below PAT FV537222 +OK572999 # Clusters below PAT FV537222 +OK573277 # Clusters below PAT FV537222 +FV537225 # Clusters below PAT FV537222 +OM202905 # Clusters below PAT FV537222 +OM202932 # Clusters below PAT FV537222 +FV537223 # Clusters below PAT FV537222 +FV537222 # Clusters below PAT FV537222 From 75a51e6ed7177ad3a7b9dc344c9d6d085b07773a Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 17 Oct 2024 00:45:11 -0700 Subject: [PATCH 7/8] Allow more lenient refine rules for the global refine parameters Separate washington-state specific refine parameters from a more permissive global refine parameters. --- phylogenetic/build-configs/washington-state/config.yaml | 3 +++ phylogenetic/defaults/config.yaml | 5 ++++- phylogenetic/rules/construct_phylogeny.smk | 9 ++------- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/phylogenetic/build-configs/washington-state/config.yaml b/phylogenetic/build-configs/washington-state/config.yaml index 0b91a50..6268f41 100644 --- a/phylogenetic/build-configs/washington-state/config.yaml +++ b/phylogenetic/build-configs/washington-state/config.yaml @@ -19,6 +19,9 @@ subsampling: country: --query "country == 'USA' and state not in ['WA', 'CA', 'ID', 'OR', 'NV','AZ','NM', 'CO', 'UT', 'WY', 'MT'] and accession != 'NC_009942'" --group-by state year --subsample-max-sequences 300 --min-length '9800' force_include: --exclude-all --include ../nextclade/defaults/include.txt +refine: + treetime_params: --coalescent opt --clock-filter-iqd 4 --date-inference marginal --date-confidence + traits: metadata_columns: [ 'country', diff --git a/phylogenetic/defaults/config.yaml b/phylogenetic/defaults/config.yaml index d723efb..a86b515 100644 --- a/phylogenetic/defaults/config.yaml +++ b/phylogenetic/defaults/config.yaml @@ -2,7 +2,7 @@ strain_id_field: "accession" # Use 'Egypt 1951' as the reference and root, following Mencattelli et al, 2023 # https://www.nature.com/articles/s41467-023-42185-7 reference: "defaults/reference_global.gb" -root: "AF260968_REF" +root: "AF260968" # Sequences must be FASTA and metadata must be TSV # Both files must be zstd compressed @@ -68,6 +68,9 @@ subsampling: region: --query "is_lab_host != 'true'" --query-columns is_lab_host:str --min-length '9800' --group-by region year --subsample-max-sequences 3000 --exclude defaults/exclude.txt force_include: --exclude-all --include defaults/include.txt +refine: + treetime_params: --coalescent opt --date-inference marginal --date-confidence + traits: metadata_columns: [ 'region', diff --git a/phylogenetic/rules/construct_phylogeny.smk b/phylogenetic/rules/construct_phylogeny.smk index 59fd7be..1c18cae 100644 --- a/phylogenetic/rules/construct_phylogeny.smk +++ b/phylogenetic/rules/construct_phylogeny.smk @@ -55,9 +55,7 @@ rule refine: params: metadata_id_columns = config["strain_id_field"], root = config["root"], - date_inference = "marginal", - coalescent = "opt", - clock_filter_iqd = 4, + treetime_params = config["refine"]["treetime_params"], shell: """ augur refine \ @@ -69,9 +67,6 @@ rule refine: --output-node-data {output.node_data} \ --root {params.root} \ --timetree \ - --coalescent {params.coalescent} \ - --date-confidence \ - --date-inference {params.date_inference} \ - --clock-filter-iqd {params.clock_filter_iqd} \ + {params.treetime_params} \ 2>&1 | tee {log} """ From 6b2c5dd2fd0bbaa9c6c5af7971ca30191a1e7075 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Thu, 17 Oct 2024 00:54:46 -0700 Subject: [PATCH 8/8] Add global root to example data --- phylogenetic/example_data/metadata.tsv | 3 +- phylogenetic/example_data/sequences.fasta | 139 ++++++++++++++++++++++ 2 files changed, 141 insertions(+), 1 deletion(-) diff --git a/phylogenetic/example_data/metadata.tsv b/phylogenetic/example_data/metadata.tsv index e67893b..96b6858 100644 --- a/phylogenetic/example_data/metadata.tsv +++ b/phylogenetic/example_data/metadata.tsv @@ -67,4 +67,5 @@ HM488132 2000-XX-XX North America USA Connecticut CT Culiseta melanura Armstro HQ671707 1999-XX-XX North America USA Connecticut CT Culex pipiens Henn et al. https://www.ncbi.nlm.nih.gov/nuccore/HQ671707 10607 NY99 AF202541 XXXX-XX-XX North America USA New York NY Jia et al. https://www.ncbi.nlm.nih.gov/nuccore/AF202541 10945 NY99 AF206518 XXXX-XX-XX North America USA Connecticut Greenwich-Stanford Town Line CT Culex pipiens Anderson et al. https://www.ncbi.nlm.nih.gov/nuccore/AF206518 10975 NY99 -AF481864 XXXX-XX-XX Ciconiidae Malkinson et al. https://www.ncbi.nlm.nih.gov/nuccore/AF481864 11029 pre-NY \ No newline at end of file +AF481864 XXXX-XX-XX Ciconiidae Malkinson et al. https://www.ncbi.nlm.nih.gov/nuccore/AF481864 11029 pre-NY +AF260968 XXXX-XX-XX Africa Egypt Bowen et al. https://www.ncbi.nlm.nih.gov/nuccore/AF260968 11029 NY99 \ No newline at end of file diff --git a/phylogenetic/example_data/sequences.fasta b/phylogenetic/example_data/sequences.fasta index 59ed8e5..6ad2c61 100644 --- a/phylogenetic/example_data/sequences.fasta +++ b/phylogenetic/example_data/sequences.fasta @@ -12443,3 +12443,142 @@ GACTAGAGGTTAGAGGAGACCCCGCGGTTTAAAGTGCACGGCCCAGCCTGACTGAAGCTG TAGGTCAGGGGAAGGACTAGAGGTTAGTGGAGACCCCGTGCCACAAAACACCACAACAAA ACAGCATATTGACACCTGGGATAGACTAGGAGATCTTCTGCTCTGCACAACCAGCCACAC GGCACAGTGCGCCGACAATGGTGGCTGGTGGTGCGAGAACACAGGATCT +>AF260968 +AGTAGTTCGCCTGTGTGAGCTGACAAACTTAGTAGTGTTTGTGAGGATTAACAACAATTAACACGGTGCGAGCTGTTTCT +TAGCACGAAGATCTCGATGTCTAAGAAACCAGGAGGGCCCGGCAAGAGCCGGGCTGTCAATATGCTAAAACGCGGAATGC +CCCGCGTGTTGTCCTTGATTGGACTGAAGAGGGCAATGTTGAGCCTGATCGACGGCAAGGGACCAATACGATTTGTGTTG +GCTCTCTTGGCGTTCTTCAGGTTCACAGCAATTGCTCCGACCCGAGCAGTGCTGGATCGATGGAGAGGTGTGAACAAACA +AACAGCGATGAAACACCTTCTGAGTTTTAAGAAGGAACTAGGGACCTTGACCAGTGCTATCAATCGGCGGAGCTCAAAAC +AAAAGAAAAGAGGAGGAAAGACCGGAATTGCAGTCATGATTGGCTTGATCGCCAGCGTGGGAGCAGTTACCCTCTCTAAC +TTCCAAGGGAAGGTGATGATGACTGTAAATGCCACTGACGTCACAGACGTCATCACGATTCCAACAGCTGCTGGAAAGAA +TCTATGCATTGTCAGAGCAATGGACGTGGGGTACATGTGTGATGATACTATCACCTATGAATGTCCAGTGCTGTCGGCTG +GTAATGATCCAGAAGACATCGACTGTTGGTGCACAAAATCAGCAGTCTACGTCAGGTATGGAAGATGCACCAAGACACGC +CACTCAAGACGTAGCCGGAGGTCACTGACAGTGCAGACACATGGAGAAAGCACTCTAGCGAACAAGAAGGGGGCTTGGAT +GGACAGCACCAAGGCTACAAGGTATTTGGTAAAAACAGAATCATGGATCTTGAGGAACCCCGGATATGCCCTGGTGGCAG +CCGTCATTGGTTGGATGCTTGGAAGCAACACCATGCAGCGAGTTGTGTTCGTTGTGCTACTGCTCTTGGTGGCTCCAGCC +TACAGCTTTAACTGCCTTGGAATGAGCAACAGAGACTTCTTAGAGGGAGTGTCTGGAGCAACATGGGTGGATTTGGTTCT +CGAAGGCGACAGCTGTGTGACCATCATGTCTAAGGACAAGCCTACCATCGATGTGAAGATGATGAATATGGAGGCCGCCA +ACCTGGCAGAGGTCCGCAGTTATTGCTATCTGGCCACCGTCAGCGATCTCTCCACCAAAGCTGCGTGCCCGACTATGGGA +GAAGCTCACAATGACAAACGTGCTGACCCAGCTTTTGTGTGTAAACAAGGAGTAGTGGACAGGGGTTGGGGCAACGGCTG +TGGACTATTTGGTAAAGGAAGCATTGACACATGCGCCAAATTTGCCTGTTCTACCAAGGCAACAGGAAGAACCATTCTGA +AAGAGAACATCAAGTACGAAGTGGCTATCTTTGTCCATGGACCAACCACTGTGGAGTCGCATGGAAACTACCCCACACAG +ATTGGGGCCACTCAGGCAGGGAGATTCAGCATCACTCCTGCGGCGCCTTCATACACACTAAAACTTGGAGAGTATGGAGA +GGTGACGGTGGACTGTGAACCACGATCAGGGATTGACACCAATGCATACTACGTGATGACTGTCGGAACAAAGACGTTCT +TGGTCCATCGTGAGTGGTTTATGGACCTCAACCTCCCCTGGAGCAGTGCCGGAAGCACTGTGTGGAGGAACAGAGAGACG +TTGATGGAGTTTGAGGAACCACACGCCACGAAGCAGTCTGTGATAGCATTGGGCTCACAAGAGGGAGCTCTGCATCAAGC +TTTGGCTGGAGCCATTCCTGTGGAATTTTCAAGCAACACTGTCAAGTTGACATCGGGTCATTTGAAGTGTAGAGTGAAGA +TGGAAAAATTGCAGTTGAAGGGAACAACCTACGGCGTCTGTTCAAAGGCTTTCAAGTTTCTTGGAACTCCCGCAGACACA +GGCCACGGCACTGTAGTGTTGGAATTGCAGTACACTGGCACGGATGGACCTTGCAAAGTTCCCATCTCGTCAGTGGCTTC +ATTGAACGACCTAACGCCAGTGGGCAGGTTGGTCACTGTCAACCCCTTTGTTTCAGTAGCCACGGCCAATGCCAAGGTCC +TGATTGAATTGGAACCACCCTTTGGAGACTCATACATAGTGGTGGGCAGAGGAGAACAACAGATTAATCACCATTGGCAC +AAGTCTGGAAGCAGCATTGGCAAAGCCTTCACAACCACCCTCAAAGGGGCGCAGAGATTAGCCGCCCTAGGAGATACAGC +TTGGGACTTTGGATCAGTTGGAGGGGTGTTCACCTCAGTGGGGAAGGCTGTCCATCAAGTGTTTGGTGGAGCATTCCGCT +CACTGTTCGGAGGCATGTCTTGGATAACGCAAGGATTGCTGGGGGCTCTGCTGTTGTGGATGGGCATCAATGCTCGTGAC +AGGTCCATAGCTCTCACGTTTCTCGCAGTTGGAGGGGTTTTGCTCTTTCTCTCCGTGAACGTGCACGCTGACACTGGATG +TGCCATAGACATCAGCCGGCAGGAGCTGAGATGTGGAAGTGGAGTGTTCATACACAATGATGTGGAGGCTTGGATGGACC +GGTACAAGTACTACCCTGAAACGCCACAAGGCCTAGCCAAGATCATTCAAAAAGCCCACAAAGAAGGAGTGTGCGGTCTA +CGGTCGGTTTCCAGACTGGAGCACCAAATGTGGGAAGCGGTGAAGGACGAGCTAAACACTCTTTTGAAAGAGAATGGTGT +GGACCTCAGTGTTGTGGTTGAGAAACAGGAGGGAATGTACAAGTCAGCACCTAAACGTCTCACCGCTACCACGGAAAAAT +TGGAAATAGGCTGGAAGGCCTGGGGAAAGAGCATCCTATTCGCACCAGAATTGGCCAACAACACTTTTGTGGTTGATGGT +CCGGAGACCAAGGAATGCCCAACTCAGAATCGCGCTTGGAACAGCTTGGAAGTAGAGGATTTTGGATTTGGTCTCACCAG +TACCCGGATGTTCCTGAAGGTCAGAGAGAGCAACACAACTGAATGTGACTCAAAGATCATCGGAACGGCTGTCAAGAACA +ACTTGGCGATCCACAGTGACCTGTCCTATTGGATTGAAAGCAGGCTTAATGATACGTGGAAGCTTGAAAGGGCGGTCCTG +GGTGAAGTTAAATCATGCACTTGGCCTGAAACGCACACTTTGTGGGGTGAAGGAATCCTCGAGAGTGACTTGATAATACC +AGTCACACTGGCGGGACCACGAAGCAACCACAATCGGAGACCTGGGTACAAGACACAAAACCAGGGCCCATGGGACGAAG +GCCGGGTAGAGATTGATTTCGATTACTGCCCAGGAACGACGGTCACCCTGAGTGAGAGCTGCGGACACCGTGGACCTGCC +ACTCGCACCACCACAGAGAGCGGAAAGCTGATAACGGACTGGTGCTGCAGGAGCTGCACCTTACCACCATTGCGCTACCA +GACGGACAGCGGTTGTTGGTATGGTATGGAGATTAGACCACAGAGGCATGATGAAAAGACCCTTGTGCAGTCACAAGTGA +ATGCTTACAACGCTGATATGATTGATCCTTTTCAGCTGGGCCTTCTGGTCGTGTTCTTGGCCACCCAGGAGGTCCTTCGC +AAGAGGTGGACAGCCAAGATCAGCATGCCAGCTATACTGATTGCTCTGCTAGTCCTGGTGTTTGGGGGCATTACTTACAC +TGACGTGTTACGCTATGTCATCTTAGTGGGAGCAGCTTTCGCAGAATCCAATTCGGGAGGAGACGTGGTACACTTGGCGC +TCATGGCGACCTTCAAGATACAACCAGTGTTTATGGTGGCATCGTTTCTCAAAGCGAGATGGACCAACCAGGAGAATATC +TTGTTGATGTTGGCGGCTGTTTTCTTTCAAATGGCTTACCATGACGCTCGCCAAATTCTGCTTTGGGAGATCCCTGATGT +GTTGAATTCATTGGCAGTAGCTTGGATGATACTGAGAGCCATAACCTTTACAACAACATCAAACGTGGTTGTTCCGCTGC +TAGCTCTGTTAACACCCGGACTGAGATGCTTGAATCTGGATGTGTACAGGATCCTGCTATTGATGGTCGGAATAGGCAGC +TTGATCAGAGAGAAGAGAAGCGCAGCTGCAAAAAAGAAAGGAGCAAGTCTGTTATGCCTGGCTCTAGCCTCAACAGGACT +TTTCAACCCTATGATCCTCGCCGCTGGACTCATTGCATGTGATCCCAACCGTAAACGAGGATGGCCCGCAACTGAAGTGA +TGACTGCTGTCGGCCTGATGTTTGCCATTGTCGGAGGGCTGGCAGAGCTTGACATTGACTCCATGGCCATTCCAATGACC +ATCGCAGGGCTCATGTTTGCTGCCTTCGTGATATCTGGGAAATCAACAGATATGTGGATCGAGAGGACGGCGGACATCTC +CTGGGAAAGTGATGCGGAAATTACAGGCTCGAGCGAGAGAGTTGATGTGCGGCTTGATGATGACGGAAATTTCCAGCTCA +TGAATGATCCAGGAGCACCTTGGAAGATATGGATGCTCAGAATGGCTTGCCTCGCGATTAGTGCGTACACCCCTTGGGCA +ATCCTGCCCTCAGTAGTTGGATTTTGGATAACTCTCCAATACACAAAGAGAGGAGGTGTGCTGTGGGACACTCCCTCACC +AAAGGAGTACAAAAAAGGGGACACGACCACTGGCGTCTACAGGATCATGACTCGTGGGCTGCTCGGCAGTTATCAAGCAG +GAGCGGGCGTGATGGTTGAAGGGGTTTTCCACACCCTTTGGCATACAACAAAAGGAGCCGCTCTGATGAGCGGGGAAGGC +CGCCTGGACCCATACTGGGGTAGTGTCAAAGAGGATCGACTTTGCTACGGAGGACCCTGGAAATTGCAGCACAAGTGGAA +TGGGCAGGATGAGGTGCAAATGATTGTGGTGGAACCTGGCAAGAACGTTAAAAACGTCCAGACGAAACCAGGGGTGTTCA +AAACACCTGAAGGAGAAATTGGGGCCGTGACTCTGGACTTCCCCACTGGAACATCAGGCTCACCAATAGTGGACAAAAAC +GGTGATGTGATCGGGCTCTATGGCAATGGAGTCATAATGCCCAACGGCTCATACATAAGCGCGATAGTGCAGGGTGAAAG +GATGGATGAGCCGATCCCAGCCGGATTCGAACCTGAGATGCTGAGGAAAAAACAGATCACAGTTCTGGACCTTCATCCCG +GTGCTGGTAAAACAAGGAGGATACTGCCACAGATCATCAAAGAGGCCATAAATAGAAGATTGAGAACGGCCGTGCTAGCA +CCAACTAGGGTTGTAGCCGCTGAGATGGCTGAAGCCCTGAGAGGACTGCCCATCCGGTATCAGACATCTGCAGTGCCCAG +AGAACACAATGGAAATGAGATTGTTGATGTCATGTGCCATGCCACTCTCACTCACAGGCTGATGTCTCCTCACAGGGTGC +CGAACTACAATCTTTTCGTGATGGATGAGGCTCATTTTACCGACCCAGCTAGCATTGCAGCAAGGGGTTATATTTCCACA +AAAGTCGAGCTGGGGGAGGCGGCGGCAATATTCATGACAGCTACCCCACCAGGCACTTCAGACCCATTCCCAGAGTCCAA +TTCACCTATTTCTGACTTGCAGACTGAGATCCCAGATCGGGCCTGGAACTCTGGGTACGAATGGATTACAGAATACATTG +GGAAAACGGTTTGGTTTGTGCCCAGTGTGAAAATGGGGAATGAGATTGCCCTTTGTCTACAACGTGCCGGCAAAAAAGTA +GTCCAACTGAACAGAAAGTCGTATGAGACGGAGTACCCAAAGTGCAAGAACGATGATTGGGACTTTGTTATCACAACAGA +CATATCTGAAATGGGGGCTAACTTCAAGGCGAGCAGGGTGATTGACAGCAGGAAGAGTGTGAAACCAACCATCATCACGG +AAGGAGAAGGGAGGGTGATCCTGGGAGAACCATCCGCTGTGACAGCAGCTAGTGCAGCCCAAAGACGTGGACGCATCGGT +AGGAATCCATCGCAAGTTGGTGATGAGTACTGCTATGGGGGGCACACGAATGAAGACGACTCGAACTTCGCCCATTGGAC +TGAGGCACGAATCATGCTGGACAACATCAACATGCCAAACGGACTGATCGCTCAATTCTACCAACCAGAGCGTGAAAAGG +TATACACCATGGATGGAGAATACCGACTCAGAGGAGAAGAGAGGAAAAACTTTCTGGAATTATTGAGGACTGCAGATCTG +CCAGTTTGGCTGGCTTACAAGGTGGCAGCGGCTGGAGTGTCATACCACGATCGGAGATGGTGTTTTGATGGCCCTAGGAC +AAACACAATTCTAGAAGACAACAACGAAGTGGAAGTCATTACGAAGCTTGGTGAAAGAAAGATTCTGAGGCCGCGCTGGA +TTGACGCCAGGGTGTACTCGGATCATCAGGCATTAAAGGCGTTCAAGGACTTTGCTTCGGGAAAGCGTTCTCAGATAGGG +CTCATTGAGGTTCTGGGAAAGATGCCTGAGCACTTCATGGGGAAGACATGGGAAGCACTTGACACCATGTATGTTGTGGC +CACCGCAGAGAAAGGGGGAAGAGCTCACAGAATGGCCTTGGAGGAACTGCCAGATGCTCTCCAGACAATTGCCCTGATTG +CCTTATTGAGTGTGATGACCATGGGAGTATTCTTCCTCCTCATGCAGCGGAAGGGCATTGGAAAGATAGGTTTGGGAGGC +GTTGTCCTGGGAGTCGCAACCTTCTTTTGTTGGATGGCTGAAGTTCCAGGAACGAAGATCGCCGGAATGTTGCTGCTTTC +CCTTCTCTTGATGATTGTGCTAATCCCTGAGCCAGAGAAGCAACGTTCGCAGACAGACAACCAGCTAGCCGTGTTCCTGA +TTTGTGTGTTGACCCTCGTGAGCGCAGTGGCAGCCAACGAAATGGGTTGGCTGGACAAGACCAAGAATGATATAAGCAGT +TTGTTTGGGCAAAGAATTGAGGCCAAGGAGAATTTCAGTATGGGAGAGTTTCTCCTGGACTTGAGACCGGCAACAGCCTG +GTCACTGTATGCTGTGACCACAGCGGTTCTCACTCCACTGCTAAAGCATCTGATCACGTCAGATTACATCAACACTTCAT +TGACCTCAATCAATGTTCAAGCAAGTGCACTATTCACACTCGCGCGAGGCTTCCCCTTTGTCGATGTTGGAGTGTCGGCT +CTCCTGCTAGCAGCCGGATGCTGGGGACAAGTCACCCTCACCGTGACGGTGACAGCGGCAACACTCCTGTTCTGCCACTA +CGCCTACATGGTTCCCGGATGGCAGGCTGAGGCAATGCGCTCAGCCCAGCGGCGGACAGCGGCTGGAATCATGAAAAACG +CTGTAGTGGATGGCATCGTGGCCACGGACGTCCCAGAATTAGAGCGCACCACACCCATCATGCAGAAGAAAGTTGGGCAA +ATCATGCTGATCTTGGTGTCTCTAGCTGCAGTAGTAGTGAACCCGTCTGTGAAGACAGTGCGAGAAGCCGGAATTCTGAT +CACGGCAGCAGCGGTGACACTCTGGGAGAATGGAGCAAGCTCTGTTTGGAATGCAACAACTGCCATCGGACTCTGCCACA +TCATGCGTGGGGGTTGGTTGTCATGCTTATCCATAACATGGACACTCATAAAGAACATGGAAAAACCAGGACTAAAAAGA +GGTGGGGCAAAGGGACGCACCTTGGGAGAGGTTTGGAAAGAAAGACTCAACCAGATGACAAAAGAAGAGTTCACTAGGTA +CCGCAAAGAGGCCATCATCGAAGTCGATCGCTCAGCAGCAAAACACGCCAGGAAAGAAGGCAATGTCACTGGAGGGCATC +CAGTCTCTAGAGGCACAGCAAAGCTGAGATGGCTGGTCGAGCGGAGGTTTCTCGAACCGGTCGGAAAAGTGATTGACCTT +GGATGTGGAAGAGGCGGTTGGTGTTACTACATGGCAACCCAAAAAAGAGTCCAAGAGGTCAGAGGGTACACAAAGGGTGG +TCCCGGACATGAAGAGCCCCAACTGGTGCAAAGTTATGGATGGAACATTGTCACCATGAAGAGCGGAGTGGATGTGTTCT +ACAGACCTTCTGAGTGCTGCGATACCCTCCTTTGTGACATCGGAGAGTCTTCATCAAGTGCTGAGGTTGAAGAGCATAGG +ACGATCCGGGTCCTTGAAATGGTTGAGGACTGGCTGCACCGAGGGCCAAAGGAATTTTGTGTGAAGGTGCTCTGCCCCTA +TATGCCAAAAGTCATAGAAAAGATGGAGCTGCTCCAGCGCCGGTATGGGGGGGGACTGGTCAGAAACCCACTCTCGCGGA +ATTCCACGCACGAGATGTATTGGGTAAGTCGAGCTTCGGGCAATGTGGTACACTCAGTGAACATGACCAGCCAGGTGCTT +CTGGGAAGAATGGAGAAAAGGACCTGGAAGGGACCCCAATACGAGGAAGATGTGAACTTGGGAAGTGGAACCAGGGCGGT +GGGAAAACCCCTACTCAACTCAGACACTAGTAAAATCAAGAACAGGATTGAACGACTCAGGCGTGAGTACAGTTCGACGT +GGCACCACGATGAGAACCACCCATATAGAACCTGGAACTATCACGGCAGTTATGATGTGAAACCTACAGGCTCCGCCAGC +TCGCTGGTCAATGGAGTGGTTAGGCTCCTCTCAAAACCATGGGACACCATCACGAACGTTACCACCATGGCCATGACTGA +CACTACTCCCTTCGGACAGCAGCGGGTGTTTAAAGAGAAGGTGGACACGAAAGCTCCTGAACCGCCAGAAGGAGTGAAGT +ATGTGCTCAATGAAACCACCAACTGGTTGTGGGCGTTTCTGGCCAGAGAAAAACGTCCCAGAATGTGCTCTCGAGAGGAA +TTCATAAAAAAGGTCAATAGCAATGCAGCTCTGGGTGCCATGTTTGAAGAGCAGAACCAATGGAGGAGCGCCAGAGAAGC +AGTTGAGGATCCAAAATTTTGGGAGATGGTGGATGAGGAGCGCGAGGCACACCTGCGGGGGGAATGTCACACTTGCATCT +ACAACATGATGGGGAAGAGAGAGAAGAAACCTGGAGAGTTCGGAAAGGCTAAGGGAAGCAGAGCCATATGGTTCATGTGG +CTCGGAGCTCGCTTTCTGGAGTTCGAAGCTCTGGGCTTTCTTAACGAAGACCACTGGCTTGGAAGAAAGAACTCAGGAGG +CGGGGTCGAGGGCTTGGGCCTCCAAAAACTGGGTTATATTCTGCGTGAAGTTGGCACCCGACCTGGAGGCAAGATCTATG +CTGATGACACAGCTGGCTGGGACACCCGCATTACGAGAGCTGACCTGGAAAATGAAGCTAAGGTTCTTGAGTTGCTGGAT +GGGGAACATCGGCGTCTTGCTAGGGCCATCATTGAGCTCACCTATCGTCACAAAGTTGTGAAAGTGATGCGCCCGGCTGC +TGATGGAAGAACCGTCATGGATGTCATCTCCAGAGAAGATCAGAGGGGGAGTGGACAAGTTGTCACCTACGCTCTAAACA +CCTTCACCAACCTGGCCGTCCAGTTGGTGAGGATGATGGAAGGGGAAGGAGTGATTGGCCCAGATGATGTGGAGAAACTC +ACAAAGGGAAAAGGACCTAAAGTCAGGACCTGGCTGTTTGAGAATGGGGAGGAAAGACTCAGCCGCATGGCTGTCAGCGG +AGATGACTGTGTGGTAAAGCCCCTAGATGACCGCTTCGCCACCTCTCTCCACTTCCTCAACGCCATGTCAAAGGTTCGCA +AAGATATCCAGGAGTGGAAACCGTCAACTGGATGGTATGACTGGCAGCAGGTTCCATTCTGCTCGAACCATTTCACTGAA +TTAATCATGAAAGATGGAAGAACACTGGTGGTTCCATGCCGAGGACAGGACGAACTGGTAGGCAGAGCTCGCATTTCTCC +AGGGGCCGGATGGAACGTCCGTGACACTGCTTGTCTGGCTAAGTCTTATGCCCAGATGTGGCTGCTTCTGTACTTCCACA +GAAGAGACCTGCGGCTAATGGCCAACGCCATTTGCTCCGCTGTCCCTGTGAATTGGGTCCCTACCGGAAGAACCACGTGG +TCCATCCATGCCGGAGGGGAGTGGATGACAACAGAAGACATGCTGGAGGTCTGGAACCGTGTTTGGATAGAGGAGAATGA +ATGGATGGAAGACAAAACCCCAGTGGAGAAATGGAGTGACGTCCCATACTCAGGAAAACGGGAGGACATCTGGTGTGGCA +GCTTGATTGGCACAAGAACCCGAGCCACGTGGGCAGAAAACATCCAGGTAGCCATCAACCAAGTCAGAGCAATCATTGGA +GATGAGAAGTATGTGGATTACATGAGTTCATTAAAGAGATATGAAGACACGACTTTGGTTGAGGACACAGTACTGTAAAT +ACTTTATTAATTGTAAATAGACAATGTAAGCATGTGTAAAAGTATAGTTTTATAGTAGCATTTAGTGATGTTAGTGTAAA +TAGTTAAGAAAATTTTAAGGAGGAAGTCAGGCCGGAAAGTTTCCGCCACCGGAAGTTGAGTAGACGGTGCTGCCTGCGAC +TCAACCCCAGGAGGACTGGGTGAACAAAGCTGCGAAGTGATCCATGTAAGCCCTCAGAACCGTCTCGGAAGGAGGACCCC +ACATGTTGTAACTTCAAAGCCCAATGTCAGACCACGCTACGGCGTGCCACTCTGCGGAGAGTGCAGTCTGCGATAGTGCC +CCAGGAGGACTGGGTTAACAAAGGCAGATCAACGCCCCACGCGGCCCTAGCCCTGGTAATGGTGTTAACCAGGGCGAAAG +GACTAGAGGTTAGAGGAGACCCCGCGGTTTAAAGTGCACGGCCCAGCCTGACTGAAGCTGTAGGTCAGGGGAAGGACTAG +AGGTTAGTGGAGACCCCGTGCCACAAAACACCACAACAAAACAGCATATTGACACCTGGGATAGACTAGGAGATCTTCTG +CTCTGCACAACCAGCCACACGGCACAGTGCGCCGACAATGGTGGCTGGTGGTGCGAGAACACAGGATCT