-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenomic-surveillance.yaml
85 lines (81 loc) · 3.68 KB
/
genomic-surveillance.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Format reference: <https://docs.nextstrain.org/projects/ncov/page/reference/workflow-config-file.html>
# Define our input data, assigning a name to each dataset.
# We will reference these names in the subsampling scheme
# below, as a way to select specific strains for the analysis.
inputs:
- name: reference_data
metadata: https://data.nextstrain.org/files/ncov/open/reference/metadata.tsv.xz
aligned: https://data.nextstrain.org/files/ncov/open/reference/aligned.fasta.xz
- name: custom_data
metadata: data/idaho.metadata.tsv
sequences: data/idaho.sequences.fasta
- name: background_data
metadata: https://data.nextstrain.org/files/ncov/open/north-america/metadata.tsv.xz
aligned: https://data.nextstrain.org/files/ncov/open/north-america/aligned.fasta.xz
# GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build.
refine:
root: "Wuhan-Hu-1/2019"
# Define a single build for the state of interest, Idaho.
# The build name will be "idaho" and it will use the custom
# subsampling scheme defined below.
builds:
idaho:
title: "Idaho-specific genomic surveillance build"
subsampling_scheme: idaho_scheme
# Defines colorings for input data sources
# (e.g. "background_data" is "yes" or "no").
auspice_config: ncov-tutorial/auspice-config-custom-data.json
# Define a single subsampling scheme for the state of Idaho.
# This analysis is for a specific date range, so we specify
# the same maximum collection date for strains in all sections
# of the subsampling scheme below.
subsampling:
idaho_scheme:
# Include all data from Idaho.
# When the workflow merges metadata from multiple
# inputs, it creates a boolean column for each input to
# indicate which input each record came from. A record
# from the "usa" input will have a value of "yes" in a
# column named "usa". The same record will have a column
# for the "nextregions" input with a value of "no".
custom_sample:
query: --query "(custom_data == 'yes')"
# Limit the number of Idaho records included in the
# analysis to a reasonable but large number. Tune this
# number alone with the other "max_sequences" in the
# sections below to keep your final build to <10,000
# records.
max_sequences: 50
# To understand transmission patterns within the US that
# led to introductions to Idaho, we select a subset of USA
# data from states other than Idaho with priority given to
# strains that are genetically similar to the strains in
# the "idaho" subsampling set defined above.
usa_context:
query: --query "(custom_data != 'yes') & (country == 'USA')"
# This value sets a hard upper limit on how many strains
# make it into the analysis. Tune this value, based on
# your needs for the resulting tree.
max_sequences: 10
# These group-by columns attempt to evenly sample across
# US states by year and month. Sequences in each group
# of state, year, and month are prioritized by genetic
# proximity.
group_by: division year month
priorities:
type: proximity
focus: custom_sample
# Select a subset of data from the "background_data" for
# context. This example prioritizes strains that are
# genetically related to the "idaho" subsampling set, but
# you can remove the "priorities" block to get a random
# global context instead.
global_context:
query: --query "(custom_data != 'yes')"
# As with the contextual data from the USA above, tune
# this value to get a reasonable number of strains in
# your build.
max_sequences: 10
priorities:
type: proximity
focus: custom_sample