genomic-surveillance.yaml

# Format reference: <https://docs.nextstrain.org/projects/ncov/page/reference/workflow-config-file.html>

# Define our input data, assigning a name to each dataset.
# We will reference these names in the subsampling scheme
# below, as a way to select specific strains for the analysis.
inputs:
  - name: reference_data
    metadata: https://data.nextstrain.org/files/ncov/open/reference/metadata.tsv.xz
    aligned: https://data.nextstrain.org/files/ncov/open/reference/aligned.fasta.xz
  - name: custom_data
    metadata: data/idaho.metadata.tsv
    sequences: data/idaho.sequences.fasta
  - name: background_data
    metadata: https://data.nextstrain.org/files/ncov/open/north-america/metadata.tsv.xz
    aligned: https://data.nextstrain.org/files/ncov/open/north-america/aligned.fasta.xz

# GenBank data includes "Wuhan-Hu-1/2019" which we use as the root for this build.
refine:
  root: "Wuhan-Hu-1/2019"

# Define a single build for the state of interest, Idaho.
# The build name will be "idaho" and it will use the custom
# subsampling scheme defined below.
builds:
  idaho:
    title: "Idaho-specific genomic surveillance build"
    subsampling_scheme: idaho_scheme
    # Defines colorings for input data sources
    # (e.g. "background_data" is "yes" or "no").
    auspice_config: ncov-tutorial/auspice-config-custom-data.json

# Define a single subsampling scheme for the state of Idaho.
# This analysis is for a specific date range, so we specify
# the same maximum collection date for strains in all sections
# of the subsampling scheme below.
subsampling:
  idaho_scheme:
    # Include all data from Idaho.
    # When the workflow merges metadata from multiple
    # inputs, it creates a boolean column for each input to
    # indicate which input each record came from. A record
    # from the "usa" input will have a value of "yes" in a
    # column named "usa". The same record will have a column
    # for the "nextregions" input with a value of "no".
    custom_sample:
      query: --query "(custom_data == 'yes')"
      # Limit the number of Idaho records included in the
      # analysis to a reasonable but large number. Tune this
      # number alone with the other "max_sequences" in the
      # sections below to keep your final build to <10,000
      # records.
      max_sequences: 50
    # To understand transmission patterns within the US that
    # led to introductions to Idaho, we select a subset of USA
    # data from states other than Idaho with priority given to
    # strains that are genetically similar to the strains in
    # the "idaho" subsampling set defined above.
    usa_context:
      query: --query "(custom_data != 'yes') & (country == 'USA')"
      # This value sets a hard upper limit on how many strains
      # make it into the analysis. Tune this value, based on
      # your needs for the resulting tree.
      max_sequences: 10
      # These group-by columns attempt to evenly sample across
      # US states by year and month. Sequences in each group
      # of state, year, and month are prioritized by genetic
      # proximity.
      group_by: division year month
      priorities:
        type: proximity
        focus: custom_sample
    # Select a subset of data from the "background_data" for
    # context. This example prioritizes strains that are
    # genetically related to the "idaho" subsampling set, but
    # you can remove the "priorities" block to get a random
    # global context instead.
    global_context:
      query: --query "(custom_data != 'yes')"
      # As with the contextual data from the USA above, tune
      # this value to get a reasonable number of strains in
      # your build.
      max_sequences: 10
      priorities:
        type: proximity
        focus: custom_sample