Skip to content

GenBank fetch and ingest #1240

GenBank fetch and ingest

GenBank fetch and ingest #1240

name: GenBank fetch and ingest
on:
schedule:
# Note times are in UTC, which is 1 or 2 hours behind CET depending on daylight savings.
#
# Currently, we aim to trigger ingest every day at 18:07 UTC which is 19:07 CET (as of Mar 2022).
# Note the actual runs might be late. As of right now, the action starts around 20 past the hour.
# Numerous people were confused, about that, including me:
# - https://github.community/t/scheduled-action-running-consistently-late/138025/11
# - https://github.com/github/docs/issues/3059
#
# Note, '*' is a special character in YAML, so you have to quote this string.
#
# Docs:
# - https://docs.github.com/en/actions/learn-github-actions/events-that-trigger-workflows#schedule
#
# Tool that deciphers this particular format of crontab string:
# - https://crontab.guru/
#
# Looks like you are about to modify this schedule? Make sure you also modify the schedule for the
# sister GISAID job, so that we don't need to keep two schedules in our heads.
- cron: '7 18 * * *'
# Manually triggered using `./vendored/trigger nextstrain/ncov-ingest genbank/fetch-and-ingest` (or `fetch-and-ingest`, which
# includes GISAID)
repository_dispatch:
types:
- genbank/fetch-and-ingest
- fetch-and-ingest
# Manually triggered using GitHub's UI
workflow_dispatch:
inputs:
trial_name:
description: >-
Short name for a trial run.
If provided, files will be uploaded to s3://nextstrain-data/files/ncov/open/trial/${TRIAL_NAME}/
and downstream ncov and forecasts-ncov workflows will not be triggered.
WARNING: without the trial name, workflow will upload files to s3://nextstrain-data/files/ncov/open/
required: false
fetch_from_database:
description: >-
Whether the workflow should fetch from upstream database.
If not selected (false), the workflow will start from existing data on S3.
type: boolean
required: false
default: true
image:
description: >-
Specific container image to use for build.
This will override the default image (nextstrain/ncov-ingest).
required: false
jobs:
set_config_overrides:
runs-on: ubuntu-latest
steps:
- id: config
name: Set config overrides
env:
TRIAL_NAME: ${{ inputs.trial_name }}
FETCH_FROM_DATABASE: ${{ github.event_name != 'workflow_dispatch' && true || inputs.fetch_from_database }}
run: |
config="--config"
if [[ "$FETCH_FROM_DATABASE" == true ]]; then
config+=" fetch_from_database=True"
else
config+=" fetch_from_database=False"
fi
if [[ "$TRIAL_NAME" ]]; then
config+=" trigger_rebuild=False"
config+=" trigger_counts=False"
config+=" s3_dst=s3://nextstrain-data/files/ncov/open/trial/${TRIAL_NAME}"
else
config+=" trigger_rebuild=True"
config+=" trigger_counts=True"
fi
echo "config=$config" >> "$GITHUB_OUTPUT"
outputs:
config_overrides: ${{ steps.config.outputs.config }}
fetch_and_ingest:
needs: [set_config_overrides]
permissions:
id-token: write
uses: nextstrain/.github/.github/workflows/pathogen-repo-build.yaml@master
secrets: inherit
with:
runtime: aws-batch
env: |
CONFIG_OVERRIDES: ${{ needs.set_config_overrides.outputs.config_overrides }}
GITHUB_RUN_ID: ${{ github.run_id }}
NEXTSTRAIN_DOCKER_IMAGE: ${{ inputs.image || 'nextstrain/ncov-ingest' }}
SLACK_CHANNELS: ${{ inputs.trial_name && vars.TEST_SLACK_CHANNEL || 'ncov-genbank-updates' }}
run: |
nextstrain build \
--aws-batch \
--detach \
--no-download \
--cpus 36 \
--memory 68GiB \
--env GITHUB_RUN_ID \
--env SLACK_TOKEN \
--env SLACK_CHANNELS \
--env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \
. \
--stats snakemake_stats.json \
--configfile config/genbank.yaml \
$CONFIG_OVERRIDES