-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into clean-metadata
- Loading branch information
Showing
9 changed files
with
8,645 additions
and
1,577 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
name: 'Fetch & Ingest 2019-nCov/SARS-CoV-2 data from GISAID for nextstrain.org/ncov' | ||
|
||
on: | ||
# Manually triggered using `./bin/trigger fetch-and-ingest` | ||
repository_dispatch: | ||
types: fetch-and-ingest | ||
|
||
jobs: | ||
ingest: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v1 | ||
- name: ingest | ||
run: | | ||
PATH="$HOME/.local/bin:$PATH" | ||
python3 -m pip install --upgrade pip setuptools | ||
python3 -m pip install pipenv | ||
pipenv sync | ||
pipenv run ./bin/ingest-gisaid --fetch | ||
env: | ||
AWS_DEFAULT_REGION: ${{ secrets.AWS_DEFAULT_REGION }} | ||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} | ||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} | ||
GISAID_API_ENDPOINT: ${{ secrets.GISAID_API_ENDPOINT }} | ||
GISAID_USERNAME_AND_PASSWORD: ${{ secrets.GISAID_USERNAME_AND_PASSWORD }} | ||
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} | ||
SLACK_CHANNELS: ncov-gisaid-updates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,69 +1,117 @@ | ||
#!/bin/bash | ||
# usage: ingest-gisaid [--fetch] | ||
# ingest-gisaid --help | ||
# | ||
# Ingest SARS-CoV-2 metadata and sequences from GISAID. | ||
# | ||
# If the --fetch flag is given, new records are fetched from GISAID. Otherwise, | ||
# ingest from the existing GISAID NDJSON file on S3. | ||
# | ||
set -euo pipefail | ||
|
||
: "${S3_SRC:=s3://nextstrain-ncov-private}" | ||
: "${S3_DST:=$S3_SRC}" | ||
|
||
# Determine where to save data files based on if we're running as a result of a | ||
# push to master or to another branch (or locally, outside of the GitHub | ||
# workflow). Files are always compared to the default/primary paths in the | ||
# source S3 bucket. | ||
# | ||
silent= | ||
branch= | ||
main() { | ||
local fetch=0 | ||
|
||
for arg; do | ||
case "$arg" in | ||
-h|--help) | ||
print-help | ||
exit | ||
;; | ||
--fetch) | ||
fetch=1 | ||
shift | ||
break | ||
;; | ||
esac | ||
done | ||
|
||
# Determine where to save data files based on if we're running as a result of a | ||
# push to master or to another branch (or locally, outside of the GitHub | ||
# workflow). Files are always compared to the default/primary paths in the | ||
# source S3 bucket. | ||
# | ||
local silent= | ||
local branch= | ||
|
||
case "${GITHUB_REF:-}" in | ||
refs/heads/master) | ||
# Do nothing different; defaults above are good. | ||
branch=master | ||
;; | ||
refs/heads/*) | ||
# Save data files under a per-branch prefix | ||
silent=yes | ||
branch="${GITHUB_REF##refs/heads/}" | ||
S3_DST="$S3_DST/branch/$branch" | ||
;; | ||
"") | ||
# Save data files under a tmp prefix | ||
silent=yes | ||
S3_DST="$S3_DST/tmp" | ||
;; | ||
*) | ||
echo "Skipping ingest for ref $GITHUB_REF" | ||
exit 0 | ||
;; | ||
esac | ||
|
||
echo "S3_SRC is $S3_SRC" | ||
echo "S3_DST is $S3_DST" | ||
|
||
case "${GITHUB_REF:-}" in | ||
refs/heads/master) | ||
# Do nothing different; defaults above are good. | ||
branch=master | ||
;; | ||
refs/heads/*) | ||
# Save data files under a per-branch prefix | ||
silent=yes | ||
branch="${GITHUB_REF##refs/heads/}" | ||
S3_DST="$S3_DST/branch/$branch" | ||
;; | ||
"") | ||
# Save data files under a tmp prefix | ||
silent=yes | ||
S3_DST="$S3_DST/tmp" | ||
;; | ||
*) | ||
echo "Skipping ingest for ref $GITHUB_REF" | ||
exit 0 | ||
;; | ||
esac | ||
cd "$(dirname "$0")/.." | ||
|
||
echo "S3_SRC is $S3_SRC" | ||
echo "S3_DST is $S3_DST" | ||
set -x | ||
|
||
cd "$(dirname "$0")/.." | ||
if [[ "$fetch" == 1 ]]; then | ||
./bin/fetch-from-gisaid > data/gisaid.ndjson | ||
if [[ "$branch" == master ]]; then | ||
./bin/notify-on-record-change data/gisaid.ndjson "$S3_SRC/gisaid.ndjson.gz" "GISAID" | ||
fi | ||
./bin/upload-to-s3 --quiet data/gisaid.ndjson "$S3_DST/gisaid.ndjson.gz" | ||
else | ||
aws s3 cp --no-progress "$S3_DST/gisaid.ndjson.gz" - | gunzip -cfq > data/gisaid.ndjson | ||
fi | ||
|
||
set -x | ||
./bin/transform-gisaid data/gisaid.ndjson \ | ||
--output-metadata data/gisaid/metadata.tsv \ | ||
--output-fasta data/gisaid/sequences.fasta | ||
|
||
./bin/fetch-from-gisaid > data/gisaid.ndjson | ||
if [[ "$branch" == master ]]; then | ||
./bin/notify-on-record-change data/gisaid.ndjson "$S3_SRC/gisaid.ndjson.gz" "GISAID" | ||
fi | ||
./bin/upload-to-s3 --quiet data/gisaid.ndjson "$S3_DST/gisaid.ndjson.gz" | ||
./bin/flag-metadata data/gisaid/metadata.tsv > data/gisaid/flagged_metadata.txt | ||
./bin/check-locations data/gisaid/metadata.tsv \ | ||
data/gisaid/location_hierarchy.tsv \ | ||
gisaid_epi_isl | ||
|
||
./bin/transform-gisaid data/gisaid.ndjson \ | ||
--output-metadata data/gisaid/metadata.tsv \ | ||
--output-fasta data/gisaid/sequences.fasta | ||
if [[ "$branch" == master ]]; then | ||
./bin/notify-on-metadata-change data/gisaid/metadata.tsv "$S3_SRC/metadata.tsv.gz" gisaid_epi_isl | ||
./bin/notify-on-additional-info-change data/gisaid/additional_info.tsv "$S3_SRC/additional_info.tsv.gz" | ||
./bin/notify-on-flagged-metadata-change data/gisaid/flagged_metadata.txt "$S3_SRC/flagged_metadata.txt.gz" | ||
./bin/notify-on-location-hierarchy-addition data/gisaid/location_hierarchy.tsv source-data/location_hierarchy.tsv | ||
fi | ||
|
||
./bin/flag-metadata data/gisaid/metadata.tsv > data/gisaid/flagged_metadata.txt | ||
./bin/check-locations data/gisaid/metadata.tsv \ | ||
data/gisaid/location_hierarchy.tsv \ | ||
gisaid_epi_isl | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/metadata.tsv "$S3_DST/metadata.tsv.gz" | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/additional_info.tsv "$S3_DST/additional_info.tsv.gz" | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/flagged_metadata.txt "$S3_DST/flagged_metadata.txt.gz" | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/sequences.fasta "$S3_DST/sequences.fasta.gz" | ||
} | ||
|
||
if [[ "$branch" == master ]]; then | ||
./bin/notify-on-metadata-change data/gisaid/metadata.tsv "$S3_SRC/metadata.tsv.gz" gisaid_epi_isl | ||
./bin/notify-on-additional-info-change data/gisaid/additional_info.tsv "$S3_SRC/additional_info.tsv.gz" | ||
./bin/notify-on-flagged-metadata-change data/gisaid/flagged_metadata.txt "$S3_SRC/flagged_metadata.txt.gz" | ||
./bin/notify-on-location-hierarchy-addition data/gisaid/location_hierarchy.tsv source-data/location_hierarchy.tsv | ||
fi | ||
print-help() { | ||
# Print the help comments at the top of this file ($0) | ||
local line | ||
while read -r line; do | ||
if [[ $line =~ ^#! ]]; then | ||
continue | ||
elif [[ $line =~ ^# ]]; then | ||
line="${line/##/}" | ||
line="${line/# /}" | ||
echo "$line" | ||
else | ||
break | ||
fi | ||
done < "$0" | ||
} | ||
|
||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/metadata.tsv "$S3_DST/metadata.tsv.gz" | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/additional_info.tsv "$S3_DST/additional_info.tsv.gz" | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/flagged_metadata.txt "$S3_DST/flagged_metadata.txt.gz" | ||
./bin/upload-to-s3 ${silent:+--quiet} data/gisaid/sequences.fasta "$S3_DST/sequences.fasta.gz" | ||
main "$@" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.