Skip to content

Commit

Permalink
Merge pull request #1300 from izcram/feature/cleanupTarBall
Browse files Browse the repository at this point in the history
Feature/cleanup tar ball
  • Loading branch information
zfisch authored Apr 14, 2021
2 parents fd3ad52 + 0453274 commit 8234892
Show file tree
Hide file tree
Showing 7 changed files with 273 additions and 40 deletions.
2 changes: 1 addition & 1 deletion pipeline/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ prune-release-notes-output: ## postprocessing: removes some files, s.t. luigi ta
include-release-notes: prune-release-notes-output run-pipeline ## postprocessing: include updated release notes into archive

cleanup-failed: ## postprocessing: cleaning up some dangling FAILED files, which may be confusing
find $(OUT_DIR) -name 'FAILED*.tsv' -exec rm -i {} \;
find $(OUT_DIR) -name 'FAILED*' -exec rm -i {} \;

.ONESHELL:
tag-release: ## postprocessing: tags and pushes git commit used for the data release
Expand Down
1 change: 1 addition & 0 deletions pipeline/field_metadata.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -378,3 +378,4 @@ lowMESFlag Internal #SKIP "Intermediate position, sequence, or value used in In
varConsequences Nomenclature #COPY "Variant consequences, pulled from Ensembl's Variant Effect Predictor" missense_variant https://useast.ensembl.org/info/genome/variation/prediction/predicted_data.html#consequence_type_table
VR_ID Nomenclature #COPY The digest representing the variant (in GRCh38 genomic coordinates) by the GA4GH Variant Representation Standard ga4gh:VA.kVvzAwHx3AlCzQ8u6b7J9bLNuxKcSnUr
BayesDel_nsfp33a_noAF Evidence bayesdel_noaf "BayesDel score" 0.55
change_type Internal #SKIP "type of change with respect to last release" "changed_information"
6 changes: 2 additions & 4 deletions pipeline/top_level_readme.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(Readme last updated March 05, 2021)
(Readme last updated April 06, 2021)

BRCA Exchange download data

Expand All @@ -20,8 +20,6 @@ output/
and translated into GRCh38 coordinates as needed.
output/1000G.sorted.hg38.vcf
Variants selected from 1000 Genomes, with genotype data.
output/1000G.sorted.hg38.vcffor_pipeline
Variants from 1000 Genomes, without genotype data
output/ClinVar.vcf
Variants from ClinVar at NCBI.
output/bic_brca12.sorted.hg38.vcf
Expand All @@ -43,7 +41,7 @@ output/release/
output/release/artifacts/
Subdirectory with miscellaneous intermediate results from the data aggregation pipeline
output/release/built_final.tsv
Tab-delimited file with the variants shared on BRCA Exchange, and their attribute data
Final and complete output of pipeline. Currently just a symlink to output/release/built_with_change_types.tsv
output/release/built_with_change_types.tsv
Tab-delimited file of the variants shared on BRCA Exchange, their attributes, and their update status since the
last release.
Expand Down
48 changes: 42 additions & 6 deletions pipeline/utilities/generateMD5Sums.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,64 @@
#!/usr/bin/env python
import os
import argparse
import hashlib
import logging
import os
import sys
from pathlib import Path
from typing import Set


def read_file_list(path: Path) -> Set[Path]:
with open(path, "r") as f:
return {Path(line.strip()) for line in f.readlines()}


def main():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--inputDir", help="Input directory for generating md5sums")
parser.add_argument("-o", "--outputFile", help="Output file for md5sums")
parser.add_argument("-f", "--keepListFilePath", help="only consider files whose paths are given in this file."
"Paths are expected to be relative to --inputDir")
parser.add_argument("-d", "--discardListFilePath",
help="paths in this file are ignored, i.e. not included in final tarball. Paths are expected to be relative to --inputDir")

args = parser.parse_args()

logging.getLogger().setLevel(logging.INFO)

keep_list = read_file_list(args.keepListFilePath)
discard_list = read_file_list(args.discardListFilePath)

lists_intersec = keep_list.intersection(discard_list)
if lists_intersec:
sys.exit(f"Keep list and discard list are not disjoint! Got {', '.join([str(p) for p in lists_intersec])}")

input_dir = Path(args.inputDir)
output_file_name = args.outputFile
# Recurses through a directory and it's subdirectories and generates md5 hashes for each file

# Recurses through a directory and it's subdirectories and generates md5 hashes for each file in keeplist
# All hashes are appended to an output file as specified with the -o flag.
with open(output_file_name, 'w') as f_out:
for subdir, dirs, files in os.walk(args.inputDir):
for subdir, _, files in os.walk(input_dir):
subdir_path = Path(subdir).relative_to(input_dir)

for file in files:
# Don't hash the output file
if file == output_file_name.split('/')[-1]:
continue
filename = os.path.join(subdir, file)
md5hash = hashlib.md5(open(filename, 'rb').read()).hexdigest()
f_out.write(file + ": " + md5hash + '\n')

filepath = subdir_path / file
if filepath not in keep_list:
logging.info(f"Won't include file {filepath} in tarball")

if filepath not in discard_list:
# If a file is neither in the keep list nor the discard,list, fail, as it is an unexpected file.
# This way we are making sure we don't forget to include a newly created file into the tarball
sys.exit(f"Found found file {filepath} neither in keep list nor discard list. ")
else:
md5hash = hashlib.md5(open(Path(subdir) / file, 'rb').read()).hexdigest()
f_out.write(f"{md5hash} {filepath}\n") # 2 whitespaces in order to be compatible with GNU md5sum


if __name__ == "__main__":
main()
67 changes: 38 additions & 29 deletions pipeline/workflow/CompileVCFFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import subprocess
import tarfile
import tempfile
from pathlib import Path
from shutil import copy

import luigi
Expand Down Expand Up @@ -845,17 +846,6 @@ def run(self):


@requires(bayesdel_processing.AddBayesdelScores)
class LinkBuiltFinal(DefaultPipelineTask):
def output(self):
return luigi.LocalTarget(os.path.join(self.release_dir, "built_final.tsv"))

def run(self):
# create relative symlink to have a permanent pointer to what currently the final output of the pipeline is
relativ_input = os.path.relpath(self.input().path, os.path.dirname(self.output().path))
os.symlink(relativ_input, self.output().path)


@requires(LinkBuiltFinal)
class FindMissingReports(DefaultPipelineTask):
def output(self):
return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log"))
Expand All @@ -872,7 +862,7 @@ def run(self):
pipeline_utils.check_file_for_contents(self.output().path)


@requires(LinkBuiltFinal)
@requires(bayesdel_processing.AddBayesdelScores)
class RunDiffAndAppendChangeTypesToOutput(DefaultPipelineTask):
def _extract_release_date(self, version_json):
with open(version_json, 'r') as f:
Expand Down Expand Up @@ -981,6 +971,18 @@ def run(self):
os.path.join(self.release_dir, "reports_with_change_types.tsv"))


@requires(RunDiffAndAppendChangeTypesToOutput)
class LinkBuiltFinal(DefaultPipelineTask):
def output(self):
return luigi.LocalTarget(os.path.join(self.release_dir, "built_final.tsv"))

def run(self):
input_path = self.input()['built_with_change_types'].path
# create relative symlink to have a permanent pointer to what currently the final output of the pipeline is
relative_input = os.path.relpath(input_path, os.path.dirname(self.output().path))
os.symlink(relative_input, self.output().path)


@requires(LinkBuiltFinal)
class GenerateVariantsOutputFile(DefaultPipelineTask):
VAR_OUTPUT_FILE_KEY = 'var_output_file'
Expand Down Expand Up @@ -1064,31 +1066,38 @@ def output(self):
def run(self):
os.chdir(utilities_method_dir)

args = ["python", "generateMD5Sums.py", "-i", self.cfg.output_dir, "-o",
self.output().path]
workflow_dir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir))

keep_list_file_path = os.path.join(workflow_dir, "tarball_files_keep_list.txt")
discard_list_file_path = os.path.join(workflow_dir, "tarball_files_discard_list.txt")
args = ["python", "generateMD5Sums.py",
"-i", self.cfg.output_dir,
"-o", self.output().path,
"--keepListFilePath", keep_list_file_path,
"--discardListFilePath", discard_list_file_path]

pipeline_utils.run_process(args)
pipeline_utils.check_file_for_contents(self.output().path)


@requires(GenerateMD5Sums)
class GenerateReleaseArchive(DefaultPipelineTask):
def getArchiveName(self):
def output(self):
# Format archive filename as release-mm-dd-yy.tar.gz
return "release-" + self.cfg.date.strftime("%x").replace('/',
'-') + ".tar.gz"
archive_name = f'release-{self.cfg.date.strftime("%x").replace("/", "-")}.tar.gz'
return luigi.LocalTarget(Path(self.cfg.output_dir).parent / archive_name)

def getArchiveParentDirectory(self):
return os.path.dirname(self.cfg.output_dir) + "/"
def run(self):
# parse md5sum list.
with open(self.input().path) as f:
file_list = [line.strip().split(' ')[-1] for line in f.readlines()]

def output(self):
return luigi.LocalTarget(
self.getArchiveParentDirectory() + self.getArchiveName())
# include md5sum in tar as well
file_list.append(Path(self.input().path).name)

output_dir = Path(self.cfg.output_dir)
with tarfile.open(self.output().path, "w:gz") as tar:
for file in file_list:
file_path = output_dir / file
tar.add(file_path, arcname=Path(output_dir.name) / file)

def run(self):
os.chdir(self.getArchiveParentDirectory())
with tarfile.open(
self.getArchiveParentDirectory() + self.getArchiveName(),
"w:gz") as tar:
tar.add(self.cfg.output_dir,
arcname=os.path.basename(self.cfg.output_dir))
138 changes: 138 additions & 0 deletions pipeline/workflow/tarball_files_discard_list.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
./1000G.sorted.hg38.vcffor_pipeline
./release/artifacts/1000_Genomesready.vcf
./release/artifacts/1000_Genomes.vcf
./release/artifacts/aggregated.tsv
./release/artifacts/bayesdel.vcf
./release/artifacts/BICready.vcf
./release/artifacts/BIC.vcf
./release/artifacts/built.tsv
./release/artifacts/built_with_bayesdel.tsv
./release/artifacts/built_with_ca_ids.tsv
./release/artifacts/built_with_mupit.tsv
./release/artifacts/built_with_priors_clean.tsv
./release/artifacts/built_with_priors.tsv
./release/artifacts/built_with_vr_ids.tsv
./release/artifacts/ClinVarready.vcf
./release/artifacts/ClinVar.vcf
./release/artifacts/enigma_from_clinvar.tsv
./release/artifacts/ESPready.vcf
./release/artifacts/ESP.vcf
./release/artifacts/ExACready.vcf
./release/artifacts/ExAC.vcf
./release/artifacts/exLOVDready.vcf
./release/artifacts/exLOVD.vcf
./release/artifacts/Findlay_BRCA1_Ring_Function_Scoresready.vcf
./release/artifacts/Findlay_BRCA1_Ring_Function_Scores.vcf
./release/artifacts/GnomADready.vcf
./release/artifacts/GnomAD.vcf
./release/artifacts/LOVDready.vcf
./release/artifacts/LOVD.vcf
./release/artifacts/releaseDiff.log
./release/artifacts/right1000_Genomes
./release/artifacts/rightBIC
./release/artifacts/rightClinVar
./release/artifacts/rightESP
./release/artifacts/rightExAC
./release/artifacts/rightexLOVD
./release/artifacts/rightFindlay_BRCA1_Ring_Function_Scores
./release/artifacts/rightGnomAD
./release/artifacts/rightLOVD
./release/artifacts/victor_wdir/input.vcf.gz
./release/artifacts/victor_wdir/input.vcf.gz.tbi
./release/artifacts/victor_wdir/output.10.qc.vcf.gz
./release/artifacts/victor_wdir/output.10.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.10.vcf.gz
./release/artifacts/victor_wdir/output.10.vcf.gz.tbi
./release/artifacts/victor_wdir/output.11.qc.vcf.gz
./release/artifacts/victor_wdir/output.11.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.11.vcf.gz
./release/artifacts/victor_wdir/output.11.vcf.gz.tbi
./release/artifacts/victor_wdir/output.12.qc.vcf.gz
./release/artifacts/victor_wdir/output.12.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.12.vcf.gz
./release/artifacts/victor_wdir/output.12.vcf.gz.tbi
./release/artifacts/victor_wdir/output.13.qc.vcf.gz
./release/artifacts/victor_wdir/output.13.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.13.vcf.gz
./release/artifacts/victor_wdir/output.13.vcf.gz.tbi
./release/artifacts/victor_wdir/output.14.qc.vcf.gz
./release/artifacts/victor_wdir/output.14.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.14.vcf.gz
./release/artifacts/victor_wdir/output.14.vcf.gz.tbi
./release/artifacts/victor_wdir/output.15.qc.vcf.gz
./release/artifacts/victor_wdir/output.15.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.15.vcf.gz
./release/artifacts/victor_wdir/output.15.vcf.gz.tbi
./release/artifacts/victor_wdir/output.16.qc.vcf.gz
./release/artifacts/victor_wdir/output.16.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.16.vcf.gz
./release/artifacts/victor_wdir/output.16.vcf.gz.tbi
./release/artifacts/victor_wdir/output.17.qc.vcf.gz
./release/artifacts/victor_wdir/output.17.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.17.vcf.gz
./release/artifacts/victor_wdir/output.17.vcf.gz.tbi
./release/artifacts/victor_wdir/output.18.qc.vcf.gz
./release/artifacts/victor_wdir/output.18.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.18.vcf.gz
./release/artifacts/victor_wdir/output.18.vcf.gz.tbi
./release/artifacts/victor_wdir/output.19.qc.vcf.gz
./release/artifacts/victor_wdir/output.19.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.19.vcf.gz
./release/artifacts/victor_wdir/output.19.vcf.gz.tbi
./release/artifacts/victor_wdir/output.1.qc.vcf.gz
./release/artifacts/victor_wdir/output.1.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.1.vcf.gz
./release/artifacts/victor_wdir/output.1.vcf.gz.tbi
./release/artifacts/victor_wdir/output.20.qc.vcf.gz
./release/artifacts/victor_wdir/output.20.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.20.vcf.gz
./release/artifacts/victor_wdir/output.20.vcf.gz.tbi
./release/artifacts/victor_wdir/output.21.qc.vcf.gz
./release/artifacts/victor_wdir/output.21.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.21.vcf.gz
./release/artifacts/victor_wdir/output.21.vcf.gz.tbi
./release/artifacts/victor_wdir/output.22.qc.vcf.gz
./release/artifacts/victor_wdir/output.22.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.22.vcf.gz
./release/artifacts/victor_wdir/output.22.vcf.gz.tbi
./release/artifacts/victor_wdir/output.2.qc.vcf.gz
./release/artifacts/victor_wdir/output.2.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.2.vcf.gz
./release/artifacts/victor_wdir/output.2.vcf.gz.tbi
./release/artifacts/victor_wdir/output.3.qc.vcf.gz
./release/artifacts/victor_wdir/output.3.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.3.vcf.gz
./release/artifacts/victor_wdir/output.3.vcf.gz.tbi
./release/artifacts/victor_wdir/output.4.qc.vcf.gz
./release/artifacts/victor_wdir/output.4.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.4.vcf.gz
./release/artifacts/victor_wdir/output.4.vcf.gz.tbi
./release/artifacts/victor_wdir/output.5.qc.vcf.gz
./release/artifacts/victor_wdir/output.5.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.5.vcf.gz
./release/artifacts/victor_wdir/output.5.vcf.gz.tbi
./release/artifacts/victor_wdir/output.6.qc.vcf.gz
./release/artifacts/victor_wdir/output.6.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.6.vcf.gz
./release/artifacts/victor_wdir/output.6.vcf.gz.tbi
./release/artifacts/victor_wdir/output.7.qc.vcf.gz
./release/artifacts/victor_wdir/output.7.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.7.vcf.gz
./release/artifacts/victor_wdir/output.7.vcf.gz.tbi
./release/artifacts/victor_wdir/output.8.qc.vcf.gz
./release/artifacts/victor_wdir/output.8.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.8.vcf.gz
./release/artifacts/victor_wdir/output.8.vcf.gz.tbi
./release/artifacts/victor_wdir/output.9.qc.vcf.gz
./release/artifacts/victor_wdir/output.9.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.9.vcf.gz
./release/artifacts/victor_wdir/output.9.vcf.gz.tbi
./release/artifacts/victor_wdir/output.for_PROV
./release/artifacts/victor_wdir/output.vqslod
./release/artifacts/victor_wdir/output.X.qc.vcf.gz
./release/artifacts/victor_wdir/output.X.qc.vcf.gz.tbi
./release/artifacts/victor_wdir/output.X.vcf.gz
./release/artifacts/victor_wdir/output.X.vcf.gz.tbi
./release/artifacts/victor_wdir/slurm.annotate.run_1.start
./release/artifacts/victor_wdir/slurm.annotate.run_1.stop
./release/artifacts/victor_wdir/slurm.annotate.run_1.version
Loading

0 comments on commit 8234892

Please sign in to comment.