diff --git a/pipeline/Makefile b/pipeline/Makefile index 87dfb5b20..e9b9e88f0 100755 --- a/pipeline/Makefile +++ b/pipeline/Makefile @@ -152,7 +152,7 @@ prune-release-notes-output: ## postprocessing: removes some files, s.t. luigi ta include-release-notes: prune-release-notes-output run-pipeline ## postprocessing: include updated release notes into archive cleanup-failed: ## postprocessing: cleaning up some dangling FAILED files, which may be confusing - find $(OUT_DIR) -name 'FAILED*.tsv' -exec rm -i {} \; + find $(OUT_DIR) -name 'FAILED*' -exec rm -i {} \; .ONESHELL: tag-release: ## postprocessing: tags and pushes git commit used for the data release diff --git a/pipeline/field_metadata.tsv b/pipeline/field_metadata.tsv index be7416968..02b349f88 100644 --- a/pipeline/field_metadata.tsv +++ b/pipeline/field_metadata.tsv @@ -378,3 +378,4 @@ lowMESFlag Internal #SKIP "Intermediate position, sequence, or value used in In varConsequences Nomenclature #COPY "Variant consequences, pulled from Ensembl's Variant Effect Predictor" missense_variant https://useast.ensembl.org/info/genome/variation/prediction/predicted_data.html#consequence_type_table VR_ID Nomenclature #COPY The digest representing the variant (in GRCh38 genomic coordinates) by the GA4GH Variant Representation Standard ga4gh:VA.kVvzAwHx3AlCzQ8u6b7J9bLNuxKcSnUr BayesDel_nsfp33a_noAF Evidence bayesdel_noaf "BayesDel score" 0.55 +change_type Internal #SKIP "type of change with respect to last release" "changed_information" diff --git a/pipeline/top_level_readme.txt b/pipeline/top_level_readme.txt index cbc62dafa..1a4146dcd 100644 --- a/pipeline/top_level_readme.txt +++ b/pipeline/top_level_readme.txt @@ -1,4 +1,4 @@ -(Readme last updated March 05, 2021) +(Readme last updated April 06, 2021) BRCA Exchange download data @@ -20,8 +20,6 @@ output/ and translated into GRCh38 coordinates as needed. output/1000G.sorted.hg38.vcf Variants selected from 1000 Genomes, with genotype data. -output/1000G.sorted.hg38.vcffor_pipeline - Variants from 1000 Genomes, without genotype data output/ClinVar.vcf Variants from ClinVar at NCBI. output/bic_brca12.sorted.hg38.vcf @@ -43,7 +41,7 @@ output/release/ output/release/artifacts/ Subdirectory with miscellaneous intermediate results from the data aggregation pipeline output/release/built_final.tsv - Tab-delimited file with the variants shared on BRCA Exchange, and their attribute data + Final and complete output of pipeline. Currently just a symlink to output/release/built_with_change_types.tsv output/release/built_with_change_types.tsv Tab-delimited file of the variants shared on BRCA Exchange, their attributes, and their update status since the last release. diff --git a/pipeline/utilities/generateMD5Sums.py b/pipeline/utilities/generateMD5Sums.py index 400eda4fa..d4c58a983 100644 --- a/pipeline/utilities/generateMD5Sums.py +++ b/pipeline/utilities/generateMD5Sums.py @@ -1,28 +1,64 @@ #!/usr/bin/env python -import os import argparse import hashlib +import logging +import os +import sys +from pathlib import Path +from typing import Set + + +def read_file_list(path: Path) -> Set[Path]: + with open(path, "r") as f: + return {Path(line.strip()) for line in f.readlines()} def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--inputDir", help="Input directory for generating md5sums") parser.add_argument("-o", "--outputFile", help="Output file for md5sums") + parser.add_argument("-f", "--keepListFilePath", help="only consider files whose paths are given in this file." + "Paths are expected to be relative to --inputDir") + parser.add_argument("-d", "--discardListFilePath", + help="paths in this file are ignored, i.e. not included in final tarball. Paths are expected to be relative to --inputDir") args = parser.parse_args() + logging.getLogger().setLevel(logging.INFO) + + keep_list = read_file_list(args.keepListFilePath) + discard_list = read_file_list(args.discardListFilePath) + + lists_intersec = keep_list.intersection(discard_list) + if lists_intersec: + sys.exit(f"Keep list and discard list are not disjoint! Got {', '.join([str(p) for p in lists_intersec])}") + + input_dir = Path(args.inputDir) output_file_name = args.outputFile - # Recurses through a directory and it's subdirectories and generates md5 hashes for each file + + # Recurses through a directory and it's subdirectories and generates md5 hashes for each file in keeplist # All hashes are appended to an output file as specified with the -o flag. with open(output_file_name, 'w') as f_out: - for subdir, dirs, files in os.walk(args.inputDir): + for subdir, _, files in os.walk(input_dir): + subdir_path = Path(subdir).relative_to(input_dir) + for file in files: # Don't hash the output file if file == output_file_name.split('/')[-1]: continue - filename = os.path.join(subdir, file) - md5hash = hashlib.md5(open(filename, 'rb').read()).hexdigest() - f_out.write(file + ": " + md5hash + '\n') + + filepath = subdir_path / file + if filepath not in keep_list: + logging.info(f"Won't include file {filepath} in tarball") + + if filepath not in discard_list: + # If a file is neither in the keep list nor the discard,list, fail, as it is an unexpected file. + # This way we are making sure we don't forget to include a newly created file into the tarball + sys.exit(f"Found found file {filepath} neither in keep list nor discard list. ") + else: + md5hash = hashlib.md5(open(Path(subdir) / file, 'rb').read()).hexdigest() + f_out.write(f"{md5hash} {filepath}\n") # 2 whitespaces in order to be compatible with GNU md5sum + if __name__ == "__main__": main() diff --git a/pipeline/workflow/CompileVCFFiles.py b/pipeline/workflow/CompileVCFFiles.py index 39f01c102..6428de24b 100644 --- a/pipeline/workflow/CompileVCFFiles.py +++ b/pipeline/workflow/CompileVCFFiles.py @@ -5,6 +5,7 @@ import subprocess import tarfile import tempfile +from pathlib import Path from shutil import copy import luigi @@ -845,17 +846,6 @@ def run(self): @requires(bayesdel_processing.AddBayesdelScores) -class LinkBuiltFinal(DefaultPipelineTask): - def output(self): - return luigi.LocalTarget(os.path.join(self.release_dir, "built_final.tsv")) - - def run(self): - # create relative symlink to have a permanent pointer to what currently the final output of the pipeline is - relativ_input = os.path.relpath(self.input().path, os.path.dirname(self.output().path)) - os.symlink(relativ_input, self.output().path) - - -@requires(LinkBuiltFinal) class FindMissingReports(DefaultPipelineTask): def output(self): return luigi.LocalTarget(os.path.join(self.artifacts_dir, "missing_reports.log")) @@ -872,7 +862,7 @@ def run(self): pipeline_utils.check_file_for_contents(self.output().path) -@requires(LinkBuiltFinal) +@requires(bayesdel_processing.AddBayesdelScores) class RunDiffAndAppendChangeTypesToOutput(DefaultPipelineTask): def _extract_release_date(self, version_json): with open(version_json, 'r') as f: @@ -981,6 +971,18 @@ def run(self): os.path.join(self.release_dir, "reports_with_change_types.tsv")) +@requires(RunDiffAndAppendChangeTypesToOutput) +class LinkBuiltFinal(DefaultPipelineTask): + def output(self): + return luigi.LocalTarget(os.path.join(self.release_dir, "built_final.tsv")) + + def run(self): + input_path = self.input()['built_with_change_types'].path + # create relative symlink to have a permanent pointer to what currently the final output of the pipeline is + relative_input = os.path.relpath(input_path, os.path.dirname(self.output().path)) + os.symlink(relative_input, self.output().path) + + @requires(LinkBuiltFinal) class GenerateVariantsOutputFile(DefaultPipelineTask): VAR_OUTPUT_FILE_KEY = 'var_output_file' @@ -1064,8 +1066,15 @@ def output(self): def run(self): os.chdir(utilities_method_dir) - args = ["python", "generateMD5Sums.py", "-i", self.cfg.output_dir, "-o", - self.output().path] + workflow_dir = os.path.abspath(os.path.join(os.path.realpath(__file__), os.pardir)) + + keep_list_file_path = os.path.join(workflow_dir, "tarball_files_keep_list.txt") + discard_list_file_path = os.path.join(workflow_dir, "tarball_files_discard_list.txt") + args = ["python", "generateMD5Sums.py", + "-i", self.cfg.output_dir, + "-o", self.output().path, + "--keepListFilePath", keep_list_file_path, + "--discardListFilePath", discard_list_file_path] pipeline_utils.run_process(args) pipeline_utils.check_file_for_contents(self.output().path) @@ -1073,22 +1082,22 @@ def run(self): @requires(GenerateMD5Sums) class GenerateReleaseArchive(DefaultPipelineTask): - def getArchiveName(self): + def output(self): # Format archive filename as release-mm-dd-yy.tar.gz - return "release-" + self.cfg.date.strftime("%x").replace('/', - '-') + ".tar.gz" + archive_name = f'release-{self.cfg.date.strftime("%x").replace("/", "-")}.tar.gz' + return luigi.LocalTarget(Path(self.cfg.output_dir).parent / archive_name) - def getArchiveParentDirectory(self): - return os.path.dirname(self.cfg.output_dir) + "/" + def run(self): + # parse md5sum list. + with open(self.input().path) as f: + file_list = [line.strip().split(' ')[-1] for line in f.readlines()] - def output(self): - return luigi.LocalTarget( - self.getArchiveParentDirectory() + self.getArchiveName()) + # include md5sum in tar as well + file_list.append(Path(self.input().path).name) + + output_dir = Path(self.cfg.output_dir) + with tarfile.open(self.output().path, "w:gz") as tar: + for file in file_list: + file_path = output_dir / file + tar.add(file_path, arcname=Path(output_dir.name) / file) - def run(self): - os.chdir(self.getArchiveParentDirectory()) - with tarfile.open( - self.getArchiveParentDirectory() + self.getArchiveName(), - "w:gz") as tar: - tar.add(self.cfg.output_dir, - arcname=os.path.basename(self.cfg.output_dir)) diff --git a/pipeline/workflow/tarball_files_discard_list.txt b/pipeline/workflow/tarball_files_discard_list.txt new file mode 100644 index 000000000..c205915a8 --- /dev/null +++ b/pipeline/workflow/tarball_files_discard_list.txt @@ -0,0 +1,138 @@ +./1000G.sorted.hg38.vcffor_pipeline +./release/artifacts/1000_Genomesready.vcf +./release/artifacts/1000_Genomes.vcf +./release/artifacts/aggregated.tsv +./release/artifacts/bayesdel.vcf +./release/artifacts/BICready.vcf +./release/artifacts/BIC.vcf +./release/artifacts/built.tsv +./release/artifacts/built_with_bayesdel.tsv +./release/artifacts/built_with_ca_ids.tsv +./release/artifacts/built_with_mupit.tsv +./release/artifacts/built_with_priors_clean.tsv +./release/artifacts/built_with_priors.tsv +./release/artifacts/built_with_vr_ids.tsv +./release/artifacts/ClinVarready.vcf +./release/artifacts/ClinVar.vcf +./release/artifacts/enigma_from_clinvar.tsv +./release/artifacts/ESPready.vcf +./release/artifacts/ESP.vcf +./release/artifacts/ExACready.vcf +./release/artifacts/ExAC.vcf +./release/artifacts/exLOVDready.vcf +./release/artifacts/exLOVD.vcf +./release/artifacts/Findlay_BRCA1_Ring_Function_Scoresready.vcf +./release/artifacts/Findlay_BRCA1_Ring_Function_Scores.vcf +./release/artifacts/GnomADready.vcf +./release/artifacts/GnomAD.vcf +./release/artifacts/LOVDready.vcf +./release/artifacts/LOVD.vcf +./release/artifacts/releaseDiff.log +./release/artifacts/right1000_Genomes +./release/artifacts/rightBIC +./release/artifacts/rightClinVar +./release/artifacts/rightESP +./release/artifacts/rightExAC +./release/artifacts/rightexLOVD +./release/artifacts/rightFindlay_BRCA1_Ring_Function_Scores +./release/artifacts/rightGnomAD +./release/artifacts/rightLOVD +./release/artifacts/victor_wdir/input.vcf.gz +./release/artifacts/victor_wdir/input.vcf.gz.tbi +./release/artifacts/victor_wdir/output.10.qc.vcf.gz +./release/artifacts/victor_wdir/output.10.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.10.vcf.gz +./release/artifacts/victor_wdir/output.10.vcf.gz.tbi +./release/artifacts/victor_wdir/output.11.qc.vcf.gz +./release/artifacts/victor_wdir/output.11.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.11.vcf.gz +./release/artifacts/victor_wdir/output.11.vcf.gz.tbi +./release/artifacts/victor_wdir/output.12.qc.vcf.gz +./release/artifacts/victor_wdir/output.12.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.12.vcf.gz +./release/artifacts/victor_wdir/output.12.vcf.gz.tbi +./release/artifacts/victor_wdir/output.13.qc.vcf.gz +./release/artifacts/victor_wdir/output.13.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.13.vcf.gz +./release/artifacts/victor_wdir/output.13.vcf.gz.tbi +./release/artifacts/victor_wdir/output.14.qc.vcf.gz +./release/artifacts/victor_wdir/output.14.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.14.vcf.gz +./release/artifacts/victor_wdir/output.14.vcf.gz.tbi +./release/artifacts/victor_wdir/output.15.qc.vcf.gz +./release/artifacts/victor_wdir/output.15.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.15.vcf.gz +./release/artifacts/victor_wdir/output.15.vcf.gz.tbi +./release/artifacts/victor_wdir/output.16.qc.vcf.gz +./release/artifacts/victor_wdir/output.16.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.16.vcf.gz +./release/artifacts/victor_wdir/output.16.vcf.gz.tbi +./release/artifacts/victor_wdir/output.17.qc.vcf.gz +./release/artifacts/victor_wdir/output.17.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.17.vcf.gz +./release/artifacts/victor_wdir/output.17.vcf.gz.tbi +./release/artifacts/victor_wdir/output.18.qc.vcf.gz +./release/artifacts/victor_wdir/output.18.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.18.vcf.gz +./release/artifacts/victor_wdir/output.18.vcf.gz.tbi +./release/artifacts/victor_wdir/output.19.qc.vcf.gz +./release/artifacts/victor_wdir/output.19.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.19.vcf.gz +./release/artifacts/victor_wdir/output.19.vcf.gz.tbi +./release/artifacts/victor_wdir/output.1.qc.vcf.gz +./release/artifacts/victor_wdir/output.1.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.1.vcf.gz +./release/artifacts/victor_wdir/output.1.vcf.gz.tbi +./release/artifacts/victor_wdir/output.20.qc.vcf.gz +./release/artifacts/victor_wdir/output.20.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.20.vcf.gz +./release/artifacts/victor_wdir/output.20.vcf.gz.tbi +./release/artifacts/victor_wdir/output.21.qc.vcf.gz +./release/artifacts/victor_wdir/output.21.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.21.vcf.gz +./release/artifacts/victor_wdir/output.21.vcf.gz.tbi +./release/artifacts/victor_wdir/output.22.qc.vcf.gz +./release/artifacts/victor_wdir/output.22.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.22.vcf.gz +./release/artifacts/victor_wdir/output.22.vcf.gz.tbi +./release/artifacts/victor_wdir/output.2.qc.vcf.gz +./release/artifacts/victor_wdir/output.2.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.2.vcf.gz +./release/artifacts/victor_wdir/output.2.vcf.gz.tbi +./release/artifacts/victor_wdir/output.3.qc.vcf.gz +./release/artifacts/victor_wdir/output.3.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.3.vcf.gz +./release/artifacts/victor_wdir/output.3.vcf.gz.tbi +./release/artifacts/victor_wdir/output.4.qc.vcf.gz +./release/artifacts/victor_wdir/output.4.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.4.vcf.gz +./release/artifacts/victor_wdir/output.4.vcf.gz.tbi +./release/artifacts/victor_wdir/output.5.qc.vcf.gz +./release/artifacts/victor_wdir/output.5.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.5.vcf.gz +./release/artifacts/victor_wdir/output.5.vcf.gz.tbi +./release/artifacts/victor_wdir/output.6.qc.vcf.gz +./release/artifacts/victor_wdir/output.6.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.6.vcf.gz +./release/artifacts/victor_wdir/output.6.vcf.gz.tbi +./release/artifacts/victor_wdir/output.7.qc.vcf.gz +./release/artifacts/victor_wdir/output.7.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.7.vcf.gz +./release/artifacts/victor_wdir/output.7.vcf.gz.tbi +./release/artifacts/victor_wdir/output.8.qc.vcf.gz +./release/artifacts/victor_wdir/output.8.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.8.vcf.gz +./release/artifacts/victor_wdir/output.8.vcf.gz.tbi +./release/artifacts/victor_wdir/output.9.qc.vcf.gz +./release/artifacts/victor_wdir/output.9.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.9.vcf.gz +./release/artifacts/victor_wdir/output.9.vcf.gz.tbi +./release/artifacts/victor_wdir/output.for_PROV +./release/artifacts/victor_wdir/output.vqslod +./release/artifacts/victor_wdir/output.X.qc.vcf.gz +./release/artifacts/victor_wdir/output.X.qc.vcf.gz.tbi +./release/artifacts/victor_wdir/output.X.vcf.gz +./release/artifacts/victor_wdir/output.X.vcf.gz.tbi +./release/artifacts/victor_wdir/slurm.annotate.run_1.start +./release/artifacts/victor_wdir/slurm.annotate.run_1.stop +./release/artifacts/victor_wdir/slurm.annotate.run_1.version diff --git a/pipeline/workflow/tarball_files_keep_list.txt b/pipeline/workflow/tarball_files_keep_list.txt new file mode 100644 index 000000000..25f04e316 --- /dev/null +++ b/pipeline/workflow/tarball_files_keep_list.txt @@ -0,0 +1,51 @@ +./1000G.sorted.hg38.vcf +./bic_brca12.sorted.hg38.vcf +./ClinVar.vcf +./enigma_from_clinvar.tsv +./esp.sorted.hg38.vcf +./exac.brca12.sorted.hg38.vcf +./exLOVD_brca12.sorted.hg38.vcf +./findlay_BRCA1_ring_function_scores.clean.sorted.hg38.vcf +./gnomAD.sorted.hg38.vcf +./md5sums.txt +./README.txt +./release/artifacts/brca-pseudonym-generator.log +./release/artifacts/discarded_reports.tsv +./release/artifacts/ENIGMA_wrong_genome.txt +./release/artifacts/exLOVD_BRCA1_error_variants.txt +./release/artifacts/exLOVD_BRCA2_error_variants.txt +./release/artifacts/get_ca_id.log +./release/artifacts/LOVD_error_variants.txt +./release/artifacts/merged.tsv +./release/artifacts/missing_reports.log +./release/artifacts/reports.tsv +./release/artifacts/variant_merging.log +./release/artifacts/victor_annotation.log +./release/artifacts/wrong_genome_coors/1000_Genomes_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/BIC_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/ClinVar_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/ESP_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/ExAC_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/exLOVD_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/Findlay_BRCA1_Ring_Function_Scores_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/GnomAD_wrong_genome_coor.vcf +./release/artifacts/wrong_genome_coors/LOVD_wrong_genome_coor.vcf +./release/built_final.tsv +./release/built_with_change_types.tsv +./release/diff/added_data_reports.tsv +./release/diff/added_data.tsv +./release/diff/added_reports.tsv +./release/diff/added.tsv +./release/diff/diff.json +./release/diff/diff_reports.json +./release/diff/diff_reports.txt +./release/diff/diff.txt +./release/diff/README.txt +./release/diff/removed_reports.tsv +./release/diff/removed.tsv +./release/field_metadata.tsv +./release/metadata/version.json +./release/reports_with_change_types.tsv +./sharedLOVD.sorted.hg38.vcf +./variants_output_field_metadata.tsv +./variants_output.tsv