diff --git a/definitions/pipelines/germline_wgs.cwl b/definitions/pipelines/germline_wgs.cwl index a6650249..ca61187a 100644 --- a/definitions/pipelines/germline_wgs.cwl +++ b/definitions/pipelines/germline_wgs.cwl @@ -152,6 +152,11 @@ inputs: disclaimer_text: type: string? default: 'Workflow source can be found at https://github.com/genome/analysis-workflows' + annotsv_annotations: + type: + - string + - Directory + doc: "directory/path of the annotsv annotations directory" outputs: cram: type: File @@ -468,6 +473,7 @@ steps: sv_split_count: sv_filter_split_count genome_build: vep_ensembl_assembly blocklist_bedpe: blocklist_bedpe + annotsv_annotations: annotsv_annotations out: [cn_diagram, cn_scatter_plot, tumor_antitarget_coverage, tumor_target_coverage, tumor_bin_level_ratios, tumor_segmented_ratios, cnvkit_vcf, cnvnator_cn_file, cnvnator_root, cnvnator_vcf, manta_diploid_variants, manta_somatic_variants, manta_all_candidates, manta_small_candidates, manta_tumor_only_variants, smoove_output_variants, cnvkit_filtered_vcf, cnvnator_filtered_vcf, manta_filtered_vcf, smoove_filtered_vcf, survivor_merged_vcf, survivor_merged_annotated_tsv, bcftools_merged_vcf, bcftools_merged_annotated_tsv, bcftools_merged_filtered_annotated_tsv] add_disclaimer_survivor_sv_vcf: diff --git a/definitions/subworkflows/gatk_soft_filter.cwl b/definitions/subworkflows/gatk_soft_filter.cwl new file mode 100644 index 00000000..eaf6845b --- /dev/null +++ b/definitions/subworkflows/gatk_soft_filter.cwl @@ -0,0 +1,86 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "apply soft filtering to a gatk called vcf using hard filter parameters" +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: MultipleInputFeatureRequirement +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + vcf: + type: File + secondaryFiles: [.tbi] + output_basename: + type: string? + default: "soft_filtered" +outputs: + filtered_vcf: + type: File + secondaryFiles: [.tbi] + outputSource: index_merged/indexed_vcf +steps: + split_snps: + run: ../tools/select_variants.cwl + in: + reference: reference + vcf: vcf + output_vcf_basename: + default: "SNPS" + select_type: + default: "SNP" + out: + [filtered_vcf] + split_indels: + run: ../tools/select_variants.cwl + in: + reference: reference + vcf: vcf + output_vcf_basename: + default: "INDELS" + select_type: + default: "INDEL" + out: + [filtered_vcf] + filter_snps: + run: ../tools/variant_filtration.cwl + in: + reference: reference + vcf: split_snps/filtered_vcf + filters: + default: ["QD<2.0;QD2", "QUAL<30.0;QUAL30", "SOR>3.0;SOR3", "FS>60.0;FS60", "MQ<40.0;MQ40", "MQRankSum<-12.5;MQRankSum-12.5", "ReadPosRankSum<-8.0;ReadPosRankSum-8"] + output_vcf_basename: + default: "SNPS.filtered" + out: + [filtered_vcf] + filter_indels: + run: ../tools/variant_filtration.cwl + in: + reference: reference + vcf: split_indels/filtered_vcf + filters: + default: ["QD<2.0;QD2", "QUAL<30.0;QUAL30", "FS>200.0;FS200", "ReadPosRankSum<-20.0;ReadPosRankSum-20"] + output_vcf_basename: + default: "INDELS.filtered" + out: + [filtered_vcf] + merge: + run: ../tools/merge_vcf.cwl + in: + merged_vcf_basename: output_basename + vcfs: + source: [filter_snps/filtered_vcf, filter_indels/filtered_vcf] + linkMerge: merge_flattened + out: + [merged_vcf] + index_merged: + run: ../tools/index_vcf.cwl + in: + vcf: merge/merged_vcf + out: + [indexed_vcf] diff --git a/definitions/subworkflows/joint_cnvkit.cwl b/definitions/subworkflows/joint_cnvkit.cwl new file mode 100644 index 00000000..de8ff4f1 --- /dev/null +++ b/definitions/subworkflows/joint_cnvkit.cwl @@ -0,0 +1,92 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "jointly run cnvkit for sv calls" +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: ScatterFeatureRequirement +inputs: + sample_names: + type: string[] + bams: + type: File[] + secondaryFiles: [^.bai] + reference_fasta: + type: + - string + - File + secondaryFiles: [.fai] + reference_cnn: + type: File? + doc: "can be a flat reference or reference based on a panel of normals" + method: + type: + - "null" + - type: enum + symbols: ["hybrid", "amplicon", "wgs"] + segment_filter: + type: + - "null" + - type: enum + symbols: ["ampdel", "ci", "cn", "sem"] +outputs: + vcfs: + type: File[] + outputSource: index_cnvkit/indexed_vcf + secondaryFiles: [.tbi] + cnr: + type: File[] + outputSource: cnvkit/tumor_bin_level_ratios + cns: + type: File[] + outputSource: cnvkit/tumor_segmented_ratios +steps: + cnvkit: + scatter: [tumor_bam, cnvkit_vcf_name] + scatterMethod: dotproduct + run: cnvkit_single_sample.cwl + in: + method: method + reference_cnn: reference_cnn + tumor_bam: bams + cnvkit_vcf_name: + source: [sample_names] + valueFrom: "$(self).cnvkit.vcf" + segment_filter: segment_filter + fasta_reference: reference_fasta + out: + [tumor_bin_level_ratios, tumor_segmented_ratios, cnvkit_vcf] + bgzip_and_index: + scatter: [vcf] + run: bgzip_and_index.cwl + in: + vcf: cnvkit/cnvkit_vcf + out: + [indexed_vcf] + sample_rename: + scatter: [input_vcf, new_sample_name] + scatterMethod: dotproduct + run: ../tools/replace_vcf_sample_name.cwl + in: + input_vcf: bgzip_and_index/indexed_vcf + new_sample_name: sample_names + sample_to_replace: + valueFrom: 'adjusted.tumor' + output_name: + valueFrom: '${ + var sample = inputs.new_sample_name; + var name = sample + ".cnvkit.vcf.gz"; + return name; + }' + out: + [renamed_vcf] + index_cnvkit: + scatter: [vcf] + run: ../tools/index_vcf.cwl + in: + vcf: sample_rename/renamed_vcf + out: + [indexed_vcf] diff --git a/definitions/subworkflows/joint_cnvnator.cwl b/definitions/subworkflows/joint_cnvnator.cwl new file mode 100644 index 00000000..377caca1 --- /dev/null +++ b/definitions/subworkflows/joint_cnvnator.cwl @@ -0,0 +1,79 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "run cnvnator for multiple samples" +requirements: + - class: SubworkflowFeatureRequirement + - class: ScatterFeatureRequirement + - class: StepInputExpressionRequirement +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + sample_names: + type: string[] + bams: + type: File[] + secondaryFiles: [^.bai] + bin_size: + type: int? +outputs: + vcfs: + type: File[] + outputSource: index_cnvnator/indexed_vcf + secondaryFiles: [.tbi] + root_files: + type: File[] + outputSource: cnvnator/root_file + cn_files: + type: File[] + outputSource: cnvnator/cn_file +steps: + cnvnator: + scatter: [bam, sample_name] + scatterMethod: dotproduct + run: ../tools/cnvnator.cwl + in: + bam: bams + reference: reference + sample_name: sample_names + bin_size: bin_size + out: + [vcf, root_file, cn_file] + bgzip_index: + scatter: [vcf] + run: bgzip_and_index.cwl + in: + vcf: cnvnator/vcf + out: + [indexed_vcf] + sample_rename: + scatter: [input_vcf, new_sample_name] + scatterMethod: dotproduct + run: ../tools/replace_vcf_sample_name.cwl + in: + input_vcf: bgzip_index/indexed_vcf + new_sample_name: sample_names + sample_to_replace: + valueFrom: '${ + var old_name = inputs.new_sample_name.split(".")[0]; + return old_name; + }' + output_name: + valueFrom: '${ + var sample = inputs.new_sample_name; + var name = sample + ".cnvnator.vcf.gz"; + return name; + }' + out: + [renamed_vcf] + index_cnvnator: + scatter: [vcf] + run: ../tools/index_vcf.cwl + in: + vcf: sample_rename/renamed_vcf + out: + [indexed_vcf] diff --git a/definitions/subworkflows/joint_detect_snps.cwl b/definitions/subworkflows/joint_detect_snps.cwl new file mode 100644 index 00000000..a7ec4721 --- /dev/null +++ b/definitions/subworkflows/joint_detect_snps.cwl @@ -0,0 +1,190 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "joint germline snp variant detection" +requirements: + - class: MultipleInputFeatureRequirement + - class: SubworkflowFeatureRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/vep_custom_annotation.yml + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: ScatterFeatureRequirement +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + bams: + type: File[] + secondaryFiles: [^.bai] + sample_names: + type: string[] + gvcf_gq_bands: + type: string[] + intervals: + type: + type: array + items: + type: array + items: string + contamination_fraction: + type: string[] + ploidy: + type: int? + vep_cache_dir: + type: + - string + - Directory + vep_ensembl_assembly: + type: string + doc: "genome assembly to use in vep. Examples: GRCh38 or GRCm38" + vep_ensembl_version: + type: string + doc: "ensembl version - Must be present in the cache directory. Example: 95" + vep_ensembl_species: + type: string + doc: "ensembl species - Must be present in the cache directory. Examples: homo_sapiens or mus_musculus" + vep_plugins: + type: string[] + default: [Frameshift, Wildtype] + synonyms_file: + type: File? + annotate_coding_only: + type: boolean? + vep_custom_annotations: + type: ../types/vep_custom_annotation.yml#vep_custom_annotation[] + doc: "custom type, check types directory for input format" + limit_variant_intervals: + type: File + variants_to_table_fields: + type: string[] + default: ['CHROM','POS','ID','REF','ALT'] + variants_to_table_genotype_fields: + type: string[] + vep_to_table_fields: + type: string[] + final_tsv_prefix: + type: string? + default: 'variants' + gnomad_max_pop_af: + type: float + default: 0.05 + min_conf_call: + type: float? +outputs: + sample_gvcfs: + type: File[] + outputSource: per_sample_merge_gvcfs/gvcf + raw_vcf: + type: File + outputSource: genotype/raw_vcf + secondaryFiles: [.tbi] + final_vcf: + type: File + outputSource: genotype/final_vcf + secondaryFiles: [.tbi] + filtered_vcf: + type: File + outputSource: genotype/filtered_vcf + secondaryFiles: [.tbi] + vep_summary: + type: File + outputSource: genotype/vep_summary + final_tsv: + type: File + outputSource: genotype/final_tsv + filtered_tsv: + type: File + outputSource: genotype/filtered_tsv + all_staged: + type: Directory + outputSource: stage_all/gathered_directory +steps: + per_sample_make_gvcfs: + scatter: [bam, contamination_fraction] + scatterMethod: dotproduct + run: gatk_haplotypecaller_iterator.cwl + in: + reference: reference + bam: bams + emit_reference_confidence: + default: 'GVCF' + gvcf_gq_bands: gvcf_gq_bands + intervals: intervals + contamination_fraction: contamination_fraction + ploidy: ploidy + out: + [gvcf] + per_sample_merge_gvcfs: + scatter: [gvcfs, output_file_name] + scatterMethod: dotproduct + run: ../tools/combine_gvcfs.cwl + in: + reference: reference + gvcfs: per_sample_make_gvcfs/gvcf + output_file_name: + source: [sample_names] + valueFrom: "$(self).merged.g.vcf.gz" + out: + [gvcf] + genotype: + run: joint_genotype.cwl + in: + reference: reference + gvcfs: + source: [per_sample_merge_gvcfs/gvcf] + linkMerge: merge_flattened + intervals: intervals + vep_cache_dir: vep_cache_dir + vep_ensembl_assembly: vep_ensembl_assembly + vep_ensembl_version: vep_ensembl_version + vep_ensembl_species: vep_ensembl_species + vep_plugins: vep_plugins + synonyms_file: synonyms_file + annotate_coding_only: annotate_coding_only + vep_custom_annotations: vep_custom_annotations + roi_intervals: limit_variant_intervals + variants_to_table_fields: variants_to_table_fields + variants_to_table_genotype_fields: variants_to_table_genotype_fields + vep_to_table_fields: vep_to_table_fields + final_tsv_prefix: final_tsv_prefix + gnomad_max_pop_af: gnomad_max_pop_af + min_conf_call: min_conf_call + out: + [raw_vcf, annotated_vcf, final_vcf, filtered_vcf, vep_summary, final_tsv, filtered_tsv] + stage_gvcf: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "gvcfs" + files: + source: [per_sample_merge_gvcfs/gvcf] + linkMerge: merge_flattened + out: + [gathered_directory] + + stage_gatk: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "gatk" + files: + source: [genotype/raw_vcf, genotype/annotated_vcf, genotype/final_vcf, genotype/filtered_vcf, genotype/vep_summary, genotype/final_tsv, genotype/filtered_tsv] + linkMerge: merge_flattened + directory: stage_gvcf/gathered_directory + out: + [gathered_directory] + stage_all: + run: ../tools/gather_to_sub_directory_dirs.cwl + in: + outdir: + valueFrom: "SNP_pipeline" + directories: + source: [stage_gatk/gathered_directory] + linkMerge: merge_flattened + out: + [gathered_directory] diff --git a/definitions/subworkflows/joint_detect_svs.cwl b/definitions/subworkflows/joint_detect_svs.cwl new file mode 100644 index 00000000..9bbe2bce --- /dev/null +++ b/definitions/subworkflows/joint_detect_svs.cwl @@ -0,0 +1,293 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "joint detect svs" +requirements: + - class: SubworkflowFeatureRequirement + - class: ScatterFeatureRequirement +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + bams: + type: File[] + secondaryFiles: [^.bai] + sample_names: + type: string[] + cohort_name: + type: string + exclude_regions: + type: File? + manta_call_regions: + type: File? + manta_output_contigs: + type: boolean? + cnvnator_bin_size: + type: int? + cnvkit_method: + type: + - "null" + - type: enum + symbols: ["hybrid", "amplicon", "wgs"] + cnvkit_reference_cnn: + type: File? + cnvkit_segment_filter: + type: + - "null" + - type: enum + symbols: ["ampdel", "ci", "cn", "sem"] + filter_del_depth: + type: double? + filter_dup_depth: + type: double? + filter_paired_count: + type: int? + filter_split_count: + type: int? + filter_alt_abundance_percentage: + type: double? + filter_depth_caller_min_size: + type: int? + survivor_estimate_sv_distance: + type: boolean + genome_build: + type: string + survivor_max_distance_to_merge: + type: int + survivor_minimum_sv_calls: + type: int + survivor_minimum_sv_size: + type: int + survivor_same_strand: + type: boolean + survivor_same_type: + type: boolean + snps_vcf: + type: File? + filter_blocklist_bedpe: + type: File? + annotsv_filter_pop_af: + type: double? + annotsv_filter_no_CDS: + type: boolean? + annotsv_annotations: + type: + - string + - Directory +outputs: + all_staged: + type: Directory + outputSource: stage_all/gathered_directory +steps: +# stage 1, variant calling + smoove: + run: ../tools/smoove.cwl + in: + bams: bams + cohort_name: cohort_name + reference: reference + exclude_regions: exclude_regions + out: + [output_vcf] + index_smoove: + run: ../tools/index_vcf.cwl + in: + vcf: smoove/output_vcf + out: + [indexed_vcf] + stage_raw_smoove: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "smoove" + files: + source: [index_smoove/indexed_vcf] + linkMerge: merge_flattened + out: + [gathered_directory] + manta: + run: ../tools/manta_germline.cwl + in: + bams: bams + reference: reference + call_regions: manta_call_regions + output_contigs: manta_output_contigs + out: + [diploid_variants, all_candidates, small_candidates, stats] + stage_raw_manta: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "manta" + files: + source: [manta/diploid_variants, manta/all_candidates, manta/small_candidates] + linkMerge: merge_flattened + directory: manta/stats + out: + [gathered_directory] + cnvnator: + run: joint_cnvnator.cwl + in: + reference: reference + sample_names: sample_names + bams: bams + bin_size: cnvnator_bin_size + out: + [vcfs, root_files, cn_files] + stage_raw_cnvnator: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "cnvnator" + files: + source: [cnvnator/vcfs, cnvnator/root_files, cnvnator/cn_files] + linkMerge: merge_flattened + out: + [gathered_directory] + cnvkit: + run: joint_cnvkit.cwl + in: + sample_names: sample_names + bams: bams + reference_fasta: reference + reference_cnn: cnvkit_reference_cnn + method: cnvkit_method + segment_filter: cnvkit_segment_filter + out: + [vcfs, cnr, cns] + stage_raw_cnvkit: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "cnvkit" + files: + source: [cnvkit/vcfs, cnvkit/cnr, cnvkit/cns] + linkMerge: merge_flattened + out: + [gathered_directory] + stage_raw: + run: ../tools/gather_to_sub_directory_dirs.cwl + in: + outdir: + valueFrom: "raw" + directories: + source: [stage_raw_smoove/gathered_directory, stage_raw_manta/gathered_directory, stage_raw_cnvnator/gathered_directory, stage_raw_cnvkit/gathered_directory] + linkMerge: merge_flattened + out: + [gathered_directory] +# stage 2, filtering + filter_smoove: + run: sv_joint_read_caller_filter.cwl + in: + reference: reference + sample_names: sample_names + bams: bams + filter_del_depth: filter_del_depth + filter_dup_depth: filter_dup_depth + filter_paired_count: filter_paired_count + filter_split_count: filter_split_count + filter_alt_abundance_percentage: filter_alt_abundance_percentage + sv_vcf: index_smoove/indexed_vcf + vcf_source: + default: "smoove" + out: + [vcfs] + filter_manta: + run: sv_joint_read_caller_filter.cwl + in: + reference: reference + sample_names: sample_names + bams: bams + filter_del_depth: filter_del_depth + filter_dup_depth: filter_dup_depth + filter_paired_count: filter_paired_count + filter_split_count: filter_split_count + filter_alt_abundance_percentage: filter_alt_abundance_percentage + sv_vcf: manta/diploid_variants + vcf_source: + default: "manta" + out: + [vcfs] + filter_cnvnator: + run: sv_joint_depth_caller_filter.cwl + in: + reference: reference + sample_names: sample_names + bams: bams + filter_del_depth: filter_del_depth + filter_dup_depth: filter_dup_depth + sv_vcfs: cnvnator/vcfs + vcf_source: + default: "cnvnator" + min_sv_size: filter_depth_caller_min_size + out: + [vcfs] + filter_cnvkit: + run: sv_joint_depth_caller_filter.cwl + in: + reference: reference + sample_names: sample_names + bams: bams + filter_del_depth: filter_del_depth + filter_dup_depth: filter_dup_depth + sv_vcfs: cnvkit/vcfs + vcf_source: + default: "cnvkit" + min_sv_size: filter_depth_caller_min_size + out: + [vcfs] + stage_filtered: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "filtered" + files: + source: [filter_smoove/vcfs, filter_manta/vcfs, filter_cnvnator/vcfs, filter_cnvkit/vcfs] + linkMerge: merge_flattened + out: + [gathered_directory] +# stage3, merge+annotate+filter + merge_svs: + run: merge_svs.cwl + in: + cohort_name: cohort_name + estimate_sv_distance: survivor_estimate_sv_distance + genome_build: genome_build + max_distance_to_merge: survivor_max_distance_to_merge + minimum_sv_calls: survivor_minimum_sv_calls + minimum_sv_size: survivor_minimum_sv_size + same_strand: survivor_same_strand + same_type: survivor_same_type + snps_vcf: snps_vcf + sv_vcfs: + source: [filter_smoove/vcfs, filter_manta/vcfs, filter_cnvnator/vcfs, filter_cnvkit/vcfs] + linkMerge: merge_flattened + blocklist_bedpe: filter_blocklist_bedpe + filter_pop_af: annotsv_filter_pop_af + filter_no_CDS: annotsv_filter_no_CDS + annotsv_annotations: annotsv_annotations + out: + [bcftools_merged_sv_vcf, bcftools_merged_annotated_tsv, bcftools_merged_unannotated_tsv, bcftools_merged_filtered_annotated_tsv, survivor_merged_sv_vcf, survivor_merged_annotated_tsv, survivor_merged_unannotated_tsv, survivor_merged_filtered_annotated_tsv] + stage_merged: + run: ../tools/gather_to_sub_directory.cwl + in: + outdir: + valueFrom: "merged" + files: + source: [merge_svs/bcftools_merged_sv_vcf, merge_svs/bcftools_merged_annotated_tsv, merge_svs/bcftools_merged_unannotated_tsv, merge_svs/bcftools_merged_filtered_annotated_tsv, merge_svs/survivor_merged_sv_vcf, merge_svs/survivor_merged_annotated_tsv, merge_svs/survivor_merged_unannotated_tsv, merge_svs/survivor_merged_filtered_annotated_tsv] + linkMerge: merge_flattened + out: + [gathered_directory] + stage_all: + run: ../tools/gather_to_sub_directory_dirs.cwl + in: + outdir: + valueFrom: "SV_pipeline" + directories: + source: [stage_raw/gathered_directory, stage_filtered/gathered_directory, stage_merged/gathered_directory] + linkMerge: merge_flattened + out: + [gathered_directory] diff --git a/definitions/subworkflows/joint_detect_variants.cwl b/definitions/subworkflows/joint_detect_variants.cwl new file mode 100644 index 00000000..e1cf579b --- /dev/null +++ b/definitions/subworkflows/joint_detect_variants.cwl @@ -0,0 +1,203 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "joint variant detection(snps,svs)" +requirements: + - class: MultipleInputFeatureRequirement + - class: SubworkflowFeatureRequirement + - class: SchemaDefRequirement + types: + - $import: ../types/vep_custom_annotation.yml + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: ScatterFeatureRequirement +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + bams: + type: File[] + secondaryFiles: [^.bai] + sample_names: + type: string[] + cohort_name: + type: string + gvcf_gq_bands: + type: string[] + intervals: + type: + type: array + items: + type: array + items: string + contamination_fraction: + type: string[] + ploidy: + type: int? + vep_cache_dir: + type: + - string + - Directory + vep_ensembl_assembly: + type: string + doc: "genome assembly to use in vep. Examples: GRCh38 or GRCm38" + vep_ensembl_version: + type: string + doc: "ensembl version - Must be present in the cache directory. Example: 95" + vep_ensembl_species: + type: string + doc: "ensembl species - Must be present in the cache directory. Examples: homo_sapiens or mus_musculus" + vep_plugins: + type: string[] + default: [Frameshift, Wildtype] + synonyms_file: + type: File? + annotate_coding_only: + type: boolean? + vep_custom_annotations: + type: ../types/vep_custom_annotation.yml#vep_custom_annotation[] + doc: "custom type, check types directory for input format" + limit_variant_intervals: + type: File + snp_to_table_fields: + type: string[] + default: ['CHROM','POS','ID','REF','ALT'] + snp_to_table_genotype_fields: + type: string[] + vep_to_table_fields: + type: string[] + snp_final_tsv_prefix: + type: string? + default: 'variants' + snp_gnomad_max_pop_af: + type: float + default: 0.05 + gatk_min_conf_call: + type: float? + + + sv_exclude_regions: + type: File? + manta_call_regions: + type: File? + manta_output_contigs: + type: boolean? + cnvnator_bin_size: + type: int? + cnvkit_method: + type: + - "null" + - type: enum + symbols: ["hybrid", "amplicon", "wgs"] + cnvkit_reference_cnn: + type: File? + cnvkit_segment_filter: + type: + - "null" + - type: enum + symbols: ["ampdel", "ci", "cn", "sem"] + sv_filter_del_depth: + type: double? + sv_filter_dup_depth: + type: double? + sv_filter_paired_count: + type: int? + sv_filter_split_count: + type: int? + sv_filter_alt_abundance_percentage: + type: double? + sv_filter_depth_caller_min_size: + type: int? + survivor_estimate_sv_distance: + type: boolean + survivor_max_distance_to_merge: + type: int + survivor_minimum_sv_calls: + type: int + survivor_minimum_sv_size: + type: int + survivor_same_strand: + type: boolean + survivor_same_type: + type: boolean + sv_filter_blocklist_bedpe: + type: File? + annotsv_filter_pop_af: + type: double? + annotsv_filter_no_CDS: + type: boolean? + annotsv_annotations: + type: + - string + - Directory +outputs: + snps_staged: + type: Directory + outputSource: detect_snps/all_staged + svs_staged: + type: Directory + outputSource: detect_svs/all_staged +steps: + detect_snps: + run: joint_detect_snps.cwl + in: + reference: reference + bams: bams + sample_names: sample_names + gvcf_gq_bands: gvcf_gq_bands + intervals: intervals + contamination_fraction: contamination_fraction + ploidy: ploidy + vep_cache_dir: vep_cache_dir + vep_ensembl_assembly: vep_ensembl_assembly + vep_ensembl_version: vep_ensembl_version + vep_ensembl_species: vep_ensembl_species + vep_plugins: vep_plugins + synonyms_file: synonyms_file + annotate_coding_only: annotate_coding_only + vep_custom_annotations: vep_custom_annotations + limit_variant_intervals: limit_variant_intervals + variants_to_table_fields: snp_to_table_fields + variants_to_table_genotype_fields: snp_to_table_genotype_fields + vep_to_table_fields: vep_to_table_fields + final_tsv_prefix: snp_final_tsv_prefix + gnomad_max_pop_af: snp_gnomad_max_pop_af + min_conf_call: gatk_min_conf_call + out: + [raw_vcf, all_staged] + detect_svs: + run: joint_detect_svs.cwl + in: + reference: reference + bams: bams + sample_names: sample_names + cohort_name: cohort_name + genome_build: vep_ensembl_assembly + exclude_regions: sv_exclude_regions + manta_call_regions: manta_call_regions + manta_output_contigs: manta_output_contigs + cnvnator_bin_size: cnvnator_bin_size + cnvkit_method: cnvkit_method + cnvkit_reference_cnn: cnvkit_reference_cnn + cnvkit_segment_filter: cnvkit_segment_filter + filter_del_depth: sv_filter_del_depth + filter_dup_depth: sv_filter_dup_depth + filter_paired_count: sv_filter_paired_count + filter_split_count: sv_filter_split_count + filter_alt_abundance_percentage: sv_filter_alt_abundance_percentage + filter_depth_caller_min_size: sv_filter_depth_caller_min_size + survivor_estimate_sv_distance: survivor_estimate_sv_distance + survivor_max_distance_to_merge: survivor_max_distance_to_merge + survivor_minimum_sv_calls: survivor_minimum_sv_calls + survivor_minimum_sv_size: survivor_minimum_sv_size + survivor_same_strand: survivor_same_strand + survivor_same_type: survivor_same_type + snps_vcf: detect_snps/raw_vcf + filter_blocklist_bedpe: sv_filter_blocklist_bedpe + annotsv_filter_pop_af: annotsv_filter_pop_af + annotsv_annotations: annotsv_annotations + out: + [all_staged] diff --git a/definitions/subworkflows/joint_genotype.cwl b/definitions/subworkflows/joint_genotype.cwl index 3de95bd4..80e8bff6 100644 --- a/definitions/subworkflows/joint_genotype.cwl +++ b/definitions/subworkflows/joint_genotype.cwl @@ -64,13 +64,19 @@ inputs: final_tsv_prefix: type: string? default: 'variants' - filter_gnomAD_maximum_population_allele_frequency: + gnomad_max_pop_af: type: float default: 0.05 + min_conf_call: + type: float? outputs: raw_vcf: type: File - outputSource: merge_vcfs/merged_vcf + outputSource: normalize_index/indexed_vcf + secondaryFiles: [.tbi] + annotated_vcf: + type: File + outputSource: soft_filter/filtered_vcf secondaryFiles: [.tbi] final_vcf: type: File @@ -106,6 +112,7 @@ steps: source: [combine_gvcfs/gvcf] linkMerge: merge_flattened intervals: intervals + min_conf_call: min_conf_call out: [genotype_vcf] merge_vcfs: @@ -114,11 +121,35 @@ steps: vcfs: genotype_gvcf/genotype_vcf out: [merged_vcf] - + decompose: + run: ../tools/vt_decompose.cwl + in: + vcf: merge_vcfs/merged_vcf + out: + [decomposed_vcf] + decompose_index: + run: ../tools/index_vcf.cwl + in: + vcf: decompose/decomposed_vcf + out: + [indexed_vcf] + normalize: + run: ../tools/vt_normalize.cwl + in: + vcf: decompose_index/indexed_vcf + reference: reference + out: + [normalized_vcf] + normalize_index: + run: ../tools/index_vcf.cwl + in: + vcf: normalize/normalized_vcf + out: + [indexed_vcf] annotate_variants: run: ../tools/vep.cwl in: - vcf: merge_vcfs/merged_vcf + vcf: normalize_index/indexed_vcf cache_dir: vep_cache_dir ensembl_assembly: vep_ensembl_assembly ensembl_version: vep_ensembl_version @@ -136,11 +167,20 @@ steps: vcf: annotate_variants/annotated_vcf out: [indexed_vcf] + soft_filter: + run: gatk_soft_filter.cwl + in: + reference: reference + vcf: bgzip_index_annotated_vcf/indexed_vcf + output_basename: + default: "annotated" + out: + [filtered_vcf] filter_vcf: run: germline_filter_vcf.cwl in: - annotated_vcf: annotate_variants/annotated_vcf - filter_gnomAD_maximum_population_allele_frequency: filter_gnomAD_maximum_population_allele_frequency + annotated_vcf: soft_filter/filtered_vcf + filter_gnomAD_maximum_population_allele_frequency: gnomad_max_pop_af gnomad_field_name: source: vep_custom_annotations valueFrom: | diff --git a/definitions/subworkflows/merge_svs.cwl b/definitions/subworkflows/merge_svs.cwl index 02373d00..97a78bd2 100644 --- a/definitions/subworkflows/merge_svs.cwl +++ b/definitions/subworkflows/merge_svs.cwl @@ -32,13 +32,26 @@ inputs: type: File[] blocklist_bedpe: type: File? + filter_pop_af: + type: double? + default: "0.05" + filter_no_CDS: + type: boolean? + annotsv_annotations: + type: + - string + - Directory + doc: "directory/path of the annotsv annotations directory" outputs: bcftools_merged_sv_vcf: type: File outputSource: filter_blocklist_bcftools/filtered_sv_vcf bcftools_merged_annotated_tsv: type: File - outputSource: bcftools_annotate_variants/sv_variants_tsv + outputSource: bcftools_annotate_variants/annotated_tsv + bcftools_merged_unannotated_tsv: + type: File + outputSource: bcftools_annotate_variants/unannotated_tsv bcftools_merged_filtered_annotated_tsv: type: File outputSource: bcftools_annotsv_filter/filtered_tsv @@ -47,7 +60,13 @@ outputs: outputSource: filter_blocklist_survivor/filtered_sv_vcf survivor_merged_annotated_tsv: type: File - outputSource: survivor_annotate_variants/sv_variants_tsv + outputSource: survivor_annotate_variants/annotated_tsv + survivor_merged_unannotated_tsv: + type: File + outputSource: survivor_annotate_variants/unannotated_tsv + survivor_merged_filtered_annotated_tsv: + type: File + outputSource: survivor_annotsv_filter/filtered_tsv steps: survivor_merge_sv_vcfs: run: ../tools/survivor.cwl @@ -60,7 +79,7 @@ steps: estimate_sv_distance: estimate_sv_distance minimum_sv_size: minimum_sv_size cohort_name: - default: "SURVIVOR-sv-merged.vcf" + default: "survivor-sv-merged.vcf" out: [merged_vcf] filter_blocklist_survivor: @@ -69,7 +88,7 @@ steps: input_vcf: survivor_merge_sv_vcfs/merged_vcf blocklist_bedpe: blocklist_bedpe output_vcf_basename: - default: "SURVIVOR-sv-merged" + default: "survivor-sv-merged" out: [filtered_sv_vcf] survivor_annotate_variants: @@ -77,13 +96,26 @@ steps: in: genome_build: genome_build input_vcf: filter_blocklist_survivor/filtered_sv_vcf - output_tsv_name: - default: "SURVIVOR-merged-AnnotSV.tsv" + output_base: + default: "survivor-merged-AnnotSV" snps_vcf: source: [snps_vcf] valueFrom: ${ return [ self ]; } + annotations: annotsv_annotations out: - [sv_variants_tsv] + [annotated_tsv, unannotated_tsv] + survivor_annotsv_filter: + run: ../tools/annotsv_filter.cwl + in: + annotsv_tsv: survivor_annotate_variants/annotated_tsv + filtering_frequency: filter_pop_af + no_CDS: filter_no_CDS + survivor_merged: + default: true + output_tsv_name: + default: "survivor-merged-AnnotSV-filtered.tsv" + out: + [filtered_tsv] bcftools_merge_sv_vcfs: run: ../tools/bcftools_merge.cwl in: @@ -95,11 +127,11 @@ steps: default: "bcftools-sv-merged.vcf" vcfs: sv_vcfs out: - [merged_sv_vcf] + [merged_vcf] filter_blocklist_bcftools: run: ../tools/filter_sv_vcf_blocklist_bedpe.cwl in: - input_vcf: bcftools_merge_sv_vcfs/merged_sv_vcf + input_vcf: bcftools_merge_sv_vcfs/merged_vcf blocklist_bedpe: blocklist_bedpe output_vcf_basename: default: "bcftools-sv-merged" @@ -110,18 +142,24 @@ steps: in: genome_build: genome_build input_vcf: filter_blocklist_bcftools/filtered_sv_vcf - output_tsv_name: - default: "bcftools-merged-AnnotSV.tsv" + output_base: + default: "bcftools-merged-AnnotSV" snps_vcf: source: [snps_vcf] valueFrom: ${ return [ self ]; } + annotations: annotsv_annotations out: - [sv_variants_tsv] + [annotated_tsv, unannotated_tsv] bcftools_annotsv_filter: run: ../tools/annotsv_filter.cwl in: - annotsv_tsv: bcftools_annotate_variants/sv_variants_tsv - filtering_frequency: - default: "0.05" + annotsv_tsv: bcftools_annotate_variants/annotated_tsv + filtering_frequency: filter_pop_af + no_CDS: filter_no_CDS + survivor_merged: + default: false + output_tsv_name: + default: "bcftools-merged-AnnotSV-filtered.tsv" + out: [filtered_tsv] diff --git a/definitions/subworkflows/single_sample_sv_callers.cwl b/definitions/subworkflows/single_sample_sv_callers.cwl index 01449b20..2275c156 100644 --- a/definitions/subworkflows/single_sample_sv_callers.cwl +++ b/definitions/subworkflows/single_sample_sv_callers.cwl @@ -77,6 +77,11 @@ inputs: type: int? blocklist_bedpe: type: File? + annotsv_annotations: + type: + - string + - Directory + doc: "directory/path of the annotsv annotations directory" outputs: cn_diagram: type: File? @@ -300,5 +305,6 @@ steps: sv_vcfs: source: [run_cnvkit_filter/filtered_vcf, run_cnvnator_filter/filtered_vcf, run_manta_filter/filtered_vcf, run_smoove_filter/filtered_vcf] linkMerge: merge_flattened + annotsv_annotations: annotsv_annotations out: [bcftools_merged_sv_vcf, bcftools_merged_annotated_tsv, bcftools_merged_filtered_annotated_tsv, survivor_merged_sv_vcf, survivor_merged_annotated_tsv] diff --git a/definitions/subworkflows/sv_joint_depth_caller_filter.cwl b/definitions/subworkflows/sv_joint_depth_caller_filter.cwl new file mode 100644 index 00000000..4cd676ea --- /dev/null +++ b/definitions/subworkflows/sv_joint_depth_caller_filter.cwl @@ -0,0 +1,124 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "Filter multiple sv vcfs from depth callers(cnvkit/cnvnator), returns single sample vcfs with the sample name as $SAMPLE-$CALLER" +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: ScatterFeatureRequirement +inputs: + bams: + type: File[] + secondaryFiles: [^.bai] + sample_names: + type: string[] + filter_del_depth: + type: double? + filter_dup_depth: + type: double? + min_sv_size: + type: int? + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + sv_vcfs: + type: File[] + vcf_source: + type: + - type: enum + symbols: ["cnvkit", "cnvnator"] + merge_distance: + type: int? +outputs: + vcfs: + type: File[] + outputSource: bgzip_and_index/indexed_vcf + secondaryFiles: [.tbi] +steps: + merge_calls: + scatter: [input_vcf] + run: ../tools/custom_merge_sv_records.cwl + in: + input_vcf: sv_vcfs + distance: merge_distance + out: + [vcf] + size_filter: + scatter: [input_vcf] + run: ../tools/filter_sv_vcf_size.cwl + in: + input_vcf: merge_calls/vcf + size_method: + default: "min_len" + sv_size: min_sv_size + out: + [filtered_sv_vcf] + duphold: + scatter: [bam, sv_vcf] + scatterMethod: dotproduct + run: ../tools/duphold.cwl + in: + bam: bams + reference: reference + sv_vcf: size_filter/filtered_sv_vcf + out: + [annotated_sv_vcf] + depth_filter: + scatter: [input_vcf, output_vcf_name] + scatterMethod: dotproduct + run: ../tools/filter_sv_vcf_depth.cwl + in: + input_vcf: duphold/annotated_sv_vcf + deletion_depth: filter_del_depth + duplication_depth: filter_dup_depth + output_vcf_name: + source: [sample_names] + valueFrom: | + ${ + var sample = self; + var caller = inputs.vcf_source; + var vcf_name = sample + "-" + caller + ".vcf"; + return vcf_name; + } + vcf_source: + default: "duphold" + out: + [filtered_sv_vcf] + rename: + scatter: [input_vcf, new_sample_name, sample_to_replace, output_name] + scatterMethod: dotproduct + run: ../tools/replace_vcf_sample_name.cwl + in: + input_vcf: depth_filter/filtered_sv_vcf + sample_to_replace: sample_names + vcf_source: vcf_source + new_sample_name: + source: [sample_names] + valueFrom: | + ${ + var sample = self; + var caller = inputs.vcf_source; + var result = sample + "-" + caller; + return result; + } + output_name: + source: [sample_names] + valueFrom: | + ${ + var sample = self; + var caller = inputs.vcf_source; + var result = sample + "-" + caller + ".vcf.gz"; + return result; + } + out: + [renamed_vcf] + bgzip_and_index: + scatter: [vcf] + run: bgzip_and_index.cwl + in: + vcf: rename/renamed_vcf + out: [indexed_vcf] diff --git a/definitions/subworkflows/sv_joint_read_caller_filter.cwl b/definitions/subworkflows/sv_joint_read_caller_filter.cwl new file mode 100644 index 00000000..21b71515 --- /dev/null +++ b/definitions/subworkflows/sv_joint_read_caller_filter.cwl @@ -0,0 +1,157 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: Workflow +label: "filter jointly called vcfs from read based callers" +requirements: + - class: SubworkflowFeatureRequirement + - class: StepInputExpressionRequirement + - class: InlineJavascriptRequirement + - class: ScatterFeatureRequirement +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + sample_names: + type: string[] + bams: + type: File[] + secondaryFiles: [^.bai] + filter_del_depth: + type: double? + filter_dup_depth: + type: double? + filter_paired_count: + type: int? + filter_split_count: + type: int? + filter_alt_abundance_percentage: + type: double? + sv_vcf: + type: File + secondaryFiles: [.tbi] + vcf_source: + type: + - type: enum + symbols: ["manta", "smoove"] +outputs: + vcfs: + type: File[] + outputSource: final_index/indexed_vcf + secondaryFiles: [.tbi] +steps: + read_support_filter: + run: ../tools/filter_sv_vcf_read_support.cwl + in: + abundance_percentage: filter_alt_abundance_percentage + input_vcf: sv_vcf + paired_count: filter_paired_count + split_count: filter_split_count + vcf_source: vcf_source + out: + [filtered_sv_vcf] + bgzip_index: + run: bgzip_and_index.cwl + in: + vcf: read_support_filter/filtered_sv_vcf + out: + [indexed_vcf] + split_vcf: + scatter: [sample_name] + run: ../tools/bcftools_view.cwl + in: + sample_name: sample_names + in_vcf: bgzip_index/indexed_vcf + out: + [vcf] + duphold: + scatter: [bam, sv_vcf] + scatterMethod: dotproduct + run: ../tools/duphold.cwl + in: + bam: bams + reference: reference + sv_vcf: split_vcf/vcf + out: + [annotated_sv_vcf] + bgzip_index_duphold: + scatter: [vcf] + scatterMethod: dotproduct + run: bgzip_and_index.cwl + in: + vcf: duphold/annotated_sv_vcf + out: + [indexed_vcf] + merge_vcfs: + run: ../tools/bcftools_merge.cwl + in: + vcfs: bgzip_index_duphold/indexed_vcf + out: + [merged_vcf] + depth_filter: + run: ../tools/filter_sv_vcf_depth.cwl + in: + input_vcf: merge_vcfs/merged_vcf + deletion_depth: filter_del_depth + duplication_depth: filter_dup_depth + vcf_source: + default: "duphold" + out: + [filtered_sv_vcf] + final_split_vcf: + scatter: [sample_name, output_vcf_name] + scatterMethod: dotproduct + run: ../tools/bcftools_view.cwl + in: + sample_name: sample_names + in_vcf: depth_filter/filtered_sv_vcf + vcf_source: vcf_source + output_vcf_name: + source: [sample_names] + valueFrom: | + ${ + var sample = self; + var caller = inputs.vcf_source; + var result = sample + "-" + caller + ".vcf.gz"; + return result; + } + out: + [vcf] + rename: + scatter: [input_vcf, sample_to_replace, new_sample_name, output_name] + scatterMethod: dotproduct + run: ../tools/replace_vcf_sample_name.cwl + in: + input_vcf: final_split_vcf/vcf + sample_to_replace: sample_names + vcf_source: vcf_source + new_sample_name: + source: [sample_names] + valueFrom: | + ${ + var sample = self; + var caller = inputs.vcf_source; + var result = sample + "-" + caller; + return result; + } + output_name: + source: [sample_names] + valueFrom: | + ${ + var sample = self; + var caller = inputs.vcf_source; + var result = sample + "-" + caller + ".vcf.gz"; + return result; + } + out: + [renamed_vcf] + final_index: + scatter: [vcf] + scatterMethod: dotproduct + run: ../tools/index_vcf.cwl + in: + vcf: rename/renamed_vcf + out: + [indexed_vcf] diff --git a/definitions/tools/annotsv.cwl b/definitions/tools/annotsv.cwl index 35c6e153..63f5f4b0 100644 --- a/definitions/tools/annotsv.cwl +++ b/definitions/tools/annotsv.cwl @@ -3,12 +3,12 @@ cwlVersion: v1.0 class: CommandLineTool -arguments: ["/opt/AnnotSV_2.1/bin/AnnotSV", "-bedtools", "/usr/bin/bedtools", "-outputDir", "$(runtime.outdir)"] +arguments: ["/opt/AnnotSV_2.3/bin/AnnotSV", "-bedtools", "/usr/bin/bedtools", "-outputDir", "$(runtime.outdir)", "-outputFile", "$(inputs.output_base).tsv"] requirements: - class: ResourceRequirement ramMin: 8000 - class: DockerRequirement - dockerPull: "mgibio/annotsv-cwl:2.1" + dockerPull: "mgibio/annotsv-cwl:2.3" inputs: genome_build: @@ -16,29 +16,39 @@ inputs: inputBinding: position: 2 prefix: "-genomeBuild" + doc: "genome build used, GRCh37(tool default), GRCh38, mm9, or mm10" input_vcf: type: File inputBinding: position: 3 prefix: "-SVinputFile" doc: "vcf file to filter" - output_tsv_name: + output_base: type: string? - default: "AnnotSV.tsv" + default: "AnnotSV" inputBinding: - position: 4 - prefix: "-outputFile" - doc: "output file name" + doc: "base for output file name" snps_vcf: type: File[]? inputBinding: position: 5 - prefix: "-vcfFiles" + prefix: "-snvIndelFiles" itemSeparator: "," doc: "snps vcf(s) for adding hom/het snp counts found within svs" - + annotations: + type: + - string + - Directory + inputBinding: + position: 6 + prefix: "-annotationsDir" + doc: "directory/path of the annotsv annotations directory" outputs: - sv_variants_tsv: + annotated_tsv: + type: File + outputBinding: + glob: "$(inputs.output_base).tsv" + unannotated_tsv: type: File outputBinding: - glob: $(inputs.output_tsv_name) + glob: "$(inputs.output_base).unannotated.tsv" diff --git a/definitions/tools/annotsv_filter.cwl b/definitions/tools/annotsv_filter.cwl index 85a16272..e69c4f30 100644 --- a/definitions/tools/annotsv_filter.cwl +++ b/definitions/tools/annotsv_filter.cwl @@ -23,15 +23,17 @@ requirements: parser.add_argument('--input', '-i', dest="input", help='input AnnotSV tsv file', required=True, action="store") parser.add_argument('--output', '-o', dest="output", help='output tsv file name', required=True, action="store") parser.add_argument('--filtering_frequency', dest="filtering_frequency", help="frequency to filter with", action="store", type=float, default="0.05") - parser.add_argument('--all-CDS', dest="CDS", help="Do not require a positive CoDing Sequence overlap", action="store_true") + parser.add_argument('--no-CDS', dest="CDS", help="Do not require a positive CoDing Sequence overlap", action="store_true") parser.add_argument('--ignore-pass-filter', dest="filter", help="Do not require calls to have a PASS filter", action="store_true") + parser.add_argument('--survivor-merged', dest="survivor", help="survivor merge filtering, drop the last filter step", action="store_true") args = parser.parse_args() input_file_name = args.input output_file_name = args.output filtering_frequency = args.filtering_frequency - all_cds = args.CDS + no_cds = args.CDS ignore_pass_filter = args.filter + survivor_merged = args.survivor with open(input_file_name, 'r') as file_in, open(output_file_name, 'w') as file_out: file_in = csv.DictReader(file_in, delimiter='\t') @@ -43,23 +45,23 @@ requirements: total_sv_count += 1 if(row['AnnotSV type'] == 'split' \ and (row['FILTER'] == 'PASS' or ignore_pass_filter) \ - and (int(row['CDS length']) > 0 or all_cds) \ + and (int(row['CDS length']) > 0 or no_cds) \ and float(row['IMH_AF']) < filtering_frequency and float(row['1000g_max_AF']) < filtering_frequency and not(float(row['DGV_LOSS_Frequency']) > filtering_frequency and 'DEL' in row['SV type']) - and not(float(row['DGV_GAIN_Frequency']) < filtering_frequency and ('DUP' in row['SV type'] or 'INS' in row['SV type'])) - and not(('Manta' in row['ID'] and 'IMPRECISE' in row['INFO']) or (row['QUAL'] != '.' and 'IMPRECISE' in row['INFO'])) ): + and not(float(row['DGV_GAIN_Frequency']) > filtering_frequency and ('DUP' in row['SV type'] or 'INS' in row['SV type'])) + and (survivor_merged or not(('Manta' in row['ID'] and 'IMPRECISE' in row['INFO']) or (row['QUAL'] != '.' and 'IMPRECISE' in row['INFO'])))): file_out.writerow(row) pass_sv_count += 1 print("total sv count:",total_sv_count) print("total sv passed count:",pass_sv_count) inputs: - all_CDS: + no_CDS: type: boolean? inputBinding: position: 1 - prefix: "--all-CDS" + prefix: "--no-CDS" annotsv_tsv: type: File inputBinding: @@ -75,11 +77,17 @@ inputs: inputBinding: position: 4 prefix: "--ignore-pass-filter" + survivor_merged: + type: boolean + default: false + inputBinding: + position: 5 + prefix: "--survivor-merged" output_tsv_name: type: string? default: "filtered-bcftools-merged-AnnotSV.tsv" inputBinding: - position: 5 + position: 6 prefix: "--output" outputs: diff --git a/definitions/tools/bcftools_merge.cwl b/definitions/tools/bcftools_merge.cwl index 4c45df4b..17909417 100644 --- a/definitions/tools/bcftools_merge.cwl +++ b/definitions/tools/bcftools_merge.cwl @@ -43,7 +43,7 @@ inputs: inputBinding: position: 4 prefix: "--output-type" - doc: "output file format" + doc: "output file format, b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF" output_vcf_name: type: string? default: "bcftools_merged.vcf.gz" @@ -58,7 +58,7 @@ inputs: doc: "input bgzipped tabix indexed vcfs to merge" outputs: - merged_sv_vcf: + merged_vcf: type: File outputBinding: glob: $(inputs.output_vcf_name) diff --git a/definitions/tools/bcftools_view.cwl b/definitions/tools/bcftools_view.cwl new file mode 100644 index 00000000..fa2e18e6 --- /dev/null +++ b/definitions/tools/bcftools_view.cwl @@ -0,0 +1,53 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool + +baseCommand: ["/opt/bcftools/bin/bcftools", "view"] + +requirements: + - class: ResourceRequirement + ramMin: 4000 + - class: DockerRequirement + dockerPull: "mgibio/bcftools-cwl:1.12" + +inputs: + sample_name: + type: string? + inputBinding: + position: 1 + prefix: "--samples" + doc: "comma-separated list of samples to include (or exclude with '^' prefix)" + output_type: + type: + type: enum + symbols: ["b", "u", "z", "v"] + default: "z" + inputBinding: + position: 4 + prefix: "--output-type" + doc: "output file format, b: compressed BCF, u: uncompressed BCF, z: compressed VCF, v: uncompressed VCF" + output_vcf_name: + type: string? + default: "bcftools_split.vcf.gz" + inputBinding: + position: 5 + prefix: "--output-file" + doc: "output vcf file name" + variant_type: + type: string? + inputBinding: + position: 6 + prefix: "--types" + doc: "select comma-separated list of variant types: snps,indels,mnps,ref,bnd,other" + in_vcf: + type: File + inputBinding: + position: 7 + doc: "input bgzipped tabix indexed vcf to view" + +outputs: + vcf: + type: File + outputBinding: + glob: $(inputs.output_vcf_name) diff --git a/definitions/tools/cnvnator.cwl b/definitions/tools/cnvnator.cwl index 0d2f6062..546ba4f2 100644 --- a/definitions/tools/cnvnator.cwl +++ b/definitions/tools/cnvnator.cwl @@ -45,10 +45,10 @@ requirements: # read depth signal partitioning cnvnator -root "$SAMPLE.root" -partition "$BIN_SIZE" -chrom $CHROMOSOMES # cnv calling - cnvnator -root "$SAMPLE.root" -call "$BIN_SIZE" -chrom $CHROMOSOMES > "$SAMPLE.CNVnator.cn" + cnvnator -root "$SAMPLE.root" -call "$BIN_SIZE" -chrom $CHROMOSOMES > "$SAMPLE.cnvnator.cn" # convert to vcf - cnvnator2VCF.pl -reference "$REFERENCE" "$SAMPLE.CNVnator.cn" FASTA_CHRS/ > "$SAMPLE.CNVnator.vcf" + cnvnator2VCF.pl -reference "$REFERENCE" "$SAMPLE.cnvnator.cn" FASTA_CHRS/ > "$SAMPLE.cnvnator.vcf" exit 0 inputs: bam: @@ -87,7 +87,7 @@ outputs: vcf: type: File outputBinding: - glob: "$(inputs.sample_name).CNVnator.vcf" + glob: "$(inputs.sample_name).cnvnator.vcf" root_file: type: File outputBinding: @@ -95,4 +95,4 @@ outputs: cn_file: type: File outputBinding: - glob: "$(inputs.sample_name).CNVnator.cn" + glob: "$(inputs.sample_name).cnvnator.cn" diff --git a/definitions/tools/custom_merge_sv_records.cwl b/definitions/tools/custom_merge_sv_records.cwl new file mode 100644 index 00000000..1f134761 --- /dev/null +++ b/definitions/tools/custom_merge_sv_records.cwl @@ -0,0 +1,92 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool +label: "merges nearby DEL/DUP records within a certain window distance" + +baseCommand: ["python3", "merge.py"] +requirements: + - class: ResourceRequirement + ramMin: 4000 + - class: DockerRequirement + dockerPull: "griffithlab/vatools:4.1.0" + - class: InitialWorkDirRequirement + listing: + - entryname: "merge.py" + entry: | + import argparse + import vcfpy + from collections import OrderedDict + + parser = argparse.ArgumentParser() + parser.add_argument('--input', '-i', dest="input", help='input vcf file', required=True, action="store") + parser.add_argument('--output', '-o', dest="output", help='output vcf file', required=False, default="out.vcf", action="store") + parser.add_argument('--window', '-w', dest="window", help='max merge window size', required=False, default=1000, type=int, action="store") + + args = parser.parse_args() + in_vcf_name = args.input + out_vcf_name = args.output + window_size = args.window + + reader = vcfpy.Reader.from_path(in_vcf_name) + new_header = reader.header + new_header.add_filter_line(vcfpy.OrderedDict([('ID', 'MERGED_CALL'), ('Description', 'Record merged from 2 or more individual records')])) + + writer = vcfpy.Writer.from_path(out_vcf_name, new_header) + new_record_count = 0 + merge_records = [] + for record in reader: + if((len(merge_records) == 0) or (merge_records[-1].CHROM != record.CHROM) or (merge_records[-1].INFO['SVTYPE'] != record.INFO['SVTYPE']) or (abs(merge_records[-1].INFO['END'] - record.POS) > window_size)): + if(len(merge_records) > 1): + new_record_count = new_record_count + 1 + new_record_chr = merge_records[0].CHROM + new_record_start = merge_records[0].POS + new_record_end = merge_records[-1].INFO['END'] + new_record_type = merge_records[0].INFO['SVTYPE'] + new_record_svlen = new_record_end - new_record_start + + info = OrderedDict({"SVTYPE": new_record_type, "END": new_record_end, "SVLEN": new_record_svlen}) + alt = vcfpy.SymbolicAllele(new_record_type) + sample_calls = [] + for sample in merge_records[0].calls: + gt = OrderedDict({"GT": "/".join(map(str, sample.gt_alleles)).replace("None",".")}) + name = sample.sample + sample_calls.append(vcfpy.Call(name, gt)) + + new_record = vcfpy.Record(new_record_chr, new_record_start, [], "N", [alt], ".", ["MERGED_CALL"], info, ["GT"], sample_calls) + writer.write_record(new_record) + merge_records = [record] + else: + merge_records = [record] + writer.write_record(record) + next + + dist = abs(merge_records[-1].INFO['END'] - record.POS) + if(dist < window_size): + merge_records.append(record) + print(f"Found {new_record_count} records that can be merged based on the input {window_size} distance") + +inputs: + input_vcf: + type: File + inputBinding: + prefix: "-i" + position: 1 + output_vcf_name: + type: string? + default: "record_merged.vcf" + inputBinding: + prefix: "-o" + position: 2 + distance: + type: int? + default: 1000 + inputBinding: + prefix: "-w" + position: 3 + +outputs: + vcf: + type: File + outputBinding: + glob: "$(inputs.output_vcf_name)" diff --git a/definitions/tools/gather_to_sub_directory.cwl b/definitions/tools/gather_to_sub_directory.cwl index cffb6a9f..d4ad5fad 100644 --- a/definitions/tools/gather_to_sub_directory.cwl +++ b/definitions/tools/gather_to_sub_directory.cwl @@ -6,7 +6,7 @@ baseCommand: ["/bin/bash","directory_gatherer.sh"] requirements: - class: DockerRequirement - dockerPull: "ubuntu:xenial" + dockerPull: "ubuntu:focal" - class: ResourceRequirement ramMin: 1000 - class: InitialWorkDirRequirement @@ -16,10 +16,10 @@ requirements: set -eou pipefail outdir="$1" - files="${@:2}" - mkdir $outdir - chmod -R 777 $outdir - cp -t $outdir $files + files=("${@:2}") + mkdir "$outdir" + chmod -R 777 "$outdir" + cp --recursive --preserve --no-clobber --target-directory "$outdir" "${files[@]}" exit 0 @@ -32,6 +32,23 @@ inputs: type: File[] inputBinding: position: 2 + valueFrom: | + ${ + var results = [] + for(var i=0; i sample_update.txt - /opt/bcftools/bin/bcftools reheader -s sample_update.txt -o "$basen" "$3" + /opt/bcftools/bin/bcftools reheader -s sample_update.txt -o "$4" "$3" inputs: input_vcf: @@ -43,9 +41,14 @@ inputs: inputBinding: position: 2 doc: "Sample name to replace the other" - + output_name: + type: string? + inputBinding: + position: 4 + default: "renamed.$(inputs.input_vcf.basename)" + doc: "output filename for vcf" outputs: renamed_vcf: type: File outputBinding: - glob: $("renamed." + inputs.input_vcf.basename) + glob: "$(inputs.output_name)" diff --git a/definitions/tools/variant_filtration.cwl b/definitions/tools/variant_filtration.cwl new file mode 100644 index 00000000..1901bf13 --- /dev/null +++ b/definitions/tools/variant_filtration.cwl @@ -0,0 +1,55 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool +label: "VariantFiltration (GATK 4.1.8.1)" +baseCommand: ["/gatk/gatk", "--java-options", "-Xmx4g", "VariantFiltration"] +requirements: + - class: ResourceRequirement + ramMin: 6000 + tmpdirMin: 25000 + - class: DockerRequirement + dockerPull: "broadinstitute/gatk:4.1.8.1" +arguments: + ["-O", { valueFrom: $(runtime.outdir)/$(inputs.output_vcf_basename).vcf.gz }] +inputs: + reference: + type: + - string + - File + secondaryFiles: [.fai, ^.dict] + inputBinding: + prefix: "-R" + position: 1 + vcf: + type: File + inputBinding: + prefix: "--variant" + position: 2 + secondaryFiles: [.tbi] + filters: + type: string[] + inputBinding: + position: 3 + valueFrom: | + ${ + var results = [] + for(var i=0; i3.0;SOR3'" + output_vcf_basename: + type: string? + default: select_variants +outputs: + filtered_vcf: + type: File + secondaryFiles: [.tbi] + outputBinding: + glob: $(inputs.output_vcf_basename).vcf.gz diff --git a/definitions/tools/vt_normalize.cwl b/definitions/tools/vt_normalize.cwl new file mode 100644 index 00000000..0e19086b --- /dev/null +++ b/definitions/tools/vt_normalize.cwl @@ -0,0 +1,32 @@ +#!/usr/bin/env cwl-runner + +cwlVersion: v1.0 +class: CommandLineTool +label: "run vt normalize" +baseCommand: ["vt", "normalize"] +requirements: + - class: DockerRequirement + dockerPull: quay.io/biocontainers/vt:0.57721--hf74b74d_1 + - class: ResourceRequirement + ramMin: 4000 +arguments: + ["-o", { valueFrom: $(runtime.outdir)/normalized.vcf.gz }] +inputs: + vcf: + type: File + inputBinding: + position: 1 + secondaryFiles: [".tbi"] + reference: + type: + - string + - File + secondaryFiles: [".fai"] + inputBinding: + prefix: "-r" + position: 2 +outputs: + normalized_vcf: + type: File + outputBinding: + glob: "normalized.vcf.gz"