From 84e9679931e03017f8dcb7e83c50b1faac867198 Mon Sep 17 00:00:00 2001 From: Marcus Kinsella Date: Wed, 15 May 2019 11:26:50 -0400 Subject: [PATCH] Fix AlignmentSummaryMetrics for unpaired reads (#65) For unpaired reads, picard only writes one row of results, and crimson then returns a dict instead of a list of dicts. This change handles that case and includes a test of outputs from running picard on unpaired SS2 data. --- src/sctools/groups.py | 5 +- ...RR6258488_qc.alignment_summary_metrics.txt | 10 ++ .../SRR6258488_qc.duplicate_metrics.txt | 10 ++ .../SRR6258488_qc.gc_bias.summary_metrics.txt | 10 ++ .../SRR6258488_qc.rna_metrics.txt | 113 ++++++++++++++++++ src/sctools/test/test_groups.py | 62 ++++++++++ 6 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt create mode 100644 src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt create mode 100644 src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt create mode 100644 src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt diff --git a/src/sctools/groups.py b/src/sctools/groups.py index f4f757a..d019945 100644 --- a/src/sctools/groups.py +++ b/src/sctools/groups.py @@ -34,7 +34,10 @@ def write_aggregated_picard_metrics_by_row(file_names, output_name): # but only output PAIRED-READS/third line contents = parsed['metrics']['contents'] if class_name == "AlignmentSummaryMetrics": - # parse out PE, R1 and R2 + # parse out PE, R1 and R2. If the reads are unpaired, the contents + # will be a single dict rather than a list of dicts. + if isinstance(contents, dict): + contents = [contents] rows = {} for m in contents: cat = m['CATEGORY'] diff --git a/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt new file mode 100644 index 0000000..1559f3e --- /dev/null +++ b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.alignment_summary_metrics.txt @@ -0,0 +1,10 @@ +## htsjdk.samtools.metrics.StringHeader +# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +## htsjdk.samtools.metrics.StringHeader +# Started on: Tue May 14 15:45:18 UTC 2019 + +## METRICS CLASS picard.analysis.AlignmentSummaryMetrics +CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP +UNPAIRED 1086652 1086652 1 0 770963 0.709485 38213614 697232 34613985 34073804 0 0.002624 0.002357 0.000149 50 0 0 0 0 0 0.501303 0 0.000027 + + diff --git a/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt new file mode 100644 index 0000000..661fa79 --- /dev/null +++ b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.duplicate_metrics.txt @@ -0,0 +1,10 @@ +## htsjdk.samtools.metrics.StringHeader +# MarkDuplicates INPUT=[/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectDuplicationMetrics/inputs/-1585165421/SRR6258488_qc.bam] OUTPUT=SRR6258488_qc.MarkDuplicated.bam METRICS_FILE=SRR6258488_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX= OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +## htsjdk.samtools.metrics.StringHeader +# Started on: Tue May 14 15:45:17 UTC 2019 + +## METRICS CLASS picard.sam.DuplicationMetrics +LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE +SRR6258488 770963 0 473100 315689 345396 0 0 0.448006 + + diff --git a/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt new file mode 100644 index 0000000..26669a7 --- /dev/null +++ b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.gc_bias.summary_metrics.txt @@ -0,0 +1,10 @@ +## htsjdk.samtools.metrics.StringHeader +# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +## htsjdk.samtools.metrics.StringHeader +# Started on: Tue May 14 15:45:18 UTC 2019 + +## METRICS CLASS picard.analysis.GcBiasSummaryMetrics +ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP +All Reads ALL 100 1559752 1244063 13.760859 1.1878 0.219754 0.753171 1.281724 0.883386 0.021428 + + diff --git a/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt new file mode 100644 index 0000000..4383106 --- /dev/null +++ b/src/sctools/test/data/group_metrics_unpaired_ss2/SRR6258488_qc.rna_metrics.txt @@ -0,0 +1,113 @@ +## htsjdk.samtools.metrics.StringHeader +# CollectRnaSeqMetrics REF_FLAT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=SRR6258488_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-1585165421/SRR6258488_qc.bam OUTPUT=SRR6258488_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false +## htsjdk.samtools.metrics.StringHeader +# Started on: Tue May 14 15:45:18 UTC 2019 + +## METRICS CLASS picard.analysis.RnaSeqMetrics +PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP +54332600 38213614 0 371628 1152265 18630585 18059136 0 0 0 12352 12891 538 0.489324 0.510676 0 0.009725 0.030153 0.487538 0.472584 0.039878 0.028047 0 2.183917 0 0 0 + +## HISTOGRAM java.lang.Integer +normalized_position All_Reads.normalized_coverage +0 1.252653 +1 1.146108 +2 1.065068 +3 1.122433 +4 1.234516 +5 1.247113 +6 1.2191 +7 1.08917 +8 1.101883 +9 1.130302 +10 1.082888 +11 1.146879 +12 1.173149 +13 1.084206 +14 1.035169 +15 1.169359 +16 1.278125 +17 1.298059 +18 1.418038 +19 1.468055 +20 1.306559 +21 1.210198 +22 0.953958 +23 0.806139 +24 0.815513 +25 0.887045 +26 0.763414 +27 0.737914 +28 0.702678 +29 0.689913 +30 0.633512 +31 0.665368 +32 0.682949 +33 0.848599 +34 0.941722 +35 1.082228 +36 1.113449 +37 1.049003 +38 0.97788 +39 0.989931 +40 0.92986 +41 0.874432 +42 0.87788 +43 0.868871 +44 0.92942 +45 1.015775 +46 1.070114 +47 1.023889 +48 1.023103 +49 0.988576 +50 0.931694 +51 0.794716 +52 0.765784 +53 0.721218 +54 0.723223 +55 0.711507 +56 0.704034 +57 0.694139 +58 0.741844 +59 0.831505 +60 0.806244 +61 0.869419 +62 0.987354 +63 0.954176 +64 0.925553 +65 0.951851 +66 0.906269 +67 0.85666 +68 0.985052 +69 0.947861 +70 0.98528 +71 0.873541 +72 0.87925 +73 0.956294 +74 1.137028 +75 1.206313 +76 1.148145 +77 1.159051 +78 1.207689 +79 1.170334 +80 1.199969 +81 1.391121 +82 1.243649 +83 1.235795 +84 1.227105 +85 1.278662 +86 1.298065 +87 1.201038 +88 1.2361 +89 1.098932 +90 1.042881 +91 1.037875 +92 0.95545 +93 0.969215 +94 1.059149 +95 0.857316 +96 0.792585 +97 0.817511 +98 0.880909 +99 0.786114 +100 0.548663 + diff --git a/src/sctools/test/test_groups.py b/src/sctools/test/test_groups.py index afb858a..104f3e6 100644 --- a/src/sctools/test/test_groups.py +++ b/src/sctools/test/test_groups.py @@ -1,10 +1,12 @@ import os import csv +import itertools from collections import OrderedDict from sctools import platform data_dir = os.path.split(__file__)[0] + '/data/group_metrics/' +unpaired_data_dir = os.path.split(__file__)[0] + '/data/group_metrics_unpaired_ss2/' def check_parsed_metrics_csv(file_name, cell_id, class_name, expected_metrics): @@ -274,3 +276,63 @@ def test_write_aggregated_qc_metrics(): # The output file should contain all of the column headers from the input files plus the "joined column" containing row headers assert len(output_headers) == len(expected_headers) + 1 os.remove('output_QCs.csv') + + +def test_unpaired_ss2_write_aggregated_picard_metrics_by_row(): + + sources = [ + unpaired_data_dir + 'SRR6258488_qc.alignment_summary_metrics.txt', + unpaired_data_dir + 'SRR6258488_qc.duplicate_metrics.txt', + unpaired_data_dir + 'SRR6258488_qc.gc_bias.summary_metrics.txt', + unpaired_data_dir + 'SRR6258488_qc.rna_metrics.txt', + ] + + args = ['-f', *sources, '-t', 'Picard', '-o', 'output_picard_group_unpaired'] + return_code = platform.GenericPlatform.group_qc_outputs(args) + assert return_code == 0 + + expected_metrics = {} + + for source in sources: + with open(source) as f: + for line in f: + if line.startswith("## METRICS CLASS"): + class_ = line.strip().split('\t')[1].split('.')[-1] + break + labels = f.readline().strip().split("\t") + values = f.readline().strip().split("\t") + + for label, value in itertools.zip_longest(labels, values, fillvalue=""): + if label in ("LIBRARY", "SAMPLE", "READ_GROUP", "CATEGORY"): + continue + if class_ == "AlignmentSummaryMetrics": + label += ".UNPAIRED" + try: + value = str(float(value)) + except ValueError: + pass + expected_metrics[(class_, label)] = value + expected_metrics[("Class", "")] = "SRR6258488" + + with open('output_picard_group_unpaired.csv') as f: + labels = f.readline().strip().split(',') + classes = f.readline().strip().split(',') + values = f.readline().strip().split(',') + assert len(labels) == len(expected_metrics) + + for class_, label in expected_metrics: + if class_ not in classes or label not in labels: + print("!", class_, label) + + for class_, label, value in zip(classes, labels, values): + assert (class_, label) in expected_metrics + try: + value = str(float(value)) + except ValueError: + value = value + try: + expected_value = str(float(expected_metrics[(class_, label)])) + except ValueError: + expected_value = expected_metrics[(class_, label)] + assert value == expected_value + os.remove('output_picard_group_unpaired.csv')