Skip to content

Commit

Permalink
Fix AlignmentSummaryMetrics for unpaired reads (#65)
Browse files Browse the repository at this point in the history
For unpaired reads, picard only writes one row of results, and crimson
then returns a dict instead of a list of dicts. This change handles that
case and includes a test of outputs from running picard on unpaired
SS2 data.
  • Loading branch information
mckinsel authored May 15, 2019
1 parent 6bf7fe5 commit 84e9679
Show file tree
Hide file tree
Showing 6 changed files with 209 additions and 1 deletion.
5 changes: 4 additions & 1 deletion src/sctools/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ def write_aggregated_picard_metrics_by_row(file_names, output_name):
# but only output PAIRED-READS/third line
contents = parsed['metrics']['contents']
if class_name == "AlignmentSummaryMetrics":
# parse out PE, R1 and R2
# parse out PE, R1 and R2. If the reads are unpaired, the contents
# will be a single dict rather than a list of dicts.
if isinstance(contents, dict):
contents = [contents]
rows = {}
for m in contents:
cat = m['CATEGORY']
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## htsjdk.samtools.metrics.StringHeader
# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
## htsjdk.samtools.metrics.StringHeader
# Started on: Tue May 14 15:45:18 UTC 2019

## METRICS CLASS picard.analysis.AlignmentSummaryMetrics
CATEGORY TOTAL_READS PF_READS PCT_PF_READS PF_NOISE_READS PF_READS_ALIGNED PCT_PF_READS_ALIGNED PF_ALIGNED_BASES PF_HQ_ALIGNED_READS PF_HQ_ALIGNED_BASES PF_HQ_ALIGNED_Q20_BASES PF_HQ_MEDIAN_MISMATCHES PF_MISMATCH_RATE PF_HQ_ERROR_RATE PF_INDEL_RATE MEAN_READ_LENGTH READS_ALIGNED_IN_PAIRS PCT_READS_ALIGNED_IN_PAIRS PF_READS_IMPROPER_PAIRS PCT_PF_READS_IMPROPER_PAIRS BAD_CYCLES STRAND_BALANCE PCT_CHIMERAS PCT_ADAPTER SAMPLE LIBRARY READ_GROUP
UNPAIRED 1086652 1086652 1 0 770963 0.709485 38213614 697232 34613985 34073804 0 0.002624 0.002357 0.000149 50 0 0 0 0 0 0.501303 0 0.000027


Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## htsjdk.samtools.metrics.StringHeader
# MarkDuplicates INPUT=[/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectDuplicationMetrics/inputs/-1585165421/SRR6258488_qc.bam] OUTPUT=SRR6258488_qc.MarkDuplicated.bam METRICS_FILE=SRR6258488_qc.duplicate_metrics.txt REMOVE_DUPLICATES=false ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT MAX_SEQUENCES_FOR_DISK_READ_ENDS_MAP=50000 MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=8000 SORTING_COLLECTION_SIZE_RATIO=0.25 TAG_DUPLICATE_SET_MEMBERS=false REMOVE_SEQUENCING_DUPLICATES=false TAGGING_POLICY=DontTag DUPLICATE_SCORING_STRATEGY=SUM_OF_BASE_QUALITIES PROGRAM_RECORD_ID=MarkDuplicates PROGRAM_GROUP_NAME=MarkDuplicates READ_NAME_REGEX=<optimized capture of last three ':' separated fields as numeric values> OPTICAL_DUPLICATE_PIXEL_DISTANCE=100 MAX_OPTICAL_DUPLICATE_SET_SIZE=300000 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
## htsjdk.samtools.metrics.StringHeader
# Started on: Tue May 14 15:45:17 UTC 2019

## METRICS CLASS picard.sam.DuplicationMetrics
LIBRARY UNPAIRED_READS_EXAMINED READ_PAIRS_EXAMINED SECONDARY_OR_SUPPLEMENTARY_RDS UNMAPPED_READS UNPAIRED_READ_DUPLICATES READ_PAIR_DUPLICATES READ_PAIR_OPTICAL_DUPLICATES PERCENT_DUPLICATION ESTIMATED_LIBRARY_SIZE
SRR6258488 770963 0 473100 315689 345396 0 0 0.448006


Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
## htsjdk.samtools.metrics.StringHeader
# CollectMultipleMetrics INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-1585165421/SRR6258488_qc.bam ASSUME_SORTED=true OUTPUT=SRR6258488_qc METRIC_ACCUMULATION_LEVEL=[ALL_READS] FILE_EXTENSION=.txt PROGRAM=[CollectAlignmentSummaryMetrics, CollectInsertSizeMetrics, CollectGcBiasMetrics, CollectBaseDistributionByCycle, QualityScoreDistribution, MeanQualityByCycle, CollectSequencingArtifactMetrics, CollectQualityYieldMetrics] VALIDATION_STRINGENCY=SILENT REFERENCE_SEQUENCE=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectMultipleMetrics/inputs/-852851197/GRCh38.primary_assembly.genome.fa STOP_AFTER=0 INCLUDE_UNPAIRED=false VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
## htsjdk.samtools.metrics.StringHeader
# Started on: Tue May 14 15:45:18 UTC 2019

## METRICS CLASS picard.analysis.GcBiasSummaryMetrics
ACCUMULATION_LEVEL READS_USED WINDOW_SIZE TOTAL_CLUSTERS ALIGNED_READS AT_DROPOUT GC_DROPOUT GC_NC_0_19 GC_NC_20_39 GC_NC_40_59 GC_NC_60_79 GC_NC_80_100 SAMPLE LIBRARY READ_GROUP
All Reads ALL 100 1559752 1244063 13.760859 1.1878 0.219754 0.753171 1.281724 0.883386 0.021428


Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
## htsjdk.samtools.metrics.StringHeader
# CollectRnaSeqMetrics REF_FLAT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/GRCh38_gencode.v27.refFlat.txt RIBOSOMAL_INTERVALS=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-852851197/gencode.v27.rRNA.interval_list STRAND_SPECIFICITY=NONE CHART_OUTPUT=SRR6258488_qc.rna.coverage.pdf METRIC_ACCUMULATION_LEVEL=[ALL_READS] INPUT=/cromwell-executions/SmartSeq2SingleCell/a47f1348-ecf5-463e-afee-c3bed51d479d/call-CollectRnaMetrics/inputs/-1585165421/SRR6258488_qc.bam OUTPUT=SRR6258488_qc.rna_metrics.txt VALIDATION_STRINGENCY=SILENT MINIMUM_LENGTH=500 RRNA_FRAGMENT_PERCENTAGE=0.8 ASSUME_SORTED=true STOP_AFTER=0 VERBOSITY=INFO QUIET=false COMPRESSION_LEVEL=5 MAX_RECORDS_IN_RAM=500000 CREATE_INDEX=false CREATE_MD5_FILE=false GA4GH_CLIENT_SECRETS=client_secrets.json USE_JDK_DEFLATER=false USE_JDK_INFLATER=false
## htsjdk.samtools.metrics.StringHeader
# Started on: Tue May 14 15:45:18 UTC 2019

## METRICS CLASS picard.analysis.RnaSeqMetrics
PF_BASES PF_ALIGNED_BASES RIBOSOMAL_BASES CODING_BASES UTR_BASES INTRONIC_BASES INTERGENIC_BASES IGNORED_READS CORRECT_STRAND_READS INCORRECT_STRAND_READS NUM_R1_TRANSCRIPT_STRAND_READS NUM_R2_TRANSCRIPT_STRAND_READS NUM_UNEXPLAINED_READS PCT_R1_TRANSCRIPT_STRAND_READS PCT_R2_TRANSCRIPT_STRAND_READS PCT_RIBOSOMAL_BASES PCT_CODING_BASES PCT_UTR_BASES PCT_INTRONIC_BASES PCT_INTERGENIC_BASES PCT_MRNA_BASES PCT_USABLE_BASES PCT_CORRECT_STRAND_READS MEDIAN_CV_COVERAGE MEDIAN_5PRIME_BIAS MEDIAN_3PRIME_BIAS MEDIAN_5PRIME_TO_3PRIME_BIAS SAMPLE LIBRARY READ_GROUP
54332600 38213614 0 371628 1152265 18630585 18059136 0 0 0 12352 12891 538 0.489324 0.510676 0 0.009725 0.030153 0.487538 0.472584 0.039878 0.028047 0 2.183917 0 0 0

## HISTOGRAM java.lang.Integer
normalized_position All_Reads.normalized_coverage
0 1.252653
1 1.146108
2 1.065068
3 1.122433
4 1.234516
5 1.247113
6 1.2191
7 1.08917
8 1.101883
9 1.130302
10 1.082888
11 1.146879
12 1.173149
13 1.084206
14 1.035169
15 1.169359
16 1.278125
17 1.298059
18 1.418038
19 1.468055
20 1.306559
21 1.210198
22 0.953958
23 0.806139
24 0.815513
25 0.887045
26 0.763414
27 0.737914
28 0.702678
29 0.689913
30 0.633512
31 0.665368
32 0.682949
33 0.848599
34 0.941722
35 1.082228
36 1.113449
37 1.049003
38 0.97788
39 0.989931
40 0.92986
41 0.874432
42 0.87788
43 0.868871
44 0.92942
45 1.015775
46 1.070114
47 1.023889
48 1.023103
49 0.988576
50 0.931694
51 0.794716
52 0.765784
53 0.721218
54 0.723223
55 0.711507
56 0.704034
57 0.694139
58 0.741844
59 0.831505
60 0.806244
61 0.869419
62 0.987354
63 0.954176
64 0.925553
65 0.951851
66 0.906269
67 0.85666
68 0.985052
69 0.947861
70 0.98528
71 0.873541
72 0.87925
73 0.956294
74 1.137028
75 1.206313
76 1.148145
77 1.159051
78 1.207689
79 1.170334
80 1.199969
81 1.391121
82 1.243649
83 1.235795
84 1.227105
85 1.278662
86 1.298065
87 1.201038
88 1.2361
89 1.098932
90 1.042881
91 1.037875
92 0.95545
93 0.969215
94 1.059149
95 0.857316
96 0.792585
97 0.817511
98 0.880909
99 0.786114
100 0.548663

62 changes: 62 additions & 0 deletions src/sctools/test/test_groups.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os
import csv
import itertools
from collections import OrderedDict
from sctools import platform


data_dir = os.path.split(__file__)[0] + '/data/group_metrics/'
unpaired_data_dir = os.path.split(__file__)[0] + '/data/group_metrics_unpaired_ss2/'


def check_parsed_metrics_csv(file_name, cell_id, class_name, expected_metrics):
Expand Down Expand Up @@ -274,3 +276,63 @@ def test_write_aggregated_qc_metrics():
# The output file should contain all of the column headers from the input files plus the "joined column" containing row headers
assert len(output_headers) == len(expected_headers) + 1
os.remove('output_QCs.csv')


def test_unpaired_ss2_write_aggregated_picard_metrics_by_row():

sources = [
unpaired_data_dir + 'SRR6258488_qc.alignment_summary_metrics.txt',
unpaired_data_dir + 'SRR6258488_qc.duplicate_metrics.txt',
unpaired_data_dir + 'SRR6258488_qc.gc_bias.summary_metrics.txt',
unpaired_data_dir + 'SRR6258488_qc.rna_metrics.txt',
]

args = ['-f', *sources, '-t', 'Picard', '-o', 'output_picard_group_unpaired']
return_code = platform.GenericPlatform.group_qc_outputs(args)
assert return_code == 0

expected_metrics = {}

for source in sources:
with open(source) as f:
for line in f:
if line.startswith("## METRICS CLASS"):
class_ = line.strip().split('\t')[1].split('.')[-1]
break
labels = f.readline().strip().split("\t")
values = f.readline().strip().split("\t")

for label, value in itertools.zip_longest(labels, values, fillvalue=""):
if label in ("LIBRARY", "SAMPLE", "READ_GROUP", "CATEGORY"):
continue
if class_ == "AlignmentSummaryMetrics":
label += ".UNPAIRED"
try:
value = str(float(value))
except ValueError:
pass
expected_metrics[(class_, label)] = value
expected_metrics[("Class", "")] = "SRR6258488"

with open('output_picard_group_unpaired.csv') as f:
labels = f.readline().strip().split(',')
classes = f.readline().strip().split(',')
values = f.readline().strip().split(',')
assert len(labels) == len(expected_metrics)

for class_, label in expected_metrics:
if class_ not in classes or label not in labels:
print("!", class_, label)

for class_, label, value in zip(classes, labels, values):
assert (class_, label) in expected_metrics
try:
value = str(float(value))
except ValueError:
value = value
try:
expected_value = str(float(expected_metrics[(class_, label)]))
except ValueError:
expected_value = expected_metrics[(class_, label)]
assert value == expected_value
os.remove('output_picard_group_unpaired.csv')

0 comments on commit 84e9679

Please sign in to comment.