diff --git a/src/sctools/metrics/aggregator.py b/src/sctools/metrics/aggregator.py index b8551a4..1e7ad9c 100644 --- a/src/sctools/metrics/aggregator.py +++ b/src/sctools/metrics/aggregator.py @@ -455,8 +455,11 @@ def parse_extra_fields(self, tags: Sequence[str], record: pysam.AlignedSegment) self._quality_above_threshold( 30, self._quality_string_to_numeric(record.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY)))) - self.perfect_cell_barcodes += ( - record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) == record.get_tag(consts.CELL_BARCODE_TAG_KEY)) + # Exclude reads that do not have a CB tag from the perfect_cell_barcodes count + if record.has_tag(consts.CELL_BARCODE_TAG_KEY): + raw_cell_barcode_tag = record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) + cell_barcode_tag = record.get_tag(consts.CELL_BARCODE_TAG_KEY) + self.perfect_cell_barcodes += (raw_cell_barcode_tag == cell_barcode_tag) try: alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY) diff --git a/src/sctools/test/data/cell-sorted-missing-cb.bam b/src/sctools/test/data/cell-sorted-missing-cb.bam new file mode 100644 index 0000000..88d2b05 Binary files /dev/null and b/src/sctools/test/data/cell-sorted-missing-cb.bam differ diff --git a/src/sctools/test/data/cell_metrics_missing_cb.csv.gz b/src/sctools/test/data/cell_metrics_missing_cb.csv.gz new file mode 100644 index 0000000..20a433d Binary files /dev/null and b/src/sctools/test/data/cell_metrics_missing_cb.csv.gz differ diff --git a/src/sctools/test/test_metrics.py b/src/sctools/test/test_metrics.py index f9d9469..ae9ca61 100644 --- a/src/sctools/test/test_metrics.py +++ b/src/sctools/test/test_metrics.py @@ -31,12 +31,14 @@ # samtools view | less # set the input files -_gene_sorted_bam = _data_dir + '/small-gene-sorted.bam' -_cell_sorted_bam = _data_dir + '/small-cell-sorted.bam' +_gene_sorted_bam = os.path.join(_data_dir, 'small-gene-sorted.bam') +_cell_sorted_bam = os.path.join(_data_dir, 'small-cell-sorted.bam') +_cell_sorted_bam_missing_cell_barcodes = os.path.join(_data_dir, 'cell-sorted-missing-cb.bam') # specify filenames for temporary metrics outputs that are used in the following tests -_gene_metric_output_file = _test_dir + '/gene_metrics.csv.gz' -_cell_metric_output_file = _test_dir + '/cell_metrics.csv.gz' +_gene_metric_output_file = os.path.join(_test_dir, 'gene_metrics.csv.gz') +_cell_metric_output_file = os.path.join(_test_dir, 'cell_metrics.csv.gz') +_cell_metric_output_file_missing_cell_barcodes = os.path.join(_test_dir, 'cell_metrics_missing_cb.csv.gz') # run the gene metrics suite gene_gatherer = GatherGeneMetrics(_gene_sorted_bam, _gene_metric_output_file) @@ -48,18 +50,25 @@ cell_gatherer.extract_metrics() _cell_metrics = pd.read_csv(_cell_metric_output_file, index_col=0) +# run the cell metrics suite +cell_gatherer_missing_cbs = GatherCellMetrics(_cell_sorted_bam_missing_cell_barcodes, _cell_metric_output_file_missing_cell_barcodes) +cell_gatherer_missing_cbs.extract_metrics() +_cell_metrics_missing_cbs = pd.read_csv(_cell_metric_output_file_missing_cell_barcodes, index_col=0) + def test_calculate_cell_metrics_cli(): """test the sctools cell metrics CLI invocation""" + cell_metrics_csv = os.path.join(_test_dir, 'cell_metrics.csv') return_call = TenXV2.calculate_cell_metrics( - args=['-i', _cell_sorted_bam, '-o', _test_dir + '/gene_metrics.csv']) + args=['-i', _cell_sorted_bam, '-o', cell_metrics_csv]) assert return_call == 0 def test_calculate_gene_metrics_cli(): """test the sctools gene metrics CLI invocation""" + gene_metrics_csv = os.path.join(_test_dir, 'gene_metrics.csv') return_call = TenXV2.calculate_gene_metrics( - args=['-i', _gene_sorted_bam, '-o', _test_dir + '/gene_metrics.csv']) + args=['-i', _gene_sorted_bam, '-o', gene_metrics_csv]) assert return_call == 0 @@ -144,12 +153,22 @@ def test_metrics_highest_read_count(metrics, expected_value): (_gene_metrics, 300), # todo this is 100%, we should mangle a few in the testing data (_cell_metrics, 655), ]) -def test_metrics_number_perfect_barcodes(metrics, expected_value): - """Test that each metric correctly identifies the number of perfect barcodes where CB == CR""" +def test_metrics_number_perfect_molecule_barcodes(metrics, expected_value): + """Test that each metric correctly identifies the number of perfect molecule barcodes where UB == UR""" observed_perfect_barcodes = metrics['perfect_molecule_barcodes'].sum() assert observed_perfect_barcodes == expected_value +@pytest.mark.parametrize('metrics, expected_value', [ + (_cell_metrics, 650), + (_cell_metrics_missing_cbs, 12861) +]) +def test_metrics_number_perfect_cell_barcodes(metrics, expected_value): + """Test that each metric correctly identifies the number of perfect cell barcodes where CB == CR""" + observed_perfect_cell_barcodes = metrics['perfect_cell_barcodes'].sum() + assert observed_perfect_cell_barcodes == expected_value + + @pytest.mark.parametrize('metrics, expected_value', [ (_gene_metrics, 300), # todo this is 100%, should get some intronic or other reads (_cell_metrics, 609),