Skip to content

Commit

Permalink
Handle KeyError caused by missing "CB" tags when calculating cell met…
Browse files Browse the repository at this point in the history
…rics (#56)

* Handle missing CB KeyError

If a SAM record does not have a 'CB'
tag, do not include it in the count of
perfect_cell_barcodes.

* Add test for perfect_cell_barcodes metric

* Refactor perfect_cell_barcodes error handling

* Simplify file path syntax in test
  • Loading branch information
samanehsan authored Jan 3, 2019
1 parent 941ae54 commit 3b1199d
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 10 deletions.
7 changes: 5 additions & 2 deletions src/sctools/metrics/aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,11 @@ def parse_extra_fields(self, tags: Sequence[str], record: pysam.AlignedSegment)
self._quality_above_threshold(
30, self._quality_string_to_numeric(record.get_tag(consts.QUALITY_CELL_BARCODE_TAG_KEY))))

self.perfect_cell_barcodes += (
record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY) == record.get_tag(consts.CELL_BARCODE_TAG_KEY))
# Exclude reads that do not have a CB tag from the perfect_cell_barcodes count
if record.has_tag(consts.CELL_BARCODE_TAG_KEY):
raw_cell_barcode_tag = record.get_tag(consts.RAW_CELL_BARCODE_TAG_KEY)
cell_barcode_tag = record.get_tag(consts.CELL_BARCODE_TAG_KEY)
self.perfect_cell_barcodes += (raw_cell_barcode_tag == cell_barcode_tag)

try:
alignment_location = record.get_tag(consts.ALIGNMENT_LOCATION_TAG_KEY)
Expand Down
Binary file added src/sctools/test/data/cell-sorted-missing-cb.bam
Binary file not shown.
Binary file added src/sctools/test/data/cell_metrics_missing_cb.csv.gz
Binary file not shown.
35 changes: 27 additions & 8 deletions src/sctools/test/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@
# samtools view <filename> | less

# set the input files
_gene_sorted_bam = _data_dir + '/small-gene-sorted.bam'
_cell_sorted_bam = _data_dir + '/small-cell-sorted.bam'
_gene_sorted_bam = os.path.join(_data_dir, 'small-gene-sorted.bam')
_cell_sorted_bam = os.path.join(_data_dir, 'small-cell-sorted.bam')
_cell_sorted_bam_missing_cell_barcodes = os.path.join(_data_dir, 'cell-sorted-missing-cb.bam')

# specify filenames for temporary metrics outputs that are used in the following tests
_gene_metric_output_file = _test_dir + '/gene_metrics.csv.gz'
_cell_metric_output_file = _test_dir + '/cell_metrics.csv.gz'
_gene_metric_output_file = os.path.join(_test_dir, 'gene_metrics.csv.gz')
_cell_metric_output_file = os.path.join(_test_dir, 'cell_metrics.csv.gz')
_cell_metric_output_file_missing_cell_barcodes = os.path.join(_test_dir, 'cell_metrics_missing_cb.csv.gz')

# run the gene metrics suite
gene_gatherer = GatherGeneMetrics(_gene_sorted_bam, _gene_metric_output_file)
Expand All @@ -48,18 +50,25 @@
cell_gatherer.extract_metrics()
_cell_metrics = pd.read_csv(_cell_metric_output_file, index_col=0)

# run the cell metrics suite
cell_gatherer_missing_cbs = GatherCellMetrics(_cell_sorted_bam_missing_cell_barcodes, _cell_metric_output_file_missing_cell_barcodes)
cell_gatherer_missing_cbs.extract_metrics()
_cell_metrics_missing_cbs = pd.read_csv(_cell_metric_output_file_missing_cell_barcodes, index_col=0)


def test_calculate_cell_metrics_cli():
"""test the sctools cell metrics CLI invocation"""
cell_metrics_csv = os.path.join(_test_dir, 'cell_metrics.csv')
return_call = TenXV2.calculate_cell_metrics(
args=['-i', _cell_sorted_bam, '-o', _test_dir + '/gene_metrics.csv'])
args=['-i', _cell_sorted_bam, '-o', cell_metrics_csv])
assert return_call == 0


def test_calculate_gene_metrics_cli():
"""test the sctools gene metrics CLI invocation"""
gene_metrics_csv = os.path.join(_test_dir, 'gene_metrics.csv')
return_call = TenXV2.calculate_gene_metrics(
args=['-i', _gene_sorted_bam, '-o', _test_dir + '/gene_metrics.csv'])
args=['-i', _gene_sorted_bam, '-o', gene_metrics_csv])
assert return_call == 0


Expand Down Expand Up @@ -144,12 +153,22 @@ def test_metrics_highest_read_count(metrics, expected_value):
(_gene_metrics, 300), # todo this is 100%, we should mangle a few in the testing data
(_cell_metrics, 655),
])
def test_metrics_number_perfect_barcodes(metrics, expected_value):
"""Test that each metric correctly identifies the number of perfect barcodes where CB == CR"""
def test_metrics_number_perfect_molecule_barcodes(metrics, expected_value):
"""Test that each metric correctly identifies the number of perfect molecule barcodes where UB == UR"""
observed_perfect_barcodes = metrics['perfect_molecule_barcodes'].sum()
assert observed_perfect_barcodes == expected_value


@pytest.mark.parametrize('metrics, expected_value', [
(_cell_metrics, 650),
(_cell_metrics_missing_cbs, 12861)
])
def test_metrics_number_perfect_cell_barcodes(metrics, expected_value):
"""Test that each metric correctly identifies the number of perfect cell barcodes where CB == CR"""
observed_perfect_cell_barcodes = metrics['perfect_cell_barcodes'].sum()
assert observed_perfect_cell_barcodes == expected_value


@pytest.mark.parametrize('metrics, expected_value', [
(_gene_metrics, 300), # todo this is 100%, should get some intronic or other reads
(_cell_metrics, 609),
Expand Down

0 comments on commit 3b1199d

Please sign in to comment.