From aaed0b997957b282e9351d51c7c84e2218f942ff Mon Sep 17 00:00:00 2001 From: Nikolas Barkas Date: Wed, 18 Mar 2020 12:56:43 -0400 Subject: [PATCH] ignore multi gene alignements (#74) * ignore multi gene alignements * (a) fixed the gene_name_tag to gene_id_tag in playform.py and (b) added the logic for multi genes when there is multiple genes * fix linting Co-authored-by: Kishori Konwar --- src/sctools/count.py | 8 +++++++- src/sctools/platform.py | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sctools/count.py b/src/sctools/count.py index 7df748b..da2842d 100644 --- a/src/sctools/count.py +++ b/src/sctools/count.py @@ -290,13 +290,19 @@ def from_sorted_tagged_bam( primary_alignment = alignments[0] if primary_alignment.has_tag(gene_name_tag): gene_name = primary_alignment.get_tag(gene_name_tag) + # overlaps multiple genes, drop query, and unfortunately there only one + # one alignment for this query + if len(gene_name.split(',')) != 1: + continue else: continue # drop query else: # multi-map implicated_gene_names: Set[str] = set() for alignment in alignments: if alignment.has_tag(gene_name_tag): - implicated_gene_names.add(alignment.get_tag(gene_name_tag)) + # consider its gene name only if it has only gene name + if len(gene_name.split(',')) == 1: + implicated_gene_names.add(alignment.get_tag(gene_name_tag)) if len(implicated_gene_names) == 1: # only one gene gene_name = implicated_gene_names.__iter__().__next__() else: diff --git a/src/sctools/platform.py b/src/sctools/platform.py index 2d746ad..b9c31db 100644 --- a/src/sctools/platform.py +++ b/src/sctools/platform.py @@ -450,7 +450,7 @@ def bam_to_count_matrix(cls, args: Iterable[str] = None) -> int: chromosomes_gene_locations_extended=gene_locations, cell_barcode_tag=args.cell_barcode_tag, molecule_barcode_tag=args.molecule_barcode_tag, - gene_name_tag=args.gene_name_tag, + gene_name_tag=args.gene_id_tag, open_mode=open_mode, ) matrix.save(args.output_prefix)