diff --git a/src/sctools/count.py b/src/sctools/count.py index f63576b..74753a1 100644 --- a/src/sctools/count.py +++ b/src/sctools/count.py @@ -14,6 +14,7 @@ gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb') from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str) + Notes ----- Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus. @@ -251,36 +252,7 @@ def from_sorted_tagged_bam( ) in grouped_records_generator: # modify alignments to include the gene name to the alignments to INTRONIC regions - if chromosomes_gene_locations_extended: - alignments = [] - for alignment in input_alignments: - if alignment.has_tag("XF"): - aln_type = alignment.get_tag("XF") - if ( - alignment.reference_name - and aln_type == "INTRONIC" - and alignment.reference_name - in chromosomes_gene_locations_extended - ): - gene_name = cls.binary_overlap( - chromosomes_gene_locations_extended[ - alignment.reference_name - ], - 0, - len( - chromosomes_gene_locations_extended[ - alignment.reference_name - ] - ) - - 1, - alignment.reference_start, - ) - - if gene_name: - alignment.set_tag("GE", gene_name) - alignments.append(alignment) - else: - alignments = input_alignments + alignments = input_alignments # only keep queries w/ well-formed UMIs gene_name = None @@ -289,8 +261,11 @@ def from_sorted_tagged_bam( if len(alignments) == 1: primary_alignment = alignments[0] - if primary_alignment.has_tag(gene_name_tag) and primary_alignment.has_tag('XF') \ - and primary_alignment.get_tag('XF')!='INTERGENIC': + if ( + primary_alignment.has_tag(gene_name_tag) + and primary_alignment.has_tag('XF') + and primary_alignment.get_tag('XF') != 'INTERGENIC' + ): gene_name = primary_alignment.get_tag(gene_name_tag) # overlaps multiple genes, drop query, and unfortunately there only one # one alignment for this query @@ -301,11 +276,14 @@ def from_sorted_tagged_bam( else: # multi-map implicated_gene_names: Set[str] = set() for alignment in alignments: - if alignment.has_tag(gene_name_tag) and alignment.has_tag('XF') \ - and alignment.get_tag('XF')!='INTERGENIC': + if ( + alignment.has_tag(gene_name_tag) + and alignment.has_tag('XF') + and alignment.get_tag('XF') != 'INTERGENIC' + ): # consider its gene name only if it has only gene name gene_name = alignment.get_tag(gene_name_tag) - if len(gene_name.split(',')) == 1: + if len(gene_name.split(',')) == 1: implicated_gene_names.add(alignment.get_tag(gene_name_tag)) if len(implicated_gene_names) == 1: # only one gene @@ -313,8 +291,8 @@ def from_sorted_tagged_bam( else: continue # drop query - if gene_name==None: - continue + if gene_name is None: + continue if ( cell_barcode, diff --git a/src/sctools/metrics/gatherer.py b/src/sctools/metrics/gatherer.py index 789a185..b207ccc 100644 --- a/src/sctools/metrics/gatherer.py +++ b/src/sctools/metrics/gatherer.py @@ -198,9 +198,9 @@ def extract_metrics(self, mode: str = 'rb') -> None: for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator): metric_aggregator = GeneMetrics() - # in case of multi-genes ignore + # in case of multi-genes ignore as in the counting stage if gene_tag and len(gene_tag.split(',')) > 1: - continue + continue # break up gene ids by cell barcodes for cell_iterator, cell_tag in iter_cell_barcodes( diff --git a/src/sctools/test/test_count.py b/src/sctools/test/test_count.py index bfd3cbe..81ec151 100644 --- a/src/sctools/test/test_count.py +++ b/src/sctools/test/test_count.py @@ -894,17 +894,6 @@ def extract_gene_non_exons( [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], ) -def test_count_matrix_with_introns( - alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index -): - _count_matrix_with_introns( - alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX - ) - _count_matrix_with_introns( - alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX - ) - - def _count_matrix_with_introns( alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index ):