diff --git a/.dockerignore b/.dockerignore index fceaba4..a86bba6 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1 @@ #files ignored when building docker image -*/*/test diff --git a/.gitignore b/.gitignore index 538134b..147772b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,7 @@ wheels/ *.egg-info/ .installed.cfg *.egg +test/data/bam_with_tags_test.bam # PyInstaller # Usually these files are written by a python script from a template diff --git a/Dockerfile b/Dockerfile index a1e41cd..25a0db6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,12 @@ LABEL maintainer="Ambrose J. Carr " \ COPY requirements.txt . RUN pip3 install -r requirements.txt +RUN mkdir /sctools/ + +COPY . /sctools + +RUN pip3 install /sctools + WORKDIR usr/local/bin/sctools -COPY src/sctools . \ No newline at end of file + diff --git a/requirements.txt b/requirements.txt index 6eafde0..f7e5542 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,4 +14,4 @@ requests==2.20.0 scipy==1.3.1 setuptools==40.4.3 setuptools_scm==3.1.0 -tables==3.4.2 \ No newline at end of file +tables==3.4.2 diff --git a/src/sctools/count.py b/src/sctools/count.py index da2842d..74753a1 100644 --- a/src/sctools/count.py +++ b/src/sctools/count.py @@ -14,6 +14,7 @@ gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb') from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str) + Notes ----- Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus. @@ -251,44 +252,20 @@ def from_sorted_tagged_bam( ) in grouped_records_generator: # modify alignments to include the gene name to the alignments to INTRONIC regions - if chromosomes_gene_locations_extended: - alignments = [] - for alignment in input_alignments: - if alignment.has_tag("XF"): - aln_type = alignment.get_tag("XF") - if ( - alignment.reference_name - and aln_type == "INTRONIC" - and alignment.reference_name - in chromosomes_gene_locations_extended - ): - gene_name = cls.binary_overlap( - chromosomes_gene_locations_extended[ - alignment.reference_name - ], - 0, - len( - chromosomes_gene_locations_extended[ - alignment.reference_name - ] - ) - - 1, - alignment.reference_start, - ) - - if gene_name: - alignment.set_tag("GE", gene_name) - alignments.append(alignment) - else: - alignments = input_alignments + alignments = input_alignments # only keep queries w/ well-formed UMIs + gene_name = None if cell_barcode is None or molecule_barcode is None: continue if len(alignments) == 1: primary_alignment = alignments[0] - if primary_alignment.has_tag(gene_name_tag): + if ( + primary_alignment.has_tag(gene_name_tag) + and primary_alignment.has_tag('XF') + and primary_alignment.get_tag('XF') != 'INTERGENIC' + ): gene_name = primary_alignment.get_tag(gene_name_tag) # overlaps multiple genes, drop query, and unfortunately there only one # one alignment for this query @@ -299,15 +276,24 @@ def from_sorted_tagged_bam( else: # multi-map implicated_gene_names: Set[str] = set() for alignment in alignments: - if alignment.has_tag(gene_name_tag): + if ( + alignment.has_tag(gene_name_tag) + and alignment.has_tag('XF') + and alignment.get_tag('XF') != 'INTERGENIC' + ): # consider its gene name only if it has only gene name + gene_name = alignment.get_tag(gene_name_tag) if len(gene_name.split(',')) == 1: implicated_gene_names.add(alignment.get_tag(gene_name_tag)) + if len(implicated_gene_names) == 1: # only one gene gene_name = implicated_gene_names.__iter__().__next__() else: continue # drop query + if gene_name is None: + continue + if ( cell_barcode, molecule_barcode, diff --git a/src/sctools/metrics/gatherer.py b/src/sctools/metrics/gatherer.py index 1876f5f..b207ccc 100644 --- a/src/sctools/metrics/gatherer.py +++ b/src/sctools/metrics/gatherer.py @@ -198,6 +198,10 @@ def extract_metrics(self, mode: str = 'rb') -> None: for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator): metric_aggregator = GeneMetrics() + # in case of multi-genes ignore as in the counting stage + if gene_tag and len(gene_tag.split(',')) > 1: + continue + # break up gene ids by cell barcodes for cell_iterator, cell_tag in iter_cell_barcodes( bam_iterator=gene_iterator diff --git a/src/sctools/test/test_count.py b/src/sctools/test/test_count.py index bfd3cbe..81ec151 100644 --- a/src/sctools/test/test_count.py +++ b/src/sctools/test/test_count.py @@ -894,17 +894,6 @@ def extract_gene_non_exons( [bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()], ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"], ) -def test_count_matrix_with_introns( - alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index -): - _count_matrix_with_introns( - alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX - ) - _count_matrix_with_introns( - alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX - ) - - def _count_matrix_with_introns( alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index ):