Skip to content

Commit dc07854

Browse files
authored
Merge pull request #75 from HumanCellAtlas/kmk-snrna-seq-multi-gene
Kmk snrna seq multi gene
2 parents aaed0b9 + 109da40 commit dc07854

File tree

7 files changed

+31
-46
lines changed

7 files changed

+31
-46
lines changed

.dockerignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
#files ignored when building docker image
2-
*/*/test

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ wheels/
2626
*.egg-info/
2727
.installed.cfg
2828
*.egg
29+
test/data/bam_with_tags_test.bam
2930

3031
# PyInstaller
3132
# Usually these files are written by a python script from a template

Dockerfile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@ LABEL maintainer="Ambrose J. Carr <[email protected]>" \
77
COPY requirements.txt .
88
RUN pip3 install -r requirements.txt
99

10+
RUN mkdir /sctools/
11+
12+
COPY . /sctools
13+
14+
RUN pip3 install /sctools
15+
1016
WORKDIR usr/local/bin/sctools
1117

12-
COPY src/sctools .
18+

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ requests==2.20.0
1414
scipy==1.3.1
1515
setuptools==40.4.3
1616
setuptools_scm==3.1.0
17-
tables==3.4.2
17+
tables==3.4.2

src/sctools/count.py

Lines changed: 18 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
gene_name_tag: str=consts.GENE_NAME_TAG_KEY, open_mode: str='rb')
1515
from_mtx(matrix_mtx: str, row_index_file: str, col_index_file: str)
1616
17+
1718
Notes
1819
-----
1920
Memory usage of this module can be roughly approximated by the chunk_size parameter in Optimus.
@@ -251,44 +252,20 @@ def from_sorted_tagged_bam(
251252
) in grouped_records_generator:
252253

253254
# modify alignments to include the gene name to the alignments to INTRONIC regions
254-
if chromosomes_gene_locations_extended:
255-
alignments = []
256-
for alignment in input_alignments:
257-
if alignment.has_tag("XF"):
258-
aln_type = alignment.get_tag("XF")
259-
if (
260-
alignment.reference_name
261-
and aln_type == "INTRONIC"
262-
and alignment.reference_name
263-
in chromosomes_gene_locations_extended
264-
):
265-
gene_name = cls.binary_overlap(
266-
chromosomes_gene_locations_extended[
267-
alignment.reference_name
268-
],
269-
0,
270-
len(
271-
chromosomes_gene_locations_extended[
272-
alignment.reference_name
273-
]
274-
)
275-
- 1,
276-
alignment.reference_start,
277-
)
278-
279-
if gene_name:
280-
alignment.set_tag("GE", gene_name)
281-
alignments.append(alignment)
282-
else:
283-
alignments = input_alignments
255+
alignments = input_alignments
284256

285257
# only keep queries w/ well-formed UMIs
258+
gene_name = None
286259
if cell_barcode is None or molecule_barcode is None:
287260
continue
288261

289262
if len(alignments) == 1:
290263
primary_alignment = alignments[0]
291-
if primary_alignment.has_tag(gene_name_tag):
264+
if (
265+
primary_alignment.has_tag(gene_name_tag)
266+
and primary_alignment.has_tag('XF')
267+
and primary_alignment.get_tag('XF') != 'INTERGENIC'
268+
):
292269
gene_name = primary_alignment.get_tag(gene_name_tag)
293270
# overlaps multiple genes, drop query, and unfortunately there only one
294271
# one alignment for this query
@@ -299,15 +276,24 @@ def from_sorted_tagged_bam(
299276
else: # multi-map
300277
implicated_gene_names: Set[str] = set()
301278
for alignment in alignments:
302-
if alignment.has_tag(gene_name_tag):
279+
if (
280+
alignment.has_tag(gene_name_tag)
281+
and alignment.has_tag('XF')
282+
and alignment.get_tag('XF') != 'INTERGENIC'
283+
):
303284
# consider its gene name only if it has only gene name
285+
gene_name = alignment.get_tag(gene_name_tag)
304286
if len(gene_name.split(',')) == 1:
305287
implicated_gene_names.add(alignment.get_tag(gene_name_tag))
288+
306289
if len(implicated_gene_names) == 1: # only one gene
307290
gene_name = implicated_gene_names.__iter__().__next__()
308291
else:
309292
continue # drop query
310293

294+
if gene_name is None:
295+
continue
296+
311297
if (
312298
cell_barcode,
313299
molecule_barcode,

src/sctools/metrics/gatherer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ def extract_metrics(self, mode: str = 'rb') -> None:
198198
for gene_iterator, gene_tag in iter_genes(bam_iterator=bam_iterator):
199199
metric_aggregator = GeneMetrics()
200200

201+
# in case of multi-genes ignore as in the counting stage
202+
if gene_tag and len(gene_tag.split(',')) > 1:
203+
continue
204+
201205
# break up gene ids by cell barcodes
202206
for cell_iterator, cell_tag in iter_cell_barcodes(
203207
bam_iterator=gene_iterator

src/sctools/test/test_count.py

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -894,17 +894,6 @@ def extract_gene_non_exons(
894894
[bam.QueryNameSortOrder(), CellMoleculeGeneQueryNameSortOrder()],
895895
ids=["query_name_sort_order", "cell_molecule_gene_query_name_sort_order"],
896896
)
897-
def test_count_matrix_with_introns(
898-
alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index
899-
):
900-
_count_matrix_with_introns(
901-
alignment_sort_order, gene_name_to_index, consts.SINGLE_CELL_COUNT_MATRIX
902-
)
903-
_count_matrix_with_introns(
904-
alignment_sort_order, gene_name_to_index, consts.SINGLE_NUCLEI_COUNT_MATRIX
905-
)
906-
907-
908897
def _count_matrix_with_introns(
909898
alignment_sort_order: bam.AlignmentSortOrder, gene_name_to_index, test_index
910899
):

0 commit comments

Comments
 (0)