-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathproject_alevin.py
80 lines (65 loc) · 2.62 KB
/
project_alevin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""Salmon alevin"""
import re
import pandas as pd
from ruffus import *
from cgatcore import pipeline as P
import sys
import glob
import os
params = P.get_parameters("project_alevin.yml") # define params here
samples = pd.read_csv("cellranger_samples.csv")
samples.set_index('name', inplace=True)
print(samples)
def get_gex_fastq(dir):
'''Docstring'''
fastq1_pattern = params["pattern"]["fastq1"]
fastq1_glob = f"{dir}/*{fastq1_pattern}*"
fastq1 = glob.glob(fastq1_glob)
if len(fastq1) == 0:
raise OSError(f"No file matched pattern: {fastq1_glob}")
fastq2 = [file.replace(params["pattern"]["fastq1"], params["pattern"]["fastq2"]) for file in fastq1]
for file in fastq2:
if not os.path.exists(file):
raise OSError(f"Paired file not found: {file}")
return {'fastq1' : fastq1, 'fastq2' : fastq2 }
@follows(mkdir("alevin"))
@transform('data/*/.sample', regex(r'data/(.+)/.sample'), r'alevin/\1/alevin/quants_mat.gz')
def salmon_alevin(infile, outfile):
# python module looking for regular expression, group(1) is equiv to '\1'
outdir = outfile.replace("/alevin/quants_mat.gz", "")
sampleid = re.search('data/(.+)/.sample', infile).group(1)
print(sampleid)
fastqs = samples['fastqs'][sampleid]
cellnumber = samples['cells'][sampleid]
chemistry = samples['chemistry'][sampleid]
fastq_dict = get_gex_fastq(fastqs)
print(fastq_dict)
infiles_fastq1 = ' '.join(fastq_dict['fastq1'])
infiles_fastq2 = ' '.join(fastq_dict['fastq2'])
chemistry = samples['chemistry'][sampleid]
if chemistry == "SC3Pv2":
chemistry_option = "--chromium"
elif chemistry == "SC3Pv3":
chemistry_option = "--chromiumV3"
else:
raise NameError('Invalid chemistry.')
statement = '''rm -r %(outdir)s &&
salmon alevin
-l ISR
-1 %(infiles_fastq1)s
-2 %(infiles_fastq2)s
%(chemistry_option)s
-i %(index_file)s
-p %(threads)s
-o %(outdir)s
--tgMap %(tgMap_file)s
--expectCells %(cellnumber)s
> %(sampleid)s_standardout.log
2> %(sampleid)s_standarderror.log
''' # command line string statement, all options need to sit within these quotes, which p.run will send to the cluster as a job
P.run(statement, job_queue='all.q',
job_threads=params["threads"],
job_memory=params["job_memory"],
job_condaenv='obds-py3')
if __name__ == "__main__":
sys.exit(P.main(sys.argv))