-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgtf_pipeline.py
58 lines (49 loc) · 1.6 KB
/
gtf_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
''' Script to split-transform-merge for pipeline '''
#Write a split-transform-merge pipeline
#Split file by chromosome
#Count the number of transcripts for each chromosome
#Read all count files, calculate the average and write to file
#@
#input fastq file
import gzip
from ruffus import *
from cgatcore import pipeline as P
import sys
@split('genes.gtf.gz', 'chr*.gtf.gz')
def split_chrom(infile, outfiles):
with gzip.open(infile, 'rt') as inf:
last_chrom = ''
for line in inf:
chrom = line.split()[0]
if chrom == last_chrom:
outf.write(line)
else:
if last_chrom != '':
outf.close()
outfile = chrom+ '.gtf.gz'
outf = gzip.open(outfile, 'wt')
outf.write(line)
last_chrom = chrom
print(chrom)
#split_chrom('test.gtf.gz', "")
@transform(split_chrom, suffix('.gtf.gz'), '.count')
def count_genes(infile, outfile):
statement = "wc -l %(infile)s > %(outfile)s"
P.run(statement, job_queue='all.q', job_threads=1, job_memory='2G', job_condaenv='obds-py3')
@merge(count_genes, 'average')
def average(infiles, outfile):
sum = 0
average = 0
for infile in infiles:
with open(infile) as inf:
readfile = inf.read()
print(readfile)
count = int(readfile.split()[0])
sum += count
print(sum)
average = sum/len(infiles)
print(average)
with open(outfile, 'wt') as outf:
outf.write(f'{average}')
if __name__ == "__main__":
sys.exit(P.main(sys.argv))