-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnakefile
144 lines (125 loc) · 5.11 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Created by Michal Bukowski ([email protected]) under GPL-3.0 license
# Differential gene expression pipeline utilising Salmon and Bioconductor DESeq2
# for Illumina RNA-Seq squencing results for reversed stranded libraries.
# If you find it useful, please cite:
# Bukowski M et al. (2022) Staphylococcal saoABC operon codes for
# a DNA-binding protein SaoC implicated in the response to nutrient deficit.
# [awaiting publication]
# FASTQ files with paired reads are named according to a pattern:
# {strain}_{group}_{setting}_{replica}_{reads}.fastq,
# eg. rn_wt_lg_1_R2.fastq for RN4220, wild type, logarithmic growth phase,
# replica 1 of 3, 1 of 2 files with reads. R1 and R2 are known, the remianing
# are inferred from the file names in input/reads. The number of replicas must
# be the same across all experiments (strain + group + setting). The pipeline
# is adapted to process read for different strains, however, in the original
# case there was only one (Staphylococcus aureus RN4220, denoted as rn).
strains, groups, settings, replicas = glob_wildcards(
'input/reads/{strain}_{group}_{setting}_{replica}_R1.fastq')
# Convert a repetetive list for a list of unique values for group and replica
groups = list( set(groups) )
replicas = list( set(replicas) )
# Set the reference gropu to wt (wild type).
ref_group = 'wt'
# Library types used with Salmon (see Salom manual), the library for original
# experiment was reversed stranded with Illumina inward paired reads (ISR),
# ISF is done for antisense RNA expression analysis.
libtypes = ['ISR', 'ISF']
# Final DGE output together with raw counts and TPM values are summarised
# for each strain, setting and library type in TSV files in output/summary.
rule all:
input:
expand( expand(
'output/summary/{libtype}/summary_{strain}_{setting}.tsv',
zip, strain=strains, setting=settings, allow_missing=True
), libtype=libtypes)
# Prepare index based on a FASTA file with reference transcript sequences
# for each strain with Salmon index
rule index:
params:
kmer = 15
input:
'input/refs/{strain}.fna'
output:
directory('output/index/{strain}')
log:
'log/index/{strain}.log'
shell:
'''salmon index -k {params.kmer} \
-t {input} \
-i {output} \
> {log} 2>&1
'''
# Run salmon quant for each pair of read files (R1 and R2) for each strain,
# group, replica and library type, using index for the particular strain
rule quant:
params:
libtype = '{libtype}'
threads:
1
input:
index = rules.index.output,
reads_1 = 'input/reads/{strain}_{group}_{setting}_{replica}_R1.fastq',
reads_2 = 'input/reads/{strain}_{group}_{setting}_{replica}_R2.fastq'
output:
directory('output/quant/{libtype}/{strain}_{group}_{setting}_{replica}')
log:
'log/quant/{libtype}/{strain}_{group}_{setting}_{replica}.log'
shell:
'''salmon quant --validateMappings \
-p {threads} \
-l {params.libtype} \
-i {input.index} \
-1 {input.reads_1} \
-2 {input.reads_2} \
-o {output} \
> {log} 2>&1
'''
# Run the Python script scripts/collect.py to prepare metadata files that
# point to files with raw counts for all groups from the same experiment
# for a particular strain. These will be used by DESeq2.
rule collect:
input:
expand(rules.quant.output, libtype='{libtype}', strain='{strain}',
group=groups, setting='{setting}', replica=replicas)
output:
'output/collect/{libtype}/quant_{strain}_{setting}.tsv'
log:
'log/collect/{libtype}/quant_{strain}_{setting}.log'
shell:
'''scripts/collect.py --input {input} \
--output {output} \
> {log} 2>&1
'''
# Run the R script scripts/dge.r to carry out differential gene expression
# analysis with DESeq2.
rule dge:
params:
ref = ref_group
input:
rules.collect.output
output:
'output/dge/{libtype}/dge_{strain}_{setting}.tsv'
log:
'log/dge/{libtype}/dge_{strain}_{setting}.log'
shell:
'''scripts/dge.r --meta {input} \
--ref {params.ref} \
--output {output} \
> {log} 2>&1
'''
# Collect DGE output together with raw counts and TPM values. The final stage
# of the whole pipeline.
rule summary:
input:
meta = rules.collect.output,
dge = rules.dge.output
output:
'output/summary/{libtype}/summary_{strain}_{setting}.tsv'
log:
'log/summary/{libtype}/summary_{strain}_{setting}.log'
shell:
'''scripts/summary.py --meta {input.meta} \
--dge {input.dge} \
--output {output} \
> {log} 2>&1
'''