Skip to content

Commit

Permalink
Adding targets and adding new adding polyA matrix generation rules
Browse files Browse the repository at this point in the history
  • Loading branch information
skchronicles committed Oct 3, 2023
1 parent ef11237 commit 56c79b0
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
8 changes: 8 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,14 @@ rule all:
join(workpath, "{name}", "polyA", "{name}.nanopolish.transcripts.polyA.tsv"),
name=samples,
),
# Filters nanopolish polyA tail estimates
# and gets average length for each transcript
expand(
join(workpath, "{name}", "polyA", "{name}.nanopolish.transcripts.polyA.average.tsv"),
name=samples,
),
# Matrix of filtered/averaged polyA tail transcript lengths
join(workpath, "project", "polyA", "nanopolish.transcripts.average_polyA_length.tsv"),
# Differential analyses results,
# @imported from `rule flair_diffexp` in rules/diffexp.smk
# Subset counts matrix for each
Expand Down
66 changes: 66 additions & 0 deletions workflow/rules/polya.smk
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,70 @@ rule nanopolish_polya:
--bam={input.bam} \\
--genome={input.ref} \\
> {output.polya}
"""


rule nanopolish_polya_filter:
"""
Data-processing step to filter and average the polyA
tail length for each transcript. The estimated polyA
tail lengths need to be filtered on the qc_tag, only
keep estimates with the tag 'PASS'.
@Input:
Nanopolish polyA tail length estimates (scatter)
@Output:
Filtered/averaged nanopolish polyA tail length estimates
"""
input:
polya = join(workpath, "{name}", "polyA", "{name}.nanopolish.transcripts.polyA.tsv"),
output:
polya = join(workpath, "{name}", "polyA", "{name}.nanopolish.transcripts.polyA.average.tsv"),
params:
rname = "filterpolya",
conda: depending(join(workpath, config['conda']['dinopore']), use_conda)
container: depending(config['images']['dinopore'], use_singularity)
threads: int(allocated("threads", "nanopolish_polya_filter", cluster))
shell: """
# Filter and average polyA tail length
# of each transcript
echo -e 'id\\tavg_polya_length' > {output.polya}
awk -F '\\t' -v OFS='\\t' 'NR>1 && $NF=="PASS" {{print}}' {input.polya} \\
| cut -f2,9 \\
| awk -F '\\t' -v OFS='\\t' \\
'{{seen[$1]+=$2; count[$1]++}} END {{for (x in seen) print x, seen[x]/count[x]}}' \\
>> {output.polya}
"""


rule nanopolish_polya_matrix:
"""
Data-processing step to create a matrix containing
the average polyA tail length for each transcript for
each sample.
@Input:
Filtered/averaged nanopolish polyA tail length estimates (gather)
@Output:
Matrix containing average polyA tail lengths,
for each sample for each transcript (TSV)
"""
input:
polyas = expand(join(workpath, "{name}", "polyA", "{name}.nanopolish.transcripts.polyA.average.tsv"), name=samples),
output:
matrix = join(workpath, "project", "polyA", "nanopolish.transcripts.average_polyA_length.tsv"),
params:
rname = "matrixpolya",
script = join("workflow", "scripts", "create_matrix.py"),
conda: depending(join(workpath, config['conda']['modr']), use_conda)
container: depending(config['images']['modr'], use_singularity)
threads: int(allocated("threads", "nanopolish_polya_matrix", cluster))
shell: """
# Create counts matrix each transcripts
# average polyA tail length
{params.script} \\
--input {input.polyas} \\
--output {output.matrix} \\
--join-on id \\
--extract avg_polya_length \\
--clean-suffix '.nanopolish.transcripts.polyA.average.tsv' \\
--nan-values 0.00
"""

0 comments on commit 56c79b0

Please sign in to comment.