-
Notifications
You must be signed in to change notification settings - Fork 1
/
targetscan_count_8mers.pl
executable file
·119 lines (93 loc) · 3.03 KB
/
targetscan_count_8mers.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#! /usr/bin/env perl
# Find locations of TargetScan seeds in a sequence file
# Also print file of sequence lengths
if (! $ARGV[1])
{
print STDERR "\nFind defined sites from a set of UTRs\n";
print STDERR "USAGE: $0 miRNA_seeds_file UTRs > Offset-6mer_sites\n";
print STDERR "Note: Include last argument of '-list' to print list of all positions too\n\n";
exit;
}
### Example miR-1 : seed region = GGAAUGU
### To find and count all 6mer-1a (offset 6mer) sites,
### miRNA family should be listed as uGGAAU
### (initial T + positions 2-6 of the mature miRNA)
### Then script will look for the reverse complement
$miRNASeedsFile = $ARGV[0];
$sequenceFile = $ARGV[1];
if ($sequenceFile =~ /(\S+)\.txt$/)
{
$sequenceLengthsOutfile = "$1.lengths.txt";
}
else
{
$sequenceLengthsOutfile = "$sequenceFile.lengths.txt";
}
# Create output file to hold sequence lengths
open (SEQUENCE_LENGTHS_OUT, ">$sequenceLengthsOutfile") || die "Cannot open $sequenceLengthsOutfile for writing: $!";
open (DEFINED_SITES, "$miRNASeedsFile") || die "Cannot open $miRNASeedsFile for reading: $!";
while (<DEFINED_SITES>)
{
# GGAAUGU GGAAUGU 10090 # miR-1
# AAAGCCA AAAGCCA 10090
# AAAGCCA AAAGCCA 10116
chomp;
($familyID, $familyID, $species) = split /\t/, $_;
$seedRegionRevcomp = reverse_complement($familyID);
# Convert any Ts to Us
$seedRegionRevcomp =~ s/T/U/g;
$seedRegionRevcomp =~ s/t/u/g;
$siteToFind = $seedRegionRevcomp . "A";
$familyID2site{$familyID} = $siteToFind;
# Reverse-complement to get the actual site we need
push @sitesToFind, $siteToFind;
}
# Make these unique, since multiple families can have the same 6mer-1a site
@sitesToFind = do { my %seen; grep { !$seen{$_}++ } @sitesToFind };
open (SEQUENCES, "$sequenceFile") || die "Cannot open $sequenceFile for reading: $!";
while (<SEQUENCES>)
{
# ENST00000000233 9713 -CAACCAGGGGCCG------G-CCCCTGCTGC
chomp;
($sequenceID, $species, $sequence) = split /\t/, $_;
# Remove all gaps
$sequence =~ s/-//g;
$sequence =~ s/\.//g;
# Convert any Ts to Us
$sequence =~ s/T/U/g;
$sequence =~ s/t/u/g;
$sequenceLength = length $sequence;
print SEQUENCE_LENGTHS_OUT "$sequenceID\t$species\t$sequenceLength\n";
%siteToPositions = ();
foreach $siteToFind (@sitesToFind)
{
$numThisSite = 0;
# Make sure this is case-insensitive (i)
while ($sequence =~ /$siteToFind/gi)
{
$start = $-[0];
$end = $+[0];
$numThisSite++;
}
$siteToCount{$sequenceID}{$species}{$siteToFind} = $numThisSite;
}
foreach $familyID (sort keys %familyID2site)
{
$siteToFind = $familyID2site{$familyID};
if ($siteToCount{$sequenceID}{$species}{$siteToFind})
{
print "$sequenceID\t$species\t$familyID\t$siteToCount{$sequenceID}{$species}{$siteToFind}\n";
}
}
}
print STDERR "\nAll done -- Also see $sequenceLengthsOutfile for sequence lengths.\n\n";
##########################################
sub reverse_complement
{
my $dna = shift;
# reverse the DNA sequence
my $revcomp = reverse($dna);
# complement the reversed DNA sequence
$revcomp =~ tr/ACGTUacgtu/TGCAAtgcaa/;
return $revcomp;
}