-
Notifications
You must be signed in to change notification settings - Fork 1
/
tssdedupe.pl
50 lines (41 loc) · 1.65 KB
/
tssdedupe.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/perl
# script to deduplicate, filter and merge TSS from the raw TSS database
# (C) Yuri Kravatsky, [email protected]
use strict;
my $ARGC=scalar @ARGV;
if ($ARGC!=1) { die "Usage: tssdedupe TSSfile.txt\n"; }
my $infile=$ARGV[0];
my $basename=substr($infile,0,rindex($infile,"."));
my $outfile=$basename.".dedupe";
open (EP,$infile) || die "Can't open \"$infile\" for reading: $!";
open (OUTP,">$outfile") || die "Can't create \"$outfile\": $!";
my %TSScoord;
while (<EP>) {
chomp;
chomp;
if (length($_)<5) { next; }
if ($_=~ m/^(\s+)?#/) { print OUTP "$_\n"; next; } # commented strings written in the same order
my @arr=split(/\t/);
#NC_027893SS 77407 - NOC2L_1
my $chr=$arr[0];
my $currchr=uc($chr);
if ($currchr =~ m /_|-|CHRG|CHRH|CHRM|CHRKI/) { next; }
my $coord=$arr[1];
my $gene=$arr[3];
# $gene=~s/\.\d+//g;
my $chain=$arr[2];
my $genename=$TSScoord{$chr}{$coord}{$chain};
if (defined($genename) && $genename ne '') { $gene = $genename . "-" . $gene; }
$TSScoord{$chr}{$coord}{$chain}=$gene;
}
close EP;
foreach my $chr (keys %TSScoord) {
print "$chr:\n";
foreach my $coord (keys %{$TSScoord{$chr}}) {
# print "\t$coord\n";
foreach my $chain (keys %{$TSScoord{$chr}{$coord}}) {
my $genename=$TSScoord{$chr}{$coord}{$chain};
print OUTP "$chr\t$coord\t$chain\t$genename\n"; }
}
}
close OUTP;