-
Notifications
You must be signed in to change notification settings - Fork 1
/
filter.pl
executable file
·128 lines (115 loc) · 2.41 KB
/
filter.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/perl
use strict;
use warnings;
die "Usage perl filter.pl SAMPLE \n" if (@ARGV < 1);
my $database_dir="/diskmnt/Projects/Users/dcui/Projects/Fusion_hg38/FilterDatabase";
my (%black, %paralog, %manual, %noncancer, %tcga_normal, %known, %gtex, %large, %tmp);
open(BLACK, "$database_dir/blacklist");
while(<BLACK>)
{
chomp;
$black{$_}="";
}
open(PARALOG, "$database_dir/paralog_clusters.dat");
while(<PARALOG>)
{
chomp;
my @l=split(/\t/,);
next if scalar @l < 2;
for(my $i=0;$i<=$#l;$i++)
{
for(my $j=0;$j<=$#l;$j++)
{
next if $i==$j;
my $pp=join("\-\-",$l[$i], $l[$j]);
$paralog{$pp}="";
}
}
}
open(DAT, "$database_dir/blacklist.manual");
while(<DAT>)
{
chomp;
$manual{$_}="";
}
open(DAT, "$database_dir/noncancer_cell.txt");
while(<DAT>)
{
chomp;
$noncancer{$_}="";
}
open(DAT, "$database_dir/tcga.normal.id.txt");
while(<DAT>)
{
chomp;
$tcga_normal{$_}="";
}
#fusions should stay
open(DA, "$database_dir/tcga.published");
while(<DA>)
{
chomp;
next if $_ eq "BMPR1B--PDLIM5";
next if $_ eq "ZC3H7A--BCAR4";
$known{$_}="";
}
my $dir = $ARGV[0];
my $sample = $ARGV[1];
open(DATA, "$dir/Total_Fusions_in_$sample.tsv");
<DATA>;
open(OUT, ">$dir/Filtered_Fusions_in_$sample.tsv");
while(<DATA>)
{
chomp;
my @line = split(/\t/,);
next if $line[6] eq "NA";
if(!exists $large{$line[0]})
{
$large{$line[0]} = $line[6];
}else
{
if($large{$line[0]} < $line[6])
{
$large{$line[0]} = $line[6];
}
}
}
open(DATA, "$dir/Total_Fusions_in_$sample.tsv");
my $header = <DATA>;
my @h = split(/\t/, $header);
print OUT join("\t", @h[0..7], $h[9]);
while(<DATA>)
{
chomp;
my @line = split(/\t/,);
my @genes = split(/\-\-/,$line[0]);
next if $genes[0] eq $genes[1];
#1.fusion from same gene
next if (exists $black{$genes[0]} || exists $black{$genes[1]});
#2.blacklist
next if exists $paralog{$line[0]};
#3.fusion from paralog genes
next if exists $noncancer{$line[0]};
#4.non-cancer fusion
next if exists $gtex{$line[0]};
#5.gtex fusion
next if exists $tcga_normal{$line[0]};
#6.tcga normal
next if exists $manual{$line[0]};
#7.blacklist pairs
if($line[6] eq "NA")
{
print OUT join("\t", @line[0..7], $line[9])."\n";
}else
{
if($large{$line[0]} == $line[6])
{
if(!exists $tmp{$line[0]})
{
print OUT join("\t", @line[0..7], $line[9])."\n";
}
$tmp{$line[0]} = "";
}
}
#8. only report breakpoint with the highest score
}