-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLengthDistribution.pl
109 lines (103 loc) · 3.34 KB
/
LengthDistribution.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/perl
use strict;
use File::Basename;
my $usage = "Usage:\n\tperl genome_statistic.pl genomeFasta\n\n";
if (@ARGV==0) {die $usage}
foreach ( @ARGV ) {
my $name = basename( $_ );
print "\n==>>$name<<==\n\n";
my ($genome_size, $identifier, %scaffold_length, %scaffold_seq, $N_num, $G_num, $C_num, $T_num, $A_num);
open GENOME, '<',"$_" or die "Can't open file $_!($!)";
while ( <GENOME> ) {
chomp;
if ( m/^>(\S+)/ ) { $identifier = $1 }
else {
my $length = length $_;
$genome_size += $length;
$scaffold_length{$identifier} += $length;
$scaffold_seq{$identifier} .= $_;
$N_num += ($_ =~ tr/Nn/Nn/);
$G_num += ($_ =~ tr/Gg/Gg/);
$C_num += ($_ =~ tr/Cc/Cc/);
$T_num += ($_ =~ tr/Tt/Tt/);
$A_num += ($_ =~ tr/Aa/Aa/);
}
}
close GENOME;
my @contig_length;
foreach (keys %scaffold_seq) {
my $scaffold_seq = $scaffold_seq{$_};
my @contigs = split /N{10,}/, $scaffold_seq;
foreach (@contigs) {
my $length = length $_;
push @contig_length, $length
}
}
@contig_length = sort { $b <=> $a } @contig_length;
my @scaffold_length = values %scaffold_length;
@scaffold_length = sort { $b <=> $a } @scaffold_length;
my $longest_fragment = $scaffold_length[0];
my $shortest_fragment = $scaffold_length[$#scaffold_length];
my $sequence_number = @scaffold_length;
print "the genome scaffolds number is $sequence_number\nthe longest length is $longest_fragment\nthe shortest length is $shortest_fragment\n";
my $rate_of_N = $N_num / $genome_size;
my $rate_of_GC = ( $G_num + $C_num ) / ($G_num + $C_num + $T_num + $A_num);
print "the genome size is $genome_size\n";
print "the rate of N is $rate_of_N\n";
print "the rate of GC is $rate_of_GC\n";
my $genome_size50 = $genome_size * 0.5;
my $genome_size90 = $genome_size * 0.9;
my $aclength = 0;
foreach (@scaffold_length) {
$aclength += $_;
if ($aclength >= $genome_size50) {
print "the scaffold N50 is $_\n";
last
}
}
$aclength = 0;
foreach (@contig_length) {
$aclength += $_;
if ($aclength >= $genome_size50) {
print "the contig N50 is $_\n";
last
}
}
$aclength = 0;
foreach (@scaffold_length) {
$aclength += $_;
if ($aclength >= $genome_size90) {
print "the scaffold N90 is $_\n";
last
}
}
$aclength = 0;
foreach (@contig_length) {
$aclength += $_;
if ($aclength >= $genome_size90) {
print "the contig N90 is $_\n";
last
}
}
my ($length1000Num,$length2000Num,$length3000Num,$total_length3000,$total_length2000_3000,$total_length1000_2000,$total_length2000,$total_length1000,$length2000_3000Num,$length1000_2000Num);
foreach (@scaffold_length) {
if ($_ >= 3000) {
$length3000Num ++;
$total_length3000 += $_
}elsif ($_ >= 2000) {
$length2000_3000Num ++;
$total_length2000_3000 += $_
}elsif ($_ >= 1000) {
$length1000_2000Num ++;
$total_length1000_2000 += $_
}
$length2000Num = $length3000Num + $length2000_3000Num;
$length1000Num = $length2000Num + $length1000_2000Num;
$total_length2000 = $total_length3000 + $total_length2000_3000;
$total_length1000 = $total_length2000 + $total_length1000_2000;
}
print "the number of sequences >= 1kb is $length1000Num\ttotal length is $total_length1000\n";
print "the number of sequences >= 2kb is $length2000Num\ttotal length is $total_length2000\n";
print "the number of sequences >= 3kb is $length3000Num\ttotal length is $total_length3000\n";
print "\n";
}