-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_fasta_n_sequences_per_file.pl
132 lines (107 loc) · 3.48 KB
/
split_fasta_n_sequences_per_file.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env perl
# Splits fasta file with multiple sequences up into multiple files, with a set number of
# fasta sequences per file.
# Usage:
# perl split_fasta_n_sequences_per_file.pl [fasta file path] [number sequences per file]
# New files are created at filepath of old file with "_1.fasta", "_2.fasta", etc. appended
# to the end. Files already at those paths will be overwritten.
use strict;
use warnings;
my $fasta_file = $ARGV[0];
my $number_sequences_per_file = $ARGV[1];
my $OVERWRITE = 1; # set to 0 to prevent overwriting (stop script rather than overwrite)
# verifies that input fasta file exists and is not empty
if(!$fasta_file)
{
print STDERR "Error: no input fasta file provided. Exiting.\n";
die;
}
if(!-e $fasta_file)
{
print STDERR "Error: input fasta file does not exist:\n\t".$fasta_file."\nExiting.\n";
die;
}
if(-z $fasta_file)
{
print STDERR "Error: input fasta file is empty:\n\t".$fasta_file."\nExiting.\n";
die;
}
# sanity check number sequences per file
if(!$number_sequences_per_file or $number_sequences_per_file < 1)
{
print STDERR "Error: Fewer than 1 sequences per file requested. Exiting.\n";
die;
}
# reads in start of input fasta file to verify that we have enough sequences
my $number_sequences = 0;
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA_FILE>) # for each line in the file
{
if($_ =~ /^>/) # header line
{
$number_sequences++;
# to avoid reading large files twice, stops reading once we have verified that
# we have enough sequences
if($number_sequences >= 2 and $number_sequences >= $number_sequences_per_file)
{
close FASTA_FILE;
last;
}
}
}
close FASTA_FILE;
if($number_sequences < 2)
{
print STDERR "Fewer than 2 sequences in input file. My services are not needed here.\n";
die;
}
# sanity check
if($number_sequences < $number_sequences_per_file)
{
print STDERR "Output would be identical to input. My services are not needed here.\n";
die;
}
# splits sequences in fasta file into a number of smaller files
my $current_output_file_number = 0; # the number added to the end of the filepath of the current output file
my $sequences_in_current_output_file = 0; # number of sequences we have printed to the current output file
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA_FILE>) # for each line in the file
{
chomp;
my $line = $_;
if($line =~ /^>/) # header line
{
if(!$current_output_file_number or $sequences_in_current_output_file >= $number_sequences_per_file)
{
$current_output_file_number++;
$sequences_in_current_output_file = 0;
# closes current output file
close OUT_FILE;
# opens the next output file
my $current_output_file = $fasta_file."_".$current_output_file_number.".fasta";
if(-e $current_output_file)
{
print STDERR "Warning: output file already exists. Overwriting:\n\t".$current_output_file."\n";
die_if_overwrite_not_allowed();
}
open OUT_FILE, ">$current_output_file" || die "Could not open $current_output_file to write; terminating =(\n";
}
$sequences_in_current_output_file++;
}
print OUT_FILE $line;
print OUT_FILE "\n";
}
close FASTA_FILE;
close OUT_FILE;
# if overwriting not allowed (if $OVERWRITE is set to 0), prints an error and exits
sub die_if_overwrite_not_allowed
{
if(!$OVERWRITE)
{
print STDERR "Error: exiting to avoid overwriting. Set \$OVERWRITE to 1 to allow "
."overwriting.\n";
die;
}
}
# May 27, 2020
# July 12, 2021