-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_fasta_into_n_files.pl
120 lines (96 loc) · 3.21 KB
/
split_fasta_into_n_files.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env perl
# Splits fasta file with multiple sequences up into a number of smaller files, each with
# about the same number of sequences.
# Usage:
# perl split_fasta_into_n_files.pl [fasta file path] [number output files to generate]
# New files are created at filepath of old file with "_1_of_[n].fasta", "_2_of_[n].fasta",
# etc. appended to the end. Files already at those paths will be overwritten.
use strict;
use warnings;
my $fasta_file = $ARGV[0];
my $number_files = $ARGV[1];
my $OVERWRITE = 1; # set to 0 to prevent overwriting (stop script rather than overwrite)
# verifies that input fasta file exists and is not empty
if(!$fasta_file)
{
print STDERR "Error: no input fasta file provided. Exiting.\n";
die;
}
if(!-e $fasta_file)
{
print STDERR "Error: input fasta file does not exist:\n\t".$fasta_file."\nExiting.\n";
die;
}
if(-z $fasta_file)
{
print STDERR "Error: input fasta file is empty:\n\t".$fasta_file."\nExiting.\n";
die;
}
# sanity check input number of files
if(!$number_files or $number_files < 2)
{
print STDERR "Fewer than 2 output files requested. My services are not needed here.\n";
die;
}
# reads in input fasta file and counts total number of sequences
my $number_sequences = 0;
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA_FILE>) # for each line in the file
{
if($_ =~ /^>/) # header line
{
$number_sequences++;
}
}
close FASTA_FILE;
if($number_sequences < 2)
{
print STDERR "Fewer than 2 sequences in input file. My services are not needed here.\n";
die;
}
# calculates number of sequences to print in each file
my $number_sequences_per_file = $number_sequences / $number_files;
# splits sequences in fasta file into a number of smaller files
my $current_output_file_number = 0; # the number added to the end of the filepath of the current output file
my $sequences_in_current_output_file = 0; # number of sequences we have printed to the current output file
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA_FILE>) # for each line in the file
{
chomp;
my $line = $_;
if($line =~ /^>/) # header line
{
if(!$current_output_file_number or $sequences_in_current_output_file >= $number_sequences_per_file)
{
$current_output_file_number++;
$sequences_in_current_output_file = 0;
# closes current output file
close OUT_FILE;
# opens the next output file
my $current_output_file = $fasta_file."_".$current_output_file_number."_of_".$number_files.".fasta";
if(-e $current_output_file)
{
print STDERR "Warning: output file already exists. Overwriting:\n\t".$current_output_file."\n";
die_if_overwrite_not_allowed();
}
open OUT_FILE, ">$current_output_file" || die "Could not open $current_output_file to write; terminating =(\n";
}
$sequences_in_current_output_file++;
}
print OUT_FILE $line;
print OUT_FILE "\n";
}
close FASTA_FILE;
close OUT_FILE;
# if overwriting not allowed (if $OVERWRITE is set to 0), prints an error and exits
sub die_if_overwrite_not_allowed
{
if(!$OVERWRITE)
{
print STDERR "Error: exiting to avoid overwriting. Set \$OVERWRITE to 1 to allow "
."overwriting.\n";
die;
}
}
# May 27, 2020
# July 12, 2021