-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_pairs_indexed.pl
executable file
·111 lines (92 loc) · 2.97 KB
/
clean_pairs_indexed.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/perl -w
use strict;
use File::Spec;
use Getopt::Long;
use lib '/home_stajichlab/robb/bin/';
use Bio::Index::Fastq;
##takes a sam file and splits it into separate files based on the
##targets sequences.
##$file_1 and _2 should be mate fq files
my $file_1;
my $file_2;
GetOptions(
'1|file_1:s' => \$file_1,
'2|file_2:s' => \$file_2,
'h|help' => \&getHelp,
);
sub getHelp () {
print "
This script takes 2 paired fastq files and matches mates and prints unPaired reads to STDOUT.
Please name the fastq files with .fq extention.
usage:
./cleanUp_Pairs.pl [-1 fastq file 1] [-2 fastq file 2][-h]
options:
-1 STR sam file 1 [required, no default]
-2 STR sam file 2 [required, no default]
-h this message
";
exit 1;
}
if ( !defined $file_1 and !defined $file_2 ) {
print "\n\nMust provide a two files containing paired reads\n\n";
&getHelp;
exit 1;
}
my $file_1_path = File::Spec->rel2abs($file_1);
my $file_2_path = File::Spec->rel2abs($file_2);
my $fq_1_inx = Bio::Index::Fastq->new('-filename' => "index.1", '-write_flag' => 1);
my $fq_2_inx = Bio::Index::Fastq->new('-filename' => "index.2", '-write_flag' => 1);
$fq_1_inx->make_index($file_1_path);
$fq_2_inx->make_index($file_2_path);
my $sample;
##sample_1.fq
##sampe_pair1.fq
if ($file_1 =~ /(\S+?)_?(\S+)\.(fq|fastq)/){
$sample = $1;
}
my %pairs;
my $out_seqIO_1 = Bio::SeqIO->new('-format' => 'Fastq','-filename' => "$file_1.matched");
my $out_seqIO_2 = Bio::SeqIO->new('-format' => 'Fastq','-filename' => "$file_2.matched");
my $out_seqIO_unMatched = Bio::SeqIO->new('-format' => 'Fastq','-fh' => \*STDOUT);
my @ids = $fq_1_inx->get_all_primary_ids;
print "len = scalar @ids\n";
foreach my $header ( @ids){
print "**$header\n";
my $header_to_store;
my $seqObj = $fq_1_inx->get_Seq_by_id($header);
if ($header =~ /(\S+)(\.r|\.f)/){
$header_to_store = $1;
}elsif($header =~ /(\S+)(\/1|\/2)/){
$header_to_store = $1;
}else {
$header_to_store = $header;
}
$pairs{$header_to_store} = $seqObj;
}
foreach my $header ( $fq_2_inx->get_all_ids){
my $header_to_store;
if ($header =~ /(\S+)(\.r|\.f)/){
$header_to_store = $1;
}elsif($header =~ /(\S+)(\/1|\/2)/){
$header_to_store = $1;
}else {
$header_to_store = $header;
}
my $seqObj_2 = $fq_2_inx->get_Seq_by_id($header_to_store);
my $seqObj_1 = $pairs{$header_to_store};
if ( exists $pairs{$header_to_store} ) {
$out_seqIO_1->write_seq($seqObj_1);
$out_seqIO_2->write_seq($seqObj_2);
#remove the found keys, so that only the unparied
#from the first file are in hash at end of loop
delete $pairs{$header_to_store};
}
else { #print any seq in file2 that was not in file one
$out_seqIO_unMatched->write_seq($seqObj_2);
}
}
#will print any remaing and unpair seqs from the first file into NOMATCH
foreach my $header ( sort keys %pairs ) {
my $seqObj = $pairs{$header};
$out_seqIO_unMatched->write_seq($seqObj);
}