forked from ucscCancer/fpfilter-tool
-
Notifications
You must be signed in to change notification settings - Fork 0
/
fpfilter.pl
713 lines (575 loc) · 23.9 KB
/
fpfilter.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long;
use IO::File;
use File::Temp qw( tempdir );
use File::Spec;
use File::Basename qw( fileparse dirname);
use Cwd qw( abs_path cwd);
use Carp;
## Define filtering parameters ##
my $min_read_pos = 0.10;
my $min_var_freq = 0.05;
my $min_var_count = 4;
my $min_strandedness = 0.01;
my $max_strandedness = 1 - $min_strandedness;
my $max_mm_qualsum_diff = 50;
my $max_mapqual_diff = 30;
my $max_readlen_diff = 25;
my $min_var_dist_3 = 0.20;
my $max_var_mm_qualsum = 100;
## Parse arguments ##
my $output_basename;
my $vcf_file;
my $output_file;
my $bam_file;
my $bam_index;
my $sample;
my $bam_readcount_path = 'bam-readcount';
my $samtools_path = 'samtools';
my $ref_fasta;
my $help;
my $opt_result;
my @params = @ARGV;
$opt_result = GetOptions(
'vcf-file=s' => \$vcf_file,
'bam-file=s' => \$bam_file,
'bam-index=s' => \$bam_index,
'sample=s' => \$sample,
'bam-readcount=s' => \$bam_readcount_path,
'samtools=s' => \$samtools_path,
'reference=s' => \$ref_fasta,
'output=s' => \$output_file,
'min-read-pos=f' => \$min_read_pos,
'min-var-freq=f' => \$min_var_freq,
'min-var-count=f' => \$min_var_count,
'min-strandedness=f' => \$min_strandedness,
'max-mm-qualsum-diff=f' => \$max_mm_qualsum_diff,
'max-mapqual-diff=f' => \$max_mapqual_diff,
'max-readlen-diff=f' => \$max_readlen_diff,
'min-var-dist-3=f' => \$min_var_dist_3,
'max_var_mm_qualsum=f' => \$max_var_mm_qualsum,
'help' => \$help,
);
unless($opt_result) {
die help_text();
}
if($help) {
print STDOUT help_text();
exit 0;
}
unless($vcf_file) {
warn "You must provide a file to be filtered\n";
die help_text();
}
unless(-s $vcf_file) {
die "Can not find VCF file: $vcf_file\n";
}
unless($ref_fasta) {
warn "You must provide a reference fasta for generating readcounts to use for filtering\n";
die help_text();
}
unless(-s $ref_fasta) {
die "Can not find valid reference fasta: $ref_fasta\n";
}
unless($bam_file) {
die "You must provide a BAM file for generating readcounts\n";
die help_text();
}
unless(-s $bam_file) {
die "Can not find valid BAM file: $bam_file\n";
}
if($bam_index && !-s $bam_index) {
die "Can not find valid BAM index file: $bam_index\n";
}
unless($output_file) {
warn "You must provide an output file name: $output_file\n";
die help_text();
}
else {
$output_file = abs_path($output_file); #make sure we have full path as manipulating cwd below
}
unless($sample) {
warn "You must provide a sample name\n";
die help_text();
}
my %filters;
$filters{'position'} = [sprintf("PB%0.f",$min_read_pos*100), "Average position on read less than " . $min_read_pos . " fraction of the relative distance from the read start or read end"];
$filters{'strand_bias'} = [sprintf("SB%0.f",$min_strandedness*100), "Reads supporting the variant have less than " . $min_strandedness . " fraction of the reads on one strand, but reference supporting reads are not similarly biased"];
$filters{'min_var_count'} = [ "MVC".$min_var_count, "Less than " . $min_var_count . " high quality reads support the variant"];
$filters{'min_var_freq'} = [ sprintf("MVF%0.f",$min_var_freq*100), "Variant allele frequency is less than " . $min_var_freq];
$filters{'mmqs_diff'} = [ sprintf("MMQSD%d",$max_mm_qualsum_diff), "Difference in average mismatch quality sum between variant and reference supporting reads is greater than " . $max_mm_qualsum_diff];
$filters{'mq_diff'} = [ sprintf("MQD%d",$max_mapqual_diff), "Difference in average mapping quality sum between variant and reference supporting reads is greater than " . $max_mapqual_diff];
$filters{'read_length'} = [ sprintf("RLD%d",$max_readlen_diff), "Difference in average clipped read length between variant and reference supporting reads is greater than " . $max_readlen_diff];
$filters{'dist3'} = [ sprintf("DETP%0.f",$min_var_dist_3*100), "Average distance of the variant base to the effective 3' end is less than " . $min_var_dist_3];
$filters{'var_mmqs'} = [ sprintf("MMQS%d",$max_var_mm_qualsum), "The average mismatch quality sum of reads supporting the variant is greater than " . $max_var_mm_qualsum] if($max_var_mm_qualsum);
$filters{'no_var_readcount'} = [ "NRC", "Unable to grab readcounts for variant allele"];
$filters{'incomplete_readcount'} = [ "IRC", "Unable to grab any sort of readcount for either the reference or the variant allele"];
my $vcf_header;
my @vcf_lines;
my %rc_for_snp; # store info on snp positions and VCF line
my %rc_for_indel; #store info on indel positions and VCF line
my ($workdir, $working_fasta, $working_bam) = setup_workdir($ref_fasta, $bam_file, $bam_index);
my $starting_dir = cwd();
my $input = open_vcf_file($vcf_file) or die "Unable to open input file $vcf_file: $!\n";
$vcf_header = parse_vcf_header($input);
add_filters_to_vcf_header($vcf_header, values %filters);
add_process_log_to_header($vcf_header, $vcf_file, @params);
my $header_line = $vcf_header->[-1];
chomp $header_line;
my @header_fields = split "\t", $header_line;
while(my $entry = $input->getline) {
push @vcf_lines, $entry;
chomp $entry;
my %fields;
@fields{@header_fields} = split("\t", $entry);
my $filter_sample = $fields{$sample};
unless($filter_sample) {
die "Unable to find field for $sample\n";
}
my @sample_fields = split /:/, $filter_sample;
unless(@sample_fields) {
die "Unable to parse field for $sample\n";
}
my $index = 0;
my %format_keys = map { $_ => $sample_fields[$index++] } split /:/, $fields{FORMAT};
#these are in order ACGT
my @alleles = ($fields{REF}, split /,/, $fields{ALT});
my %gt_alleles = map {$_ => 1} grep { $_ > 0 } split /\//, $format_keys{GT};
my @used_alleles;
for my $allele_index (keys %gt_alleles) {
push @used_alleles, $alleles[$allele_index];
}
my ($var) = sort @used_alleles; #follow existing convention of fp filter using alphabetical order to choose a single base on triallelic sites
$var = q{} unless defined $var; #in the case where there is no variant allele, set this to the empty string. Later it will be filtered as NRC or IRC
$var = uc($var);
my $ref = uc($fields{REF});
my $chrom = $fields{'#CHROM'};
my $pos = $fields{POS};
if(length($ref) > 1 || length($var) > 1) {
#it's an indel or mnp
if( (length($ref) == length($var)) || (length($ref) > 1 && length($var) > 1) ) {
print STDERR 'Complex variant or MNP will be skipped: '. $chrom ."\t". $pos ."\t". $ref ."\t". $var ."\n";
next;
}
elsif(length($ref) > length($var)) {
#it's a deletion
$pos += 1;
($ref, $var) = ($var, $ref);
$ref = substr($var, 1, 1);
$var = "-" . substr($var, 1);
}
else {
#it's an insertion
substr($var, 0, 1, "+");
}
$rc_for_indel{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
}
else {
#it's a SNP
$rc_for_snp{$chrom}{$pos}{$ref}{$var} = \$vcf_lines[-1];
}
}
if(%rc_for_snp) {
filter_sites_in_hash(\%rc_for_snp, $bam_readcount_path, $working_bam, $working_fasta, $workdir);
}
else {
print STDERR "No SNP sites identified\n";
}
if(%rc_for_indel) {
filter_sites_in_hash(\%rc_for_indel, $bam_readcount_path, $working_bam, $working_fasta, $workdir, '-i');
}
else {
print STDERR "No Indel sites identified\n";
}
## Open the output files ##
my $filtered_vcf = IO::File->new("$output_file","w") or die "Can't open output file $output_file: $!\n";
print $filtered_vcf @$vcf_header;
print $filtered_vcf @vcf_lines;
chdir $starting_dir or die "Unable to go back to starting dir\n";
exit(0);
################################################################################
=head3 read_counts_by_allele
Retrieve relevant read counts for a certain allele
=cut
sub read_counts_by_allele {
(my $line, my $allele) = @_;
my @lineContents = split(/\t/, $line);
my $numContents = @lineContents;
for(my $colCounter = 5; $colCounter < $numContents; $colCounter++) {
my $this_allele = $lineContents[$colCounter];
my @alleleContents = split(/\:/, $this_allele);
if($alleleContents[0] eq $allele) {
my $numAlleleContents = @alleleContents;
return("") if($numAlleleContents < 8);
my $return_string = "";
my $return_sum = 0;
for(my $printCounter = 1; $printCounter < $numAlleleContents; $printCounter++) {
$return_sum += $alleleContents[$printCounter];
$return_string .= "\t" if($return_string);
$return_string .= $alleleContents[$printCounter];
}
return($return_string);
}
}
return("");
}
sub help_text {
return <<HELP;
fpfilter - Filtering for Illumina Sequencing
SYNOPSIS
fpfilter [options] [file ...]
OPTIONS
--vcf-file the input VCF file. Must have a GT field.
--bam-file the BAM file of the sample you are filtering on. Typically the tumor.
--sample the sample name of the sample you want to filter on in the VCF file.
--reference a fasta containing the reference sequence the BAM file was aligned to.
--output the filename of the output VCF file
--min-read-pos minimum average relative distance from start/end of read
--min-var-freq minimum variant allele frequency
--min-var-count minimum number of variant-supporting reads
--min-strandedness minimum representation of variant allele on each strand
--max-mm-qualsum-diff maximum difference of mismatch quality sum between variant and reference reads (paralog filter)
--max_var_mm_qualsum maximum mismatch quality sum of reference-supporting reads
--max-mapqual-diff maximum difference of mapping quality between variant and reference reads
--max-readlen-diff maximum difference of average supporting read length between variant and reference reads (paralog filter)
--min-var-dist-3 minimum average distance to effective 3prime end of read (real end or Q2) for variant-supporting reads
--help this message
DESCRIPTION
This program will filter a VCF with a variety of filters as detailed in the VarScan2 paper (http://www.ncbi.nlm.nih.gov/pubmed/22300766). It requires the bam-readcount utility (https://github.com/genome/bam-readcount).
This filter was calibrated on 100bp PE Illumina reads. It is likely to be overly stringent for longer reads and may be less effective on shorter reads.
AUTHORS
Dan Koboldt Original code
Dave Larson Modifications for VCF and exportation.
HELP
}
### methods copied from elsewhere begin here...
sub parse_vcf_header {
my $input_fh = shift;
my @header;
my $header_end = 0;
while (!$header_end) {
my $line = $input_fh->getline;
if ($line =~ m/^##/) {
push @header, $line;
} elsif ($line =~ m/^#/) {
push @header, $line;
$header_end = 1;
} else {
die "Missed the final header line with the sample list? Last line examined: $line Header so far: " . join("\n", @header) . "\n";
}
}
return \@header;
}
sub generate_region_list {
my ($hash, $region_fh) = @_; #input_fh should be a filehandle to the VCF
print STDERR "Printing variants to temporary region_list file...\n";
for my $chr (keys %$hash) {
for my $pos (sort { $a <=> $b } keys %{$hash->{$chr}}) {
print $region_fh "$chr\t$pos\t$pos\n";
}
}
}
sub _simplify_indel_allele {
my ($ref, $var) = @_;
#these could be padded e.g. G, GT for a T insertion or GCC G for a 2bp deletion
#they could also be complex e.g. GCCCGT, GCGT for a 2bp deletion
#they could also represent an insertion deletion event e.g. GCCCGT GCGGGGT; these cannot be represented in genome bed. Throw an error or warn.
#
#I think the algorithm should be trim end (no updating of coords required)
#trim beginning and return number of bases trimmed
my @ref_array = map { uc } split //, $ref;
my @var_array = map { uc } split //, $var;
while(@ref_array and @var_array and $ref_array[-1] eq $var_array[-1]) {
pop @ref_array;
pop @var_array;
}
my $right_shift = 0;
while(@ref_array and @var_array and $ref_array[0] eq $var_array[0]) {
shift @ref_array;
shift @var_array;
$right_shift++;
}
return (join("",@ref_array), join("",@var_array), $right_shift);
}
sub filter_site {
my ($ref_result, $var_result) = @_;
#this will return a list of filter names
my @filter_names;
if($ref_result && $var_result) {
## Parse out the bam-readcounts details for each allele. The fields should be: ##
#num_reads : avg_mapqual : avg_basequal : avg_semq : reads_plus : reads_minus : avg_clip_read_pos : avg_mmqs : reads_q2 : avg_dist_to_q2 : avgRLclipped : avg_eff_3'_dist
my ($ref_count, $ref_map_qual, $ref_base_qual, $ref_semq, $ref_plus, $ref_minus, $ref_pos, $ref_subs, $ref_mmqs, $ref_q2_reads, $ref_q2_dist, $ref_avg_rl, $ref_dist_3) = split(/\t/, $ref_result);
my ($var_count, $var_map_qual, $var_base_qual, $var_semq, $var_plus, $var_minus, $var_pos, $var_subs, $var_mmqs, $var_q2_reads, $var_q2_dist, $var_avg_rl, $var_dist_3) = split(/\t/, $var_result);
my $ref_strandedness = my $var_strandedness = 0.50;
$ref_dist_3 = 0.5 if(!$ref_dist_3);
## Use conservative defaults if we can't get mismatch quality sums ##
$ref_mmqs = 50 if(!$ref_mmqs);
$var_mmqs = 0 if(!$var_mmqs);
my $mismatch_qualsum_diff = $var_mmqs - $ref_mmqs;
## Determine map qual diff ##
my $mapqual_diff = $ref_map_qual - $var_map_qual;
## Determine difference in average supporting read length ##
my $readlen_diff = $ref_avg_rl - $var_avg_rl;
## Determine ref strandedness ##
if(($ref_plus + $ref_minus) > 0) {
$ref_strandedness = $ref_plus / ($ref_plus + $ref_minus);
$ref_strandedness = sprintf("%.2f", $ref_strandedness);
}
## Determine var strandedness ##
if(($var_plus + $var_minus) > 0) {
$var_strandedness = $var_plus / ($var_plus + $var_minus);
$var_strandedness = sprintf("%.2f", $var_strandedness);
}
if($var_count && ($var_plus + $var_minus)) {
## We must obtain variant read counts to proceed ##
my $var_freq = $var_count / ($ref_count + $var_count);
## FAILURE 1: READ POSITION ##
if($var_pos < $min_read_pos) {
#$stats{'num_fail_pos'}++;
push @filter_names, $filters{'position'}->[0];
}
## FAILURE 2: Variant is strand-specific but reference is NOT strand-specific ##
if(($var_strandedness < $min_strandedness || $var_strandedness > $max_strandedness) && ($ref_strandedness >= $min_strandedness && $ref_strandedness <= $max_strandedness)) {
#$stats{'num_fail_strand'}++;
push @filter_names, $filters{'strand_bias'}->[0];
}
## FAILURE : Variant allele count does not meet minimum ##
if($var_count < $min_var_count) {
#$stats{'num_fail_varcount'}++;
push @filter_names, $filters{'min_var_count'}->[0];
}
## FAILURE : Variant allele frequency does not meet minimum ##
if($var_freq < $min_var_freq) {
#$stats{'num_fail_varfreq'}++;
push @filter_names, $filters{'min_var_freq'}->[0];
}
## FAILURE 3: Paralog filter for sites where variant allele mismatch-quality-sum is significantly higher than reference allele mmqs
if($mismatch_qualsum_diff> $max_mm_qualsum_diff) {
#$stats{'num_fail_mmqs'}++;
push @filter_names, $filters{'mmqs_diff'}->[0];
}
## FAILURE 4: Mapping quality difference exceeds allowable maximum ##
if($mapqual_diff > $max_mapqual_diff) {
#$stats{'num_fail_mapqual'}++;
push @filter_names, $filters{'mq_diff'}->[0];
}
## FAILURE 5: Read length difference exceeds allowable maximum ##
if($readlen_diff > $max_readlen_diff) {
#$stats{'num_fail_readlen'}++;
push @filter_names, $filters{'read_length'}->[0];
}
## FAILURE 5: Read length difference exceeds allowable maximum ##
if($var_dist_3 < $min_var_dist_3) {
#$stats{'num_fail_dist3'}++;
push @filter_names, $filters{'dist3'}->[0];
}
if($max_var_mm_qualsum && $var_mmqs > $max_var_mm_qualsum) {
#$stats{'num_fail_var_mmqs'}++;
push @filter_names, $filters{'var_mmqs'}->[0];
}
## SUCCESS: Pass Filter ##
if(@filter_names == 0) {
#$stats{'num_pass_filter'}++;
## Print output, and append strandedness information ##
@filter_names = ('PASS');
}
}
else {
push @filter_names, $filters{'no_var_readcount'}->[0];
}
}
else {
#$stats{'num_no_readcounts'}++;
#print $fail_fh "$line\tno_readcounts\n";
push @filter_names, $filters{'incomplete_readcount'}->[0];
}
return @filter_names;
}
sub add_filters_to_vcf_header {
my ($parsed_header, @filter_refs) = @_;
my $column_header = pop @$parsed_header;
for my $filter_ref (@filter_refs) {
my ($filter_name, $filter_description) = @$filter_ref;
my $filter_line = qq{##FILTER=<ID=$filter_name,Description="$filter_description">\n};
push @$parsed_header, $filter_line;
}
push @$parsed_header, $column_header;
}
sub add_process_log_to_header {
my ($parsed_header, $input, @params) = @_;
my $column_header = pop @$parsed_header;
my $param_string = join(" ", @params);
push @$parsed_header, qq{##vcfProcessLog=<InputVCF=<$input>, InputVCFSource=<fpfilter>, InputVCFVer=<6.0>, InputVCFParam=<"$param_string"> InputVCFgeneAnno=<.>>\n}, $column_header;
}
sub filter_sites_in_hash {
my ($hash, $bam_readcount_path, $bam_file, $ref_fasta, $working_dir, $optional_param) = @_;
#done parsing vcf
$optional_param ||= '';
my $list_name = File::Spec->catfile($working_dir, "regions.txt");
my $list_fh = IO::File->new($list_name,"w") or die "Unable to open file for coordinates\n";
generate_region_list($hash, $list_fh);
$list_fh->close();
## run bam-readcount
my $bam_readcount_cmd = "$bam_readcount_path -f $ref_fasta -l $list_name -w 0 -b 20 $optional_param $bam_file|";
my $rc_results = IO::File->new($bam_readcount_cmd) or die "Unable to open pipe to bam-readcount cmd: $bam_readcount_cmd\n";
while(my $rc_line = $rc_results->getline) {
chomp $rc_line;
my ($chrom, $position) = split(/\t/, $rc_line);
if($hash->{$chrom}{$position}) {
for my $ref (keys %{$hash->{$chrom}{$position}}) {
for my $var (keys %{$hash->{$chrom}{$position}{$ref}}) {
my $ref_result = read_counts_by_allele($rc_line, $ref);
my $var_result = read_counts_by_allele($rc_line, $var);
my @filters = filter_site($ref_result, $var_result);
my $vcf_line_ref = $hash->{$chrom}{$position}{$ref}{$var};
my @fields = split "\t", $$vcf_line_ref;
if($fields[6] eq '.' || $fields[6] eq 'PASS') {
$fields[6] = join(";", @filters);
}
else {
$fields[6] = join(";", $fields[6], @filters) if($filters[0] ne 'PASS');
}
$$vcf_line_ref = join("\t", @fields);
}
}
}
else {
die "Unknown site for rc\n";
}
}
unless($rc_results->close) {
die "Error running bam-readcount\n";
}
}
sub setup_workdir {
my ($reference, $bam_file, $bam_index) = @_;
$reference = abs_path($reference);
$bam_file = abs_path($bam_file);
$bam_index = abs_path($bam_index) if $bam_index;
my $dir = File::Temp->newdir('fpfilterXXXXX', TMPDIR => 1, CLEANUP => 1, DIR => './') or
die "Unable to create working directory\n";
#symlink in necessary files to run
my $working_reference = File::Spec->catfile($dir, "reference.fa");
symlink $reference, $working_reference;
my $fa_index = $reference . ".fai";
unless(-e $fa_index) {
index_fasta($working_reference);
}
else {
symlink $fa_index, File::Spec->catfile($dir, "reference.fa.fai");
}
my $working_bam = File::Spec->catfile($dir, "tumor.bam");
my $working_bam_index = File::Spec->catfile($dir, "tumor.bam.bai");
symlink $bam_file, $working_bam;
unless ($bam_index) {
my ($bam_filename,$bam_path,$bam_suffix) = File::Basename::fileparse($bam_file,qr/\.bam/);
$bam_index = File::Spec->catfile($bam_path,$bam_filename .'.bai');
unless (-e $bam_index) {
$bam_index = $bam_file .'.bai';
}
}
if (-e $bam_index) {
symlink $bam_index, $working_bam_index;
}
else {
index_bam($working_bam);
}
return ($dir, $working_reference, $working_bam);
}
sub index_fasta {
my ($fasta) = @_;
print STDERR "Indexing fasta...\n";
my @args = ($samtools_path, "faidx", $fasta);
system(@args) == 0
or die "Unable to index $fasta: $?\n";
}
sub index_bam {
my ($bam) = @_;
print STDERR "Indexing BAM...\n";
my @args = ($samtools_path, "index", $bam);
system(@args) == 0
or die "Unable to index $bam: $?\n";
}
sub validate_file_for_reading {
my $file = shift;
unless ( defined $file ) {
Carp::croak("Can't validate_file_for_reading: No file given");
}
if ($file eq '-') {
return 1;
}
unless (-e $file ) {
Carp::croak("File ($file) does not exist");
}
unless (-f $file) {
Carp::croak("File ($file) exists but is not a plain file");
}
unless ( -r $file ) {
Carp::croak("Do not have READ access to file ($file)");
}
return 1;
}
# Follows a symlink chain to reach the final file, accounting for relative symlinks along the way
sub follow_symlink {
my $file = shift;
# Follow the chain of symlinks
while (-l $file) {
my $original_file = $file;
$file = readlink($file);
# If the symlink was relative, repair that
unless (File::Spec->file_name_is_absolute($file)) {
my $path = dirname($original_file);
$file = join ("/", ($path, $file));
}
validate_file_for_reading($file);
}
return $file;
}
sub file_type {
my $file = shift;
validate_file_for_reading($file);
$file = follow_symlink($file);
my $result = `file -b $file`;
my @answer = split /\s+/, $result;
return $answer[0];
}
sub file_is_gzipped {
my $filename = shift;
my $file_type = file_type($filename);
#NOTE: debian bug #522441 - `file` can report gzip files as any of these....
if ($file_type eq "gzip" or $file_type eq "Sun" or $file_type eq "Minix" or $file_type eq 'GRand') {
return 1;
} else {
return 0;
}
}
sub open_gzip_file_for_reading {
my $file = shift;
validate_file_for_reading($file)
or return;
unless (file_is_gzipped($file)) {
Carp::croak("File ($file) is not a gzip file");
}
my $pipe = "zcat ".$file." |";
# _open_file throws its own exception if it doesn't work
return IO::File->new($pipe);
}
sub open_file_for_reading {
my $file = shift;
validate_file_for_reading($file)
or return;
# _open_file throws its own exception if it doesn't work
return IO::File->new($file, 'r');
}
sub open_vcf_file {
my $filename = shift;
my $fh;
if ( file_is_gzipped($filename) ) {
$fh = open_gzip_file_for_reading($filename);
} else {
$fh = open_file_for_reading($filename);
}
return $fh;
}