-
Notifications
You must be signed in to change notification settings - Fork 0
/
select_one_replicate.pl
150 lines (126 loc) · 4.44 KB
/
select_one_replicate.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env perl
# Given a table with multiple replicates from the same source, selects one replicate per
# source, using selected column to select replicate. In the event of a tie, selects first
# appearing replicate.
# Usage:
# perl select_one_replicate.pl [tab-separated table]
# "[title of column containing source of each replicate (same value for every replicate from the same source)]"
# "[title of column to use to select replicate]"
# [0 to select replicate with smallest numerical value, 1 to select replicate with largest numerical value]
# Prints to console. To print to file, use
# perl select_one_replicate.pl [tab-separated table]
# "[title of column containing source of each replicate (same value for every replicate from the same source)]"
# "[title of column to use to select replicate]"
# [0 to select replicate with smallest numerical value, 1 to select replicate with largest numerical value]
# > [annotated output table path]
use strict;
use warnings;
my $table = $ARGV[0];
my $source_column_title = $ARGV[1];
my $comparison_column_title = $ARGV[2];
my $option = $ARGV[3]; # 0 to select replicate with smallest numerical value, 1 to select replicate with largest numerical value
my $NEWLINE = "\n";
my $DELIMITER = "\t";
my $NO_DATA = "";
# verifies that input file exists and is not empty
if(!$table or !-e $table or -z $table)
{
print STDERR "Error: table not provided, does not exist, or empty:\n\t"
.$table."\nExiting.\n";
die;
}
if($option != 0 and $option != 1)
{
print STDERR "Error: option not 0 or 1. Exiting.\n";
die;
}
# reads in input table, recording smallest or largest numerical value for each replicate
my $first_line = 1;
my $source_column = -1;
my $comparison_column = -1;
my %source_to_winning_comparison_value = (); # key: source name -> value: winning value attached to replicate to print for this source
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
if($_ =~ /\S/) # if row not empty
{
my $line = $_;
my @items_in_line = split($DELIMITER, $line, -1);
if($first_line) # column titles
{
# identifies column to compare by and column containing source id
my $column = 0;
foreach my $column_title(@items_in_line)
{
if($column_title eq $source_column_title)
{
$source_column = $column;
}
elsif($column_title eq $comparison_column_title)
{
$comparison_column = $column;
}
$column++;
}
# verifies that all columns have been found
if($source_column == -1 or $comparison_column == -1)
{
print STDERR "Error: expected column titles not found. Exiting.\n";
die;
}
# next line is not column titles
$first_line = 0;
}
else # column values
{
# retrieves name of source and value to compare replicates by
my $source = $items_in_line[$source_column];
my $comparison_value = $items_in_line[$comparison_column];
# saves comparison value if it wins
if(!defined $source_to_winning_comparison_value{$source}
or $option == 0 and $comparison_value < $source_to_winning_comparison_value{$source}
or $option == 1 and $comparison_value > $source_to_winning_comparison_value{$source})
{
$source_to_winning_comparison_value{$source} = $comparison_value;
}
}
}
}
close TABLE;
# reads in input table again, printing row for only one selected replicate from each source
$first_line = 1;
my %source_printed = (); # key: source name -> value: 1 if a row has been printed for this source
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
if($_ =~ /\S/) # if row not empty
{
my $line = $_;
my @items_in_line = split($DELIMITER, $line, -1);
if($first_line) # column titles
{
# prints column titles as they are
print $line.$NEWLINE;
# next line is not column titles
$first_line = 0;
}
else # column values
{
# retrieves name of source and value to compare replicates by
my $source = $items_in_line[$source_column];
my $comparison_value = $items_in_line[$comparison_column];
# prints this row if it contains the winning comparison value for this source
# and a replicate from this source has not been printed in a previous row
if($comparison_value == $source_to_winning_comparison_value{$source}
and !$source_printed{$source})
{
print $line.$NEWLINE;
$source_printed{$source} = 1;
}
}
}
}
close TABLE;
# September 28, 2021