-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathevaluate.py
284 lines (242 loc) · 8.74 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
"""Evaluating collected algorithms predictions with respect to the
ground truth labels."""
import argparse
import os
import re
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from functools import partial
from pyteomics import mgf, fasta
from pyteomics.mass.unimod import Unimod
from sklearn.metrics import auc
from tqdm import tqdm
from ground_truth_mapper import format_sequence as format_sequence_GT
from metrics import aa_match_metrics, aa_match_batch
from token_masses import AA_MASSES
DATASET_TAGS_PATH = os.environ['DATASET_TAGS_PATH']
PROTEOMES_DIR = os.environ['PROTEOMES_DIR']
UNIMOD_DB = Unimod()
ptm_masses = {}
def _transform_match_ptm(match: re.Match) -> str:
"""
TODO
"""
ptm_idx = int(match.group(1))
if ptm_idx not in ptm_masses:
ptm_masses[ptm_idx] = UNIMOD_DB.get(ptm_idx).monoisotopic_mass
print(ptm_masses)
ptm_mass = str(ptm_masses[ptm_idx])
if not ptm_mass.startswith("-"):
ptm_mass = "+" + ptm_mass
return f"[{ptm_mass}]"
def ptms_to_delta_mass(sequence):
PTM_PATTERN = r"\[UNIMOD:([0-9]+)\]" # find ptms
sequence = re.sub(PTM_PATTERN, _transform_match_ptm, sequence)
return sequence
def parse_scores(aa_scores: str) -> list[float]:
"""
TODO.
* assumes that AA confidence scores always come
as a string of float numbers separated by a comma.
"""
aa_scores = aa_scores.split(",")
aa_scores = list(map(float, aa_scores))
return aa_scores
def remove_ptms(sequence, ptm_pattern="[^A-Z]"):
return re.sub(ptm_pattern, "", sequence)
def isoleucine_to_leucine(sequence):
return sequence.replace("I", "L")
def read_fasta(filename):
# Initialize an empty list to store sequences
sequences = {}
# Read the sequences from the FASTA file
with fasta.read(filename) as fasta_file:
for record in fasta_file:
# entry is a tuple (description, sequence)
description, sequence = record
sequence = isoleucine_to_leucine(sequence)
sequences[description] = sequence
return sequences
def find_match_in_proteome(peptide_seq, proteome):
for ref_id, ref_seq in proteome.items():
if peptide_seq in ref_seq:
return True
return False
parser = argparse.ArgumentParser()
parser.add_argument(
"output_dir",
help="""
The path to the directory containing algorithm predictions
stored in `algorithm_outputs.csv` files.
""",
)
parser.add_argument(
"data_dir", help="The path to the input data with ground truth labels."
)
parser.add_argument(
"--results_dir",
default="results/",
help="The path to save evaluation results (default: 'results/').",
)
args = parser.parse_args()
# Define dataset name and path to store evaluation results
dataset_name = os.path.basename(os.path.normpath(args.output_dir))
print(f"Evaluating results for {dataset_name}.")
labels_path = os.path.join(args.data_dir, "labels.csv")
sequences_true = pd.read_csv(labels_path)
sequences_true["seq"] = sequences_true["seq"].apply(format_sequence_GT)
# get database_path from dataset tags (proteome column, by dataset_name)
tags_df = pd.read_csv(DATASET_TAGS_PATH, sep='\t')
tags_df = tags_df.set_index("dataset")
database_path = tags_df.loc[dataset_name, "proteome"]
proteome = read_fasta(os.path.join(PROTEOMES_DIR, database_path))
print(f"Reference proteome length: {len(proteome)} proteins.")
proteome = "|".join(list(proteome.values()))
# Load predictions data, match to GT by scan id or scan index if available
PLOT_N_POINTS = 10000
PLOT_HEIGHT = 400
PLOT_WIDTH = int(PLOT_HEIGHT * 1.2)
layout = go.Layout(
height=PLOT_HEIGHT,
width=PLOT_WIDTH,
title_x=0.5,
margin_t=50,
xaxis_title="Coverage",
yaxis_title="Precision",
xaxis_range=[0, 1],
yaxis_range=[0, 1],
legend=dict(
y=0.01,
x=0.01,
),
)
prot_match_fig = go.Figure(layout=layout)
prot_match_fig.update_layout(
title_text="<b>Number of proteome matches vs. score</b>",
xaxis_title="Score threshold",
yaxis_title="Number of matches",
yaxis_range=None,
)
pep_fig = go.Figure(layout=layout)
pep_fig.update_layout(title_text="<b>Peptide precision & coverage</b>")
aa_fig = go.Figure(layout=layout)
aa_fig.update_layout(title_text="<b>AA precision & coverage</b>")
output_metrics = {}
for output_file in os.listdir(args.output_dir):
algo_name = output_file.split("_")[0]
output_path = os.path.join(args.output_dir, output_file)
output_data = pd.read_csv(output_path)
use_cols = ["sequence", "score", "aa_scores", "spectrum_id"]
output_data = pd.merge(
sequences_true,
output_data[use_cols],
on="spectrum_id",
how="outer",
)
output_data = output_data.rename({"seq": "sequence_true"}, axis=1)
# Calculate metrics
output_data = output_data.sort_values("score", ascending=False)
sequenced_idx = output_data["sequence"].notnull() # TODO: indicate number of not sequenced peptides?
labeled_idx = output_data["sequence_true"].notnull()
output_data.loc[sequenced_idx, "sequence"] = output_data.loc[sequenced_idx, "sequence"].apply(
ptms_to_delta_mass
)
output_data["sequence_no_ptm"] = np.nan
output_data.loc[sequenced_idx, "sequence_no_ptm"] = output_data.loc[sequenced_idx, "sequence"].apply(
partial(remove_ptms, ptm_pattern='[^A-Z]')
)
matches = [
(isinstance(seq, str)) and (isoleucine_to_leucine(seq) in proteome) # TODO: move to one place with remove_ptms?
for seq in tqdm(output_data["sequence_no_ptm"].tolist())
]
output_data["proteome_match"] = matches
aa_matches_batch, n_aa1, n_aa2 = aa_match_batch(
output_data["sequence"][sequenced_idx * labeled_idx],
output_data["sequence_true"][sequenced_idx * labeled_idx],
AA_MASSES,
)
# Collect metrics
aa_precision, aa_recall, pep_precision = aa_match_metrics(
aa_matches_batch, n_aa1, n_aa2
)
output_metrics[algo_name] = {
"N sequences": sequenced_idx.size,
"N predicted": sequenced_idx.sum(),
"AA precision": aa_precision,
"AA recall": aa_recall,
"Pep precision": pep_precision,
"N proteome matches": output_data["proteome_match"][sequenced_idx].mean() #sum()
}
# Plot the peptide precision–coverage curve
prot_matches = output_data["proteome_match"][sequenced_idx].values
n_matches = np.cumsum(prot_matches) # ?
pep_score = output_data["score"][sequenced_idx].values # ?
plot_idxs = np.linspace(0, len(pep_score) - 1, PLOT_N_POINTS).astype(
np.int64
)
prot_match_fig.add_trace(
go.Scatter(
x=pep_score[plot_idxs],
y=n_matches[plot_idxs],
mode="lines",
name=f"{algo_name}",
)
)
# Plot the peptide precision–coverage curve
pep_matches = np.array([aa_match[1] for aa_match in aa_matches_batch])
precision = np.cumsum(pep_matches) / np.arange(1, len(pep_matches) + 1)
coverage = np.arange(1, len(pep_matches) + 1) / len(pep_matches)
plot_idxs = np.linspace(0, len(coverage) - 1, PLOT_N_POINTS).astype(
np.int64
)
pep_fig.add_trace(
go.Scatter(
x=coverage[plot_idxs],
y=precision[plot_idxs],
mode="lines",
name=f"{algo_name} AUC = {auc(coverage, precision):.3f}",
)
)
# Plot the amino acid precision–coverage curve
aa_scores = np.concatenate(
list(
map(
parse_scores,
output_data["aa_scores"][sequenced_idx * labeled_idx].values.tolist(),
)
)
)
sort_idx = np.argsort(aa_scores)[::-1]
aa_matches_pred = np.concatenate(
[aa_match[2][0] for aa_match in aa_matches_batch]
)
precision = np.cumsum(aa_matches_pred[sort_idx]) / np.arange(
1, len(aa_matches_pred) + 1
)
coverage = np.arange(1, len(aa_matches_pred) + 1) / len(aa_matches_pred)
plot_idxs = np.linspace(0, len(coverage) - 1, PLOT_N_POINTS).astype(
np.int64
)
aa_fig.add_trace(
go.Scatter(
x=coverage[plot_idxs],
y=precision[plot_idxs],
mode="lines",
name=f"{algo_name} AUC = {auc(coverage, precision):.3f}",
)
)
# Save results
dataset_results_dir = os.path.join(args.results_dir, dataset_name)
os.makedirs(dataset_results_dir, exist_ok=True)
prot_match_fig.write_html(
os.path.join(dataset_results_dir, "number_of_proteome_matches.html")
)
pep_fig.write_html(
os.path.join(dataset_results_dir, "peptide_precision_coverage.html")
)
aa_fig.write_html(
os.path.join(dataset_results_dir, "AA_precision_coverage.html")
)
output_metrics = pd.DataFrame(output_metrics).T
output_metrics.to_csv(os.path.join(dataset_results_dir, "metrics.csv"))