-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeeplc_novel.py
194 lines (156 loc) · 8.13 KB
/
deeplc_novel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
"""
This script is used to evaluate the performance of DeepLC on the GCA dataset. The GRCh38 peptides are used to train DeepLC,
and the GCA peptides are used to evaluate the performance of DeepLC. For each PSM, the best PEP is selected.
The training is performed by SampleID and the evaluation is performed by SampleID. The final output is a csv file with the
name gca_peptides_for_deeplc_99thperc.csv and gca_peptides_for_deeplc_95thperc.csv. The 99th percentile file contains the
peptides with an error percentile lower than 99%, and the 95th percentile file contains the peptides with an error percentile.
@author: Yasset Perez-Riverol (https://github.com/ypriverol)
"""
import hashlib
import click
# specific packages
import pandas as pd
import tensorflow as tf
# import deeplc packages
from deeplc import DeepLC
from matplotlib import pyplot as plt
# Default
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
logging.getLogger('deeplc').setLevel(logging.ERROR)
logging.getLogger('tensorflow').setLevel(logging.ERROR)
tf.get_logger().setLevel(logging.ERROR)
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from scipy.stats import percentileofscore
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@click.group(context_settings=CONTEXT_SETTINGS)
def cli():
pass
@click.command("filter_deeplc", help="Run the ms2pip filtering process to remove low-quality peptides.")
@click.option("-p", "--canonical_peptide_file", type=click.Path(exists=True),
help="The path to the canonical peptide file.", required=True)
@click.option("-n", "--novel_peptide_file", type=click.Path(exists=True), help="The path to the novel peptide file.",
required=True)
@click.option("-o", "--output_folder", type=click.Path(), help="The path to the output folder.", required=True)
@click.option("-f", "--output_file_95perc", type=click.Path(),
help="The name of the output file for the 95th percentile.", required=True)
@click.option("-g", "--output_file_99perc", type=click.Path(),
help="The name of the output file for the 99th percentile.", required=True)
@click.option("-c", "--num_cores", type=int, help="The number of cores to use.", default=5)
@click.option("-v", "--verbose", is_flag=True, help="Print verbose output.")
@click.option("-s", "--num_samples",type=int, help="The number of samples to use.", default=10000)
@click.option("-gg", "--group_files", type=int, help="Group files by SampleID", default=0)
def filter_deeplc(canonical_peptide_file: str, novel_peptide_file: str, output_folder: str, output_file_95perc: str,
output_file_99perc: str, num_cores: int, verbose: bool = False, num_samples: int = 10000, group_files: int = 0):
all_gca = []
# Reading both files
if ".gz" in canonical_peptide_file:
df = pd.read_csv(canonical_peptide_file, sep=",",compression="gzip")
else:
df = pd.read_csv(canonical_peptide_file, sep=",")
if ".gz" in novel_peptide_file:
df_gca = pd.read_csv(novel_peptide_file, sep=",",compression="gzip")
else:
df_gca = pd.read_csv(novel_peptide_file, sep=",")
# In LFQ experiments, every PSM is associated with a SampleID but if is a TMT experiment, then each PSM is associated with more
# than one SampleID. In this case, we use reference_file_name to identify the SampleID.
if 'sample_id' not in df.columns:
df['sample_id'] = df['reference_file_name']
if 'sample_id' not in df_gca.columns:
df_gca['sample_id'] = df_gca['reference_file_name']
## Only use peptides that passed spectrumAI filtering
df_gca = df_gca[(df_gca['position'] == 'non-canonical') | (df_gca['flanking_ions_support'] == 'YES')]
df = df[df["tr"] < 25000]
# Create indexes for both dataframes
df.fillna("", inplace=True)
df.index = df['sample_id'] + "+" + df["seq"] + "+" + df["modifications"]
df_gca.fillna("", inplace=True)
df_gca.index = df_gca['sample_id'] + "+" + df_gca["seq"] + "+" + df_gca["modifications"]
number_groups = 0 # Counter for the number of samples processed
if group_files > 0:
def map_to_group(sample_id, num_groups):
# Calculate SHA-256 hash
hash_obj = hashlib.sha256(sample_id.encode())
hash_value = int(hash_obj.hexdigest(), 16)
# Map hash to group number (0 to num_groups - 1)
group_number = hash_value % num_groups
return group_number
# Apply the function to the 'Column2' to create a new column 'Group'
df['Group'] = df['sample_id'].apply(lambda x: map_to_group(x, group_files))
df_gca['Group'] = df_gca['sample_id'].apply(lambda x: map_to_group(x, group_files))
else:
df['Group'] = df['sample_id']
df_gca['Group'] = df_gca['sample_id']
# count total number of samples
total_samples = len(df["sample_id"].unique())
total_groups = len(df["Group"].unique())
print(f"Total number of samples: {total_samples} and total number of groups: {total_groups}\n")
for name, sub_df in tqdm(df.groupby("Group")):
sub_df_gca = df_gca[df_gca["Group"] == name]
# Use the best score grch modified peptide to train
sub_df.sort_values("posterior_error_probability", inplace=True)
sub_df_unique = sub_df.drop_duplicates(["seq", "modifications"])
if num_samples < len(sub_df_unique.index):
sub_df_unique = sub_df_unique.iloc[0:num_samples, :]
dlc = DeepLC(
deeplc_retrain=True,
n_epochs=20,
n_jobs=num_cores,verbose=verbose,
)
# Perform calibration, make predictions and calculate metrics
dlc.calibrate_preds(seq_df=sub_df_unique)
sub_df_gca["preds_tr"] = dlc.make_preds(seq_df=sub_df_gca)
sub_df_gca["error"] = sub_df_gca["tr"] - sub_df_gca["preds_tr"]
sub_df_gca["absolute error"] = abs(sub_df_gca["error"])
sub_df_gca["relative error"] = sub_df_gca["absolute error"] / sub_df_gca["tr"]
sub_df_gca["relative_error_percentile"] = sub_df_gca["relative error"].apply(
lambda x: percentileofscore(sub_df_gca["relative error"], x)
)
all_gca.append(sub_df_gca)
number_groups += 1
print(f"\nSample {name} done." + " % groups processed = " + str(round(number_groups / total_groups * 100, 4)) + "%" + " Number of Canonical PSMs: " + str(len(sub_df.index)) + " Number of GCA PSMs: " + str(len(sub_df_gca.index)), end="\n", flush=True) # Count samples: " + str(sample_count))
all_gca_df = pd.concat(all_gca)
plt.scatter(
all_gca_df["tr"],
all_gca_df["preds_tr"],
s=3,
alpha=0.1,
c=all_gca_df["relative_error_percentile"],
)
plt.plot([500, 6500], [500, 6500], c="grey")
plt.colorbar()
plt.xlabel("Experimental RT (min)")
plt.ylabel("Predicted RT (min)")
plt.savefig(output_folder + "/all_predictions.png")
plt.close()
plt.hist(all_gca_df["relative_error_percentile"], bins=100)
plt.vlines(99, 0, 1, color="black", label="99th percentile")
plt.vlines(95, 0, 1, color="grey", label="95th percentile")
plt.legend()
plt.savefig(output_folder + "/all_error_dist.png")
plt.close()
plt.scatter(all_gca_df["relative_error_percentile"], all_gca_df["error"], s=1)
plt.vlines(99, -4000, 4000, color="black", label="99th percentile")
plt.vlines(95, -4000, 4000, color="grey", label="95th percentile")
plt.legend()
plt.savefig(output_folder + "/all_error_perc.png")
plt.close()
plt.scatter(
all_gca_df["relative_error_percentile"],
all_gca_df["posterior_error_probability"],
s=4,
alpha=0.1,
)
plt.vlines(99, 0, 0.01, color="black", label="99th percentile")
plt.vlines(95, 0, 0.01, color="grey", label="95th percentile")
plt.legend()
plt.savefig(output_folder + "/all_error_perc_pep.png")
plt.close()
all_gca_df[all_gca_df["relative_error_percentile"] < 95].to_csv(output_file_95perc, index=False,compression="gzip")
all_gca_df[all_gca_df["relative_error_percentile"] < 99].to_csv(output_file_99perc, index=False,compression="gzip")
cli.add_command(filter_deeplc)
if __name__ == "__main__":
cli()