-
Notifications
You must be signed in to change notification settings - Fork 0
/
demo.py
126 lines (103 loc) · 5.01 KB
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import pickle
import time
from datetime import timedelta
from scalable_comboKR import SurfaceKernelNystromDifference
from scalable_comboKR import ScalablecomboKRdiff as ComboKR2
"""
An example of how to run comboKR 2.0 code. Skipping the cross-validation here!
Based on O'Neil dataset
"""
import warnings
warnings.filterwarnings("ignore")
# decide with which regularisation parameter (early stopping interations) to run the experiment
n_iters = 5
# ======================================================================================================================
# load the needed data
print("Loading the data and building the required arrays")
# the kernel matrices
Kcell = np.load("data/Kc.npy")
Kdrug = np.load("data/Kd.npy")
# feature names, to identify which row&column in kernel matrices correspond to which cell/drug
cell_ids = np.load("data/cellfeat_ids.npy")
drug_ids = np.load("data/drugfeat_ids.npy")
# training & test split - identified with triplets indetifying cell line, drug1, and drug2.
# note for training: it is good practice to include both drug orders! todo check how I save this
# the ids in first column should all be present in cell_ids
# the ids in the last two columns should be from drug_ids
cdd_tr = np.load("data/cdd_tr.npy") # of size n_tr*3
cdd_tst = np.load("data/cdd_tst.npy") # of size n_tst*3
print("cdd_tr shape:", cdd_tr.shape)
print("cdd_tst shape:", cdd_tst.shape)
# load the fitted monotherapy and drug combo surface functions
# monotherapies as Hill equations
# dataframe always indicates the cell line and the drug id
with open("data/fitted_hills_full.df.pkl", "rb") as f:
hills = pickle.load(f)
# the parameters of the fitted BRAID functions to the surfaces
braids_all = np.load("data/fitted_braid_params_full.npy")
# the functions are in the same order as the rows in this one:
cdd_full = np.load("data/cdd_full.npy")
# for the test data, load the dose concentrations and the measured responses at those concentrations
# these have been vectorised; every row corresponds to one surface
# full data arrays are again ordered as the cdd_full
c1_all = np.load("data/conc1.npy")
c2_all = np.load("data/conc2.npy")
y_all = np.load("data/measurementsMedian.npy")
# select the subsets of data for training and testing
# find where in cdd_full the cdd_tr and cdd_tst indices are
tr_inds = []
for ii in range(cdd_tr.shape[0]):
tr_inds.append(np.where((cdd_full[:, 0] == cdd_tr[ii, 0]) &
(cdd_full[:, 1] == cdd_tr[ii, 1]) &
(cdd_full[:, 2] == cdd_tr[ii, 2]))[0][0])
tst_inds = []
for ii in range(cdd_tst.shape[0]):
tst_inds.append(np.where((cdd_full[:, 0] == cdd_tst[ii, 0]) &
(cdd_full[:, 1] == cdd_tst[ii, 1]) &
(cdd_full[:, 2] == cdd_tst[ii, 2]))[0][0])
tr_inds = np.array(tr_inds)
tst_inds = np.array(tst_inds)
braids_tr = braids_all[tr_inds, :]
# it is good practice to "double" the training data, i.e. consider both drug pair orders in training
cdd_tr = np.vstack((cdd_tr, np.hstack((cdd_tr[:, 0][:, None], cdd_tr[:, 2][:, None], cdd_tr[:, 1][:, None]))))
braids_tr2 = braids_tr[:, [0, 2, 1, 3, 5, 4, 7, 6, 8]]
braids_tr = np.vstack((braids_tr, braids_tr2))
c1_tst = c1_all[tst_inds, :]
c2_tst = c2_all[tst_inds, :]
y_tst = y_all[tst_inds, :]
# ======================================================================================================================
# train the model
#
# The output data approximation
output_nystrom = SurfaceKernelNystromDifference(tol=1e-1, kernel="rbf", max_is_100=False)
output_nystrom.create_approximation()
combokr2 = ComboKR2(Kcell, Kdrug, cell_ids, drug_ids, output_nystrom, hills=hills, max_is_100=False)
# train (basically just initialise stuff, due to memory concers the actual training happens in predict)
combokr2.train(cdd_tr, braids_tr, n_iters)
# predict => Expected running time: about 20 minutes
print("Starting to train and test")
print("This might take a while due to the step of fitting BRAID surfaces to the GD solution..")
t0 = time.process_time()
# predict with projected gradient descent
preds, pred_braid_params = combokr2.predict_with_gd_nadam_and_braid(cdd_tst, c1_tst, c2_tst) # 0:18:54.285831
# # predict with candidate set
# preds, pred_braid_params = combokr2.predict_with_candidates(cdd_tst, c1_tst, c2_tst) # 0:07:56.296070
t1 = time.process_time()
print("It took this long to train and test comboKR 2.0:", timedelta(seconds=t1-t0))
# check how well it went
print("Pearson correlation:", np.round(np.corrcoef(y_tst.ravel(), preds.ravel())[0, 1], 3))
from scipy.stats import spearmanr
print("Spearman correlation:", np.round(spearmanr(y_tst.ravel(), preds.ravel())[0], 3))
# results with the "predict_with_gd_nadam_and_braid":
# Pearson correlation: 0.939
# Spearman correlation: 0.938
# results with the "predict_with_candidates":
# Pearson correlation: 0.935
# Spearman correlation: 0.934
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(y_tst.ravel(), preds.ravel(), s=0.02)
plt.xlabel("groundtruth")
plt.ylabel("prediction")
plt.show()