-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathtuneparameters.py
330 lines (231 loc) · 11.6 KB
/
tuneparameters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 25 15:44:37 2015
@author: Brian Donovan ([email protected])
This module contains functions which help choose the gamma and tolerance values for
RPCA - these are inputs to op_modified.opursuit().
"""
from tools import *
from numpy import square
from numpy import column_stack, arange
import numpy as np
from data_preprocessing import remove_bad_dimensions_grouped
from op_modified import opursuit, multiple_op, obj_func
from random import uniform, normalvariate
from scipy.stats import norm
import csv
from sys import stdout
from multiprocessing import Pool
from measureLinkOutliers import load_pace_data, load_from_file
class ConvergenceException:
def __init__(self, num_guesses):
self.num_guesses = num_guesses
pass
def fast_rank(A):
u, s, v = np.linalg.svd(A)
rank = np.sum(s > .0001)
return rank
# Helper method for tuning gamma and tolerance. Guesses a new value for a parameter,
# given a low bound and a high bound
# Params:
# current_val - the current value of the parameter
# lo_bound - an approximate lower bound, it is unlikely to generate a value lower than this
# hi_bound - an approximate upper bound, it is unlikely to generate a value higher than this
# SEARCH_RATE - if one of the bounds are missing, guesses will extend in that direction
# at a rate proportional to this
# BACKTRACK_PROB - the probability of generating a point outside of the bounds
def guess_param(current_val, lo_bound, hi_bound, SEARCH_RATE=5, BACKTRACK_PROB=.1,
hard_lower_bound = .0001, hard_upper_bound = 10):
lo_val = lo_bound
hi_val = hi_bound
# If we are missing either of the bounds, artificially create them using the search rate
if(lo_val==None):
#lo_val = current_val / SEARCH_RATE
if(hi_val!=None):
lo_val = hi_val / SEARCH_RATE
else:
lo_val = current_val / SEARCH_RATE
if(hi_val==None):
#hi_val = current_val * SEARCH_RATE
if(lo_val!=None):
hi_val = lo_val * SEARCH_RATE
else:
hi_val = current_val * SEARCH_RATE
# Create a normal distribution centered between the two bounds, and
# Ensure that there is only a BACKTRACK_PROB probability of generating
# a point outside of that range
mean = (hi_val+ lo_val) / 2
(lo_conf_bound, hi_conf_bound) = norm.interval(1 - BACKTRACK_PROB)
sd = (hi_val - lo_val) / (hi_conf_bound*2)
# Draw a random guess from the distribution and update bounds if necessary
current_val = normalvariate(mean, sd)
current_val = max(current_val, hard_lower_bound)
current_val = min(current_val, hard_upper_bound)
if(lo_bound!=None):
lo_bound = min(lo_bound, current_val)
if(hi_bound!=None):
hi_bound = max(hi_bound, current_val)
return current_val, lo_bound, hi_bound
# Helper method for increasing_tolerance_search (this is where most of the heavy
# lifting is done). Uses a randomized bisection technique to choose gamma and tol
# in such a way that the number of outliers in the C matrix and the rank of the L
# matrix meet some target values
# Params:
# vectors - The data to perform RPCA on, a list of Numpy column vectors
# gamma_guess - an intial guess for gamma
# tol_guess - an initial guess for the tolerance
# lo_target_c_perc - lower bound for the desired percentage of outliers
# hi_target_c_perc - upper bound for the desired percentage of otuliers
# lo_Target_num_pcs - lower bound for the desired rank of the data
# hi_target_num_pcs - upper bound for the desired rank of the data
def tune_gamma_and_tol(vectors, gamma_guess=.5, tol_guess=1e-2,
lo_target_c_perc=.04, hi_target_c_perc = .10,
lo_target_num_pcs=10, hi_target_num_pcs = 15):
BACKTRACK_PROB=.0000001
MAX_NUM_RESETS = 5
num_resets = 0
data_matrix = column_stack(vectors)
O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not
#Initially, we don't have any bounds on our search
lo_gamma = None
hi_gamma = None
gamma = gamma_guess
lo_tol = None
hi_tol = None
tol_perc = tol_guess
num_guesses = 0
while(True):
num_guesses += 1
logMsg("BS (%d , %d): Trying gamma=%f, tol=%f" % (num_guesses, hi_target_num_pcs, gamma, tol_perc))
stdout.flush()
try:
#L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=30)
L,C,term,n_iter = multiple_op(data_matrix, O, gamma, tol_perc=tol_perc)
#centered_L = scale_and_center(L, scale=False)
#pcs, robust_lowdim_data = pca(centered_L, 100000)
num_pca_dimensions = fast_rank(L)
#otherrank = fast_rank(L)
#print ("RANK %d , %d" % (num_pca_dimensions, otherrank))
c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])]
c_perc = float(sum(c_vals)) / len(c_vals)
logMsg(">>>>>> pcs=%d, outliers=%f" % (num_pca_dimensions, c_perc))
# If we have found acceptable values, stop
if(c_perc >= lo_target_c_perc and c_perc <= hi_target_c_perc
and num_pca_dimensions >= lo_target_num_pcs
and num_pca_dimensions <= hi_target_num_pcs):
break
# Otherwise, use our target values to figure out if we need to increase/decrease
# gamma or tol
if(num_pca_dimensions >= lo_target_num_pcs and c_perc >= lo_target_c_perc):
logMsg("#rank too high and too many outliers - increase tolerance")
lo_tol = tol_perc
elif(num_pca_dimensions >= lo_target_num_pcs and c_perc <= lo_target_c_perc):
logMsg("#rank too high and too few outliers - decrease gamma")
hi_gamma = gamma
elif(num_pca_dimensions <= lo_target_num_pcs and c_perc >= lo_target_c_perc):
logMsg("#rank too low and too many outliers - increase gamma")
lo_gamma = gamma
elif(num_pca_dimensions <= lo_target_num_pcs and c_perc <= lo_target_c_perc):
logMsg("#rank too low and too few outliers - decrease tolerance")
hi_tol = tol_perc
(gamma, lo_gamma, hi_gamma) = guess_param(gamma, lo_gamma, hi_gamma,
SEARCH_RATE=2, BACKTRACK_PROB=BACKTRACK_PROB)
(tol_perc, lo_tol, hi_tol) = guess_param(tol_perc, lo_tol, hi_tol,
SEARCH_RATE=5, BACKTRACK_PROB=BACKTRACK_PROB,
hard_upper_bound = 1.0)
logMsg("%s < gamma < %s , %s < tol < %s" % tuple(map(str, [lo_gamma, hi_gamma, lo_tol,hi_tol])))
stdout.flush()
except Exception as e:
logMsg(e.message)
lo_tol = .01
hi_tol = .01
if( (lo_tol !=None and hi_tol != None and hi_tol/lo_tol > .99 and hi_tol/lo_tol < 1.01)
or (lo_gamma !=None and hi_gamma != None and hi_gamma/lo_gamma > .99 and hi_gamma/lo_gamma < 1.01)):
print("----------------Got stuck")
stdout.flush()
BACKTRACK_PROB = .3
num_resets += 1
if(num_resets > MAX_NUM_RESETS):
raise ConvergenceException(num_guesses)
else:
hi_gamma = None
lo_gamma = None
hi_tol = None
lo_tol = None
gamma *= (2**uniform(-1,1))
tol_perc *= (10**uniform(-1,1))
print obj_func(L,C,gamma)
return gamma, tol_perc, num_guesses, L, C
# Searches for values of gamma and tol such that the number of outliers is between
# 5% and 10%, and that the rank of the data is between 10 and 15. The rank bounds
# are increased if lower values fail
# Params:
# vectors - the data to perform RPCA on. A list of Numpy column vectors
def increasing_tolerance_search(vectors):
hi_num_pcs = 15
num_guesses = 0
while(True):
try:
gamma, tol, part_num_guesses, L, C = tune_gamma_and_tol(vectors,
gamma_guess=.75, tol_guess=.35,
lo_target_c_perc=.05, hi_target_c_perc = .10,
lo_target_num_pcs=10, hi_target_num_pcs = hi_num_pcs)
"""
gamma, tol, part_num_guesses, L, C = tune_gamma_and_tol(vectors,
gamma_guess=.75, tol_guess=.35,
lo_target_c_perc=.00, hi_target_c_perc = 100,
lo_target_num_pcs=1, hi_target_num_pcs = 200)
"""
num_guesses += part_num_guesses
break
except ConvergenceException as e:
num_guesses += e.num_guesses
logMsg(" -------------------- Relaxing Condition ------------------------- ")
stdout.flush()
hi_num_pcs += 5
return gamma, tol, num_guesses, hi_num_pcs, L, C
###############################################################################3
# Everything below this line is for testing purposes
def paramIterator(data_matrix, O):
#tols = [1e-06, 5e-06, 1e-05, 5e-05, 1e-04, 5e-04, 1e-03, 5e-03]
tols = arange(.001, .005, .0001)
gammas = arange(.3,1.1,.01)
for tol_perc in tols:
for gamma in gammas:
yield (data_matrix, O, tol_perc, gamma)
def tryParameters((data_matrix, O, tol_perc, gamma)):
logMsg("Trying tol=%f, gamma=%f" % (tol_perc, gamma))
L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=10)
c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])]
c_perc = float(sum(c_vals)) / len(c_vals)
centered_L = scale_and_center(L, scale=False)
pcs, robust_lowdim_data = pca(centered_L, 100000)
num_pca_dimensions = pcs.shape[1]
print([tol_perc, gamma, c_perc, num_pca_dimensions, n_iter])
return [tol_perc, gamma, c_perc, num_pca_dimensions, n_iter]
def sweepGammaAndTol(vectors, pool=DefaultPool()):
data_matrix = column_stack(vectors)
O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not
it = paramIterator(data_matrix, O)
vals = pool.map(tryParameters, it)
with open('param_sweep.csv', 'w') as f:
w = csv.writer(f)
w.writerow(['tol','gamma','outliers','rank', 'iter'])
for val in vals:
w.writerow(val)
print("Done")
def compare_eps(vectors):
data_matrix = column_stack(vectors)
O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not
gamma = .5
tol_perc = .002
tol_perc = .00000001
for e in [2,5,10,20,50,100]:
L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc, eps_ratio=e)
obj = obj_func(L,C,gamma)
c_vals = [(sum(square(C[:,i]))!=0)*1 for i in xrange(C.shape[1])]
c_perc = float(sum(c_vals)) / len(c_vals)
centered_L = scale_and_center(L, scale=False)
pcs, robust_lowdim_data = pca(centered_L, 100000)
num_pca_dimensions = pcs.shape[1]
print "eps=%d term=%f n_iter=%d obj=%f rank=%d c_perc=%f" % (e,term, n_iter, obj, num_pca_dimensions, c_perc)