forked from jjberry/translational-DBN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbackprop.py
412 lines (372 loc) · 15.7 KB
/
backprop.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
import sys
import os
home = os.path.expanduser("~")
sys.path.append(os.path.join(home, 'gnumpy'))
import gnumpy as gp
import numpy as np
import scipy
import scipy.optimize
import deepnet
class NeuralNet(object):
'''
Implementation of a Multi-Layer Perception trained by backprop. This class
accepts pre-trained networks for use in a deep neural network. Pre-trained
nets should consist of a list of objects, where each object has a W and hbias
variables containing numpy arrays, n_hidden containing the number of
hidden units, and hidtype containing a string with the activation type, i.e.
"sigmoid".
'''
def __init__(self, network=None, layer_sizes=None, layer_types=None):
layers = []
if (network != None):
# copy the weights from the given network onto the GPU
for rbm in network:
l = Layer(rbm.W, rbm.hbias, rbm.n_hidden, rbm.hidtype)
layers.append(l)
else:
# if no pre-trained network is given, initialize random weights
assert layer_sizes is not None
assert layer_types is not None
assert len(layer_sizes) == len(layer_types)
# randomize the network weights according to the Bottou proposition
# this is borrowed from the ffnet project:
# http://ffnet.sourceforge.net/_modules/ffnet.html#ffnet.randomweights
n = 0
for i in range(len(layer_sizes)-1):
n += layer_sizes[i]*layer_sizes[i+1]
n += layer_sizes[i+1]
bound = 2.38 / np.sqrt(n)
for i in range(len(layer_sizes)-1):
W = np.zeros((layer_sizes[i+1]*layer_sizes[i],))
for j in range(W.size):
W[j] = np.random.uniform(-bound, bound)
W = W.reshape((layer_sizes[i+1], layer_sizes[i]))
hbias = np.zeros((layer_sizes[i+1],))
for j in range(hbias.size):
hbias[j] = np.random.uniform(-bound, bound)
hbias = hbias.reshape((layer_sizes[i+1],1))
l = Layer(W, hbias, layer_sizes[i+1], layer_types[i+1])
layers.append(l)
self.network = layers
def run_through_network(self, data, net=None):
'''
Gets the output of the top layer of the network given input data on the
bottom.
args:
array data: the input data
obj net: the network to use, default is self.network
returns:
array hid: the activation of the top layer
'''
if net is None:
net = self.network
hid = data
for layer in net:
vis = gp.garray(hid)
hid = self.get_activation(layer, vis)
gp.free_reuse_cache()
return hid
def get_activation(self, layer, data):
'''
Gets the activation of a single layer given input data
args:
obj layer: the layer object
array data: the input data
returns:
array hid: the output of the layer
'''
if not hasattr(layer, 'n_hidden'):
layer = layer[0]
hid = np.zeros((data.shape[0], layer.n_hidden))
breaks = range(0, hid.shape[0], 128)
breaks.append(hid.shape[0])
for i in range(len(breaks)-1):
s = breaks[i]
e = breaks[i+1]
act = gp.dot(data[s:e], layer.W.T) + layer.hbias.T
if layer.hidtype == 'sigmoid':
hid[s:e] = (act.logistic()).as_numpy_array()
else:
hid[s:e] = act.as_numpy_array()
return hid
def train(self, network, data, targets, validX=None, validT=None, max_iter=100,
validErrFunc='classification', targetCost='linSquaredErr', initialfit=5,
cg_iter=20):
'''
Trains the network using backprop
args:
list[obj] network: the network
array data: the training data
array targets: the training labels
array validX: the validation data (optional)
array validT: the validation labels (optional)
int max_iter: the maximum number of backprop iterations
string validErrFunc: determines which kind of network to train,
i.e. classification or reconstruction
string targetCost: determines which cost function to use, i.e.
linSquaredErr, crossEntropy, or softMax
linSquaredErr works only for gaussian output units
softmax works only for exp output units (not implemented)
int initialfit: if n>0, top layer only will be trained for n iterations
int cg_iter: the max number of iterations for conjugate gradient
optimization, default=20
'''
# initialize parameteres
self.validErrFunc = validErrFunc
self.targetCost = targetCost
self.n, self.m = data.shape
self.cg_iter = cg_iter
numunits = 0
for i in range(len(self.network)):
numunits = numunits + self.network[i].W.shape[1] + \
self.network[i].hbias.shape[0]
self.numunits = numunits
self.batch_size = 1024
self.weights = np.ones((self.n,1))
# For estimating test error
tindex = np.arange(self.n)
np.random.shuffle(tindex)
tinds = tindex[:(np.min([self.batch_size, self.n]))]
# Perform gradient descent
print "Starting %d iterations of backprop." % max_iter
if (initialfit>0):
# This gets the activation of next to last layer to train top layer
transformedX = self.run_through_network(data, network[:-1])
for i in range(max_iter):
trainerr = self.getError(network, data[tinds,:], targets[tinds,:],
self.weights[tinds])
if validX is not None:
validerr = self.getError(network, validX, validT,
np.ones((validX.shape[0],)))
print "Iteration %3d: TrainErr = %4.3f, ValidErr = %4.3f" % \
(i+1, trainerr, validerr)
else:
print "Iteration %3d: TrainErr = %4.3f" %(i+1, trainerr)
# Train the top layer only for initialfit iters
if (i < initialfit):
toplayer = self.doBackprop(transformedX, targets, [network[-1]])
network[-1] = toplayer[0]
else:
network = self.doBackprop(data, targets, network)
# Print the final training error
trainerr = self.getError(network, data[tinds,:], targets[tinds,:],
self.weights[tinds])
if validX is not None:
validerr = self.getError(network, validX, validT,
np.ones((validX.shape[0],)))
print "Final : TrainErr = %4.3f, ValidErr = %4.3f" % \
(trainerr, validerr)
else:
print "Final : TrainErr = %4.3f" %(trainerr)
return network
def getError(self, network, X, T, weights):
'''
Calculates the error for either classification or reconstruction during
backprop
args:
list[obj] network: the network to use
X: the input data
T: the input targets
weights: weights used for backprop
This function is designed to be called by the train() method
'''
err = 0
result = self.run_through_network(X, network)
if self.validErrFunc == 'classification':
for i in range(X.shape[0]):
ind = np.argmax(result[i,:])
targ = np.argmax(T[i,:])
if ind != targ:
err = err + weights[i]
else:
for i in range(X.shape[0]):
err = err + np.sqrt(np.sum(np.square(result[i,:]-T[i,:])))*weights[i]
validerr = err / np.sum(weights)
return validerr
def doBackprop(self, data, targets, network):
'''
Executes 1 iteration of backprop
args:
array data: the training data
array targets: the training targets
list[obj] network: the network
This function is designed to be called by the train() method
'''
no_layers = len(network)
index = np.arange(self.n)
np.random.shuffle(index)
nbatches = len(range(0,self.n, self.batch_size))
count = 0
for batch in range(0, self.n, self.batch_size):
if batch + 2*self.batch_size > self.n:
batchend = self.n
else:
batchend = batch + self.batch_size
# Select current batch
tmpX = data[index[batch:batchend],:]
tmpT = targets[index[batch:batchend],:]
tmpW = self.weights[index[batch:batchend],:]
# flatten out the weights and store them in v
v = []
for i in range(no_layers):
w = network[i].W.as_numpy_array()
b = network[i].hbias.as_numpy_array()
v.extend((w.reshape((w.shape[0]*w.shape[1],))).tolist())
v.extend((b.reshape((b.shape[0]*b.shape[1],))).tolist())
v = np.asarray(v)
# Conjugate gradient minimiziation
result = scipy.optimize.minimize(self.backprop_gradient, v,
args=(network, tmpX, tmpT, tmpW),
method='CG', jac=True, options={'maxiter': self.cg_iter})
if (count%10 == 0):
print "batch %d of %d. success: %s" %(count+1, nbatches,
str(result.success))
count += 1
v = result.x
# unflatten v and put new weights back
ind =0
for i in range(no_layers):
h,w = network[i].W.shape
network[i].W = gp.garray((v[ind:(ind+h*w)]).reshape((h,w)))
ind += h*w
b = len(network[i].hbias)
network[i].hbias = gp.garray((v[ind:(ind+b)]).reshape((b,1)))
ind += b
# debugging help
#print "=================="
#print "W 1", network[0].W.shape
#print network[0].W
#print "bias 1", network[0].hbias.shape
#print network[0].hbias
#print "W 2", network[1].W.shape
#print network[1].W
#print "bias 2", network[1].hbias.shape
#print network[1].hbias
#print "=================="
return network
def backprop_gradient(self, v, network, X, targets, weights):
'''
Calculates the value of the cost function and the gradient for CG
optimization.
args:
array v: the 1d vector of weights
list[obj] network: the network
array X: training data
array targets: the training targets
array weights: the backprop weights
returns:
array cost: the value of the cost function
array grad: the value of the gradient
This function is called by scipy's minimize function during optimization
'''
if len(v.shape) == 1:
v = v.reshape((v.shape[0],1))
# initialize variables
n = X.shape[0]
numHiddenLayers = len(network)
# put the v weights back into the network
ind =0
for i in range(numHiddenLayers):
h,w = network[i].W.shape
network[i].W = gp.garray((v[ind:(ind+h*w)]).reshape((h,w)))
ind += h*w
b = network[i].hbias.shape[0]
network[i].hbias = gp.garray(v[ind:(ind+b)]).reshape((b,1))
ind += b
# Run data through the network, keeping activations of each layer
acts = [X] # a list of numpy arrays
hid = X
for layer in network:
vis = gp.garray(hid)
hid = self.get_activation(layer, vis)
acts.append(hid)
gp.free_reuse_cache()
# store the gradients
dW = []
db = []
# Compute the value of the cost function
if self.targetCost == 'crossEntropy':
# see www.stanford.edu/group/pdplab/pdphandbook/handbookch6.html
cost = (-1.0/n) * np.sum(np.sum(targets * np.log(acts[-1]) + \
(1.0 - targets) * np.log(1.0 - acts[-1]), axis=1) * weights.T)
Ix = (acts[-1] - targets) / n
else: #self.targetCost == 'linSquaredErr':
cost = 0.5 * np.sum(np.sum(np.square(acts[-1] - targets), axis=1) * \
weights.T)
Ix = (acts[-1] - targets)
Ix *= np.tile(weights, (1, Ix.shape[1])).reshape((Ix.shape[0],Ix.shape[1]))
Ix = gp.garray(Ix)
# Compute the gradients
for i in range(numHiddenLayers-1,-1,-1):
# augment activations with ones
acts[i] = gp.garray(acts[i])
acts[i] = gp.concatenate((acts[i], gp.ones((n,1))), axis=1)
# compute delta in next layer
delta = gp.dot(acts[i].T, Ix)
# split delta into weights and bias parts
dW.append(delta[:-1,:].T)
db.append(delta[-1,:].T)
# backpropagate the error
if i > 0:
if network[i-1].hidtype == 'sigmoid':
Ix = gp.dot(Ix,gp.concatenate((network[i].W,network[i].hbias),
axis=1)) * acts[i] * (1.0 - acts[i])
elif network[i-1].hidtype == 'gaussian':
Ix = gp.dot(Ix,gp.concatentate((network[i].W,network[i].hbias),
axis=1))
Ix = Ix[:,:-1]
gp.free_reuse_cache()
dW.reverse()
db.reverse()
# Convert gradient information
grad = np.zeros_like(v)
ind = 0
for i in range(numHiddenLayers):
grad[ind:(ind+dW[i].size)] = \
(dW[i].reshape((dW[i].shape[0]*dW[i].shape[1],1))).as_numpy_array()
ind += dW[i].size
grad[ind:(ind+db[i].size),0] = db[i].as_numpy_array()
ind += db[i].size
grad = grad.reshape((grad.shape[0],))
return cost, grad
class Layer(object):
'''
A hidden layer object
args:
array W: the weight array
array hbias: the bias weights
int n_hidden: the number of hidden units
string hidtype: the activation function "sigmoid" or "gaussian"
'''
def __init__(self, W, hbias, n_hidden, hidtype):
self.W = gp.garray(W)
# convert 1d arrays to 2d
if len(hbias.shape) == 1:
hbias = hbias.reshape((hbias.shape[0],1))
self.hbias = gp.garray(hbias)
self.n_hidden = n_hidden
self.hidtype = hidtype
def demo_xor():
'''Demonstration of backprop with classic XOR example
'''
data = np.array([[0.,0.],[0.,1.],[1.,0.],[1.,1.]])
targets = np.array([[0.],[1.],[1.],[0.]])
nn = NeuralNet(layer_sizes=[2,2,1], layer_types=['sigmoid','sigmoid','sigmoid'])
print "initial parameters"
print "=================="
print "W 1", nn.network[0].W.shape
print nn.network[0].W
print "bias 1", nn.network[0].hbias.shape
print nn.network[0].hbias
print "W 2", nn.network[1].W.shape
print nn.network[1].W
print "bias 2", nn.network[1].hbias.shape
print nn.network[1].hbias
print "=================="
net = nn.train(nn.network, data, targets, max_iter=10, targetCost='crossEntropy',
initialfit=0, cg_iter=100)
print "network test:"
output = nn.run_through_network(data, net)
print output
if __name__ == "__main__":
demo_xor()