Skip to content

Commit f120f38

Browse files
committed
More verbosity when compiling optimizers
1 parent de11134 commit f120f38

File tree

1 file changed

+86
-12
lines changed

1 file changed

+86
-12
lines changed

nmt_keras/model_zoo.py

+86-12
Original file line numberDiff line numberDiff line change
@@ -169,17 +169,9 @@ def setOptimizer(self, **kwargs):
169169
if self.params.get('ACCUMULATE_GRADIENTS', 1) > 1 and self.params['OPTIMIZER'].lower() != 'adam':
170170
logging.warning('Gradient accumulate is only implemented for the Adam optimizer. Setting "ACCUMULATE_GRADIENTS" to 1.')
171171
self.params['ACCUMULATE_GRADIENTS'] = 1
172-
if self.verbose > 0:
173-
logging.info("Preparing optimizer: %s [LR: %s - LOSS: %s - "
174-
"CLIP_C %s - CLIP_V %s - LR_OPTIMIZER_DECAY %s - ACCUMULATE_GRADIENTS %s] and compiling." %
175-
(str(self.params['OPTIMIZER']),
176-
str(self.params.get('LR', 0.01)),
177-
str(self.params.get('LOSS', 'categorical_crossentropy')),
178-
str(self.params.get('CLIP_C', 0.)),
179-
str(self.params.get('CLIP_V', 0.)),
180-
str(self.params.get('LR_OPTIMIZER_DECAY', 0.0)),
181-
str(self.params.get('ACCUMULATE_GRADIENTS', 1))
182-
))
172+
173+
optimizer_str = '\t LR: ' + str(self.params.get('LR', 0.01)) + \
174+
'\n\t LOSS: ' + str(self.params.get('LOSS', 'categorical_crossentropy'))
183175

184176
if self.params.get('USE_TF_OPTIMIZER', False) and K.backend() == 'tensorflow':
185177
if self.params['OPTIMIZER'].lower() not in ['sgd', 'adagrad', 'adadelta', 'rmsprop', 'adam']:
@@ -200,19 +192,36 @@ def setOptimizer(self, **kwargs):
200192
optimizer = TFOptimizer(tf.train.MomentumOptimizer(self.params.get('LR', 0.01),
201193
self.params.get('MOMENTUM', 0.0),
202194
use_nesterov=self.params.get('NESTEROV_MOMENTUM', False)))
195+
optimizer_str += '\n\t MOMENTUM: ' + str(self.params.get('MOMENTUM', 0.0)) + \
196+
'\n\t NESTEROV: ' + str(self.params.get('NESTEROV_MOMENTUM', False))
197+
203198
elif self.params['OPTIMIZER'].lower() == 'adam':
204199
optimizer = TFOptimizer(tf.train.AdamOptimizer(learning_rate=self.params.get('LR', 0.01),
200+
beta1=self.params.get('BETA_1', 0.9),
201+
beta2=self.params.get('BETA_2', 0.999),
205202
epsilon=self.params.get('EPSILON', 1e-7)))
203+
optimizer_str += '\n\t BETA_1: ' + str(self.params.get('BETA_1', 0.9)) + \
204+
'\n\t BETA_2: ' + str(self.params.get('BETA_2', 0.999)) + \
205+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
206+
206207
elif self.params['OPTIMIZER'].lower() == 'adagrad':
207208
optimizer = TFOptimizer(tf.train.AdagradOptimizer(self.params.get('LR', 0.01)))
209+
208210
elif self.params['OPTIMIZER'].lower() == 'rmsprop':
209211
optimizer = TFOptimizer(tf.train.RMSPropOptimizer(self.params.get('LR', 0.01),
210212
decay=self.params.get('LR_OPTIMIZER_DECAY', 0.0),
211213
momentum=self.params.get('MOMENTUM', 0.0),
212214
epsilon=self.params.get('EPSILON', 1e-7)))
215+
optimizer_str += '\n\t MOMENTUM: ' + str(self.params.get('MOMENTUM', 0.9)) + \
216+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
217+
213218
elif self.params['OPTIMIZER'].lower() == 'adadelta':
214219
optimizer = TFOptimizer(tf.train.AdadeltaOptimizer(learning_rate=self.params.get('LR', 0.01),
220+
rho=self.params.get('RHO', 0.95),
215221
epsilon=self.params.get('EPSILON', 1e-7)))
222+
optimizer_str += '\n\t RHO: ' + str(self.params.get('RHO', 0.9)) + \
223+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
224+
216225
else:
217226
raise Exception('\tThe chosen optimizer is not implemented.')
218227
else:
@@ -223,6 +232,8 @@ def setOptimizer(self, **kwargs):
223232
nesterov=self.params.get('NESTEROV_MOMENTUM', False),
224233
clipnorm=self.params.get('CLIP_C', 0.),
225234
clipvalue=self.params.get('CLIP_V', 0.))
235+
optimizer_str += '\n\t MOMENTUM: ' + str(self.params.get('MOMENTUM', 0.0)) + \
236+
'\n\t NESTEROV: ' + str(self.params.get('NESTEROV_MOMENTUM', False))
226237

227238
elif self.params['OPTIMIZER'].lower() == 'rsmprop':
228239
optimizer = RMSprop(lr=self.params.get('LR', 0.001),
@@ -231,13 +242,16 @@ def setOptimizer(self, **kwargs):
231242
clipnorm=self.params.get('CLIP_C', 0.),
232243
clipvalue=self.params.get('CLIP_V', 0.),
233244
epsilon=self.params.get('EPSILON', 1e-7))
245+
optimizer_str += '\n\t RHO: ' + str(self.params.get('RHO', 0.9)) + \
246+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
234247

235248
elif self.params['OPTIMIZER'].lower() == 'adagrad':
236249
optimizer = Adagrad(lr=self.params.get('LR', 0.01),
237250
decay=self.params.get('LR_OPTIMIZER_DECAY', 0.0),
238251
clipnorm=self.params.get('CLIP_C', 0.),
239252
clipvalue=self.params.get('CLIP_V', 0.),
240253
epsilon=self.params.get('EPSILON', 1e-7))
254+
optimizer_str += '\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
241255

242256
elif self.params['OPTIMIZER'].lower() == 'adadelta':
243257
optimizer = Adadelta(lr=self.params.get('LR', 1.0),
@@ -246,6 +260,8 @@ def setOptimizer(self, **kwargs):
246260
clipnorm=self.params.get('CLIP_C', 0.),
247261
clipvalue=self.params.get('CLIP_V', 0.),
248262
epsilon=self.params.get('EPSILON', 1e-7))
263+
optimizer_str += '\n\t RHO: ' + str(self.params.get('RHO', 0.9)) + \
264+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
249265

250266
elif self.params['OPTIMIZER'].lower() == 'adam':
251267
if self.params.get('ACCUMULATE_GRADIENTS') > 1:
@@ -258,6 +274,11 @@ def setOptimizer(self, **kwargs):
258274
clipvalue=self.params.get('CLIP_V', 0.),
259275
epsilon=self.params.get('EPSILON', 1e-7),
260276
accum_iters=self.params.get('ACCUMULATE_GRADIENTS'))
277+
optimizer_str += '\n\t BETA_1: ' + str(self.params.get('BETA_1', 0.9)) + \
278+
'\n\t BETA_2: ' + str(self.params.get('BETA_2', 0.999)) + \
279+
'\n\t AMSGRAD: ' + str(self.params.get('AMSGRAD', False)) + \
280+
'\n\t ACCUMULATE_GRADIENTS: ' + str(self.params.get('ACCUMULATE_GRADIENTS')) + \
281+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
261282
else:
262283
optimizer = Adam(lr=self.params.get('LR', 0.001),
263284
beta_1=self.params.get('BETA_1', 0.9),
@@ -267,6 +288,10 @@ def setOptimizer(self, **kwargs):
267288
clipnorm=self.params.get('CLIP_C', 0.),
268289
clipvalue=self.params.get('CLIP_V', 0.),
269290
epsilon=self.params.get('EPSILON', 1e-7))
291+
optimizer_str += '\n\t BETA_1: ' + str(self.params.get('BETA_1', 0.9)) + \
292+
'\n\t BETA_2: ' + str(self.params.get('BETA_2', 0.999)) + \
293+
'\n\t AMSGRAD: ' + str(self.params.get('AMSGRAD', False)) + \
294+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
270295

271296
elif self.params['OPTIMIZER'].lower() == 'adamax':
272297
optimizer = Adamax(lr=self.params.get('LR', 0.002),
@@ -276,7 +301,9 @@ def setOptimizer(self, **kwargs):
276301
clipnorm=self.params.get('CLIP_C', 0.),
277302
clipvalue=self.params.get('CLIP_V', 0.),
278303
epsilon=self.params.get('EPSILON', 1e-7))
279-
304+
optimizer_str += '\n\t BETA_1: ' + str(self.params.get('BETA_1', 0.9)) + \
305+
'\n\t BETA_2: ' + str(self.params.get('BETA_2', 0.999)) + \
306+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
280307
elif self.params['OPTIMIZER'].lower() == 'nadam':
281308
optimizer = Nadam(lr=self.params.get('LR', 0.002),
282309
beta_1=self.params.get('BETA_1', 0.9),
@@ -285,10 +312,57 @@ def setOptimizer(self, **kwargs):
285312
clipnorm=self.params.get('CLIP_C', 0.),
286313
clipvalue=self.params.get('CLIP_V', 0.),
287314
epsilon=self.params.get('EPSILON', 1e-7))
315+
optimizer_str += '\n\t BETA_1: ' + str(self.params.get('BETA_1', 0.9)) + \
316+
'\n\t BETA_2: ' + str(self.params.get('BETA_2', 0.999)) + \
317+
'\n\t EPSILON: ' + str(self.params.get('EPSILON', 1e-7))
318+
319+
elif self.params['OPTIMIZER'].lower() == 'sgdhd':
320+
optimizer = SGDHD(lr=self.params.get('LR', 0.002),
321+
clipnorm=self.params.get('CLIP_C', 10.),
322+
clipvalue=self.params.get('CLIP_V', 0.),
323+
hypergrad_lr=self.params.get('HYPERGRAD_LR', 0.001))
324+
optimizer_str += '\n\t HYPERGRAD_LR: ' + str(self.params.get('HYPERGRAD_LR', 0.001))
325+
326+
elif self.params['OPTIMIZER'].lower() == 'qhsgd':
327+
optimizer = QHSGD(lr=self.params.get('LR', 0.002),
328+
momentum=self.params.get('MOMENTUM', 0.0),
329+
quasi_hyperbolic_momentum=self.params.get('QUASI_HYPERBOLIC_MOMENTUM', 0.0),
330+
decay=self.params.get('LR_OPTIMIZER_DECAY', 0.0),
331+
nesterov=self.params.get('NESTEROV_MOMENTUM', False),
332+
dampening=self.params.get('DAMPENING', 0.),
333+
clipnorm=self.params.get('CLIP_C', 10.),
334+
clipvalue=self.params.get('CLIP_V', 0.))
335+
optimizer_str += '\n\t MOMENTUM: ' + str(self.params.get('MOMENTUM', 0.0)) + \
336+
'\n\t QUASI_HYPERBOLIC_MOMENTUM: ' + str(self.params.get('QUASI_HYPERBOLIC_MOMENTUM', 0.0)) + \
337+
'\n\t DAMPENING: ' + str(self.params.get('DAMPENING', 0.0)) + \
338+
'\n\t NESTEROV: ' + str(self.params.get('NESTEROV_MOMENTUM', False))
339+
340+
elif self.params['OPTIMIZER'].lower() == 'qhsgdhd':
341+
optimizer = QHSGDHD(lr=self.params.get('LR', 0.002),
342+
momentum=self.params.get('MOMENTUM', 0.0),
343+
quasi_hyperbolic_momentum=self.params.get('QUASI_HYPERBOLIC_MOMENTUM', 0.0),
344+
dampening=self.params.get('DAMPENING', 0.),
345+
hypergrad_lr=self.params.get('HYPERGRAD_LR', 0.001),
346+
decay=self.params.get('LR_OPTIMIZER_DECAY', 0.0),
347+
nesterov=self.params.get('NESTEROV_MOMENTUM', False),
348+
clipnorm=self.params.get('CLIP_C', 10.),
349+
clipvalue=self.params.get('CLIP_V', 0.))
350+
optimizer_str += '\n\t MOMENTUM: ' + str(self.params.get('MOMENTUM', 0.0)) + \
351+
'\n\t QUASI_HYPERBOLIC_MOMENTUM: ' + str(self.params.get('QUASI_HYPERBOLIC_MOMENTUM', 0.0)) + \
352+
'\n\t HYPERGRAD_LR: ' + str(self.params.get('HYPERGRAD_LR', 0.001)) + \
353+
'\n\t DAMPENING: ' + str(self.params.get('DAMPENING', 0.0)) + \
354+
'\n\t NESTEROV: ' + str(self.params.get('NESTEROV_MOMENTUM', False))
288355
else:
289356
logging.info('\tWARNING: The modification of the LR is not implemented for the chosen optimizer.')
290357
optimizer = eval(self.params['OPTIMIZER'])
291358

359+
optimizer_str += '\n\t CLIP_C ' + str(self.params.get('CLIP_C', 0.)) + \
360+
'\n\t CLIP_V ' + str(self.params.get('CLIP_V', 0.)) + \
361+
'\n\t LR_OPTIMIZER_DECAY ' + str(self.params.get('LR_OPTIMIZER_DECAY', 0.0)) + \
362+
'\n\t ACCUMULATE_GRADIENTS ' + str(self.params.get('ACCUMULATE_GRADIENTS', 1)) + '\n'
363+
if self.verbose > 0:
364+
logging.info("Preparing optimizer and compiling. Optimizer configuration: \n" + optimizer_str)
365+
292366
if hasattr(self, 'multi_gpu_model') and self.multi_gpu_model is not None:
293367
model_to_compile = self.multi_gpu_model
294368
else:

0 commit comments

Comments
 (0)