-
Notifications
You must be signed in to change notification settings - Fork 30
Open
Description
Hi Felix,
I am learning your C51 code and trying to replicate the Rainbow DQN, but I am confused whether action advantage tower should be:
"action_advantage = Lambda(lambda a: a[:, :, :] - K.mean(a[:, :, :], keepdims=True), output_shape=(action_size, z_atoms,))(action_advantage)"
or:
"action_advantage = Lambda(lambda a: a[:, :, :] - K.expand_dims(K.mean(a[:, :, :], axis=1), axis=1), output_shape=(self.action_size, self.z_atoms,))(action_advantage)"
Could you please kindly give me a hand.
Thanks indeed for your help.
`
def build_network(self, input_shape, action_size, algorithm=Algorithm.RAINBOW, network_type=NetworkType.RESIDUAL, z_atoms=51):
inputs_x = x = Input(shape=(input_shape))
x = Conv2D(filters=self.cnn_filter_num, kernel_size=self.cnn_filter_size, padding="same", data_format=self.data_format, kernel_regularizer=l2(self.l2_reg))(x)
x = BatchNormalization(axis=1)(x)
x = Activation("relu")(x)
for _ in range(self.n_residual_block):
in_x = x
x = Conv2D(filters=self.cnn_filter_num, kernel_size=self.cnn_filter_size, padding="same", data_format=self.data_format, kernel_regularizer=l2(self.l2_reg), name="res"+str(_)+"_Conv1")(x)
x = BatchNormalization(axis=1, name="res"+str(_)+"_batchnorm1")(x)
x = Activation("relu")(x)
x = Conv2D(filters=self.cnn_filter_num, kernel_size=self.cnn_filter_size, padding="same", data_format=self.data_format, kernel_regularizer=l2(self.l2_reg), name="res"+str(_)+"_Conv2")(x)
x = BatchNormalization(axis=1, name="res"+str(_)+"_batchnorm2")(x)
x = Add()([in_x, x])
x = Activation("relu")(x)
x = Flatten()(x)
state_value = NoisyDense(self.noisydense_units, self.noisydense_init_sigma,self.noisydense_activation)(x)
state_value = NoisyDense(1*z_atoms, self.noisydense_init_sigma, self.noisydense_activation_last)(state_value)
state_value = Lambda(lambda s: K.expand_dims(s[:, :], axis=1), output_shape=(action_size, z_atoms,))(state_value)
action_advantage = NoisyDense(self.noisydense_units, self.noisydense_init_sigma, self.noisydense_activation)(x)
action_advantage = NoisyDense(action_size*z_atoms, self.noisydense_init_sigma, self.noisydense_activation_last)(action_advantage)
action_advantage = Lambda(lambda a: K.reshape(a[:, :],[-1, action_size, z_atoms]), output_shape=(action_size, z_atoms,))(action_advantage)
action_advantage = Lambda(lambda a: a[:, :, :] - K.mean(a[:, :, :], keepdims=True), output_shape=(action_size, z_atoms,))(action_advantage)
state_action_value = merge([state_value, action_advantage], mode='sum')
output_distribution_list = []
for i_ in range(action_size):
output_distribution_list.append(Lambda(lambda sa: sa[:,i_,:], output_shape=(z_atoms,))(state_action_value))
model = Model(inputs=inputs_x, outputs=output_distribution_list)
model.compile(loss='categorical_crossentropy', optimizer=rmsprop(lr=self.learning_rate))`
Metadata
Metadata
Assignees
Labels
No labels