-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathE3-RL.qmd
114 lines (88 loc) · 3.13 KB
/
E3-RL.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
---
title: "Untitled"
---
# Reinforcement learning
This is just a teaser, run/adapt it if you like.
Objective: Train a neural network capable of balancing a pole.
The environment is run on a local server, please install <a href="https://github.com/openai/gym-http-api" target="_blank" rel="noopener">gym</a>.
Or go through this <a href="https://colab.research.google.com/github/tensorflow/agents/blob/master/docs/tutorials/6_reinforce_tutorial.ipynb" target="_blank" rel="noopener">colab book</a>.
``` r
library(keras)
library(tensorflow)
library(gym)
set_random_seed(321L, disable_gpu = FALSE) # Already sets R's random seed.
remote_base =´ "http://127.0.0.1:5000"
client = create_GymClient(remote_base)
env = gym::env_create(client, "CartPole-v1")
gym::env_list_all(client)
env_reset(client, env)
# action = env_action_space_sample(client, env)
step = env_step(client, env, 1)
env_reset(client, env)
goal_steps = 500
score_requirement = 60
intial_games = 1000
state_size = 4L
action_size = 2L
gamma = 0.95
epsilon = 0.95
epsilon_min = 0.01
epsilon_decay = 0.995
model = keras_model_sequential()
model %>%
layer_dense(input_shape = c(4L), units = 20L, activation = "relu") %>%
layer_dense(units = 20L, activation = "relu") %>%
layer_dense(2L, activation = "linear")
model %>%
keras::compile(loss = loss_mean_squared_error, optimizer = optimizer_adamax())
memory = matrix(0, nrow = 8000L, 11L)
counter = 1
remember = function(memory, state, action, reward, next_state, done){
memory[counter,] = as.numeric(c(state, action, reward, next_state, done))
counter <<- counter+1
return(memory)
}
# memory: state 1:4, action 5, reward 6, next_state 7:10, done 11
act = function(state){
if(runif(1) <= epsilon){ return(sample(0:1, 1)) }
act_prob = predict(model, matrix(state, nrow = 1L))
return(which.max(act_prob) - 1L)
}
replay = function(batch_size = 25L, memory, counter){
indices = sample.int(counter, batch_size)
batch = memory[indices,,drop = FALSE]
for(i in 1:nrow(batch)){
target = batch[i,6] # Reward.
action = batch[i,5] # Action.
state = matrix(memory[i, 1:4], nrow = 1L)
next_state = matrix(memory[i,7:10], nrow =1L)
if(!batch[i,11]){ # Not done.
target = (batch[i,6] + gamma * predict(model,
matrix(next_state, nrow = 1L)))[1,1]
}
target_f = predict(model, matrix(state, nrow = 1L))
target_f[action+1L] = target
model$fit(x = state, y = target_f, epochs = 1L, verbose = 0L)
if(epsilon > epsilon_min){ epsilon <<- epsilon_decay*epsilon }
}
}
done = 0
for(e in 1:100){
state = unlist(env_reset(client, env))
for(time in 1:500){
action = act(state)
response = env_step(client, env, action = action)
done = as.integer(response$done)
if(!done){ reward = response$reward }
else{ reward = -10 }
next_state = unlist(response$observation)
memory = remember(memory, state, action, reward, next_state, done)
state = next_state
if(done){
cat("episode", e/500, " score: ", time, " eps: ", epsilon, "\n")
break()
}
if(counter > 32L){ replay(32L, memory, counter-1L) }
}
}
```