-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmnist.jl
191 lines (131 loc) · 6.18 KB
/
mnist.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
using MLDatasets: MNIST
using Flux
using LinearAlgebra
using Plots
using Images
using CUDA,cuDNN
using Random
using Statistics
nbrdata = 10_000 # the number of training data we will use from the MNIST dataset (if you have a bigger gpu (> 1050 Nvidia ) you can go bigger)
device = gpu # if you prefer cpu, change this to cpu (you should olso simplifie everything after this)
dataset = MNIST(:train)
testset = MNIST(:test)
"""
This function preprocesses the data, meaning, we reshape the data from (28,28,L) to (28,28,1,L) to account for the convolutional layers,
also, we permute the first two dimensions of x_train, this is about how julia Images.jl works if one day you want to see the images,
finally, we transform the targets into one-hot encoding (L length vectors of classes (0 to 9) ) -> 10 by L Matrix of proba)
note : there is a function in flux to one-hot encode arrays but *i thought it would be clearer here.
Send the data to gpu and return them.
"""
function prepocess(features,targets,L,device)
x_train = features[:,:,1:L] .|> Float32
x_train = reshape(x_train,28,28,1,L)
x_train = permutedims(x_train,(2,1,3,4))
yy = targets[1:L] .|> Float32
y_train = zeros(Int,10,L)
for i in 1:L
j = floor(Int,yy[i])+1
y_train[j,i] = 1
end
x_train = x_train |> device
y_train = y_train |> device
return x_train,y_train
end
features = dataset.features
targets = dataset.targets
x_train,y_train = prepocess(features,targets,nbrdata,device)
features_test = testset.features
targets_test = testset.targets
x_test,y_test = prepocess(features_test,targets_test,10_000,device)
"""
Here we define our neural network, mostly compose of convolutional and pooling layers.
architecture :
the x that will be input is of size (28,28,1,L),
Conv((3,3),1=>4,tanh) will create 3*3= 9 filters of size 1*4=4 giving 9*4 = 36 parameters + 4 bias = 40 parameters with a tanh activation function,
the result (see convolution for further details) will be of size ((28-2),(28-2),4,L) = (26,26,4,L),
then MaxPool((2,2)) will reduce the size of the image to (26,26,4,L) = (13,13,4,L), mostly by summing over 2 by 2 submatrix,
then Conv((3,3),4=>8,tanh) will create 3*3= 9 filters of size 4*8=32 giving 9*32 = 288 parameters + 8 bias = 296 parameters with a tanh activation function,
the result will be of size ((13-2),(13-2),8,L) = (11,11,8,L),
again, MaxPool((2,2)) will reduce the size of the image to (11,11,8,L) = (5,5,8,L),
then we flatten the result into a vector of size (5*5*8,L) = (200,L),
then we add a dropout of 0.4 to avoid overfitting, (will set some coefs of the 200 vector to 0 with a probability of 0.4),
Next, we add a fully connected layer of size 50 (200*50 + 50 parameters = 10_050 parameters) with a tanh activation function,
the output will be of size (50,L) ,
and finally, another fully connected layer of size 10 (50*10 + 10 parameters = 510 parameters) with a softmax activation function
We get 10_896 parameters in total to optimise, it is actually quite a fin model compare to some used for MNIST, so don't expect too much from it.
"""
model = Chain(Conv((3,3),1=>4,tanh),
MaxPool((2,2)),
Conv((3,3),4=>8,tanh),
MaxPool((2,2)),
Flux.flatten,
Flux.Dropout(0.4f0),
Dense( (((28-2)÷2-2)÷2)^2 * 8 => 50,tanh),
Dense(50=>10),
Flux.softmax) |> device
model(x_train)
"""
The loss used here is a crossentropy loss, won't explain it here, google it.
"""
loss(model,x,y)= Flux.crossentropy(model(x),y)
@show loss(model,x_test,y_test)
Flux.gradient(loss,model,x_test[:,:,:,1:1],y_test[:,1:1]);
"""
The optimiser used here is Adam with a learning rate of 0.01, again, won't explain it here. try Descent(0.01) if you want.
"""
opt = Flux.setup(Adam(0.01), model)
"""
This is where the magic goes, we train the model, it's really simple to read.
"""
function train_model(model,x,y,epochs,opt,bs)
@assert 1 <= bs <= size(x,4)
@inbounds for i in 1:epochs
@show i,loss(model,x,y)
data = Flux.DataLoader((x,y),batchsize = bs,shuffle=true) # collect data in batches (for 500 (batchsize in 10_000 (data size) will make an iterator of 20 elements of size 500 from a shuffle of the data)
for d in data # get each 20 elements of size 500
g = Flux.gradient(loss,model,d[1],d[2]) # d[1] : 500 values of x, d[2] : 500 values of y
Flux.update!(opt, model, g[1]) # update the parameters from the gradient following an optimiser
end
end
end
function evalClass(y)
argmax.(eachcol(y)) .- 1
end
function confusion_matrix(Res)
classes = unique(Res)
cm = zeros(Int, length(classes), length(classes))
for i in eachrow(Res)
true_class = i[2] + 1
pred_class = i[1] + 1
cm[true_class, pred_class] += 1
end
return cm
end
function compute_metrics(cm)
n = size(cm, 1)
for i in 1:n
tp = cm[i, i]
fp = sum(cm[:, i]) - tp
fn = sum(cm[i, :]) - tp
tn = sum(cm) - tp - fp - fn
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)
println("Class $(i-1): Precision = $precision, Recall = $recall, F1 Score = $f1")
end
end
Flux.Optimisers.adjust!(opt,0.01)
# Little note here, if this feels really slow, first wait a second so your gpu wake up, otherwize, try lowering the number of points / change the model to get less parameters. it takes around 40s on my computer just to give you an idea
@time train_model(model,x_train,y_train,12,opt,500)
@show loss(model,x_test,y_test)
Res = hcat(evalClass(model(x_train) |> cpu ),evalClass(y_train |> cpu))
accuracy = mean(Res[:,1] .== Res[:,2])
cm = confusion_matrix(Res)
println("Confusion Matrix: \n", cm)
compute_metrics(cm)
Res = hcat(evalClass(model(x_test) |> cpu ),evalClass(y_test |> cpu))
accuracy = mean(Res[:,1] .== Res[:,2])
println("Accuracy: $accuracy")
cm = confusion_matrix(Res)
println("Confusion Matrix: \n", cm)
compute_metrics(cm)