-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb using GloVe.R
169 lines (145 loc) · 5.19 KB
/
imdb using GloVe.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#
# What is happening here.
# We're reading in the original IMDB reviews, tokenizing them and assigning them
# labels. Then the samples (review, label) are split into training and validation
# sets.
# Next GloVe word embeddings are read - it consists of 400000 words with their
# 100-dimensional embeddings vectors. It seems that those embeddings can be used
# as the embeddings layer weights, so the next step is to create a matrix of those
# embeddings vectors, that matrix is then used to initialize the weights of the
# embedding layer.
# Then the model predicting sentiment of a review is trained on only 200 samples
# on that pretrained embeddings network.
# Question. Why pretrained embedding layer should in any way help in achieving
# better prediction accuracy for this problem???
#
library(keras)
imdb_dir <- "~/Documents/GitHub/Keras/aclImdb"
train_dir <- file.path(imdb_dir, "train")
labels <- c()
texts <- c()
for (label_type in c("neg", "pos")) {
label <- switch(label_type, neg = 0, pos = 1)
dir_name <- file.path(train_dir, label_type)
for (fname in list.files(dir_name, pattern = glob2rx("*.txt"),
full.names = TRUE)) {
texts <- c(texts, readChar(fname, file.info(fname)$size))
labels <- c(labels, label)
}
}
# Tokenizing the text
maxlen <- 100
training_samples <- 200
validation_samples <- 10000
max_words <- 10000
tokenizer <- text_tokenizer(num_words = max_words) %>%
fit_text_tokenizer(texts)
sequences <- texts_to_sequences(tokenizer, texts)
word_index = tokenizer$word_index
cat("Found", length(word_index), "unique tokens.\n")
data <- pad_sequences(sequences, maxlen = maxlen)
labels <- as.array(labels)
cat("Shape of data tensor:", dim(data), "\n")
cat('Shape of label tensor:', dim(labels), "\n")
indices <- sample(1:nrow(data))
training_indices <- indices[1:training_samples]
validation_indices <- indices[(training_samples + 1):
(training_samples + validation_samples)]
x_train <- data[training_indices,]
y_train <- labels[training_indices]
x_val <- data[validation_indices,]
y_val <- labels[validation_indices]
# Parsing the GloVe word-embeddings file
#
glove_dir = "~/Documents/GitHub/Keras/GloVe/glove.6B"
lines <- readLines(file.path(glove_dir, "glove.6B.100d.txt"))
embeddings_index <- new.env(hash = TRUE, parent = emptyenv())
for (i in 1:length(lines)) {
line <- lines[[i]]
values <- strsplit(line, " ")[[1]]
word <- values[[1]]
embeddings_index[[word]] <- as.double(values[-1])
}
cat("Found", length(embeddings_index), "word vectors.\n")
# Preparing the GloVe word-embeddings matrix
#
embedding_dim <- 100
embedding_matrix <- array(0, c(max_words, embedding_dim))
for (word in names(word_index)) {
index <- word_index[[word]]
if (index < max_words) {
embedding_vector <- embeddings_index[[word]]
if (!is.null(embedding_vector)) # Words not found in the embedding index will be all zeros
embedding_matrix[index+1,] <- embedding_vector
# index 1 isn’t supposed to stand for any word or token—it’s a placeholder.
}
}
model <- keras_model_sequential() %>%
layer_embedding(input_dim = max_words, output_dim = embedding_dim,
input_length = maxlen) %>%
layer_flatten() %>%
layer_dense(units = 32, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
summary(model)
get_layer(model, index = 0) %>%
set_weights(list(embedding_matrix)) %>%
freeze_weights()
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("acc")
)
history <- model %>% fit(
x_train, y_train,
epochs = 20,
batch_size = 32,
validation_data = list(x_val, y_val)
)
save_model_weights_hdf5(model, "pre_trained_glove_model.h5")
###############################################################################
#
# Model evaluation
#
test_dir <- file.path(imdb_dir, "test")
labels <- c()
texts <- c()
for (label_type in c("neg", "pos")) {
label <- switch(label_type, neg = 0, pos = 1)
dir_name <- file.path(test_dir, label_type)
for (fname in list.files(dir_name, pattern = glob2rx("*.txt"),
full.names = TRUE)) {
texts <- c(texts, readChar(fname, file.info(fname)$size))
labels <- c(labels, label)
}
}
sequences <- texts_to_sequences(tokenizer, texts)
x_test <- pad_sequences(sequences, maxlen = maxlen)
y_test <- as.array(labels)
model %>%
load_model_weights_hdf5("pre_trained_glove_model.h5") %>%
evaluate(x_test, y_test)
# $loss
# [1] 0.8601566
#
# $acc
# [1] 0.56116
################################################################################
#
# Trainign the same model without pretrained word embeddings
#
model <- keras_model_sequential() %>%
layer_embedding(input_dim = max_words, output_dim = embedding_dim,
input_length = maxlen) %>%
layer_flatten() %>%
layer_dense(units = 32, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid")
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",metrics = c("acc")
)
history <- model %>% fit(
x_train, y_train,
epochs = 20,
batch_size = 32,
validation_data = list(x_val, y_val)
)