forked from KasiaDzimira/ssied
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.R
85 lines (69 loc) · 2.11 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
library(data.table)
library(tidytext)
library(magrittr)
library(dplyr)
library(kerasR)
options(scipen = 999)
train <- fread('../input/train.csv', data.table = FALSE)
test <- fread('../input/test.csv', data.table = FALSE)
max_words <- 15000
maxlen <- 64
texts <- train$question_text
tokenizer <- text_tokenizer(num_words = max_words) %>%
fit_text_tokenizer(texts)
sequences <- texts_to_sequences(tokenizer, texts)
word_index <- tokenizer$word_index
data = pad_sequences(sequences, maxlen = maxlen)
set.seed(1337)
embeddings <- readLines('../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec')
embeddings_index = new.env(hash = TRUE, parent = emptyenv())
embeddings <- embeddings[2:length(embeddings)]
for (i in 1:length(embeddings)){
embedding <- embeddings[[i]]
values <- strsplit(embedding, " ")[[1]]
word <- values[[1]]
embeddings_index[[word]] = as.double(values[-1])
}
word_vectors = array(0, c(max_words, 300))
for (word in names(word_index)){
index <- embeddings_index[[word]]
if (index < max_words){
embedding_vector = embeddings_index[[word]]
if (!is.null(embedding_vector))
word_vectors[index+1,] <- embedding_vector
}
}
labels = train$target
indices = sample(1:nrow(data))
training_indices = indices[1:nrow(data)]
x_train = data[training_indices,]
y_train = labels[training_indices]
input <- layer_input(
shape = list(NULL),
dtype = "int32",
name = "input"
)
model <- keras_model()
model %>%
layer_embedding(input_dim = max_words, output_dim = 300, name = "embedding")
layer_lstm(units = maxlen,dropout = 0.25, recurrent_dropout = 0.25, return_sequences = FALSE, name = "lstm")
layer_dense(units = 128, activation = "relu", name = "dense")
layer_dense(units = 1, activation = "sigmoid", name = "predictions")
get_layer(model, name = "embedding") %>%
set_weights(list(word_vectors)) %>%
freeze_weights()
model %>% compile(
optimizer = optimizer_adam(),
loss = "binary_crossentropy",
metrics = "binary_accuracy"
)
history <- model %>% fit(
x_train,
y_train,
batch_size = 2048,
epochs = 35,
view_metrics = FALSE,
verbose = 0
)
print(model)
print(history)