-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path3_Word2vec_Model.R
127 lines (96 loc) · 4.84 KB
/
3_Word2vec_Model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
## 3 - Word2Vec Model
#
# This script will load the sentiment score dataset created by the `2_Sentiment_Score_Creator.R`
# script and then go through the process of build a classifier to make new sentiment predictions
# based on this data. This particular script uses Word2Vec to create the word embeddings
# and a standard Spark ML logistic regression classifier to create the model.
library(dplyr)
library(sparklyr)
# Create the Spark connection
config <- spark_config()
config$spark.driver.memory <- "8g"
config$spark.master <- "local" #This is here because the master='local' in the spark_connect function is having issues. If you're running on your local machine, you shouldn't need it.
config$`sparklyr.cores.local` <- 2
config$`sparklyr.shell.driver-memory` <- "8G"
config$spark.memory.fraction <- 0.9
# If you want to run in distributed mode, uncomment these lines and change master = 'local' to
# master = 'yarn-client'. Both above for the config parameter and below in the in
# spark_connect function.
#config$spark.executor.memory <- "8g"
#config$spark.executor.cores <- "2"
# If you're running this on CML in distributed mode, uncomment the following and make
# sure your project has environment variable named STORAGE that points to the right
# hive warehouse storage location.
# See: https://github.com/fastforwardlabs/cml_churn_demo_mlops/blob/master/0_bootstrap.py
#storage <- Sys.getenv("STORAGE")
#config$spark.yarn.access.hadoopFileSystems <- storage
sc <- spark_connect(master="local", config = config)
# Load the dataset from the previous step
sentence_scores <- spark_read_parquet(sc,"data/sentence_scores/")
## Creating a word2vec representation
#
# To train the model we need a numeric representation of the sentence that can be passed to the
# Logistic Regression classifier model. This as know as word embedding and the process we're
# using here is the built in Spark Word2Vec - https://spark.rstudio.com/reference/ft_word2vec/)
# function. `ft_word2vec` is a transformer and will create a new column with a numeric
# representation of each sentence. The data set is split into a test and training set for later
# validation. There some other steps needed to get each sentence into the right format
# for the word2vec operation to run that are part of the overall pipeline.
# This includes:
# - removing punctuation
# - tokenizing (separating the sentence into individual words)
# - stop word removal (words like: "a, the, if, and" are not useful)
# To make this a single step pipeline and not have to use `regexp_replace` separately, the
# code below creates and new function `ft_remove_punctuation` and uses the `ft_dplyr_transformer`
# function to put into a format the makes it compatible with an `ml_pipeline` pipeline.
ft_remove_punctuation <- sentence_scores %>%
mutate(spoken_words = regexp_replace(spoken_words, "\'", "")) %>%
mutate(spoken_words = regexp_replace(spoken_words, "[_\"():;,.!?\\-]", " "))
word2vec_pipeline <- ml_pipeline(sc) %>%
ft_dplyr_transformer(ft_remove_punctuation) %>%
ft_tokenizer(input_col="spoken_words",output_col= "word_list") %>%
ft_stop_words_remover(input_col = "word_list", output_col = "wo_stop_words") %>%
ft_word2vec(input_col = "wo_stop_words",
output_col = "result",
min_count = 5,
max_iter = 25,
vector_size = 400,
step_size = 0.0125
)
# Now the pipeline will be fitted. This can take a loooong time. Go get coffee.
w2v_model_fitted <- ml_fit(word2vec_pipeline,sentence_scores)
# The pipeline is saved to be used in the shiny app. This is saved in the local
# directory as this is local mode Spark. Change the path if you are running this in
# distributed mode.
ml_save(
w2v_model_fitted,
"models/pipeline_w2v",
overwrite = TRUE
)
## Logistic Regression Model
#
# For this logistic regression model, the word2vec vector created above will be used to
# predict the `sent_binary` values. This is a fairly basic operation. The complexity is in
# creating the vector representation of the sentences.
w2v_transformed <- ml_transform(w2v_model_fitted, sentence_scores)
# Split the data into a 70/30 train / test set
w2v_transformed_split <- w2v_transformed %>% sdf_random_split(training=0.7, test = 0.3)
# Train the model
lr_model_w2v <- w2v_transformed_split$training %>%
ml_logistic_regression(
sent_binary ~ result,
max_iter=500,
elastic_net_param=0.0,
reg_param = 0.01
)
# The model can be evaluated using the `ml_binary_classification_evaluator` function.
pred_lr_test<- ml_predict(lr_model_w2v, w2v_transformed_split$test)
ml_binary_classification_evaluator(pred_lr_test,label_col = "label",
prediction_col = "prediction", metric_name = "areaUnderROC")
# 89% seems reasonable
# Save the model to use with the Shiny App.
ml_save(
lr_model_w2v,
"models/lr_model_w2v",
overwrite = TRUE
)