-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathDecision_tree_prediction.R
94 lines (76 loc) · 3.3 KB
/
Decision_tree_prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#Loading required R packages
library(tidyverse)
library(caret)
library(rpart)
library(party)
library(MLmetrics)
path1 = '../../all_variables_and_GPI_monthly_all_countries'
path2 = '../../../dt_results'
country_files = list.files(path1, pattern="*.csv")
for (i in country_files){
coun<-strsplit(i,"_")[[1]][[3]]
country<-strsplit(coun, ".", fixed = TRUE)[[1]][[1]]
print(country)
#Load the data
file_df <- file.path(path1, paste('all_variables_', country, '.csv', sep = ''))
if (file.exists(file_df)){
df_country_initial<- read.csv(file_df, stringsAsFactors = FALSE)
drops <- c("MonthYear")
df_country<-df_country_initial[ , !(names(df_country_initial) %in% drops)]
#Split the data into training and test set
train_set<-0.5
train.data <- head(df_country, round(length(df_country$GPI) * train_set))
h <- length(df_country$GPI) - length(train.data$GPI)
test.data <- tail(df_country, h)
#Create the dataframe to save the most important variables from each training
df_important_var<-setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("var_name"))
cparameters<-list()
predictions<-double()
for (i in (1:(nrow(test.data)))) {
model <- train(
GPI ~., data = train.data, method = "rpart",
trControl = trainControl("cv", number = 10),
tuneLength = 10
)
#Get the complexity parameter and save it
cp <- model$bestTune
cparameters <- rbind(cparameters, cp)
predictions <- append(predictions, model %>% predict(test.data[i:i,]))
#Variables' importance
imp_var <- varImp(model)$importance
imp_var <- rownames_to_column(imp_var, var = "var_name")
df_important_var <- rbind(df_important_var, imp_var)
train.data <- rbind(train.data, test.data[i:i,])
}
#Save in a dataframe cp and predictions
predictions_cp <- data.frame(cparameters,predictions)
rownames(predictions_cp) <- NULL
write.csv(predictions_cp, file.path (path2, paste(country, '_dt_', train_set, '_predictions_cp.csv', sep = '')))
#Save the important variables from each training
write.csv(df_important_var, file.path (path2, paste(country, '_dt_', train_set, '_important_variables.csv', sep = '')))
# Model performance metrics
results_analytics <- data.frame(
RMSE = RMSE(predictions, test.data$GPI),
Rsquare = R2(predictions, test.data$GPI),
Mape = MAPE(actualpreds, actualtestGPI),
Pearson = cor(test.data$GPI, predictions, method = "pearson")
)
print(results_analytics)
#Save result analytics
write.csv(results_analytics, file.path (path2, paste(country, '_dt_', train_set, '_results.csv', sep = '')), row.names=T)
#Scatterplot predicted VS actual data
pdf(file = file.path (path2, paste(country, '_dt_', train_set, '_scatterplot_gpi', '.pdf', sep = '')))
plot(test.data$GPI,predictions,
xlab="Predicted GPI",ylab="Actual GPI")
abline(lm(predictions ~ test.data$GPI))
dev.off()
#Plot predicted VS actual data
pdf(file = file.path (path2, paste(country, '_dt_', train_set, '_trends_pred_actual_gpi', '.pdf', sep = '')))
plot(test.data$GPI,type="l",col="red",main="Predicted VS Actual GPI", sub="",
ylab="gpi values", xlab="months")
lines(predictions,col="blue")
dev.off()
}
#Confirm remove all objects before going to the next interation
#rm(list = ls(all.names = TRUE))
}