-
Notifications
You must be signed in to change notification settings - Fork 1
/
treebasedmodels.html
1013 lines (894 loc) · 55.1 KB
/
treebasedmodels.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<meta name="generator" content="pandoc" />
<meta http-equiv="X-UA-Compatible" content="IE=EDGE" />
<title>Tree-based models</title>
<script src="site_libs/header-attrs-2.25/header-attrs.js"></script>
<script src="site_libs/jquery-3.6.0/jquery-3.6.0.min.js"></script>
<meta name="viewport" content="width=device-width, initial-scale=1" />
<link href="site_libs/bootstrap-3.3.5/css/flatly.min.css" rel="stylesheet" />
<script src="site_libs/bootstrap-3.3.5/js/bootstrap.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/html5shiv.min.js"></script>
<script src="site_libs/bootstrap-3.3.5/shim/respond.min.js"></script>
<style>h1 {font-size: 34px;}
h1.title {font-size: 38px;}
h2 {font-size: 30px;}
h3 {font-size: 24px;}
h4 {font-size: 18px;}
h5 {font-size: 16px;}
h6 {font-size: 12px;}
code {color: inherit; background-color: rgba(0, 0, 0, 0.04);}
pre:not([class]) { background-color: white }</style>
<script src="site_libs/navigation-1.1/tabsets.js"></script>
<link href="site_libs/highlightjs-9.12.0/default.css" rel="stylesheet" />
<script src="site_libs/highlightjs-9.12.0/highlight.js"></script>
<link href="site_libs/vembedr-0.1.5/css/vembedr.css" rel="stylesheet" />
<link href="site_libs/font-awesome-6.4.2/css/all.min.css" rel="stylesheet" />
<link href="site_libs/font-awesome-6.4.2/css/v4-shims.min.css" rel="stylesheet" />
<style type="text/css">
code{white-space: pre-wrap;}
span.smallcaps{font-variant: small-caps;}
span.underline{text-decoration: underline;}
div.column{display: inline-block; vertical-align: top; width: 50%;}
div.hanging-indent{margin-left: 1.5em; text-indent: -1.5em;}
ul.task-list{list-style: none;}
</style>
<style type="text/css">code{white-space: pre;}</style>
<script type="text/javascript">
if (window.hljs) {
hljs.configure({languages: []});
hljs.initHighlightingOnLoad();
if (document.readyState && document.readyState === "complete") {
window.setTimeout(function() { hljs.initHighlighting(); }, 0);
}
}
</script>
<style type = "text/css">
.main-container {
max-width: 940px;
margin-left: auto;
margin-right: auto;
}
img {
max-width:100%;
}
.tabbed-pane {
padding-top: 12px;
}
.html-widget {
margin-bottom: 20px;
}
button.code-folding-btn:focus {
outline: none;
}
summary {
display: list-item;
}
details > summary > p:only-child {
display: inline;
}
pre code {
padding: 0;
}
</style>
<style type="text/css">
.dropdown-submenu {
position: relative;
}
.dropdown-submenu>.dropdown-menu {
top: 0;
left: 100%;
margin-top: -6px;
margin-left: -1px;
border-radius: 0 6px 6px 6px;
}
.dropdown-submenu:hover>.dropdown-menu {
display: block;
}
.dropdown-submenu>a:after {
display: block;
content: " ";
float: right;
width: 0;
height: 0;
border-color: transparent;
border-style: solid;
border-width: 5px 0 5px 5px;
border-left-color: #cccccc;
margin-top: 5px;
margin-right: -10px;
}
.dropdown-submenu:hover>a:after {
border-left-color: #adb5bd;
}
.dropdown-submenu.pull-left {
float: none;
}
.dropdown-submenu.pull-left>.dropdown-menu {
left: -100%;
margin-left: 10px;
border-radius: 6px 0 6px 6px;
}
</style>
<script type="text/javascript">
// manage active state of menu based on current page
$(document).ready(function () {
// active menu anchor
href = window.location.pathname
href = href.substr(href.lastIndexOf('/') + 1)
if (href === "")
href = "index.html";
var menuAnchor = $('a[href="' + href + '"]');
// mark the anchor link active (and if it's in a dropdown, also mark that active)
var dropdown = menuAnchor.closest('li.dropdown');
if (window.bootstrap) { // Bootstrap 4+
menuAnchor.addClass('active');
dropdown.find('> .dropdown-toggle').addClass('active');
} else { // Bootstrap 3
menuAnchor.parent().addClass('active');
dropdown.addClass('active');
}
// Navbar adjustments
var navHeight = $(".navbar").first().height() + 15;
var style = document.createElement('style');
var pt = "padding-top: " + navHeight + "px; ";
var mt = "margin-top: -" + navHeight + "px; ";
var css = "";
// offset scroll position for anchor links (for fixed navbar)
for (var i = 1; i <= 6; i++) {
css += ".section h" + i + "{ " + pt + mt + "}\n";
}
style.innerHTML = "body {" + pt + "padding-bottom: 40px; }\n" + css;
document.head.appendChild(style);
});
</script>
<!-- tabsets -->
<style type="text/css">
.tabset-dropdown > .nav-tabs {
display: inline-table;
max-height: 500px;
min-height: 44px;
overflow-y: auto;
border: 1px solid #ddd;
border-radius: 4px;
}
.tabset-dropdown > .nav-tabs > li.active:before, .tabset-dropdown > .nav-tabs.nav-tabs-open:before {
content: "\e259";
font-family: 'Glyphicons Halflings';
display: inline-block;
padding: 10px;
border-right: 1px solid #ddd;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li.active:before {
content: "\e258";
font-family: 'Glyphicons Halflings';
border: none;
}
.tabset-dropdown > .nav-tabs > li.active {
display: block;
}
.tabset-dropdown > .nav-tabs > li > a,
.tabset-dropdown > .nav-tabs > li > a:focus,
.tabset-dropdown > .nav-tabs > li > a:hover {
border: none;
display: inline-block;
border-radius: 4px;
background-color: transparent;
}
.tabset-dropdown > .nav-tabs.nav-tabs-open > li {
display: block;
float: none;
}
.tabset-dropdown > .nav-tabs > li {
display: none;
}
</style>
<!-- code folding -->
</head>
<body>
<div class="container-fluid main-container">
<div class="navbar navbar-default navbar-fixed-top" role="navigation">
<div class="container">
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-bs-toggle="collapse" data-target="#navbar" data-bs-target="#navbar">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<a class="navbar-brand" href="index.html">Machine Learning for Public Policy</a>
</div>
<div id="navbar" class="navbar-collapse collapse">
<ul class="nav navbar-nav">
</ul>
<ul class="nav navbar-nav navbar-right">
<li>
<a href="index.html">
<span class="fa fa-home"></span>
Home
</a>
</li>
<li>
<a href="intro.html">
<span class="fa fa-duotone fa-robot"></span>
Introduction
</a>
</li>
<li>
<a href="predictionpolicy.html">
<span class="fa fa-line-chart"></span>
Prediction Policy Problems
</a>
</li>
<li>
<a href="classification.html">
<span class="fa fa-solid fa-gears"></span>
Classification:Logistic
</a>
</li>
<li>
<a href="treebasedmodels.html">
<span class="fa fa-tree"></span>
TreeModels:RandomForests
</a>
</li>
<li>
<a href="fairml.html">
<span class="fa fa-graduation-cap"></span>
Fair ML/Data Ethics
</a>
</li>
<li>
<a href="NeuralNets.html">
<span class="fa fa-superpowers"></span>
Neural Networks
</a>
</li>
<li>
<a href="PolicyChallenge.html">
<span class="fa fa-thin fa-bolt-lightning"></span>
Policy Challenge
</a>
</li>
<li>
<a href="discussionboard.html">
<span class="fa fa-solid fa-comments"></span>
Discussion Board
</a>
</li>
</ul>
</div><!--/.nav-collapse -->
</div><!--/.container -->
</div><!--/.navbar -->
<div id="header">
<h1 class="title toc-ignore">Tree-based models</h1>
</div>
<div id="tree-based-models-for-classification-problems"
class="section level2 tabset tabset-fade tabset-pills">
<h2 class="tabset tabset-fade tabset-pills"><strong>Tree-based models
for classification problems </strong></h2>
<p><strong>A general overview of tree-based methods</strong></p>
<p>An introduction to Tree-based machine learning models is given to us
by <a href="https://www.linkedin.com/in/fraroma/">Dr. Francisco
Rosales</a>, Assistant Professor at ESAN University (Perú) and Lead Data
Scientist (<a
href="https://www.linkedin.com/company/breinhub/">BREIN</a>). You can
watch the pre-recorded session below:</p>
<center>
<div class="vembedr">
<div>
<iframe src="https://www.youtube.com/embed/l7s3k2TlQeY" width="533" height="300" frameborder="0" allowfullscreen="" data-external="1"></iframe>
</div>
</div>
</center>
<p>Some key points to keep in mind when working through the practical
exercise include:</p>
<ul>
<li><p>Tree-based methods work for both classification and regression
problems.</p></li>
<li><p>Decision Trees are both a logical and a technical tool:</p>
<ul>
<li><p>they involve stratifying or segmenting the predictor space into a
number of simple regions</p></li>
<li><p>from each region, we obtain a relevant metric (e.g. mean/average)
and then use that information to make predictions about the observations
that belong to that region</p></li>
</ul></li>
<li><p>Decision Trees are the simplest version of a tree-based method.
To improve on a simple splitting algorithm, there exist ensemble
learning techniques such as bagging and boosting:</p>
<ul>
<li><p>bagging: also known as bootstrap aggregating, it is an ensemble
technique used to decrease a model’s variance. A <strong> Random Forest
</strong> is a tree-based method that functions on the concept of
bagging. The main idea behind a Random Forest model is that, if you
partition the data that would be used to create a single decision tree
into different parts, create one tree for each of these partitions, and
then use a method to “average” the results of all of these different
trees, you should end up with a better model.</p></li>
<li><p>boosting: an ensemble technique mainly used to decrease a model’s
bias. Like bagging, we create multiple trees from various splits of our
training dataset. However, whilst bagging uses bootstrap to create the
various data splits (from which each tree is born), in boosting each
tree is grown sequentially, using information from the previously built
tree. So, boosting doesn’t use bootstrap. Instead each tree is a
modified version of the original dataset (each subsequent tree is built
from the residuals of the previous model).<br />
<br></p></li>
</ul></li>
</ul>
<p>To conclude our Malawi case study, we will implement a Random Forest
algorithm to our classification problem: given a set of features X
(e.g. ownership of a toilet, size of household, etc.), how likely are we
to correctly identify an individual’s income class? Recall that this
problem has already been approached using a linear regression model (and
a lasso linear model) and a logistic classification (i.e. an eager
learner model) and whilst there was no improvement between a linear and
a lasso linear model, we did increase our model’s predictive ability
when we switched from a linear prediction to a classification approach.
I had previously claimed that the improvement was marginal — but since
the model will be used to determine who gets and who doesn’t get an
income supplement (i.e. who’s an eligible recipient of a cash transfer,
as part of Malawi’s social protection policies), any improvement is
critical and we should try various methods until we find the one that
best fits our data. <br></p>
<p>Some discussion points before the practical:</p>
<ul>
<li><p>Why did we decide to switch models (from linear to
classification)?</p></li>
<li><p>Intuitively, why did a classification model perform better than a
linear regression at predicting an individual’s social class based on
their monthly per capita consumption?</p></li>
<li><p>How would a Random Forest classification approach improve our
predictive ability? (hint, the answer may be similar to the above
one)</p></li>
</ul>
<div id="r-practical" class="section level3">
<h3><strong>R practical</strong></h3>
<p><br> As always, start by opening the libraries that you’ll need to
reproduce the script below. We will continue to use the Caret library
for machine learning purposes, and some other general libraries for data
wrangling and visualisation. <br></p>
<pre class="r"><code>rm(list = ls()) # this line cleans your Global Environment.
setwd("/Users/michellegonzalez/Documents/GitHub/Machine-Learning-for-Public-Policy") # set your working directory
# Do not forget to install a package with the install.packages() function if it's the first time you use it!
library(dplyr) # core package for dataframe manipulation. Usually installed and loaded with the tidyverse, but sometimes needs to be loaded in conjunction to avoid warnings.
library(tidyverse) # a large collection of packages for data manipulation and visualisation.
library(caret) # a library with key functions that streamline the process for predictive modelling
library(skimr) # a package with a set of functions to describe dataframes and more
library(plyr) # a package for data wrangling
library(party) # provides a user-friendly interface for creating and analyzing decision trees using recursive partitioning
library(rpart) # recursive partitioning and regression trees
library(rpart.plot) # visualising decision trees
library(rattle) # to obtain a fancy wrapper for the rpart.plot
library(RColorBrewer) # import more colours
# import data
data_malawi <- read_csv("malawi.csv") # the file is directly read from the working directory/folder previously set</code></pre>
<p><br></p>
<p>For this exercise, we will skip all the data pre-processing steps. At
this point, we are all well acquainted with the Malawi dataset, and
should be able to create our binary outcome, poor (or not), and clean
the dataset in general. If you need to, you can always go back to the <a
href="https://www.ml4publicpolicy.com/classification.html">Logistic
Classification tab</a> and repeat the data preparation process described
there. <br></p>
<h3>
Data Split and Fit
</h3>
<pre class="r"><code>set.seed(1234) # ensures reproducibility of our data split
# data partitioning: train and test datasets
train_idx <- createDataPartition(data_malawi$poor, p = .8, list = FALSE, times = 1)
Train_df <- data_malawi[ train_idx,]
Test_df <- data_malawi[-train_idx,]
# data fit: fit a random forest model
# (be warned that this may take longer to run than previous models)
rf_train <- train(poor ~ .,
data = Train_df,
method = "ranger" # estimates a Random Forest algorithm via the ranger pkg (you may need to install the ranger pkg)
)
# First glimpse at our random forest model
print(rf_train)</code></pre>
<pre><code>## Random Forest
##
## 9025 samples
## 29 predictor
## 2 classes: 'Y', 'N'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 9025, 9025, 9025, 9025, 9025, 9025, ...
## Resampling results across tuning parameters:
##
## mtry splitrule Accuracy Kappa
## 2 gini 0.8108829 0.5557409
## 2 extratrees 0.7698647 0.4280448
## 16 gini 0.7999474 0.5472253
## 16 extratrees 0.8023850 0.5525424
## 30 gini 0.7946432 0.5359787
## 30 extratrees 0.7974024 0.5425408
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 2, splitrule = gini
## and min.node.size = 1.</code></pre>
<p><br> If you read the final box of the print() output, you’ll notice
that, given our input Y and X features, and no other information, the
optimal random forest model, uses the following:</p>
<ul>
<li><p>mtry = 2: mtry is the number of variables to sample at random at
each split. This is the number we feed to the recursive partitioning
algorithm. At each split, the algorithm will search mtry (=2) variables
(a completely different set from the previous split) chosen at random,
and pick the best split point.</p></li>
<li><p>splitrule = gini: the splitting rule/algorithm used. Gini, or the
Gini Impurity is a probability that ranges from <span
class="math inline">\(0\)</span> to <span
class="math inline">\(1\)</span>. The lower the value, the more pure the
node. Recall that a node that is <span
class="math inline">\(100\%\)</span> pure includes only data from a
single class (no noise!), and therefore the splitting stops.</p></li>
<li><p>Accuracy (or <span class="math inline">\(1\)</span> - the error
rate): at <span class="math inline">\(0.81\)</span>, it improves from
our eager learner classification (logistic) approach by <span
class="math inline">\(0.01\)</span> and it is highly accurate.</p></li>
<li><p>Kappa (adjusted accuracy): at <span
class="math inline">\(0.55\)</span>, it indicates that our random forest
model (on the training data) seems to perform the same as out logistic
model. To make a proper comparison, we need to look at the out-of-sample
predictions evaluation statistics.</p></li>
</ul>
<br>
<h3>
Out-of-sample predictions
</h3>
<p><br></p>
<pre class="r"><code># make predictions using the trained model and the test dataset
set.seed(12345)
pr1 <- predict(rf_train, Test_df, type = "raw")
head(pr1) # Yes and No output</code></pre>
<pre><code>## [1] Y Y Y Y Y Y
## Levels: Y N</code></pre>
<pre class="r"><code># evaluate the predictions using the ConfusionMatrix function from Caret pkg
confusionMatrix(pr1, Test_df[["poor"]], positive = "Y") # positive = "Y" indicates that our category of interest is Y (1)</code></pre>
<pre><code>## Confusion Matrix and Statistics
##
## Reference
## Prediction Y N
## Y 1344 324
## N 122 465
##
## Accuracy : 0.8022
## 95% CI : (0.7852, 0.8185)
## No Information Rate : 0.6501
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5379
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9168
## Specificity : 0.5894
## Pos Pred Value : 0.8058
## Neg Pred Value : 0.7922
## Prevalence : 0.6501
## Detection Rate : 0.5960
## Detection Prevalence : 0.7397
## Balanced Accuracy : 0.7531
##
## 'Positive' Class : Y
## </code></pre>
<p>Based on our out-of-sample predictions, the Random Forest algorithm
seems to yield pretty similar accuracy in its predictions as the
logistic classification algorithm. The performance metrics (accuracy,
sensitivity, specificity, kappa) remain the same (as for most
classification problems). If you want a refresher of what they mean and
how to interpret them, go back one session for a more thorough
explanation!</p>
<br>
<h3>
Fine-tuning parameters
</h3>
<p><br></p>
<p>We can try to improve our Random Forest model by fine-tuning two
parameters: grid and cross-validation</p>
<pre class="r"><code># prepare the grid (create a larger random draw space)
tuneGrid <- expand.grid(mtry = c(1,2, 3, 4),
splitrule = c("gini", "extratrees"),
min.node.size = c(1, 3, 5))
# prepare the folds
trControl <- trainControl( method = "cv",
number=5,
search = 'grid',
classProbs = TRUE,
savePredictions = "final"
) # 5-folds cross-validation
# fine-tune the model with optimised paramters
# (again, be ready to wait a few minutes for this to run)
rf_train_tuned <- train(poor ~ .,
data = Train_df,
method = "ranger",
tuneGrid = tuneGrid,
trControl = trControl
)
# let's see how the fine-tuned model fared
print(rf_train_tuned)</code></pre>
<pre><code>## Random Forest
##
## 9025 samples
## 29 predictor
## 2 classes: 'Y', 'N'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 7219, 7220, 7220, 7221, 7220
## Resampling results across tuning parameters:
##
## mtry splitrule min.node.size Accuracy Kappa
## 1 gini 1 0.7819382 0.4594359
## 1 gini 3 0.7822701 0.4593051
## 1 gini 5 0.7826031 0.4600870
## 1 extratrees 1 0.7404976 0.3310123
## 1 extratrees 3 0.7396100 0.3287570
## 1 extratrees 5 0.7404971 0.3308480
## 2 gini 1 0.8142918 0.5674387
## 2 gini 3 0.8134056 0.5653133
## 2 gini 5 0.8137385 0.5661639
## 2 extratrees 1 0.7828240 0.4695433
## 2 extratrees 3 0.7840429 0.4730871
## 2 extratrees 5 0.7830448 0.4705301
## 3 gini 1 0.8160649 0.5769315
## 3 gini 3 0.8144026 0.5730246
## 3 gini 5 0.8156218 0.5755611
## 3 extratrees 1 0.8089749 0.5519067
## 3 extratrees 3 0.8073122 0.5469591
## 3 extratrees 5 0.8070911 0.5471121
## 4 gini 1 0.8139609 0.5730598
## 4 gini 3 0.8157331 0.5778242
## 4 gini 5 0.8146244 0.5748192
## 4 extratrees 1 0.8115228 0.5636714
## 4 extratrees 3 0.8122979 0.5662566
## 4 extratrees 5 0.8131842 0.5681043
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = gini
## and min.node.size = 1.</code></pre>
<p><br> Fine tuning parameters has not done much for our in-sample
model. The chosen mtry value and splitting rule were the same. The only
parameter where I see improvement is in the (training set) Kappa, from
<span class="math inline">\(0.55\)</span> to <span
class="math inline">\(0.56\)</span>. Will out of sample predictions
improve? <br></p>
<pre class="r"><code># make predictions using the trained model and the test dataset
set.seed(12345)
pr2 <- predict(rf_train_tuned, Test_df, type = "raw")
head(pr2) # Yes and No output</code></pre>
<pre><code>## [1] Y Y Y Y Y Y
## Levels: Y N</code></pre>
<pre class="r"><code># evaluate the predictions using the ConfusionMatrix function from Caret pkg
confusionMatrix(pr2, Test_df[["poor"]], positive = "Y") # positive = "Y" indicates that our category of interest is Y (1)</code></pre>
<pre><code>## Confusion Matrix and Statistics
##
## Reference
## Prediction Y N
## Y 1316 291
## N 150 498
##
## Accuracy : 0.8044
## 95% CI : (0.7875, 0.8206)
## No Information Rate : 0.6501
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5516
##
## Mcnemar's Test P-Value : 2.617e-11
##
## Sensitivity : 0.8977
## Specificity : 0.6312
## Pos Pred Value : 0.8189
## Neg Pred Value : 0.7685
## Prevalence : 0.6501
## Detection Rate : 0.5836
## Detection Prevalence : 0.7126
## Balanced Accuracy : 0.7644
##
## 'Positive' Class : Y
## </code></pre>
<p><br> Consistent with the improvements on the train set, the
out-of-sample predictions also return a higher adjusted accurcacy (Kappa
statistic), and improved specificity and sensitivity. Not by much
(e.g. Kappa increase of <span class="math inline">\(0.01\)</span>), but
we’ll take what we can get.</p>
<p><br> These results also show that the biggest prediction improvements
happen when we make big decisions - such as foregoing the variability of
continuous outcomes in favour of classes. Exploring classification
algorithms - in this case a logistic and a random forest model - was
definitely worthwhile, but did not yield large returns on our predictive
abilities.</p>
<h3>
Visualising our model
</h3>
<p><br></p>
<p>To close the chapter, let’s have a quick look at the sort of plots we
can make with a Random Forest algorithm.</p>
<pre class="r"><code># we'll need to re-estimate the rf model using rpart
MyRandomForest <- rpart(poor ~ ., data = Train_df)
# visualise the decision tree (first of many in the forest)
fancyRpartPlot(MyRandomForest, palettes = c("Oranges","Blues"), main = "Visualising nodes and splits") </code></pre>
<p><img src="treebasedmodels_files/figure-html/unnamed-chunk-8-1.png" width="672" /></p>
<p>The fancy Rpart Plot returns the flow chart that we have now learned
to call a decision tree. Recall that we have used different packages
(and different specifications) for the Random Forest. So, the
visualisation that we’re looking at now is not the exact replica of our
preferred fine-tuned model. It is, nonetheless, a good way to help you
understand how classifications and decisions are made with tree-based
methods. If you’d like an in-depth explanation of the plot, you can
visit the <a href="http://www.milbo.org/rpart-plot/prp.pdf">Rpart.plot
pkg documentation</a>.</p>
</div>
<div id="python-practical" class="section level3">
<h3><strong>Python practical</strong></h3>
<p><br> As always, start by opening the libraries that you’ll need to
reproduce the script below. We will continue to use the scikit-learn
library for machine learning purposes, and some other general libraries
for data wrangling and visualisation. <br></p>
<pre class="python"><code>#==== Python version: 3.10.12 ====#
# Opening libraries
import sklearn as sk # our trusted Machine Learning library
from sklearn.model_selection import train_test_split # split the dataset into train and test
from sklearn.model_selection import cross_val_score # to obtain the cross-validation score
from sklearn.model_selection import cross_validate # to perform cross-validation
from sklearn.ensemble import RandomForestClassifier # to perform a Random Forest classification model
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay # returns performance evaluation metrics
from sklearn.model_selection import RandomizedSearchCV # for fine-tuning parameters
from scipy.stats import randint # generate random integer
# Tree visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image # for Jupyter Notebook users
import graphviz as gv
# Non-ML libraries
import random # for random state
import csv # a library to read and write csv files
import numpy as np # a library for handling
import pandas as pd # a library to help us easily navigate and manipulate dataframes
import seaborn as sns # a data visualisation library
import matplotlib.pyplot as plt # a data visualisation library
# Uploading data
malawi = pd.read_csv('/Users/michellegonzalez/Documents/GitHub/Machine-Learning-for-Public-Policy/malawi.csv')</code></pre>
<p><br></p>
<p>For this exercise, we will skip all the data pre-processing steps. At
this point, we are all well acquainted with the Malawi dataset, and
should be able to create our binary outcome, poor (or not), and clean
the dataset in general. If you need to, you can always go back to the <a
href="https://www.ml4publicpolicy.com/classification.html">Logistic
Classification tab</a> and repeat the data preparation process described
there. <br></p>
<h3>
Data Split and Fit
</h3>
<p>Let’s use a simple 80:20 split for train and test data subsets.</p>
<pre class="python"><code># First, recall the df structure
malawi.info() # returns the column number, e.g. hhsize = column number 0, hhsize2 = 1... etc.</code></pre>
<pre><code>## <class 'pandas.core.frame.DataFrame'>
## Int64Index: 11280 entries, 10101002025 to 31202086374
## Data columns (total 29 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 hhsize 11280 non-null int64
## 1 hhsize2 11280 non-null int64
## 2 agehead 11280 non-null int64
## 3 agehead2 11280 non-null int64
## 4 north 11280 non-null category
## 5 central 11280 non-null category
## 6 rural 11280 non-null category
## 7 nevermarried 11280 non-null category
## 8 sharenoedu 11280 non-null float64
## 9 shareread 11280 non-null float64
## 10 nrooms 11280 non-null int64
## 11 floor_cement 11280 non-null category
## 12 electricity 11280 non-null category
## 13 flushtoilet 11280 non-null category
## 14 soap 11280 non-null category
## 15 bed 11280 non-null category
## 16 bike 11280 non-null category
## 17 musicplayer 11280 non-null category
## 18 coffeetable 11280 non-null category
## 19 iron 11280 non-null category
## 20 dimbagarden 11280 non-null category
## 21 goats 11280 non-null category
## 22 dependratio 11280 non-null float64
## 23 hfem 11280 non-null category
## 24 grassroof 11280 non-null category
## 25 mortarpestle 11280 non-null category
## 26 table 11280 non-null category
## 27 clock 11280 non-null category
## 28 Poor 11280 non-null category
## dtypes: category(21), float64(3), int64(5)
## memory usage: 1.0 MB</code></pre>
<pre class="python"><code>
# Then, split!
X = malawi.iloc[:, 0:27] # x is a matrix containing all variables except the last one, which conveniently is our binary target variable
y = malawi.iloc[:, 28] # y is a vector containing our target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345) # random_state is for reproducibility purposes</code></pre>
<p>Now, let’s fit a Random Forest model:</p>
<pre class="python"><code># data fit: fit a random forest model
rf = RandomForestClassifier(random_state=42) # empty random forest object
rf.fit(X_train, y_train) # fit the rf classifier using the training data</code></pre>
<style>#sk-container-id-4 {color: black;background-color: white;}#sk-container-id-4 pre{padding: 0;}#sk-container-id-4 div.sk-toggleable {background-color: white;}#sk-container-id-4 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-4 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-4 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-4 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-4 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-4 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-4 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-4 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-4 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-4 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-4 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-4 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-4 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-4 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-4 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-4 div.sk-item {position: relative;z-index: 1;}#sk-container-id-4 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-4 div.sk-item::before, #sk-container-id-4 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-4 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-4 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-4 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-4 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-4 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-4 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-4 div.sk-label-container {text-align: center;}#sk-container-id-4 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-4 div.sk-text-repr-fallback {display: none;}</style><div id="sk-container-id-4" class="sk-top-container"><div class="sk-text-repr-fallback"><pre>RandomForestClassifier(random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-4" type="checkbox" checked><label for="sk-estimator-id-4" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(random_state=42)</pre></div></div></div></div></div>
<p>We have now successfully trained a Random Forest model, and there is
no need to go over in-sample predictions. We can simply evaluate the
model’s ability to make out-of-sample predictions.</p>
<br>
<h3>
Out-of-sample predictions
</h3>
<p><br></p>
<pre class="python"><code># predict our test-dataset target variable based on the trained model
y_pred = rf.predict(X_test)
# evaluate the prediction's performance (estimate accuracy score, report confusion matrix)
# create a confusion matrix object (we're improving from our previous confusion matrix exploration ;))
cm = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy_score(y_test, y_pred))</code></pre>
<pre><code>## Accuracy: 0.8098404255319149</code></pre>
<pre class="python"><code>print("Precision:", precision_score(y_test, y_pred))</code></pre>
<pre><code>## Precision: 0.8230818008877616</code></pre>
<pre class="python"><code>print("Recall:", recall_score(y_test, y_pred))</code></pre>
<pre><code>## Recall: 0.8964088397790055</code></pre>
<pre class="python"><code>print("Confusion Matrix:", cm)</code></pre>
<pre><code>## Confusion Matrix: [[ 529 279]
## [ 150 1298]]</code></pre>
<pre class="python"><code>ConfusionMatrixDisplay(confusion_matrix=cm).plot() # create confusion matrix plot</code></pre>
<pre><code>## <sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay object at 0x17109a4a0></code></pre>
<pre class="python"><code>plt.show() # display confusion matrix plot created above</code></pre>
<p><img src="treebasedmodels_files/figure-html/unnamed-chunk-13-1.png" width="672" />
Based on our out-of-sample predictions, the Random Forest algorithm
seems to yield pretty similar accuracy in its predictions as the
logistic classification algorithm. If you want a reminder of how to
interpret accuracy, precision and recall scores or how to read the
confusion matrix, go back one session for a thorough explanation!</p>
<br>
<h3>
Fine-tuning parameters
</h3>
<p><br></p>
<p>To improve the performance of our random forest model, we can try
hyperparameter tuning. You can think of the process as optimising the
learning model by defining the settings that will govern the learning
process of the model. In Python, and for a random forest model, we can
use RandomizedSearchCV to find the optimal parameters within a range of
parameters.</p>
<pre class="python"><code># define hyperparameters and their ranges in a "parameter_distance" dictionary
parameter_distance = {'n_estimators': randint(50,500),
'max_depth': randint(1,10)
}
# n_estimators: the number of decision trees in the forest (at least 50 and at most 500)
# max_depth: the maximum depth of each decision tree (at least 1 split, and at most 20 splits of the tree into branches)</code></pre>
<p><br> There are other hyperparameters, but a search of the optimal
value of these is a good start to our model optimisation! <br></p>
<pre class="python"><code># Please note that the script below might take a while to run (don't be alarmed if you have to wait a couple of minutes)
# Use a random search to find the best hyperparameters
random_search = RandomizedSearchCV(rf,
param_distributions = parameter_distance,
n_iter=5,
cv=5,
random_state=42)
# Fit the random search object to the training model
random_search.fit(X_train, y_train)</code></pre>
<style>#sk-container-id-5 {color: black;background-color: white;}#sk-container-id-5 pre{padding: 0;}#sk-container-id-5 div.sk-toggleable {background-color: white;}#sk-container-id-5 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-5 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-5 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-5 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-5 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-5 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-5 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-5 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-5 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-5 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-5 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-5 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-5 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-5 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-5 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-5 div.sk-item {position: relative;z-index: 1;}#sk-container-id-5 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-5 div.sk-item::before, #sk-container-id-5 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-5 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-5 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-5 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-5 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-5 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-5 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-5 div.sk-label-container {text-align: center;}#sk-container-id-5 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-5 div.sk-text-repr-fallback {display: none;}</style><div id="sk-container-id-5" class="sk-top-container"><div class="sk-text-repr-fallback"><pre>RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
n_iter=5,
param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x2bae44d90>,
'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x2bad506a0>},
random_state=42)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item sk-dashed-wrapped"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-5" type="checkbox" ><label for="sk-estimator-id-5" class="sk-toggleable__label sk-toggleable__label-arrow">RandomizedSearchCV</label><div class="sk-toggleable__content"><pre>RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
n_iter=5,
param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x2bae44d90>,
'n_estimators': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x2bad506a0>},
random_state=42)</pre></div></div></div><div class="sk-parallel"><div class="sk-parallel-item"><div class="sk-item"><div class="sk-label-container"><div class="sk-label sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-6" type="checkbox" ><label for="sk-estimator-id-6" class="sk-toggleable__label sk-toggleable__label-arrow">estimator: RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(random_state=42)</pre></div></div></div><div class="sk-serial"><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-7" type="checkbox" ><label for="sk-estimator-id-7" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(random_state=42)</pre></div></div></div></div></div></div></div></div></div></div>
<pre class="python"><code># create an object / variable that containes the best hyperparameters, according to our search:
best_rf_hype = random_search.best_estimator_
print('Best random forest hyperparameters:', random_search.best_params_)</code></pre>
<pre><code>## Best random forest hyperparameters: {'max_depth': 8, 'n_estimators': 238}</code></pre>
<p><strong>Now we can re-train our model using the retrieved
hyperparameters and evaluate the out-of-sample-predictions of the
model.</strong></p>
<pre class="python"><code># for simplicity, store the best parameters again in a variable called x
x = random_search.best_params_
# Train the ranfom forest model using the best max_depth and n_estimators
rf_best = RandomForestClassifier(**x, random_state=1234) # pass the integers from the best parameters with **
rf_best.fit(X_train, y_train)</code></pre>
<style>#sk-container-id-6 {color: black;background-color: white;}#sk-container-id-6 pre{padding: 0;}#sk-container-id-6 div.sk-toggleable {background-color: white;}#sk-container-id-6 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-6 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-6 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-6 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-6 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-6 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-6 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-6 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-6 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-6 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-6 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-6 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-6 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-6 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-6 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-6 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-6 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-6 div.sk-item {position: relative;z-index: 1;}#sk-container-id-6 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-6 div.sk-item::before, #sk-container-id-6 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-6 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-6 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-6 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-6 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-6 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-6 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-6 div.sk-label-container {text-align: center;}#sk-container-id-6 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-6 div.sk-text-repr-fallback {display: none;}</style><div id="sk-container-id-6" class="sk-top-container"><div class="sk-text-repr-fallback"><pre>RandomForestClassifier(max_depth=8, n_estimators=238, random_state=1234)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-8" type="checkbox" checked><label for="sk-estimator-id-8" class="sk-toggleable__label sk-toggleable__label-arrow">RandomForestClassifier</label><div class="sk-toggleable__content"><pre>RandomForestClassifier(max_depth=8, n_estimators=238, random_state=1234)</pre></div></div></div></div></div>
<pre class="python"><code># Make out-of-sample predictions
y_pred_hype = rf_best.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_hype)
recall = recall_score(y_test, y_pred_hype)
precision = precision_score(y_test, y_pred_hype)
print(f"Accuracy with best hyperparameters: {accuracy}")</code></pre>
<pre><code>## Accuracy with best hyperparameters: 0.8067375886524822</code></pre>
<pre class="python"><code>print(f"Recall with best hyperparameters: {recall}")</code></pre>
<pre><code>## Recall with best hyperparameters: 0.9233425414364641</code></pre>
<pre class="python"><code>print(f"Precision with best hyperparameters: {precision}")</code></pre>
<pre><code>## Precision with best hyperparameters: 0.8044524669073405</code></pre>
<p><br> It looks like we have not improved our accuracy or other
peformance evaluation metrics by fine-tuning our model parameters.
Perhaps a marginal improvement of <span
class="math inline">\(0.01\)</span> on the precision score.</p>
<p>These results show that the biggest prediction improvements happen
when we make big decisions - such as foregoing the variability of
continuous outcomes in favour of classes. Exploring classification
algorithms - in this case a logistic and a random forest model - was
definitely worthwhile, but did not yield large returns on our predictive
abilities. <br></p>
<h3>
Visualising our model
</h3>
<p><br></p>
<p>To close the chapter, let’s have a quick look at the sort of plots we
can make with a Random Forest algorithm. While we cannot visualise the
entirety of the forest, we can certainly have a look at the first two or
three trees in our forest.</p>
<pre class="python"><code># Select the first (recall in python, the firs element is 0) decision-tree to display from our random forest object:
# (Alternatively, use a for-loop to display the first two, three, four... trees)
tree = rf_best.estimators_[0] # select the first tree from the foress
# transform the tree into a graph object
dot_data = export_graphviz(tree,
feature_names=X_train.columns, # names of columns selected from the X_train dataset
filled=True,
max_depth=2, # how many layers/dimensions we want to display, only 2 after the initial branch in this case
impurity=False,
proportion=True
)
graph = gv.Source(dot_data) # gv.Source helps us display the DOT languag source of the graph (needed for rendering the image)
graph.render('tree_visualisation', format='png') # this will save the tree visualisation directly into your working folder</code></pre>
<pre><code>## 'tree_visualisation.png'</code></pre>
<center>
<div class="float">
<img src="tree_visualisation.png" alt="Decision Tree" />
<div class="figcaption">Decision Tree</div>
</div>
</center>
</div>
<div id="practice-at-home" class="section level3">
<h3><strong>Practice at home</strong></h3>
<p>As usual, you can replicate this exercise using the Bolivia dataset.
To let us know that you’re still active, please answer this <a
href="https://maastrichtuniversity.eu.qualtrics.com/jfe/form/SV_1RNFd75exNEZVvU">Qualtrics-based
question</a>.</p>
</div>
</div>
<!DOCTYPE html>
<hr>
<p style="text-align: center;">Copyright © 2022 <i class="fa-light fa-person-to-portal"></i> Michelle González Amador & Stephan Dietrich <i class="fa-light fa-person-from-portal"></i>. All rights reserved.</p>
<p style="text-align: center;"><a href="https://github.com/michelleg06/Machine-Learning-for-Public-Policy" class="fa fa-github"></a></p>
</div>
<script>
// add bootstrap table styles to pandoc tables
function bootstrapStylePandocTables() {
$('tr.odd').parent('tbody').parent('table').addClass('table table-condensed');
}
$(document).ready(function () {
bootstrapStylePandocTables();
});
</script>
<!-- tabsets -->
<script>
$(document).ready(function () {
window.buildTabsets("TOC");
});
$(document).ready(function () {
$('.tabset-dropdown > .nav-tabs > li').click(function () {
$(this).parent().toggleClass('nav-tabs-open');
});
});
</script>
<!-- code folding -->