-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
38 changed files
with
4,928 additions
and
0 deletions.
There are no files selected for viewing
119 changes: 119 additions & 0 deletions
119
docs/articles/AutoML24Workshop & MScThesis/01_MSc_multiclass_datasets_selection.Rmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
--- | ||
title: "MSc multiclass datasets selection.Rmd" | ||
author: "Hubert Ruczyński" | ||
date: "`r Sys.Date()`" | ||
output: | ||
html_document: | ||
toc: yes | ||
toc_float: yes | ||
toc_collapsed: yes | ||
theme: lumen | ||
toc_depth: 3 | ||
number_sections: yes | ||
code_folding: hide | ||
latex_engine: xelatex | ||
--- | ||
|
||
```{css, echo=FALSE} | ||
body .main-container { | ||
max-width: 1820px !important; | ||
width: 1820px !important; | ||
} | ||
body { | ||
max-width: 1820px !important; | ||
width: 1820px !important; | ||
font-family: Helvetica !important; | ||
font-size: 16pt !important; | ||
} | ||
h1,h2,h3,h4,h5,h6{ | ||
font-size: 24pt !important; | ||
} | ||
``` | ||
|
||
# Imports | ||
|
||
We import the forester package to use the check_data function. | ||
|
||
```{r message=FALSE, warning=FALSE} | ||
library(forester) | ||
``` | ||
|
||
# Loading data | ||
|
||
We load the data from the RData files. | ||
|
||
```{r} | ||
CC18 <- readRDS("CC18.RData") | ||
regression <- readRDS("regression_bench.RData") | ||
``` | ||
|
||
# Multiclass tasks selection | ||
|
||
At first we check which CC18 datasets are multiclass, as the benhcmark cosnists of both bianry and multilcass classification tasks. | ||
|
||
```{r} | ||
multiclass_indexes <-c() | ||
for (i in 1:length(CC18)) { | ||
if (length(levels(CC18[[i]]$class)) > 2) { | ||
multiclass_indexes <- c(multiclass_indexes, i) | ||
} | ||
} | ||
multiclass_indexes | ||
multiclass_CC18 <- CC18[multiclass_indexes] | ||
``` | ||
|
||
# Selecting a subset of tasks | ||
|
||
Later, we analyse the sizes of the multiclass datasets, and ensure that we choose the representants with different characteristics, and reasonable sizes (not too big or too small). | ||
|
||
```{r} | ||
for (i in 1:length(multiclass_CC18)) { | ||
cat('Dataset Index:', i, '\n Name:', names(multiclass_CC18)[i], '\n Dimensionality:', dim(multiclass_CC18[[i]]), '\n') | ||
} | ||
``` | ||
|
||
Eventually we end up with the following selection of datasets: | ||
|
||
```{r} | ||
small_idx <- c(2, 3, 4, 5, 7, 10, 15, 16, 17, 21) | ||
multiclass_CC18_small <- multiclass_CC18[small_idx] | ||
multiclass_CC18 <- multiclass_CC18_small[c(1, 4, 6, 7, 10)] | ||
for (i in 1:length(multiclass_CC18)) { | ||
cat('Dataset Index:', i, '\n Name:', names(multiclass_CC18)[i], '\n Dimensionality:', dim(multiclass_CC18[[i]]), '\n') | ||
} | ||
``` | ||
|
||
# Adding wine_quality task | ||
|
||
However, as a mistake, one of the regression datasets called `wine_quality` is actually a multiclass task, thus we also add it here | ||
|
||
```{r} | ||
multiclass_CC18[[6]] <- regression[[2]] | ||
names(multiclass_CC18)[6] <- "wine_quality" | ||
for (i in 1:length(multiclass_CC18)) { | ||
cat('Dataset Index:', i, '\n Name:', names(multiclass_CC18)[i], '\n Dimensionality:', dim(multiclass_CC18[[i]]), '\n') | ||
} | ||
``` | ||
|
||
# Data check | ||
|
||
Additionally, we calculate the foresters data check for all tasks. | ||
|
||
```{r warning=FALSE} | ||
for (i in 1:length(multiclass_CC18)) { | ||
cat('Dataset Index:', i, '\n Name:', names(multiclass_CC18)[i], '\n Dimensionality:', dim(multiclass_CC18[[i]]), '\n\n') | ||
if (i != 6) { | ||
check_data(multiclass_CC18[[i]], 'class') | ||
} else { | ||
check_data(multiclass_CC18[[i]], 'quality') | ||
} | ||
} | ||
``` | ||
|
||
|
||
# Saving the outcomes | ||
|
||
```{r} | ||
saveRDS(multiclass_CC18, "multiclass_CC18.RData") | ||
``` | ||
|
138 changes: 138 additions & 0 deletions
138
docs/articles/AutoML24Workshop & MScThesis/02_MSc_altered_datasets_creation.Rmd
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
--- | ||
title: "MSc_altered_datasets_creation" | ||
author: "Hubert Ruczyński" | ||
date: "`r Sys.Date()`" | ||
output: | ||
html_document: | ||
toc: yes | ||
toc_float: yes | ||
toc_collapsed: yes | ||
theme: lumen | ||
toc_depth: 3 | ||
number_sections: yes | ||
code_folding: hide | ||
latex_engine: xelatex | ||
--- | ||
|
||
```{css, echo=FALSE} | ||
body .main-container { | ||
max-width: 1820px !important; | ||
width: 1820px !important; | ||
} | ||
body { | ||
max-width: 1820px !important; | ||
width: 1820px !important; | ||
font-family: Helvetica !important; | ||
font-size: 16pt !important; | ||
} | ||
h1,h2,h3,h4,h5,h6{ | ||
font-size: 24pt !important; | ||
} | ||
``` | ||
|
||
# Imports | ||
|
||
We import the forester package to use the check_data function. | ||
|
||
```{r message=FALSE, warning=FALSE} | ||
library(forester) | ||
``` | ||
|
||
# Loading data | ||
|
||
We load the data from the RData files. | ||
|
||
```{r} | ||
MSc_binary_CC18 <- readRDS("binary_CC18.RData") | ||
MSc_regression_bench <- readRDS("regression_bench.RData") | ||
MSc_multiclass_CC18 <- readRDS("multiclass_CC18.RData") | ||
MSc_regression_bench <- MSc_regression_bench[c(1, 3, 4, 6, 7)] | ||
``` | ||
|
||
# Altering function | ||
|
||
As the original datasets come from the benchmarks, their quality is much higer than in the case of regular ML tasks. Thus we alter the datasets in order to create a lower-quality, more real-life examples. | ||
|
||
To achieve that we introduce the following changes: | ||
|
||
- Adding ID column, | ||
|
||
- Adding static columns, | ||
|
||
- Duplicating existing columns, | ||
|
||
- Introducing the missing values for 3 columns where 5%, 10%, and 15% of observations are missing. | ||
|
||
```{r} | ||
alter_df <- function(df) { | ||
org_df_id <- 1:(ncol(df) - 1) | ||
# Adding ID column | ||
df$ID <- 1:nrow(df) | ||
# Adding static columns | ||
df$static_obvious <- rep(1, nrow(df)) | ||
df$static_less_obvious <- c(rep('a', as.integer(nrow(df) * 0.995)), rep('b', (nrow(df) - as.integer(nrow(df) * 0.995)))) | ||
# Duplicating existing columns | ||
set.seed(123) | ||
id <- sample(org_df_id, 2) | ||
df$duplicate_1 <- df[, id[1]] | ||
df$duplicate_2 <- df[, id[2]] | ||
# Introducing the missing values | ||
row_idx <- 1:nrow(df) | ||
set.seed(234) | ||
id <- sample(org_df_id[!org_df_id %in% id], 3) | ||
set.seed(345) | ||
miss_idx <- sample(row_idx, nrow(df) * 0.05) | ||
df[miss_idx, id[1]] <- NA | ||
set.seed(456) | ||
miss_idx <- sample(row_idx, nrow(df) * 0.1) | ||
df[miss_idx, id[2]] <- NA | ||
set.seed(567) | ||
miss_idx <- sample(row_idx, nrow(df) * 0.15) | ||
df[miss_idx, id[3]] <- NA | ||
return(df) | ||
} | ||
``` | ||
|
||
# Alternation | ||
|
||
We alter the selected subset of datasets from all tasks. | ||
|
||
```{r} | ||
MSc_binary_CC18$`credit-g-mod` <- alter_df(MSc_binary_CC18$`credit-g`) | ||
MSc_binary_CC18$`phoneme-mod` <- alter_df(MSc_binary_CC18$phoneme) | ||
MSc_regression_bench$`elevators-mod` <- alter_df(MSc_regression_bench$elevators) | ||
MSc_regression_bench$`kin8nm-mod` <- alter_df(MSc_regression_bench$kin8nm) | ||
MSc_multiclass_CC18$`satimage-mod` <- alter_df(MSc_multiclass_CC18$satimage) | ||
MSc_multiclass_CC18$`car-mod` <- alter_df(MSc_multiclass_CC18$car) | ||
MSc_binary_CC18 <- MSc_binary_CC18[c(1, 2, 3, 4, 5, 19, 25, 26, 36, 37)] | ||
``` | ||
|
||
# Data check | ||
|
||
Additionally, we calculate the foresters data check for all modified tasks. | ||
|
||
```{r} | ||
cat('Credit-g-mod\n') | ||
s <- check_data(MSc_binary_CC18$`credit-g-mod`, 'class') | ||
cat('Phoneme-g-mod\n') | ||
s <- check_data(MSc_binary_CC18$phoneme, 'Class') | ||
cat('Elevators-mod\n') | ||
s <- check_data(MSc_regression_bench$elevators, 'Goal') | ||
cat('Kin8nm-mod\n') | ||
s <- check_data(MSc_regression_bench$kin8nm, 'y') | ||
cat('Satimage-mod\n') | ||
s <- check_data(MSc_multiclass_CC18$satimage, 'class') | ||
cat('Car-mod\n') | ||
s <- check_data(MSc_multiclass_CC18$car, 'class') | ||
``` | ||
|
||
# Saving altered datasets | ||
|
||
```{r} | ||
MSc_binary_CC18 <- MSc_binary_CC18[c(1, 2, 3, 4, 5, 19, 25, 26, 36, 37)] | ||
saveRDS(MSc_binary_CC18, "MSc_binary_CC18.RData") | ||
saveRDS(MSc_regression_bench, "MSc_regression_bench.RData") | ||
saveRDS(MSc_multiclass_CC18, "MSc_multiclass_CC18.RData") | ||
``` |
Oops, something went wrong.