Add MSc files

ModelOriented · Jun 5, 2024 · d2eced9 · d2eced9
1 parent 80595d8
commit d2eced9
Show file tree

Hide file tree

Showing 38 changed files with 4,928 additions and 0 deletions.
diff --git a/docs/articles/AutoML24Workshop & MScThesis/01_MSc_multiclass_datasets_selection.Rmd b/docs/articles/AutoML24Workshop & MScThesis/01_MSc_multiclass_datasets_selection.Rmd
@@ -0,0 +1,119 @@
+---
+title: "MSc multiclass datasets selection.Rmd"
+author: "Hubert Ruczyński"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: yes
+    toc_float: yes
+    toc_collapsed: yes
+    theme: lumen
+    toc_depth: 3
+    number_sections: yes
+    code_folding: hide
+    latex_engine: xelatex
+---
+
+```{css, echo=FALSE}
+body .main-container {
+  max-width: 1820px !important;
+  width: 1820px !important;
+}
+body {
+  max-width: 1820px !important;
+  width: 1820px !important;
+  font-family: Helvetica !important;
+  font-size: 16pt !important;
+}
+h1,h2,h3,h4,h5,h6{
+  font-size: 24pt !important;
+}
+```
+
+# Imports
+
+We import the forester package to use the check_data function.
+
+```{r message=FALSE, warning=FALSE}
+library(forester)
+```
+
+# Loading data
+
+We load the data from the RData files.
+
+```{r}
+CC18 <- readRDS("CC18.RData")
+regression <- readRDS("regression_bench.RData")
+```
+
+# Multiclass tasks selection
+
+At first we check which CC18 datasets are multiclass, as the benhcmark cosnists of both bianry and multilcass classification tasks.
+
+```{r}
+multiclass_indexes <-c()
+for (i in 1:length(CC18)) {
+  if (length(levels(CC18[[i]]$class)) > 2) {
+    multiclass_indexes <- c(multiclass_indexes, i)
+  }
+}
+multiclass_indexes
+multiclass_CC18 <- CC18[multiclass_indexes]
+```
+
+# Selecting a subset of tasks
+
+Later, we analyse the sizes of the multiclass datasets, and ensure that we choose the representants with different characteristics, and reasonable sizes (not too big or too small).
+
+```{r}
+for (i in 1:length(multiclass_CC18)) {
+  cat('Dataset Index:', i, '\n  Name:', names(multiclass_CC18)[i], '\n  Dimensionality:', dim(multiclass_CC18[[i]]), '\n')
+}
+```
+
+Eventually we end up with the following selection of datasets:
+
+```{r}
+small_idx <- c(2, 3, 4, 5, 7, 10, 15, 16, 17, 21)
+multiclass_CC18_small <- multiclass_CC18[small_idx]
+multiclass_CC18 <- multiclass_CC18_small[c(1, 4, 6, 7, 10)]
+for (i in 1:length(multiclass_CC18)) {
+  cat('Dataset Index:', i, '\n  Name:', names(multiclass_CC18)[i], '\n  Dimensionality:', dim(multiclass_CC18[[i]]), '\n')
+}
+```
+
+# Adding wine_quality task
+
+However, as a mistake, one of the regression datasets called `wine_quality` is actually a multiclass task, thus we also add it here
+
+```{r}
+multiclass_CC18[[6]] <- regression[[2]]
+names(multiclass_CC18)[6] <- "wine_quality"
+for (i in 1:length(multiclass_CC18)) {
+  cat('Dataset Index:', i, '\n  Name:', names(multiclass_CC18)[i], '\n  Dimensionality:', dim(multiclass_CC18[[i]]), '\n')
+}
+```
+
+# Data check
+
+Additionally, we calculate the foresters data check for all tasks.
+
+```{r warning=FALSE}
+for (i in 1:length(multiclass_CC18)) {
+  cat('Dataset Index:', i, '\n  Name:', names(multiclass_CC18)[i], '\n  Dimensionality:', dim(multiclass_CC18[[i]]), '\n\n')
+  if (i != 6) {
+    check_data(multiclass_CC18[[i]], 'class')
+  } else {
+    check_data(multiclass_CC18[[i]], 'quality')
+  }
+}
+```
+
+
+# Saving the outcomes
+
+```{r}
+saveRDS(multiclass_CC18, "multiclass_CC18.RData")
+```
+
diff --git a/docs/articles/AutoML24Workshop & MScThesis/02_MSc_altered_datasets_creation.Rmd b/docs/articles/AutoML24Workshop & MScThesis/02_MSc_altered_datasets_creation.Rmd
@@ -0,0 +1,138 @@
+---
+title: "MSc_altered_datasets_creation"
+author: "Hubert Ruczyński"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    toc: yes
+    toc_float: yes
+    toc_collapsed: yes
+    theme: lumen
+    toc_depth: 3
+    number_sections: yes
+    code_folding: hide
+    latex_engine: xelatex
+---
+
+```{css, echo=FALSE}
+body .main-container {
+  max-width: 1820px !important;
+  width: 1820px !important;
+}
+body {
+  max-width: 1820px !important;
+  width: 1820px !important;
+  font-family: Helvetica !important;
+  font-size: 16pt !important;
+}
+h1,h2,h3,h4,h5,h6{
+  font-size: 24pt !important;
+}
+```
+
+# Imports
+
+We import the forester package to use the check_data function.
+
+```{r message=FALSE, warning=FALSE}
+library(forester)
+```
+
+# Loading data
+
+We load the data from the RData files.
+
+```{r}
+MSc_binary_CC18      <- readRDS("binary_CC18.RData")
+MSc_regression_bench <- readRDS("regression_bench.RData")
+MSc_multiclass_CC18  <- readRDS("multiclass_CC18.RData")
+
+MSc_regression_bench <- MSc_regression_bench[c(1, 3, 4, 6, 7)]
+```
+
+# Altering function
+
+As the original datasets come from the benchmarks, their quality is much higer than in the case of regular ML tasks. Thus we alter the datasets in order to create a lower-quality, more real-life examples.
+
+To achieve that we introduce the following changes:
+
+-   Adding ID column,
+
+-   Adding static columns,
+
+-   Duplicating existing columns,
+
+-   Introducing the missing values for 3 columns where 5%, 10%, and 15% of observations are missing.
+
+```{r}
+alter_df <- function(df) {
+  org_df_id <- 1:(ncol(df) - 1)
+  # Adding ID column
+  df$ID <- 1:nrow(df)
+  # Adding static columns
+  df$static_obvious <- rep(1, nrow(df))
+  df$static_less_obvious <- c(rep('a', as.integer(nrow(df) * 0.995)), rep('b', (nrow(df) - as.integer(nrow(df) * 0.995))))
+  # Duplicating existing columns
+  set.seed(123)
+  id <- sample(org_df_id, 2)
+  df$duplicate_1 <- df[, id[1]]
+  df$duplicate_2 <- df[, id[2]]
+  # Introducing the missing values
+  row_idx <- 1:nrow(df)
+  set.seed(234)
+  id <- sample(org_df_id[!org_df_id %in% id], 3)
+  set.seed(345)
+  miss_idx <- sample(row_idx, nrow(df) * 0.05)
+  df[miss_idx, id[1]] <- NA
+  set.seed(456)
+  miss_idx <- sample(row_idx, nrow(df) * 0.1)
+  df[miss_idx, id[2]] <- NA
+  set.seed(567)
+  miss_idx <- sample(row_idx, nrow(df) * 0.15)
+  df[miss_idx, id[3]] <- NA
+  
+  return(df)
+}
+```
+
+# Alternation
+
+We alter the selected subset of datasets from all tasks.
+
+```{r}
+MSc_binary_CC18$`credit-g-mod`       <- alter_df(MSc_binary_CC18$`credit-g`)
+MSc_binary_CC18$`phoneme-mod`        <- alter_df(MSc_binary_CC18$phoneme)
+MSc_regression_bench$`elevators-mod` <- alter_df(MSc_regression_bench$elevators)
+MSc_regression_bench$`kin8nm-mod`    <- alter_df(MSc_regression_bench$kin8nm)
+MSc_multiclass_CC18$`satimage-mod`   <- alter_df(MSc_multiclass_CC18$satimage)
+MSc_multiclass_CC18$`car-mod`        <- alter_df(MSc_multiclass_CC18$car)
+MSc_binary_CC18      <- MSc_binary_CC18[c(1, 2, 3, 4, 5, 19, 25, 26, 36, 37)]
+```
+
+# Data check
+
+Additionally, we calculate the foresters data check for all modified tasks.
+
+```{r}
+cat('Credit-g-mod\n')
+s <- check_data(MSc_binary_CC18$`credit-g-mod`, 'class')
+cat('Phoneme-g-mod\n')
+s <- check_data(MSc_binary_CC18$phoneme, 'Class')
+cat('Elevators-mod\n')
+s <- check_data(MSc_regression_bench$elevators, 'Goal')
+cat('Kin8nm-mod\n')
+s <- check_data(MSc_regression_bench$kin8nm, 'y')
+cat('Satimage-mod\n')
+s <- check_data(MSc_multiclass_CC18$satimage, 'class')
+cat('Car-mod\n')
+s <- check_data(MSc_multiclass_CC18$car, 'class')
+```
+
+# Saving altered datasets
+
+```{r}
+MSc_binary_CC18      <- MSc_binary_CC18[c(1, 2, 3, 4, 5, 19, 25, 26, 36, 37)]
+saveRDS(MSc_binary_CC18,      "MSc_binary_CC18.RData")
+saveRDS(MSc_regression_bench, "MSc_regression_bench.RData")
+saveRDS(MSc_multiclass_CC18,  "MSc_multiclass_CC18.RData")
+```