From 59dfbd1f8e6dad8853ee135c94a61491902d0d3c Mon Sep 17 00:00:00 2001
From: egillax <egillax@gmail.com>
Date: Mon, 27 Nov 2023 14:54:04 +0100
Subject: [PATCH] fixed linting comments for Hades style guide and black for
 python

---
 .Rbuildignore                              |   1 +
 .gitignore                                 |   1 +
 NAMESPACE                                  |   2 +-
 R/Dataset.R                                |  23 +-
 R/DeepPatientLevelPrediction.R             |   7 +-
 R/Estimator.R                              | 416 ++++++++++++---------
 R/HelperFunctions.R                        |  36 +-
 R/LRFinder.R                               |  17 +-
 R/MLP.R                                    |  52 +--
 R/ResNet.R                                 | 105 +++---
 R/TrainingCache-class.R                    |  31 +-
 R/Transformer.R                            | 105 +++---
 inst/python/Dataset.py                     | 107 ++++--
 inst/python/Estimator.py                   | 230 +++++++-----
 inst/python/LrFinder.py                    |  36 +-
 inst/python/MLP.py                         |  59 +--
 inst/python/ResNet.py                      |  83 ++--
 inst/python/Transformer.py                 | 144 +++----
 man/DeepPatientLevelPrediction.Rd          |   3 +-
 man/setDefaultResNet.Rd                    |   2 +-
 man/setEstimator.Rd                        |  15 +-
 man/setMultiLayerPerceptron.Rd             |  11 +-
 man/setResNet.Rd                           |  21 +-
 man/setTransformer.Rd                      |  15 +-
 man/{TrainingCache.Rd => trainingCache.Rd} |  40 +-
 tests/testthat/test-TrainingCache.R        |   2 +-
 26 files changed, 871 insertions(+), 693 deletions(-)
 rename man/{TrainingCache.Rd => trainingCache.Rd} (79%)

diff --git a/.Rbuildignore b/.Rbuildignore
index ac33fdc..307ed11 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -8,3 +8,4 @@
 ^extras$
 ^deploy.sh$
 ^compare_versions$
+^.mypy_cache$
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index e54def2..feec6ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ renv.lock
 extras/
 .Renviron
 inst/python/__pycache__
+.mypy_cache
diff --git a/NAMESPACE b/NAMESPACE
index 4806f0d..5f8c13e 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
-export(TrainingCache)
 export(fitEstimator)
 export(gridCvDeep)
 export(predictDeepEstimator)
@@ -10,6 +9,7 @@ export(setEstimator)
 export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
+export(trainingCache)
 importFrom(dplyr,"%>%")
 importFrom(reticulate,py_to_r)
 importFrom(reticulate,r_to_py)
diff --git a/R/Dataset.R b/R/Dataset.R
index f4b5170..4ba4e81 100644
--- a/R/Dataset.R
+++ b/R/Dataset.R
@@ -15,21 +15,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-createDataset <- function(data, labels, plpModel=NULL) {
+createDataset <- function(data, labels, plpModel = NULL) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  Dataset <- reticulate::import_from_path("Dataset", path = path)$Data
+  dataset <- reticulate::import_from_path("Dataset", path = path)$Data
   if (is.null(attributes(data)$path)) {
     # sqlite object
     attributes(data)$path <- attributes(data)$dbname
   }
   if (is.null(plpModel)) {
-    data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), 
-                            r_to_py(labels$outcomeCount))
+    data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    r_to_py(labels$outcomeCount))
+  } else {
+    numericalFeatures <-
+      r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))
+    data <- dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    numerical_features = numericalFeatures)
   }
-  else {
-    data <- Dataset(r_to_py(normalizePath(attributes(data)$path)),
-                    numerical_features = r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))  )
-  }
-  
-  return(data)  
-}
\ No newline at end of file
+
+  return(data)
+}
diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
index e77812e..137b53a 100644
--- a/R/DeepPatientLevelPrediction.R
+++ b/R/DeepPatientLevelPrediction.R
@@ -18,7 +18,8 @@
 
 #' DeepPatientLevelPrediction
 #'
-#' @description A package containing deep learning extensions for developing prediction models using data in the OMOP CDM
+#' @description A package containing deep learning extensions for developing
+#' prediction models using data in the OMOP CDM
 #'
 #' @docType package
 #' @name DeepPatientLevelPrediction
@@ -28,9 +29,7 @@
 NULL
 
 .onLoad <- function(libname, pkgname) {
-  # use superassignment to update global reference 
+  # use superassignment to update global reference
   reticulate::configure_environment(pkgname)
   torch <<- reticulate::import("torch", delay_load = TRUE)
 }
-
-
diff --git a/R/Estimator.R b/R/Estimator.R
index 7705279..656b647 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -26,36 +26,42 @@
 #' @param weightDecay what weight_decay to use
 #' @param batchSize batchSize to use
 #' @param epochs  how many epochs to train for
-#' @param device  what device to train on, can be a string or a function to that evaluates
+#' @param device  what device to train on, can be a string or a function to
+#' that evaluates
 #' to the device during runtime
 #' @param optimizer which optimizer to use
 #' @param scheduler which learning rate scheduler to use
 #' @param criterion loss function to use
-#' @param earlyStopping If earlyStopping should be used which stops the training of your metric is not improving
-#' @param metric either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. 
-#' Needs to be a list with function `fun`, mode either `min` or `max` and a `name`,
-#' `fun` needs to be a function that takes in prediction and labels and outputs a score.
+#' @param earlyStopping If earlyStopping should be used which stops the
+#' training of your metric is not improving
+#' @param metric either `auc` or `loss` or a custom metric to use. This is the
+#' metric used for scheduler and earlyStopping.
+#' Needs to be a list with function `fun`, mode either `min` or `max` and a
+#' `name`,
+#' `fun` needs to be a function that takes in prediction and labels and
+#' outputs a score.
 #' @param seed seed to initialize weights of model with
 #' @export
-setEstimator <- function(learningRate='auto',
-                         weightDecay = 0.0,
-                         batchSize = 512,
-                         epochs = 30,
-                         device='cpu',
-                         optimizer = torch$optim$AdamW,
-                         scheduler = list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau,
-                                          params=list(patience=1)),
-                         criterion = torch$nn$BCEWithLogitsLoss,
-                         earlyStopping = list(useEarlyStopping=TRUE,
-                                              params = list(patience=4)),
-                         metric = "auc",
-                         seed = NULL
+setEstimator <- function(learningRate = "auto",
+  weightDecay = 0.0,
+  batchSize = 512,
+  epochs = 30,
+  device = "cpu",
+  optimizer = torch$optim$AdamW,
+  scheduler = list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau,
+                   params = list(patience = 1)),
+  criterion = torch$nn$BCEWithLogitsLoss,
+  earlyStopping = list(useEarlyStopping = TRUE,
+                       params = list(patience = 4)),
+  metric = "auc",
+  seed = NULL
 ) {
-  
+
   checkIsClass(learningRate, c("numeric", "character"))
   if (inherits(learningRate, "character")) {
-    if (learningRate != "auto"){
-      stop(paste0('Learning rate should be either a numeric or "auto", you provided: ', learningRate))
+    if (learningRate != "auto") {
+      stop(paste0('Learning rate should be either a numeric or "auto", 
+                  you provided: ', learningRate))
     }
   }
   checkIsClass(weightDecay, "numeric")
@@ -67,49 +73,57 @@ setEstimator <- function(learningRate='auto',
   checkIsClass(earlyStopping, c("list", "NULL"))
   checkIsClass(metric, c("character", "list"))
   checkIsClass(seed, c("numeric", "integer", "NULL"))
-  
-  
-  if (length(learningRate)==1 && learningRate=='auto') {findLR <- TRUE} else {findLR <- FALSE}
+
+
+  if (length(learningRate) == 1 && learningRate == "auto") {
+    findLR <- TRUE
+  } else {
+    findLR <- FALSE
+  }
   if (is.null(seed)) {
     seed <- as.integer(sample(1e5, 1))
   }
-  estimatorSettings <- list(learningRate=learningRate,
-                            weightDecay=weightDecay,
-                            batchSize=batchSize,
-                            epochs=epochs,
-                            device=device,
-                            earlyStopping=earlyStopping,
-                            findLR=findLR,
-                            metric=metric,
-                            seed=seed[1])
-  
-  optimizer <- rlang::enquo(optimizer) 
+  estimatorSettings <- list(learningRate = learningRate,
+                            weightDecay = weightDecay,
+                            batchSize = batchSize,
+                            epochs = epochs,
+                            device = device,
+                            earlyStopping = earlyStopping,
+                            findLR = findLR,
+                            metric = metric,
+                            seed = seed[1])
+
+  optimizer <- rlang::enquo(optimizer)
   estimatorSettings$optimizer <- function() rlang::eval_tidy(optimizer)
-  class(estimatorSettings$optimizer) <- c("delayed", class(estimatorSettings$optimizer))
-  
+  class(estimatorSettings$optimizer) <- c("delayed",
+                                          class(estimatorSettings$optimizer))
+
   criterion <- rlang::enquo(criterion)
   estimatorSettings$criterion <- function() rlang::eval_tidy(criterion)
-  class(estimatorSettings$criterion) <- c("delayed", class(estimatorSettings$criterion))
-  
+  class(estimatorSettings$criterion) <- c("delayed",
+                                          class(estimatorSettings$criterion))
+
   scheduler <- rlang::enquo(scheduler)
   estimatorSettings$scheduler <- function() rlang::eval_tidy(scheduler)
-  class(estimatorSettings$scheduler) <-c("delayed", class(estimatorSettings$scheduler))
+  class(estimatorSettings$scheduler) <- c("delayed",
+                                          class(estimatorSettings$scheduler))
 
   if (is.function(device)) {
-    class(estimatorSettings$device) <- c("delayed",  class(estimatorSettings$device))
+    class(estimatorSettings$device) <- c("delayed",
+                                         class(estimatorSettings$device))
   }
-  
+
   paramsToTune <- list()
   for (name in names(estimatorSettings)) {
     param <- estimatorSettings[[name]]
     if (length(param) > 1 && is.atomic(param)) {
-      paramsToTune[[paste0('estimator.',name)]] <- param
+      paramsToTune[[paste0("estimator.", name)]] <- param
     }
     if ("params" %in% names(param)) {
       for (name2 in names(param[["params"]])) {
         param2 <- param[["params"]][[name2]]
         if (length(param2) > 1) {
-          paramsToTune[[paste0('estimator.',name,'.',name2)]] <- param2
+          paramsToTune[[paste0("estimator.", name, ".", name2)]] <- param2
         }
       }
     }
@@ -118,7 +132,7 @@ setEstimator <- function(learningRate='auto',
 
   return(estimatorSettings)
 }
- 
+
 #' fitEstimator
 #'
 #' @description
@@ -137,12 +151,12 @@ fitEstimator <- function(trainData,
                          analysisPath,
                          ...) {
   start <- Sys.time()
-  
+
   # check covariate data
   if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
     stop("Needs correct covariateData")
   }
-  
+
   if (!is.null(trainData$folds)) {
     trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId")
   }
@@ -150,9 +164,9 @@ fitEstimator <- function(trainData,
     covariateData = trainData$covariateData,
     cohort = trainData$labels
   )
-  
+
   covariateRef <- mappedCovariateData$covariateRef
-  
+
   outLoc <- PatientLevelPrediction::createTempModelLoc()
   cvResult <- do.call(
     what = gridCvDeep,
@@ -164,10 +178,12 @@ fitEstimator <- function(trainData,
       analysisPath = analysisPath
     )
   )
-  
-  hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary))
+
+  hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch,
+                                        function(x) x$hyperSummary))
   prediction <- cvResult$prediction
-  incs <- rep(1, covariateRef %>% dplyr::tally() %>% 
+  incs <- rep(1, covariateRef %>%
+                dplyr::tally() %>%
                 dplyr::collect() %>%
                 as.integer())
   covariateRef <- covariateRef %>%
@@ -178,25 +194,30 @@ fitEstimator <- function(trainData,
       covariateValue = 0,
       isNumeric = .data$columnId %in% cvResult$numericalIndex
     )
-  
+
   comp <- start - Sys.time()
   result <- list(
-    model = cvResult$estimator, # file.path(outLoc),
-    
+    model = cvResult$estimator,
+
     preprocessing = list(
-      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
-      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
-      requireDenseMatrix = F
+      featureEngineering = attr(trainData$covariateData,
+                                "metaData")$featureEngineering,
+      tidyCovariates = attr(trainData$covariateData,
+                            "metaData")$tidyCovariateDataSettings,
+      requireDenseMatrix = FALSE
     ),
     prediction = prediction,
     modelDesign = PatientLevelPrediction::createModelDesign(
       targetId = attr(trainData, "metaData")$targetId,
       outcomeId = attr(trainData, "metaData")$outcomeId,
-      restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings,
+      restrictPlpDataSettings = attr(trainData,
+                                     "metaData")$restrictPlpDataSettings,
       covariateSettings = attr(trainData, "metaData")$covariateSettings,
       populationSettings = attr(trainData, "metaData")$populationSettings,
-      featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings,
-      preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings,
+      featureEngineeringSettings = attr(trainData$covariateData,
+                                        "metaData")$featureEngineeringSettings,
+      preprocessSettings = attr(trainData$covariateData,
+                                "metaData")$preprocessSettings,
       modelSettings = modelSettings,
       splitSettings = attr(trainData, "metaData")$splitSettings,
       sampleSettings = attr(trainData, "metaData")$sampleSettings
@@ -214,12 +235,12 @@ fitEstimator <- function(trainData,
     ),
     covariateImportance = covariateRef
   )
-  
+
   class(result) <- "plpModel"
   attr(result, "predictionFunction") <- "predictDeepEstimator"
   attr(result, "modelType") <- "binary"
   attr(result, "saveType") <- modelSettings$saveType
-  
+
   return(result)
 }
 
@@ -242,32 +263,31 @@ predictDeepEstimator <- function(plpModel,
   }
   if ("plpData" %in% class(data)) {
     mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
-                                                 cohort = cohort,
-                                                 mapping = plpModel$covariateImportance %>%
-                                                   dplyr::select(
-                                                     "columnId",
-                                                     "covariateId"
-                                                   )
+      cohort = cohort,
+      mapping = plpModel$covariateImportance %>%
+        dplyr::select("columnId", "covariateId")
     )
-    data <- createDataset(mappedData, plpModel=plpModel)
+    data <- createDataset(mappedData, plpModel = plpModel)
   }
-  
+
   # get predictions
   prediction <- cohort
   if (is.character(plpModel$model)) {
     modelSettings <- plpModel$modelDesign$modelSettings
-    model <- torch$load(file.path(plpModel$model, "DeepEstimatorModel.pt"), map_location = "cpu")
-    estimator <- createEstimator(modelType=modelSettings$modelType,
-                                 modelParameters=model$model_parameters,
-                                 estimatorSettings=model$estimator_settings)
+    model <- torch$load(file.path(plpModel$model,
+                                  "DeepEstimatorModel.pt"),
+                        map_location = "cpu")
+    estimator <- createEstimator(modelType = modelSettings$modelType,
+                                 modelParameters = model$model_parameters,
+                                 estimatorSettings = model$estimator_settings)
     estimator$model$load_state_dict(model$model_state_dict)
     prediction$value <- estimator$predict_proba(data)
   } else {
     prediction$value <- plpModel$model$predict_proba(data)
   }
-  
+
   attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType")
-  
+
   return(prediction)
 }
 
@@ -289,15 +309,16 @@ gridCvDeep <- function(mappedData,
                        modelSettings,
                        modelLocation,
                        analysisPath) {
-  ParallelLogger::logInfo(paste0("Running hyperparameter search for ", modelSettings$modelType, " model"))
-  
+  ParallelLogger::logInfo(paste0("Running hyperparameter search for ",
+                                 modelSettings$modelType, " model"))
+
   ###########################################################################
-  
+
   paramSearch <- modelSettings$param
-  
+
   # TODO below chunk should be in a setupCache function
-  trainCache <- TrainingCache$new(analysisPath)
-    if (trainCache$isParamGridIdentical(paramSearch)) {
+  trainCache <- trainingCache$new(analysisPath)
+  if (trainCache$isParamGridIdentical(paramSearch)) {
     gridSearchPredictons <- trainCache$getGridSearchPredictions()
   } else {
     gridSearchPredictons <- list()
@@ -305,123 +326,146 @@ gridCvDeep <- function(mappedData,
     trainCache$saveGridSearchPredictions(gridSearchPredictons)
     trainCache$saveModelParams(paramSearch)
   }
-  
-  dataset <- createDataset(data=mappedData, labels=labels)
-  
-  fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))]
+
+  dataset <- createDataset(data = mappedData, labels = labels)
+
+  fitParams <- names(paramSearch[[1]])[grepl("^estimator",
+                                             names(paramSearch[[1]]))]
   findLR <- modelSettings$estimatorSettings$findLR
   if (!trainCache$isFull()) {
     for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
-    ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
-    ParallelLogger::logInfo(paste0("HyperParameters: "))
-    ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
-    currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
-    
-    currentEstimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, 
-                                               paramSearch[[gridId]])
-    
-    # initiate prediction
-    prediction <- NULL
-    
-    fold <- labels$index
-    ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
-    currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
-    currentModelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
-    if (findLR) {
-      LrFinder <- createLRFinder(modelType = modelSettings$modelType,
-                                 modelParameters = currentModelParams,
-                                 estimatorSettings = currentEstimatorSettings
-                                 )
-      lr <- LrFinder$get_lr(dataset)
-      ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
-      currentEstimatorSettings$learningRate <- lr
-    }
-    
-    learnRates <- list()
-    for (i in 1:max(fold)) {
-      ParallelLogger::logInfo(paste0("Fold ", i))
-      trainDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold != i) - 1)) # -1 for python 0-based indexing
-      testDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold == i) -1)) # -1 for python 0-based indexing
-    
-      estimator <- createEstimator(modelType=modelSettings$modelType,
-                                   modelParameters=currentModelParams,
-                                   estimatorSettings=currentEstimatorSettings)
-      estimator$fit(trainDataset, testDataset)
-      
-      ParallelLogger::logInfo("Calculating predictions on left out fold set...")
-      
-      prediction <- rbind(
-        prediction,
-        predictDeepEstimator(
-          plpModel = estimator,
-          data = testDataset,
-          cohort = labels[fold == i, ]
+      ParallelLogger::logInfo(paste0("Running hyperparameter combination no ",
+                                     gridId))
+      ParallelLogger::logInfo(paste0("HyperParameters: "))
+      ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]),
+                                    paramSearch[[gridId]], collapse = " | "))
+      currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
+
+      currentEstimatorSettings <-
+        fillEstimatorSettings(modelSettings$estimatorSettings, fitParams,
+                              paramSearch[[gridId]])
+
+      # initiate prediction
+      prediction <- NULL
+
+      fold <- labels$index
+      ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
+      currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
+      currentModelParams$numFeatures <-
+        dataset$get_numerical_features()$shape[[1]]
+      if (findLR) {
+        lrFinder <- createLRFinder(modelType = modelSettings$modelType,
+          modelParameters = currentModelParams,
+          estimatorSettings = currentEstimatorSettings
         )
+        lr <- lrFinder$get_lr(dataset)
+        ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
+        currentEstimatorSettings$learningRate <- lr
+      }
+
+      learnRates <- list()
+      for (i in 1:max(fold)) {
+        ParallelLogger::logInfo(paste0("Fold ", i))
+        trainDataset <-
+          torch$utils$data$Subset(dataset,
+                                  indices = as.integer(which(fold != i) - 1))
+        # -1 for python 0-based indexing
+        testDataset <-
+          torch$utils$data$Subset(dataset,
+                                  indices = as.integer(which(fold == i) - 1))
+        # -1 for python 0-based indexing
+
+        estimator <- createEstimator(modelType = modelSettings$modelType,
+                                     modelParameters = currentModelParams,
+                                     estimatorSettings =
+                                       currentEstimatorSettings)
+        estimator$fit(trainDataset, testDataset)
+
+        ParallelLogger::logInfo("Calculating predictions on left out 
+                                fold set...")
+
+        prediction <- rbind(
+          prediction,
+          predictDeepEstimator(
+            plpModel = estimator,
+            data = testDataset,
+            cohort = labels[fold == i, ]
+          )
+        )
+        learnRates[[i]] <- list(
+          LRs = estimator$learn_rate_schedule,
+          bestEpoch = estimator$best_epoch
+        )
+      }
+      maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
+      gridSearchPredictons[[gridId]] <- list(
+        prediction = prediction,
+        param = paramSearch[[gridId]],
+        gridPerformance =
+          PatientLevelPrediction::computeGridPerformance(prediction,
+                                                         paramSearch[[gridId]])
       )
-      learnRates[[i]] <- list(
-        LRs = estimator$learn_rate_schedule,
-        bestEpoch = estimator$best_epoch
-      )
-    }
-    maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
-    gridSearchPredictons[[gridId]] <- list(
-      prediction = prediction,
-      param = paramSearch[[gridId]],
-      gridPerformance =  PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]])
-    )
-    gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), 
-                                                                                  nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))   
-    gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]]
-    
-
-    # remove all predictions that are not the max performance
-    indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
-    for (i in seq_along(gridSearchPredictons)) {
-      if (!is.null(gridSearchPredictons[[i]])) {
-        if (i != indexOfMax) {
-          gridSearchPredictons[[i]]$prediction <- list(NULL)
+      gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <-
+        rep(list(unlist(learnRates[[maxIndex]]$LRs)),
+            nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))
+      gridSearchPredictons[[gridId]]$param$learnSchedule <-
+        learnRates[[maxIndex]]
+
+      # remove all predictions that are not the max performance
+      indexOfMax <-
+        which.max(unlist(lapply(gridSearchPredictons,
+                                function(x) x$gridPerformance$cvPerformance)))
+      for (i in seq_along(gridSearchPredictons)) {
+        if (!is.null(gridSearchPredictons[[i]])) {
+          if (i != indexOfMax) {
+            gridSearchPredictons[[i]]$prediction <- list(NULL)
+          }
         }
       }
+      ParallelLogger::logInfo(paste0("Caching all grid search results and 
+                                     prediction for best combination ",
+                                     indexOfMax))
+      trainCache$saveGridSearchPredictions(gridSearchPredictons)
     }
-    ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax))
-    trainCache$saveGridSearchPredictions(gridSearchPredictons)
-  }
   }
   paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
   # get best params
-  indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
+  indexOfMax <-
+    which.max(unlist(lapply(gridSearchPredictons,
+                            function(x) x$gridPerformance$cvPerformance)))
   finalParam <- gridSearchPredictons[[indexOfMax]]$param
 
   paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
-  
+
   # get best CV prediction
   cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction
   cvPrediction$evaluationType <- "CV"
-  
+
   ParallelLogger::logInfo("Training final model using optimal parameters")
   # get the params
   modelParams <- finalParam[modelSettings$modelParamNames]
-  
-  
+
+
   # create the dir
   if (!dir.exists(file.path(modelLocation))) {
-    dir.create(file.path(modelLocation), recursive = T)
+    dir.create(file.path(modelLocation), recursive = TRUE)
   }
-  
+
   modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
   modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
-  
-  
-  estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams,
+
+
+  estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings,
+                                             fitParams,
                                              finalParam)
   estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
   estimator <- createEstimator(modelType = modelSettings$modelType,
                                modelParameters = modelParams,
                                estimatorSettings = estimatorSettings)
- 
+
   numericalIndex <- dataset$get_numerical_features()
   estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
-  
+
   ParallelLogger::logInfo("Calculating predictions on all train data...")
   prediction <- predictDeepEstimator(
     plpModel = estimator,
@@ -429,7 +473,7 @@ gridCvDeep <- function(mappedData,
     cohort = labels
   )
   prediction$evaluationType <- "Train"
-  
+
   prediction <- rbind(
     prediction,
     cvPrediction
@@ -438,9 +482,10 @@ gridCvDeep <- function(mappedData,
   prediction <- prediction %>%
     dplyr::select(-"index")
 
-  prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, origin = "1970-01-01")
-  
-  
+  prediction$cohortStartDate <- as.Date(prediction$cohortStartDate,
+                                        origin = "1970-01-01")
+
+
   # save torch code here
   estimator$save(modelLocation, "DeepEstimatorModel.pt")
   return(
@@ -454,23 +499,25 @@ gridCvDeep <- function(mappedData,
   )
 }
 
-# utility function to add instances of parameters to estimatorSettings during grid search
+# utility function to add instances of parameters to estimatorSettings
+# during grid search
 fillEstimatorSettings <- function(estimatorSettings, fitParams, paramSearch) {
   for (fp in fitParams) {
     components <- strsplit(fp, "[.]")[[1]]
-    
-    if (length(components)==2) {
+
+    if (length(components) == 2) {
       estimatorSettings[[components[[2]]]] <- paramSearch[[fp]]
     } else {
-      estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <- paramSearch[[fp]]
+      estimatorSettings[[components[[2]]]]$params[[components[[3]]]] <-
+        paramSearch[[fp]]
     }
   }
   return(estimatorSettings)
 }
 
-# utility function to evaluate any expressions or call functions passed as settings
+# utility function to evaluate any expressions or call functions passed as
+# settings
 evalEstimatorSettings <- function(estimatorSettings) {
-  
   for (set in names(estimatorSettings)) {
     if (inherits(estimatorSettings[[set]], "delayed")) {
       estimatorSettings[[set]] <- estimatorSettings[[set]]()
@@ -484,16 +531,15 @@ createEstimator <- function(modelType,
                             estimatorSettings) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
 
-  Model <- reticulate::import_from_path(modelType, path=path)[[modelType]]
-  
-  Estimator <- reticulate::import_from_path("Estimator", path=path)$Estimator
-  
+  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
+  estimator <- reticulate::import_from_path("Estimator", path = path)$Estimator
+
   modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
   estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
   estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-  
-  estimator <- Estimator(model = Model,
+
+  estimator <- estimator(model = model,
                          model_parameters = modelParameters,
                          estimator_settings = estimatorSettings)
   return(estimator)
-}
\ No newline at end of file
+}
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
index 350baf9..a1aa43c 100644
--- a/R/HelperFunctions.R
+++ b/R/HelperFunctions.R
@@ -41,40 +41,40 @@ camelCaseToSnakeCaseNames <- function(object) {
 }
 
 #' helper function to check class of input
-#' 
+#'
 #' @param parameter the input parameter to check
 #' @param classes which classes it should belong to (one or more)
-checkIsClass<- function(parameter,classes) {
-  name = deparse(substitute(parameter))
+checkIsClass <- function(parameter, classes) {
+  name <- deparse(substitute(parameter))
   if (!inherits(x = parameter, what = classes)) {
-    ParallelLogger::logError(paste0(name, ' should be of class:', classes, ' '))      
-    stop(paste0(name, ' is wrong class'))
+    ParallelLogger::logError(paste0(name, " should be of class:", classes, " "))
+    stop(paste0(name, " is wrong class"))
   }
   return(TRUE)
 }
 
 #' helper function to check that input is higher than a certain value
-#' 
+#'
 #' @param parameter the input parameter to check, can be a vector
 #' @param value which value it should be higher than
-checkHigher <- function(parameter,value) {
-  name = deparse(substitute(parameter))
-  if (!is.numeric(parameter) | all(parameter<=value)) {
-    ParallelLogger::logError(paste0(name, ' needs to be > ',value))      
-    stop(paste0(name, ' needs to be > ', value))
+checkHigher <- function(parameter, value) {
+  name <- deparse(substitute(parameter))
+  if (!is.numeric(parameter) || all(parameter == value)) {
+    ParallelLogger::logError(paste0(name, " needs to be > ", value))
+    stop(paste0(name, " needs to be > ", value))
   }
   return(TRUE)
 }
 
 #' helper function to check that input is higher or equal than a certain value
-#' 
+#'
 #' @param parameter the input parameter to check, can be a vector
 #' @param value which value it should be higher or equal than
-checkHigherEqual <- function(parameter,value) {
-  name = deparse(substitute(parameter))
-  if (!is.numeric(parameter) | all(parameter<value)) {
-    ParallelLogger::logError(paste0(name, ' needs to be >= ',value))      
-    stop(paste0(name, ' needs to be >= ', value))
+checkHigherEqual <- function(parameter, value) {
+  name <- deparse(substitute(parameter))
+  if (!is.numeric(parameter) || all(parameter < value)) {
+    ParallelLogger::logError(paste0(name, " needs to be >= ", value))
+    stop(paste0(name, " needs to be >= ", value))
   }
   return(TRUE)
-} 
\ No newline at end of file
+}
diff --git a/R/LRFinder.R b/R/LRFinder.R
index dfe8ee3..63cb62c 100644
--- a/R/LRFinder.R
+++ b/R/LRFinder.R
@@ -18,22 +18,21 @@
 createLRFinder <- function(modelType,
                            modelParameters,
                            estimatorSettings,
-                           lrSettings=NULL) {
+                           lrSettings = NULL) {
   path <- system.file("python", package = "DeepPatientLevelPrediction")
-  LRFinderClass <- reticulate::import_from_path("LrFinder", path = path)$LrFinder
-  
-  model <- reticulate::import_from_path(modelType, path=path)[[modelType]]
-  
+  lrFinderClass <-
+    reticulate::import_from_path("LrFinder", path = path)$LrFinder
+
+  model <- reticulate::import_from_path(modelType, path = path)[[modelType]]
   modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
   estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
   if (!is.null(lrSettings)) {
     lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
   }
-  
+
   estimatorSettings <- evalEstimatorSettings(estimatorSettings)
-  
-  
-  lrFinder <- LRFinderClass(model=model,
+
+  lrFinder <- lrFinderClass(model = model,
                             model_parameters = modelParameters,
                             estimator_settings = estimatorSettings,
                             lr_settings = lrSettings)
diff --git a/R/MLP.R b/R/MLP.R
index 6ad35e6..771e244 100644
--- a/R/MLP.R
+++ b/R/MLP.R
@@ -26,12 +26,15 @@
 #'
 #'
 #' @param numLayers         Number of layers in network, default: 1:8
-#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)
-#' @param dropout           How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)
-#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
+#' @param sizeHidden        Amount of neurons in each default layer,
+#' default: 2^(6:9) (64 to 512)
+#' @param dropout           How much dropout to apply after first linear,
+#' default: seq(0, 0.3, 0.05)
+#' @param sizeEmbedding     Size of embedding default: 2^(6:9) (64 to 512)
 #' @param estimatorSettings settings of Estimator created with `setEstimator`
-#' @param hyperParamSearch  Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'
-#' @param randomSample      How many random samples from hyperparameter space to use
+#' @param hyperParamSearch  Which kind of hyperparameter search to use random
+#' sampling or exhaustive grid search. default: 'random'
+#' @param randomSample How many random samples from hyperparameter space to use
 #' @param randomSampleSeed  Random seed to sample hyperparameter combinations
 #'
 #' @export
@@ -39,35 +42,34 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
                                     sizeHidden = c(2^(6:9)),
                                     dropout = c(seq(0, 0.3, 0.05)),
                                     sizeEmbedding = c(2^(6:9)),
-                                    estimatorSettings = setEstimator(
-                                      learningRate = 'auto',
-                                      weightDecay = c(1e-6, 1e-3),
-                                      batchSize = 1024,
-                                      epochs = 30,
-                                      device="cpu"),
+                                    estimatorSettings =
+                                      setEstimator(learningRate = "auto",
+                                                   weightDecay = c(1e-6, 1e-3),
+                                                   batchSize = 1024,
+                                                   epochs = 30,
+                                                   device = "cpu"),
                                     hyperParamSearch = "random",
                                     randomSample = 100,
                                     randomSampleSeed = NULL) {
-  
   checkIsClass(numLayers, c("integer", "numeric"))
   checkHigherEqual(numLayers, 1)
 
   checkIsClass(sizeHidden, c("integer", "numeric"))
   checkHigherEqual(sizeHidden, 1)
-  
+
   checkIsClass(dropout, c("numeric"))
   checkHigherEqual(dropout, 0)
-  
+
   checkIsClass(sizeEmbedding, c("numeric", "integer"))
   checkHigherEqual(sizeEmbedding, 1)
-  
+
   checkIsClass(hyperParamSearch, "character")
-  
+
   checkIsClass(randomSample, c("numeric", "integer"))
   checkHigherEqual(randomSample, 1)
-  
+
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
-  
+
   paramGrid <- list(
     numLayers = numLayers,
     sizeHidden = sizeHidden,
@@ -78,16 +80,20 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
 
   param <- PatientLevelPrediction::listCartesian(paramGrid)
-  if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.",
-               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
+  if (hyperParamSearch == "random" && randomSample > length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the 
+               amount of possible hyperparameter combinations.",
+               "\n randomSample:", randomSample, "\n Possible hyperparameter 
+               combinations:", length(param),
                "\n Please lower the amount of randomSamples"))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed,
+                                      {param <- param[sample(length(param),
+                                                             randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "MLP"
+  attr(param, "settings")$modelType <- "MLP"
 
   results <- list(
     fitFunction = "fitEstimator",
diff --git a/R/ResNet.R b/R/ResNet.R
index d20aa1d..32e7f3a 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -22,18 +22,19 @@
 #' Creates settings for a default ResNet model
 #'
 #' @details
-#' Model architecture from by https://arxiv.org/abs/2106.11959 . 
+#' Model architecture from by https://arxiv.org/abs/2106.11959 .
 #' Hyperparameters chosen by a experience on a few prediction problems.
 #'
 #' @param estimatorSettings created with ```setEstimator```
 
 #' @export
-setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto',
-                                                            weightDecay=1e-6,
-                                                            device='cpu',
-                                                            batchSize=1024,
-                                                            epochs=50,
-                                                            seed=NULL)) {
+setDefaultResNet <- function(estimatorSettings =
+                               setEstimator(learningRate = "auto",
+                                            weightDecay = 1e-6,
+                                            device = "cpu",
+                                            batchSize = 1024,
+                                            epochs = 50,
+                                            seed = NULL)) {
   resnetSettings <- setResNet(numLayers = 6,
                               sizeHidden = 512,
                               hiddenFactor = 2,
@@ -41,9 +42,9 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto',
                               hiddenDropout = 0.4,
                               sizeEmbedding = 256,
                               estimatorSettings = estimatorSettings,
-                              hyperParamSearch = 'random',
+                              hyperParamSearch = "random",
                               randomSample = 1)
-  attr(resnetSettings, 'settings')$name <- 'defaultResnet'
+  attr(resnetSettings, "settings")$name <- "defaultResnet"
   return(resnetSettings)
 }
 
@@ -58,14 +59,21 @@ setDefaultResNet <- function(estimatorSettings=setEstimator(learningRate='auto',
 #'
 #'
 #' @param numLayers         Number of layers in network, default: 1:16
-#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)
-#' @param hiddenFactor      How much to grow the amount of neurons in each ResLayer, default: 1:4
-#' @param residualDropout   How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)
-#' @param hiddenDropout     How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)
-#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
+#' @param sizeHidden        Amount of neurons in each default layer, default:
+#' 2^(6:10) (64 to 1024)
+#' @param hiddenFactor      How much to grow the amount of neurons in each
+#' ResLayer, default: 1:4
+#' @param residualDropout   How much dropout to apply after last linear layer
+#' in ResLayer, default: seq(0, 0.3, 0.05)
+#' @param hiddenDropout     How much dropout to apply after first linear layer
+#' in ResLayer, default: seq(0, 0.3, 0.05)
+#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9)
+#' '(64 to 512)
 #' @param estimatorSettings created with ```setEstimator```
-#' @param hyperParamSearch  Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'
-#' @param randomSample      How many random samples from hyperparameter space to use
+#' @param hyperParamSearch  Which kind of hyperparameter search to use random
+#' sampling or exhaustive grid search. default: 'random'
+#' @param randomSample      How many random samples from hyperparameter space
+#' to use
 #' @param randomSampleSeed  Random seed to sample hyperparameter combinations
 #' @export
 setResNet <- function(numLayers = c(1:8),
@@ -74,61 +82,62 @@ setResNet <- function(numLayers = c(1:8),
                       residualDropout = c(seq(0, 0.5, 0.05)),
                       hiddenDropout = c(seq(0, 0.5, 0.05)),
                       sizeEmbedding = c(2^(6:9)),
-                      estimatorSettings = setEstimator(learningRate='auto',
-                                                       weightDecay=c(1e-6, 1e-3),
-                                                       device='cpu',
-                                                       batchSize=1024,
-                                                       epochs=30,
-                                                       seed=NULL),
+                      estimatorSettings =
+                        setEstimator(learningRate = "auto",
+                                     weightDecay = c(1e-6, 1e-3),
+                                     device = "cpu",
+                                     batchSize = 1024,
+                                     epochs = 30,
+                                     seed = NULL),
                       hyperParamSearch = "random",
                       randomSample = 100,
-                      randomSampleSeed = NULL) 
-{
+                      randomSampleSeed = NULL) {
   checkIsClass(numLayers, c("integer", "numeric"))
   checkHigherEqual(numLayers, 1)
-  
+
   checkIsClass(sizeHidden, c("integer", "numeric"))
   checkHigherEqual(sizeHidden, 1)
-  
+
   checkIsClass(residualDropout, "numeric")
   checkHigherEqual(residualDropout, 0)
-  
+
   checkIsClass(hiddenDropout, "numeric")
   checkHigherEqual(hiddenDropout, 0)
-  
+
   checkIsClass(sizeEmbedding, c("integer", "numeric"))
   checkHigherEqual(sizeEmbedding, 1)
-  
+
   checkIsClass(hyperParamSearch, "character")
-  
+
   checkIsClass(randomSample, c("numeric", "integer"))
   checkHigherEqual(randomSample, 1)
-  
+
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
-  
-  paramGrid <- list(
-    numLayers = numLayers,
-    sizeHidden = sizeHidden,
-    hiddenFactor = hiddenFactor,
-    residualDropout = residualDropout,
-    hiddenDropout = hiddenDropout,
-    sizeEmbedding = sizeEmbedding)
-  
+
+  paramGrid <- list(numLayers = numLayers,
+                    sizeHidden = sizeHidden,
+                    hiddenFactor = hiddenFactor,
+                    residualDropout = residualDropout,
+                    hiddenDropout = hiddenDropout,
+                    sizeEmbedding = sizeEmbedding)
 
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
-  
+
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
-  if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
-               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
-               "\n Please lower the amount of randomSamples"))
+  if (hyperParamSearch == "random" && randomSample > length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of 
+               possible hyperparameter combinations.", "\n randomSample:",
+               randomSample, "\n Possible hyperparameter combinations:",
+               length(param), "\n Please lower the amount of randomSamples"))
   }
-  
+
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed,
+                                      {param <- param[sample(length(param),
+                                                             randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "ResNet"
+  attr(param, "settings")$modelType <- "ResNet"
   results <- list(
     fitFunction = "fitEstimator",
     param = param,
diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
index be626d5..c72f588 100644
--- a/R/TrainingCache-class.R
+++ b/R/TrainingCache-class.R
@@ -2,9 +2,8 @@
 #' @description
 #' Parameter caching for training persistence and continuity
 #' @export
-TrainingCache <- R6::R6Class(
+trainingCache <- R6::R6Class(
   "TrainingCache",
-  
   private = list(
     .paramPersistence = list(
       gridSearchPredictions = NULL,
@@ -12,30 +11,27 @@ TrainingCache <- R6::R6Class(
     ),
     .paramContinuity = list(),
     .saveDir = NULL,
-    
     writeToFile = function() {
       saveRDS(private$.paramPersistence, file.path(private$.saveDir))
     },
-    
     readFromFile = function() {
-        private$.paramPersistence <- readRDS(file.path(private$.saveDir))
+      private$.paramPersistence <- readRDS(file.path(private$.saveDir))
     }
   ),
-  
   public = list(
     #' @description
     #' Creates a new training cache
     #' @param inDir Path to the analysis directory
     initialize = function(inDir) {
       private$.saveDir <- file.path(inDir, "paramPersistence.rds")
-      
+
       if (file.exists(private$.saveDir)) {
         private$readFromFile()
       } else {
         private$writeToFile()
       }
     },
-    
+
     #' @description
     #' Checks whether the parameter grid in the model settings is identical to
     #' the cached parameters.
@@ -44,7 +40,7 @@ TrainingCache <- R6::R6Class(
     isParamGridIdentical = function(inModelParams) {
       return(identical(inModelParams, private$.paramPersistence$modelParams))
     },
-    
+
     #' @description
     #' Saves the grid search results to the training cache
     #' @param inGridSearchPredictions Grid search predictions
@@ -53,7 +49,7 @@ TrainingCache <- R6::R6Class(
         inGridSearchPredictions
       private$writeToFile()
     },
-    
+
     #' @description
     #' Saves the parameter grid to the training cache
     #' @param inModelParams Parameter grid from the model settings
@@ -61,21 +57,22 @@ TrainingCache <- R6::R6Class(
       private$.paramPersistence$modelParams <- inModelParams
       private$writeToFile()
     },
-    
+
     #' @description
     #' Gets the grid search results from the training cache
     #' @returns Grid search results from the training cache
     getGridSearchPredictions = function() {
       return(private$.paramPersistence$gridSearchPredictions)
     },
-    
+
     #' @description
     #' Check if cache is full
     #' @returns Boolen
     isFull = function() {
-      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance)))))
-    },    
-    
+      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions,
+                               function(x) !is.null(x$gridPerformance)))))
+    },
+
     #' @description
     #' Gets the last index from the cached grid search
     #' @returns Last grid search index
@@ -88,11 +85,11 @@ TrainingCache <- R6::R6Class(
           return(1)
         } else {
           return(which(sapply(private$.paramPersistence$gridSearchPredictions,
-            is.null))[1])
+                              is.null))[1])
         }
       }
     },
-    
+
     #' @description
     #' Remove the training cache from the analysis path
     dropCache = function() {
diff --git a/R/Transformer.R b/R/Transformer.R
index 8b0d0c4..cd4a968 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -24,13 +24,13 @@
 #' @param estimatorSettings created with `setEstimator`
 #'
 #' @export
-setDefaultTransformer <- function(estimatorSettings=setEstimator(
-  learningRate = 'auto',
-  weightDecay = 1e-4,
-  batchSize=512,
-  epochs=10,
-  seed=NULL,
-  device='cpu')
+setDefaultTransformer <- function(estimatorSettings =
+    setEstimator(learningRate = "auto",
+                 weightDecay = 1e-4,
+                 batchSize = 512,
+                 epochs = 10,
+                 seed = NULL,
+                 device = "cpu")
 ) {
   transformerSettings <- setTransformer(numBlocks = 3,
                                         dimToken = 192,
@@ -40,10 +40,10 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator(
                                         ffnDropout = 0.1,
                                         resDropout = 0.0,
                                         dimHidden = 256,
-                                        estimatorSettings=estimatorSettings,
-                                        hyperParamSearch = 'random',
+                                        estimatorSettings = estimatorSettings,
+                                        hyperParamSearch = "random",
                                         randomSample = 1)
-  attr(transformerSettings, 'settings')$name <- 'defaultTransformer'
+  attr(transformerSettings, "settings")$name <- "defaultTransformer"
   return(transformerSettings)
 }
 
@@ -54,75 +54,81 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator(
 #'
 #' @param numBlocks               number of transformer blocks
 #' @param dimToken                dimension of each token (embedding size)
-#' @param dimOut                  dimension of output, usually 1 for binary problems
+#' @param dimOut                  dimension of output, usually 1 for binary
+#' problems
 #' @param numHeads                number of attention heads
 #' @param attDropout              dropout to use on attentions
 #' @param ffnDropout              dropout to use in feedforward block
 #' @param resDropout              dropout to use in residual connections
 #' @param dimHidden               dimension of the feedworward block
-#' @param dimHiddenRatio          dimension of the feedforward block as a ratio of dimToken (embedding size)
+#' @param dimHiddenRatio          dimension of the feedforward block as a ratio
+#' of dimToken (embedding size)
 #' @param estimatorSettings       created with `setEstimator`
-#' @param hyperParamSearch        what kind of hyperparameter search to do, default 'random'
-#' @param randomSample            How many samples to use in hyperparameter search if random
-#' @param randomSampleSeed        Random seed to sample hyperparameter combinations
+#' @param hyperParamSearch        what kind of hyperparameter search to do,
+#' default 'random'
+#' @param randomSample            How many samples to use in hyperparameter
+#' search if random
+#' @param randomSampleSeed        Random seed to sample hyperparameter
+#' combinations
 #'
 #' @export
-setTransformer <- function(numBlocks = 3, 
-                           dimToken = 96, 
+setTransformer <- function(numBlocks = 3,
+                           dimToken = 96,
                            dimOut = 1,
-                           numHeads = 8, 
-                           attDropout = 0.25, 
+                           numHeads = 8,
+                           attDropout = 0.25,
                            ffnDropout = 0.25,
-                           resDropout = 0, 
-                           dimHidden = 512, 
+                           resDropout = 0,
+                           dimHidden = 512,
                            dimHiddenRatio = NULL,
-                           estimatorSettings=setEstimator(weightDecay = 1e-6,
-                                                          batchSize=1024,
-                                                          epochs=10,
-                                                          seed=NULL),
+                           estimatorSettings = setEstimator(weightDecay = 1e-6,
+                                                            batchSize = 1024,
+                                                            epochs = 10,
+                                                            seed = NULL),
                            hyperParamSearch = "random",
-                           randomSample = 1, 
+                           randomSample = 1,
                            randomSampleSeed = NULL) {
-  
+
   checkIsClass(numBlocks, c("integer", "numeric"))
   checkHigherEqual(numBlocks, 1)
-  
+
   checkIsClass(dimToken, c("integer", "numeric"))
   checkHigherEqual(dimToken, 1)
-  
+
   checkIsClass(dimOut, c("integer", "numeric"))
   checkHigherEqual(dimOut, 1)
-  
+
   checkIsClass(numHeads, c("integer", "numeric"))
   checkHigherEqual(numHeads, 1)
 
   checkIsClass(attDropout, c("numeric"))
   checkHigherEqual(attDropout, 0)
-  
+
   checkIsClass(ffnDropout, c("numeric"))
   checkHigherEqual(ffnDropout, 0)
-  
+
   checkIsClass(resDropout, c("numeric"))
   checkHigherEqual(resDropout, 0)
-  
+
   checkIsClass(dimHidden, c("integer", "numeric", "NULL"))
   if (!is.null(dimHidden)) {
     checkHigherEqual(dimHidden, 1)
   }
-  
+
   checkIsClass(dimHiddenRatio, c("numeric", "NULL"))
   if (!is.null(dimHiddenRatio)) {
     checkHigher(dimHiddenRatio, 0)
   }
-  
+
   checkIsClass(hyperParamSearch, "character")
-  
+
   checkIsClass(randomSample, c("numeric", "integer"))
   checkHigherEqual(randomSample, 1)
-  
+
   checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
-  
-  if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), dimToken %% numHeads != 0))) {
+
+  if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads),
+               dimToken %% numHeads != 0))) {
     stop(paste(
       "dimToken needs to divisible by numHeads. dimToken =", dimToken,
       "is not divisible by numHeads =", numHeads
@@ -157,21 +163,24 @@ setTransformer <- function(numBlocks = 3,
 
   if (!is.null(dimHiddenRatio)) {
     param <- lapply(param, function(x) {
-      x$dimHidden <- round(x$dimToken*x$dimHidden, digits = 0)
+      x$dimHidden <- round(x$dimToken * x$dimHidden, digits = 0)
       return(x)
     })
   }
-  
-  if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.",
-               "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
-               "\n Please lower the amount of randomSample"))
+
+  if (hyperParamSearch == "random" && randomSample > length(param)) {
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of
+               possible hyperparameter combinations.", "\n randomSample:",
+               randomSample, "\n Possible hyperparameter combinations:",
+               length(param), "\n Please lower the amount of randomSample"))
   }
 
   if (hyperParamSearch == "random") {
-    suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
+    suppressWarnings(withr::with_seed(randomSampleSeed,
+                                      {param <- param[sample(length(param),
+                                                             randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "Transformer"
+  attr(param, "settings")$modelType <- "Transformer"
   results <- list(
     fitFunction = "fitEstimator",
     param = param,
@@ -186,4 +195,4 @@ setTransformer <- function(numBlocks = 3,
 
   class(results) <- "modelSettings"
   return(results)
-}
\ No newline at end of file
+}
diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
index 98d9ed3..a81dfc1 100644
--- a/inst/python/Dataset.py
+++ b/inst/python/Dataset.py
@@ -8,29 +8,32 @@
 
 
 class Data(Dataset):
-    def __init__(self,
-                 data,
-                 labels=None,
-                 numerical_features=None):
+    def __init__(self, data, labels=None, numerical_features=None):
         """
         data: path to a covariates dataframe either arrow dataset or sqlite object
         labels: a list of either 0 or 1, 1 if the patient got the outcome
         numerical_features: list of indices where the numerical features are
         """
         start = time.time()
-        if pathlib.Path(data).suffix == '.sqlite':
+        if pathlib.Path(data).suffix == ".sqlite":
             data = urllib.parse.quote(data)
-            data = pl.read_database("SELECT * from covariates",
-                                    connection=f"sqlite://{data}").lazy()
+            data = pl.read_database(
+                "SELECT * from covariates", connection=f"sqlite://{data}"
+            ).lazy()
         else:
-            data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow'))
-        observations = data.select(pl.col('rowId').max()).collect()[0, 0]
+            data = pl.scan_ipc(pathlib.Path(data).joinpath("covariates/*.arrow"))
+        observations = data.select(pl.col("rowId").max()).collect()[0, 0]
         # detect features are numeric
         if numerical_features is None:
-            self.numerical_features = data.groupby(by='columnId') \
-                .n_unique().filter(pl.col('covariateValue') > 1).select('columnId').collect()['columnId']
+            self.numerical_features = (
+                data.groupby(by="columnId")
+                .n_unique()
+                .filter(pl.col("covariateValue") > 1)
+                .select("columnId")
+                .collect()["columnId"]
+            )
         else:
-            self.numerical_features = pl.Series('num', numerical_features)
+            self.numerical_features = pl.Series("num", numerical_features)
 
         if labels:
             self.target = torch.as_tensor(labels)
@@ -42,42 +45,68 @@ def __init__(self,
         # create newColumnId from 1 (or zero?) until # catColumns
         # select rowId and newColumnId
         # rename newColumnId to columnId and sort by it
-        data_cat = data.filter(~pl.col('columnId')
-                               .is_in(self.numerical_features)) \
-            .sort(by='columnId').with_row_count('newColumnId'). \
-            with_columns(pl.col('newColumnId').first().over('columnId')
-                         .rank(method="dense")) \
-            .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId')).sort('rowId') \
-            .with_columns(pl.col('rowId') - 1).collect()
+        data_cat = (
+            data.filter(~pl.col("columnId").is_in(self.numerical_features))
+            .sort(by="columnId")
+            .with_row_count("newColumnId")
+            .with_columns(
+                pl.col("newColumnId").first().over("columnId").rank(method="dense")
+            )
+            .select(pl.col("rowId"), pl.col("newColumnId").alias("columnId"))
+            .sort("rowId")
+            .with_columns(pl.col("rowId") - 1)
+            .collect()
+        )
         cat_tensor = torch.as_tensor(data_cat.to_numpy())
-        tensor_list = torch.split(cat_tensor[:, 1], torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].
-                                  tolist())
+        tensor_list = torch.split(
+            cat_tensor[:, 1],
+            torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].tolist(),
+        )
 
         # because of subjects without cat features, I need to create a list with all zeroes and then insert
         # my tensorList. That way I can still index the dataset correctly.
         total_list = [torch.as_tensor((0,))] * observations
-        idx = data_cat['rowId'].unique().to_list()
+        idx = data_cat["rowId"].unique().to_list()
         for i, i2 in enumerate(idx):
             total_list[i2] = tensor_list[i]
         self.cat = torch.nn.utils.rnn.pad_sequence(total_list, batch_first=True)
-        self.cat_features = data_cat['columnId'].unique()
+        self.cat_features = data_cat["columnId"].unique()
 
         # numerical data,
         # N x C, dense matrix with values for N patients/visits for C numerical features
         if pl.count(self.numerical_features) == 0:
             self.num = None
         else:
-            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \
-                with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId').
-                                                           rank(method="dense") - 1, pl.col('rowId') - 1) \
-                .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect()
-            indices = torch.as_tensor(numerical_data.select(['rowId', 'columnId']).to_numpy(), dtype=torch.long)
-            values = torch.as_tensor(numerical_data.select('covariateValue').to_numpy(), dtype=torch.float)
-            self.num = torch.sparse_coo_tensor(indices=indices.T,
-                                               values=values.squeeze(),
-                                               size=(observations, pl.count(self.numerical_features))).to_dense()
+            numerical_data = (
+                data.filter(pl.col("columnId").is_in(self.numerical_features))
+                .sort(by="columnId")
+                .with_row_count("newColumnId")
+                .with_columns(
+                    pl.col("newColumnId").first().over("columnId").rank(method="dense")
+                    - 1,
+                    pl.col("rowId") - 1,
+                )
+                .select(
+                    pl.col("rowId"),
+                    pl.col("newColumnId").alias("columnId"),
+                    pl.col("covariateValue"),
+                )
+                .collect()
+            )
+            indices = torch.as_tensor(
+                numerical_data.select(["rowId", "columnId"]).to_numpy(),
+                dtype=torch.long,
+            )
+            values = torch.as_tensor(
+                numerical_data.select("covariateValue").to_numpy(), dtype=torch.float
+            )
+            self.num = torch.sparse_coo_tensor(
+                indices=indices.T,
+                values=values.squeeze(),
+                size=(observations, pl.count(self.numerical_features)),
+            ).to_dense()
         delta = time.time() - start
-        print(f'Processed data in {delta:.2f} seconds')
+        print(f"Processed data in {delta:.2f} seconds")
 
     def get_numerical_features(self):
         return self.numerical_features
@@ -90,13 +119,15 @@ def __len__(self):
 
     def __getitem__(self, item):
         if self.num is not None:
-            batch = {"cat": self.cat[item, :],
-                     "num": self.num[item, :]}
+            batch = {"cat": self.cat[item, :], "num": self.num[item, :]}
         else:
-            batch = {"cat": self.cat[item, :].squeeze(),
-                     "num": None}
+            batch = {"cat": self.cat[item, :].squeeze(), "num": None}
         if batch["cat"].dim() == 1 and not isinstance(item, list):
             batch["cat"] = batch["cat"].unsqueeze(0)
-        if batch["num"] is not None and batch["num"].dim() == 1 and not isinstance(item, list):
+        if (
+            batch["num"] is not None
+            and batch["num"].dim() == 1
+            and not isinstance(item, list)
+        ):
             batch["num"] = batch["num"].unsqueeze(0)
         return [batch, self.target[item].squeeze()]
diff --git a/inst/python/Estimator.py b/inst/python/Estimator.py
index cd534e2..983bde6 100644
--- a/inst/python/Estimator.py
+++ b/inst/python/Estimator.py
@@ -12,10 +12,7 @@ class Estimator:
     A class that wraps around pytorch models.
     """
 
-    def __init__(self,
-                 model,
-                 model_parameters,
-                 estimator_settings):
+    def __init__(self, model, model_parameters, estimator_settings):
         self.seed = estimator_settings["seed"]
         if callable(estimator_settings["device"]):
             self.device = estimator_settings["device"]()
@@ -36,31 +33,51 @@ def __init__(self,
         self.previous_epochs = int(estimator_settings.get("previous_epochs", 0))
         self.model.to(device=self.device)
 
-        self.optimizer = estimator_settings["optimizer"](params=self.model.parameters(),
-                                                         lr=self.learning_rate,
-                                                         weight_decay=self.weight_decay)
+        self.optimizer = estimator_settings["optimizer"](
+            params=self.model.parameters(),
+            lr=self.learning_rate,
+            weight_decay=self.weight_decay,
+        )
         self.criterion = estimator_settings["criterion"]()
 
-        if "metric" in estimator_settings.keys() and estimator_settings["metric"] is not None:
+        if (
+            "metric" in estimator_settings.keys()
+            and estimator_settings["metric"] is not None
+        ):
             self.metric = estimator_settings["metric"]
-            if type(self.metric) == str:
+            if isinstance(self.metric, str):
                 if self.metric == "auc":
-                    self.metric = {"name": "auc",
-                                   "mode": "max"}
+                    self.metric = {"name": "auc", "mode": "max"}
                 elif self.metric == "loss":
-                    self.metric = {"name": "loss",
-                                   "mode": "min"}
-            if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None:
+                    self.metric = {"name": "loss", "mode": "min"}
+            if (
+                "scheduler" in estimator_settings.keys()
+                and estimator_settings["scheduler"] is not None
+            ):
                 estimator_settings["scheduler"]["params"]["mode"] = self.metric["mode"]
-            if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None:
-                estimator_settings["early_stopping"]["params"]["mode"] = self.metric["mode"]
-
-        if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None:
-            self.scheduler = estimator_settings["scheduler"]["fun"](self.optimizer,
-                                                                    **estimator_settings["scheduler"]["params"])
-
-        if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None:
-            self.early_stopper = EarlyStopping(**estimator_settings["early_stopping"]["params"])
+            if (
+                "early_stopping" in estimator_settings.keys()
+                and estimator_settings["early_stopping"] is not None
+            ):
+                estimator_settings["early_stopping"]["params"]["mode"] = self.metric[
+                    "mode"
+                ]
+
+        if (
+            "scheduler" in estimator_settings.keys()
+            and estimator_settings["scheduler"] is not None
+        ):
+            self.scheduler = estimator_settings["scheduler"]["fun"](
+                self.optimizer, **estimator_settings["scheduler"]["params"]
+            )
+
+        if (
+            "early_stopping" in estimator_settings.keys()
+            and estimator_settings["early_stopping"] is not None
+        ):
+            self.early_stopper = EarlyStopping(
+                **estimator_settings["early_stopping"]["params"]
+            )
         else:
             self.early_stopper = None
 
@@ -69,21 +86,24 @@ def __init__(self,
         self.learn_rate_schedule = None
 
     def fit(self, dataset, test_dataset):
-
-        train_dataloader = DataLoader(dataset=dataset,
-                                      batch_size=None,
-                                      sampler=BatchSampler(
-                                          sampler=RandomSampler(dataset),
-                                          batch_size=self.batch_size,
-                                          drop_last=True
-                                      ))
-        test_dataloader = DataLoader(dataset=test_dataset,
-                                     batch_size=None,
-                                     sampler=BatchSampler(
-                                         sampler=SequentialSampler(test_dataset),
-                                         batch_size=self.batch_size,
-                                         drop_last=False
-                                     ))
+        train_dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=RandomSampler(dataset),
+                batch_size=self.batch_size,
+                drop_last=True,
+            ),
+        )
+        test_dataloader = DataLoader(
+            dataset=test_dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=SequentialSampler(test_dataset),
+                batch_size=self.batch_size,
+                drop_last=False,
+            ),
+        )
 
         trained_epochs = dict()
         times = list()
@@ -105,19 +125,25 @@ def fit(self, dataset, test_dataset):
             times.append(round(delta_time, 3))
 
             if self.early_stopper:
-                self.early_stopper(scores['metric'])
+                self.early_stopper(scores["metric"])
                 if self.early_stopper.improved:
                     model_state_dict[epoch] = self.model.state_dict()
                     trained_epochs[epoch] = current_epoch
                 if self.early_stopper.early_stop:
                     print("Early stopping, validation metric stopped improving")
-                    print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds')
-                    self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates)
+                    print(
+                        f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds"
+                    )
+                    self.finish_fit(
+                        all_scores, model_state_dict, trained_epochs, learning_rates
+                    )
                     return
             else:
                 model_state_dict[epoch] = self.model.state_dict()
                 trained_epochs[epoch] = current_epoch
-        print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds')
+        print(
+            f"Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds"
+        )
         self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates)
         return
 
@@ -170,48 +196,68 @@ def score(self, dataloader):
 
     def finish_fit(self, scores, model_state_dict, epoch, learning_rates):
         if self.metric["mode"] == "max":
-            best_epoch_index = torch.argmax(torch.as_tensor([x["metric"] for x in scores])).item()
+            best_epoch_index = torch.argmax(
+                torch.as_tensor([x["metric"] for x in scores])
+            ).item()
         elif self.metric["mode"] == "min":
-            best_epoch_index = torch.argmin(torch.as_tensor([x["metric"] for x in scores])).item()
+            best_epoch_index = torch.argmin(
+                torch.as_tensor([x["metric"] for x in scores])
+            ).item()
 
         best_model_state_dict = model_state_dict[best_epoch_index]
         self.model.load_state_dict(best_model_state_dict)
 
         self.best_epoch = epoch[best_epoch_index]
-        self.best_score = {"loss": scores[best_epoch_index]["loss"],
-                           "auc": scores[best_epoch_index]["auc"]}
-        self.learn_rate_schedule = learning_rates[:(best_epoch_index+1)]
+        self.best_score = {
+            "loss": scores[best_epoch_index]["loss"],
+            "auc": scores[best_epoch_index]["auc"],
+        }
+        self.learn_rate_schedule = learning_rates[: (best_epoch_index + 1)]
         print(f"Loaded best model (based on AUC) from epoch {self.best_epoch}")
         print(f"ValLoss: {self.best_score['loss']}")
         print(f"valAUC: {self.best_score['auc']}")
-        if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss":
+        if (
+            self.metric
+            and self.metric["name"] != "auc"
+            and self.metric["name"] != "loss"
+        ):
             self.best_score[self.metric["name"]] = scores[best_epoch_index]["metric"]
             print(f"{self.metric['name']}: {self.best_score[self.metric['name']]}")
         return
 
     def print_progress(self, scores, training_loss, delta_time, current_epoch):
-        if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss":
-            print(f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} "
-                  f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} "
-                  f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds "
-                  f"| LR: {self.optimizer.param_groups[0]['lr']}")
+        if (
+            self.metric
+            and self.metric["name"] != "auc"
+            and self.metric["name"] != "loss"
+        ):
+            print(
+                f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} "
+                f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} "
+                f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds "
+                f"| LR: {self.optimizer.param_groups[0]['lr']}"
+            )
         else:
-            print(f"Epochs: {current_epoch} "
-                  f"| Val AUC: {scores['auc']:.3f} "
-                  f"| Val Loss: {scores['loss']:.3f} "
-                  f"| Train Loss: {training_loss:.3f} "
-                  f"| Time: {delta_time:.3f} seconds "
-                  f"| LR: {self.optimizer.param_groups[0]['lr']}")
+            print(
+                f"Epochs: {current_epoch} "
+                f"| Val AUC: {scores['auc']:.3f} "
+                f"| Val Loss: {scores['loss']:.3f} "
+                f"| Train Loss: {training_loss:.3f} "
+                f"| Time: {delta_time:.3f} seconds "
+                f"| LR: {self.optimizer.param_groups[0]['lr']}"
+            )
         return
 
     def fit_whole_training_set(self, dataset, learning_rates=None):
-        dataloader = DataLoader(dataset=dataset,
-                                batch_size=None,
-                                sampler=BatchSampler(
-                                    sampler=RandomSampler(dataset),
-                                    batch_size=self.batch_size,
-                                    drop_last=True
-                                ))
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=RandomSampler(dataset),
+                batch_size=self.batch_size,
+                drop_last=True,
+            ),
+        )
         if isinstance(learning_rates, list):
             self.best_epoch = len(learning_rates)
         elif ~isinstance(learning_rates, list):
@@ -221,7 +267,7 @@ def fit_whole_training_set(self, dataset, learning_rates=None):
             self.best_epoch = self.epochs
 
         for epoch in range(self.best_epoch):
-            self.optimizer.param_groups[0]['lr'] = learning_rates[epoch]
+            self.optimizer.param_groups[0]["lr"] = learning_rates[epoch]
             self.fit_epoch(dataloader)
         return
 
@@ -231,19 +277,21 @@ def save(self, path, name):
             model_state_dict=self.model.state_dict(),
             model_parameters=self.model_parameters,
             estimator_settings=self.estimator_settings,
-            epoch=self.epochs)
-        torch.save(out,
-                   f=save_path)
+            epoch=self.epochs,
+        )
+        torch.save(out, f=save_path)
         return save_path
 
     def predict_proba(self, dataset):
-        dataloader = DataLoader(dataset=dataset,
-                                batch_size=None,
-                                sampler=BatchSampler(
-                                    sampler=SequentialSampler(dataset),
-                                    batch_size=self.batch_size,
-                                    drop_last=False
-                                ))
+        dataloader = DataLoader(
+            dataset=dataset,
+            batch_size=None,
+            sampler=BatchSampler(
+                sampler=SequentialSampler(dataset),
+                batch_size=self.batch_size,
+                drop_last=False,
+            ),
+        )
         with torch.no_grad():
             predictions = list()
             self.model.eval()
@@ -261,15 +309,11 @@ def predict(self, dataset, threshold=None):
             # use outcome rate
             threshold = dataset.target.sum().item() / len(dataset)
         predicted_class = predictions > threshold
+        return predicted_class
 
 
 class EarlyStopping:
-
-    def __init__(self,
-                 patience=3,
-                 delta=0,
-                 verbose=True,
-                 mode='max'):
+    def __init__(self, patience=3, delta=0, verbose=True, mode="max"):
         self.patience = patience
         self.counter = 0
         self.verbose = verbose
@@ -280,9 +324,8 @@ def __init__(self,
         self.previous_score = 0
         self.mode = mode
 
-    def __call__(self,
-                 metric):
-        if self.mode == 'max':
+    def __call__(self, metric):
+        if self.mode == "max":
             score = metric
         else:
             score = -1 * metric
@@ -293,8 +336,9 @@ def __call__(self,
             self.counter += 1
             self.improved = False
             if self.verbose:
-                print(f"Early stopping counter: {self.counter}"
-                      f" out of {self.patience}")
+                print(
+                    f"Early stopping counter: {self.counter}" f" out of {self.patience}"
+                )
             if self.counter >= self.patience:
                 self.early_stop = True
         else:
@@ -304,12 +348,12 @@ def __call__(self,
         self.previous_score = score
 
 
-def batch_to_device(batch, device='cpu'):
+def batch_to_device(batch, device="cpu"):
     if torch.is_tensor(batch):
         batch = batch.to(device=device)
     else:
         for ix, b in enumerate(batch):
-            if type(b) is str:
+            if isinstance(b, str):
                 key = b
                 b = batch[b]
             else:
@@ -338,7 +382,7 @@ def compute_auc(y_true, y_pred):
     float: Computed AUC score.
     """
     # Ensure inputs are sorted by predicted score
-    y_pred_sorted, sorted_indices = torch.sort(y_pred, descending=True)
+    _, sorted_indices = torch.sort(y_pred, descending=True)
     y_true_sorted = y_true[sorted_indices]
 
     # Get the number of positive and negative examples
@@ -351,7 +395,3 @@ def compute_auc(y_true, y_pred):
     # Compute AUC
     auc = num_crossings / (n_pos * n_neg)
     return auc
-
-  
-  
-  
diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py
index 9d5bd0c..e7141cd 100644
--- a/inst/python/LrFinder.py
+++ b/inst/python/LrFinder.py
@@ -2,17 +2,13 @@
 
 import torch
 from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader, BatchSampler, RandomSampler
 from tqdm import tqdm
 
 from Estimator import batch_to_device
 
 
 class ExponentialSchedulerPerBatch(_LRScheduler):
-
-    def __init__(self, optimizer,
-                 end_lr,
-                 num_iter):
+    def __init__(self, optimizer, end_lr, num_iter):
         self.end_lr = end_lr
         self.num_iter = num_iter
         super(ExponentialSchedulerPerBatch, self).__init__(optimizer, last_epoch=-1)
@@ -23,14 +19,9 @@ def get_lr(self):
 
 
 class LrFinder:
-
-    def __init__(self,
-                 model,
-                 model_parameters,
-                 estimator_settings,
-                 lr_settings):
+    def __init__(self, model, model_parameters, estimator_settings, lr_settings):
         if lr_settings is None:
-          lr_settings = {}
+            lr_settings = {}
         min_lr = lr_settings.get("min_lr", 1e-7)
         max_lr = lr_settings.get("max_lr", 1)
         num_lr = lr_settings.get("num_lr", 100)
@@ -50,13 +41,16 @@ def __init__(self,
         self.smooth = smooth
         self.divergence_threshold = divergence_threshold
 
-        self.optimizer = estimator_settings['optimizer'](params=self.model.parameters(),
-                                                         lr=self.min_lr)
+        self.optimizer = estimator_settings["optimizer"](
+            params=self.model.parameters(), lr=self.min_lr
+        )
 
-        self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr)
+        self.scheduler = ExponentialSchedulerPerBatch(
+            self.optimizer, self.max_lr, self.num_lr
+        )
 
         self.criterion = estimator_settings["criterion"]()
-        self.batch_size = int(estimator_settings['batch_size'])
+        self.batch_size = int(estimator_settings["batch_size"])
         self.losses = None
         self.loss_index = None
 
@@ -74,7 +68,9 @@ def get_lr(self, dataset):
             out = self.model(batch[0])
             loss = self.criterion(out, batch[1])
             if self.smooth is not None and i != 0:
-                losses[i] = self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1]
+                losses[i] = (
+                    self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1]
+                )
             else:
                 losses[i] = loss.item()
             lrs[i] = self.optimizer.param_groups[0]["lr"]
@@ -90,13 +86,15 @@ def get_lr(self, dataset):
                     best_loss = losses[i]
 
             if losses[i] > (self.divergence_threshold * best_loss):
-                print(f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}")
+                print(
+                    f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}"
+                )
                 break
 
         # find LR where gradient is highest but before global minimum is reached
         # I added -5 to make sure it is not still in the minimum
         global_minimum = torch.argmin(losses)
-        gradient = torch.diff(losses[:(global_minimum - 5)+1])
+        gradient = torch.diff(losses[: (global_minimum - 5) + 1])
         smallest_gradient = torch.argmin(gradient)
 
         suggested_lr = lrs[smallest_gradient]
diff --git a/inst/python/MLP.py b/inst/python/MLP.py
index 3b8b03b..cd049a6 100644
--- a/inst/python/MLP.py
+++ b/inst/python/MLP.py
@@ -4,17 +4,18 @@
 
 
 class MLP(nn.Module):
-
-    def __init__(self,
-                 cat_features: int,
-                 num_features: int,
-                 size_embedding: int,
-                 size_hidden: int,
-                 num_layers: int,
-                 activation=nn.ReLU,
-                 normalization=nn.BatchNorm1d,
-                 dropout=None,
-                 d_out: int = 1):
+    def __init__(
+        self,
+        cat_features: int,
+        num_features: int,
+        size_embedding: int,
+        size_hidden: int,
+        num_layers: int,
+        activation=nn.ReLU,
+        normalization=nn.BatchNorm1d,
+        dropout=None,
+        d_out: int = 1,
+    ):
         super(MLP, self).__init__()
         self.name = "MLP"
         cat_features = int(cat_features)
@@ -23,21 +24,25 @@ def __init__(self,
         size_hidden = int(size_hidden)
         num_layers = int(num_layers)
         d_out = int(d_out)
-        
-        self.embedding = nn.EmbeddingBag(cat_features + 1,
-                                         size_embedding,
-                                         padding_idx=0)
+
+        self.embedding = nn.EmbeddingBag(
+            cat_features + 1, size_embedding, padding_idx=0
+        )
 
         if num_features != 0 and num_features is not None:
             self.num_embedding = NumericalEmbedding(num_features, size_embedding)
 
         self.first_layer = nn.Linear(size_embedding, size_hidden)
 
-        self.layers = nn.ModuleList(MLPLayer(size_hidden=size_hidden,
-                                             normalization=normalization,
-                                             activation=activation,
-                                             dropout=dropout)
-                                    for _ in range(num_layers))
+        self.layers = nn.ModuleList(
+            MLPLayer(
+                size_hidden=size_hidden,
+                normalization=normalization,
+                activation=activation,
+                dropout=dropout,
+            )
+            for _ in range(num_layers)
+        )
         self.last_norm = normalization(size_hidden)
         self.head = nn.Linear(size_hidden, d_out)
 
@@ -62,12 +67,14 @@ def forward(self, input):
 
 
 class MLPLayer(nn.Module):
-    def __init__(self,
-                 size_hidden=64,
-                 normalization=nn.BatchNorm1d,
-                 activation=nn.ReLU,
-                 dropout=0.0,
-                 bias=True):
+    def __init__(
+        self,
+        size_hidden=64,
+        normalization=nn.BatchNorm1d,
+        activation=nn.ReLU,
+        dropout=0.0,
+        bias=True,
+    ):
         super(MLPLayer, self).__init__()
         self.norm = normalization(size_hidden)
         self.activation = activation()
diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py
index cef4b49..9df35d8 100644
--- a/inst/python/ResNet.py
+++ b/inst/python/ResNet.py
@@ -5,22 +5,23 @@
 
 
 class ResNet(nn.Module):
-
-    def __init__(self,
-                 cat_features: int,
-                 num_features: int = 0,
-                 size_embedding: int = 256,
-                 size_hidden: int = 256,
-                 num_layers: int = 2,
-                 hidden_factor: int = 1,
-                 activation=nn.ReLU,
-                 normalization=nn.BatchNorm1d,
-                 hidden_dropout=0,
-                 residual_dropout=0,
-                 dim_out: int = 1,
-                 concat_num=True):
+    def __init__(
+        self,
+        cat_features: int,
+        num_features: int = 0,
+        size_embedding: int = 256,
+        size_hidden: int = 256,
+        num_layers: int = 2,
+        hidden_factor: int = 1,
+        activation=nn.ReLU,
+        normalization=nn.BatchNorm1d,
+        hidden_dropout=0,
+        residual_dropout=0,
+        dim_out: int = 1,
+        concat_num=True,
+    ):
         super(ResNet, self).__init__()
-        self.name = 'ResNet'
+        self.name = "ResNet"
         cat_features = int(cat_features)
         num_features = int(num_features)
         size_embedding = int(size_embedding)
@@ -28,12 +29,9 @@ def __init__(self,
         num_layers = int(num_layers)
         hidden_factor = int(hidden_factor)
         dim_out = int(dim_out)
-        
-        
+
         self.embedding = nn.EmbeddingBag(
-            num_embeddings=cat_features + 1,
-            embedding_dim=size_embedding,
-            padding_idx=0
+            num_embeddings=cat_features + 1, embedding_dim=size_embedding, padding_idx=0
         )
         if num_features != 0 and not concat_num:
             self.num_embedding = NumericalEmbedding(num_features, size_embedding)
@@ -45,9 +43,17 @@ def __init__(self,
 
         res_hidden = size_hidden * hidden_factor
 
-        self.layers = nn.ModuleList(ResLayer(size_hidden, res_hidden, normalization,
-                                             activation, hidden_dropout, residual_dropout)
-                                    for _ in range(num_layers))
+        self.layers = nn.ModuleList(
+            ResLayer(
+                size_hidden,
+                res_hidden,
+                normalization,
+                activation,
+                hidden_dropout,
+                residual_dropout,
+            )
+            for _ in range(num_layers)
+        )
 
         self.last_norm = normalization(size_hidden)
 
@@ -58,7 +64,11 @@ def __init__(self,
     def forward(self, x):
         x_cat = x["cat"]
         x_cat = self.embedding(x_cat)
-        if "num" in x.keys() and x["num"] is not None and self.num_embedding is not None:
+        if (
+            "num" in x.keys()
+            and x["num"] is not None
+            and self.num_embedding is not None
+        ):
             x_num = x["num"]
             # take the average af numerical and categorical embeddings
             x = (x_cat + self.num_embedding(x_num).mean(dim=1)) / 2
@@ -79,14 +89,15 @@ def forward(self, x):
 
 
 class ResLayer(nn.Module):
-
-    def __init__(self,
-                 size_hidden,
-                 res_hidden,
-                 normalization,
-                 activation,
-                 hidden_dropout=None,
-                 residual_dropout=None):
+    def __init__(
+        self,
+        size_hidden,
+        res_hidden,
+        normalization,
+        activation,
+        hidden_dropout=None,
+        residual_dropout=None,
+    ):
         super(ResLayer, self).__init__()
 
         self.norm = normalization(size_hidden)
@@ -114,10 +125,7 @@ def forward(self, input):
 
 
 class NumericalEmbedding(nn.Module):
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 bias=True):
+    def __init__(self, num_embeddings, embedding_dim, bias=True):
         super(NumericalEmbedding, self).__init__()
         self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim))
         if bias:
@@ -134,6 +142,3 @@ def forward(self, input):
         if self.bias is not None:
             x = x + self.bias[None]
         return x
-
-
-
diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py
index 1c95b36..ddbe929 100644
--- a/inst/python/Transformer.py
+++ b/inst/python/Transformer.py
@@ -18,23 +18,24 @@ def forward(self, x):
 
 
 class Transformer(nn.Module):
-
-    def __init__(self,
-                 cat_features: int,
-                 num_features: int,
-                 num_blocks: int,
-                 dim_token: int,
-                 num_heads: int,
-                 att_dropout,
-                 ffn_dropout,
-                 res_dropout,
-                 dim_hidden: int,
-                 dim_out: int = 1,
-                 head_activation=nn.ReLU,
-                 activation=ReGLU,
-                 ffn_norm=nn.LayerNorm,
-                 head_norm=nn.LayerNorm,
-                 att_norm=nn.LayerNorm):
+    def __init__(
+        self,
+        cat_features: int,
+        num_features: int,
+        num_blocks: int,
+        dim_token: int,
+        num_heads: int,
+        att_dropout,
+        ffn_dropout,
+        res_dropout,
+        dim_hidden: int,
+        dim_out: int = 1,
+        head_activation=nn.ReLU,
+        activation=ReGLU,
+        ffn_norm=nn.LayerNorm,
+        head_norm=nn.LayerNorm,
+        att_norm=nn.LayerNorm,
+    ):
         super(Transformer, self).__init__()
         self.name = "Transformer"
         cat_features = int(cat_features)
@@ -44,8 +45,10 @@ def __init__(self,
         num_heads = int(num_heads)
         dim_hidden = int(dim_hidden)
         dim_out = int(dim_out)
-        
-        self.categorical_embedding = nn.Embedding(cat_features + 1, dim_token, padding_idx=0)
+
+        self.categorical_embedding = nn.Embedding(
+            cat_features + 1, dim_token, padding_idx=0
+        )
 
         if num_features != 0 and num_features is not None:
             self.numerical_embedding = NumericalEmbedding(num_features, dim_token)
@@ -56,27 +59,35 @@ def __init__(self,
 
         self.layers = nn.ModuleList([])
         for layer_idx in range(num_blocks):
-            layer = nn.ModuleDict({
-                "attention": nn.MultiheadAttention(dim_token, num_heads,
-                                                   dropout=att_dropout),
-                "ffn": FeedForwardBlock(dim_token, dim_hidden,
-                                        bias_first=True,
-                                        bias_second=True,
-                                        dropout=ffn_dropout,
-                                        activation=activation),
-                "attention_res_dropout": nn.Dropout(res_dropout),
-                "ffn_res_dropout": nn.Dropout(res_dropout),
-                "ffn_norm": ffn_norm(dim_token)
-            })
+            layer = nn.ModuleDict(
+                {
+                    "attention": nn.MultiheadAttention(
+                        dim_token, num_heads, dropout=att_dropout
+                    ),
+                    "ffn": FeedForwardBlock(
+                        dim_token,
+                        dim_hidden,
+                        bias_first=True,
+                        bias_second=True,
+                        dropout=ffn_dropout,
+                        activation=activation,
+                    ),
+                    "attention_res_dropout": nn.Dropout(res_dropout),
+                    "ffn_res_dropout": nn.Dropout(res_dropout),
+                    "ffn_norm": ffn_norm(dim_token),
+                }
+            )
             if layer_idx != 0:
                 layer["attention_norm"] = att_norm(dim_token)
             self.layers.append(layer)
 
-        self.head = Head(dim_token,
-                         bias=True,
-                         activation=head_activation,
-                         normalization=head_norm,
-                         dim_out=dim_out)
+        self.head = Head(
+            dim_token,
+            bias=True,
+            activation=head_activation,
+            normalization=head_norm,
+            dim_out=dim_out,
+        )
 
     def forward(self, x):
         mask = torch.where(x["cat"] == 0, True, False)
@@ -84,31 +95,34 @@ def forward(self, x):
         if self.use_numerical:
             num = self.numerical_embedding(x["num"])
             x = torch.cat([cat, num], dim=1)
-            mask = torch.cat([mask, torch.zeros([x.shape[0],
-                                                 num.shape[1]],
-                                                device=mask.device,
-                                                dtype=mask.dtype)],
-                             dim=1)
+            mask = torch.cat(
+                [
+                    mask,
+                    torch.zeros(
+                        [x.shape[0], num.shape[1]], device=mask.device, dtype=mask.dtype
+                    ),
+                ],
+                dim=1,
+            )
         else:
             x = cat
         x = self.class_token(x)
-        mask = torch.cat([mask, torch.zeros([x.shape[0], 1],
-                                            device=mask.device,
-                                            dtype=mask.dtype)],
-                         dim=1)
+        mask = torch.cat(
+            [mask, torch.zeros([x.shape[0], 1], device=mask.device, dtype=mask.dtype)],
+            dim=1,
+        )
 
         for i, layer in enumerate(self.layers):
             x_residual = self.start_residual(layer, "attention", x)
 
-            if i == len(self.layers)-1:
+            if i == len(self.layers) - 1:
                 dims = x_residual.shape
                 x_residual = layer["attention"](
                     x_residual[:, -1].view([dims[0], 1, dims[2]]).transpose(0, 1),
                     x_residual.transpose(0, 1),
                     x_residual.transpose(0, 1),
-                    mask
+                    mask,
                 )
-                attn_weights = x_residual[1]
                 x_residual = x_residual[0]
                 x = x[:, -1].view([dims[0], 1, dims[2]])
             else:
@@ -116,7 +130,7 @@ def forward(self, x):
                     x_residual.transpose(0, 1),
                     x_residual.transpose(0, 1),
                     x_residual.transpose(0, 1),
-                    mask
+                    mask,
                 )[0]
             x = self.end_residual(layer, "attention", x, x_residual.transpose(0, 1))
 
@@ -141,14 +155,15 @@ def end_residual(layer, stage, x, x_residual):
 
 
 class FeedForwardBlock(nn.Module):
-
-    def __init__(self,
-                 dim_token,
-                 dim_hidden,
-                 bias_first=True,
-                 bias_second=True,
-                 dropout=0.0,
-                 activation=ReGLU):
+    def __init__(
+        self,
+        dim_token,
+        dim_hidden,
+        bias_first=True,
+        bias_second=True,
+        dropout=0.0,
+        activation=ReGLU,
+    ):
         super(FeedForwardBlock, self).__init__()
         self.linear0 = nn.Linear(dim_token, int(dim_hidden * 2), bias=bias_first)
         self.activation = activation()
@@ -164,13 +179,7 @@ def forward(self, x):
 
 
 class Head(nn.Module):
-
-    def __init__(self,
-                 dim_in,
-                 bias,
-                 activation,
-                 normalization,
-                 dim_out):
+    def __init__(self, dim_in, bias, activation, normalization, dim_out):
         super(Head, self).__init__()
         self.normalization = normalization(dim_in)
         self.activation = activation()
@@ -183,17 +192,16 @@ def forward(self, x):
         x = self.linear(x)
         return x
 
-class ClassToken(nn.Module):
 
-    def __init__(self,
-                 dim_token):
+class ClassToken(nn.Module):
+    def __init__(self, dim_token):
         super(ClassToken, self).__init__()
         self.weight = nn.Parameter(torch.empty(dim_token, 1))
         nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
 
     def expand(self, dims):
         new_dims = [1] * (len(dims) - 1)
-        return self.weight.view(new_dims + [-1]).expand(dims +[-1])
+        return self.weight.view(new_dims + [-1]).expand(dims + [-1])
 
     def forward(self, x):
         return torch.cat([x, self.expand([x.shape[0], 1])], dim=1)
diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd
index c97f163..9eb3b36 100644
--- a/man/DeepPatientLevelPrediction.Rd
+++ b/man/DeepPatientLevelPrediction.Rd
@@ -5,5 +5,6 @@
 \alias{DeepPatientLevelPrediction}
 \title{DeepPatientLevelPrediction}
 \description{
-A package containing deep learning extensions for developing prediction models using data in the OMOP CDM
+A package containing deep learning extensions for developing
+prediction models using data in the OMOP CDM
 }
diff --git a/man/setDefaultResNet.Rd b/man/setDefaultResNet.Rd
index c26f354..e1d7f46 100644
--- a/man/setDefaultResNet.Rd
+++ b/man/setDefaultResNet.Rd
@@ -16,6 +16,6 @@ setDefaultResNet(
 Creates settings for a default ResNet model
 }
 \details{
-Model architecture from by https://arxiv.org/abs/2106.11959 . 
+Model architecture from by https://arxiv.org/abs/2106.11959 .
 Hyperparameters chosen by a experience on a few prediction problems.
 }
diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd
index fcb33ab..b8424a3 100644
--- a/man/setEstimator.Rd
+++ b/man/setEstimator.Rd
@@ -28,7 +28,8 @@ setEstimator(
 
 \item{epochs}{how many epochs to train for}
 
-\item{device}{what device to train on, can be a string or a function to that evaluates
+\item{device}{what device to train on, can be a string or a function to
+that evaluates
 to the device during runtime}
 
 \item{optimizer}{which optimizer to use}
@@ -37,11 +38,15 @@ to the device during runtime}
 
 \item{criterion}{loss function to use}
 
-\item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving}
+\item{earlyStopping}{If earlyStopping should be used which stops the
+training of your metric is not improving}
 
-\item{metric}{either `auc` or `loss` or a custom metric to use. This is the metric used for scheduler and earlyStopping. 
-Needs to be a list with function `fun`, mode either `min` or `max` and a `name`,
-`fun` needs to be a function that takes in prediction and labels and outputs a score.}
+\item{metric}{either `auc` or `loss` or a custom metric to use. This is the
+metric used for scheduler and earlyStopping.
+Needs to be a list with function `fun`, mode either `min` or `max` and a
+`name`,
+`fun` needs to be a function that takes in prediction and labels and
+outputs a score.}
 
 \item{seed}{seed to initialize weights of model with}
 }
diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd
index f3676cd..a5f96d7 100644
--- a/man/setMultiLayerPerceptron.Rd
+++ b/man/setMultiLayerPerceptron.Rd
@@ -19,15 +19,18 @@ setMultiLayerPerceptron(
 \arguments{
 \item{numLayers}{Number of layers in network, default: 1:8}
 
-\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)}
+\item{sizeHidden}{Amount of neurons in each default layer,
+default: 2^(6:9) (64 to 512)}
 
-\item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)}
+\item{dropout}{How much dropout to apply after first linear,
+default: seq(0, 0.3, 0.05)}
 
-\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)}
+\item{sizeEmbedding}{Size of embedding default: 2^(6:9) (64 to 512)}
 
 \item{estimatorSettings}{settings of Estimator created with `setEstimator`}
 
-\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'}
+\item{hyperParamSearch}{Which kind of hyperparameter search to use random
+sampling or exhaustive grid search. default: 'random'}
 
 \item{randomSample}{How many random samples from hyperparameter space to use}
 
diff --git a/man/setResNet.Rd b/man/setResNet.Rd
index fbe3d77..4f4c245 100644
--- a/man/setResNet.Rd
+++ b/man/setResNet.Rd
@@ -21,21 +21,28 @@ setResNet(
 \arguments{
 \item{numLayers}{Number of layers in network, default: 1:16}
 
-\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)}
+\item{sizeHidden}{Amount of neurons in each default layer, default:
+2^(6:10) (64 to 1024)}
 
-\item{hiddenFactor}{How much to grow the amount of neurons in each ResLayer, default: 1:4}
+\item{hiddenFactor}{How much to grow the amount of neurons in each
+ResLayer, default: 1:4}
 
-\item{residualDropout}{How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)}
+\item{residualDropout}{How much dropout to apply after last linear layer
+in ResLayer, default: seq(0, 0.3, 0.05)}
 
-\item{hiddenDropout}{How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)}
+\item{hiddenDropout}{How much dropout to apply after first linear layer
+in ResLayer, default: seq(0, 0.3, 0.05)}
 
-\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)}
+\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9)
+'(64 to 512)}
 
 \item{estimatorSettings}{created with ```setEstimator```}
 
-\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'}
+\item{hyperParamSearch}{Which kind of hyperparameter search to use random
+sampling or exhaustive grid search. default: 'random'}
 
-\item{randomSample}{How many random samples from hyperparameter space to use}
+\item{randomSample}{How many random samples from hyperparameter space
+to use}
 
 \item{randomSampleSeed}{Random seed to sample hyperparameter combinations}
 }
diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
index 7de56c4..9afcbd5 100644
--- a/man/setTransformer.Rd
+++ b/man/setTransformer.Rd
@@ -26,7 +26,8 @@ setTransformer(
 
 \item{dimToken}{dimension of each token (embedding size)}
 
-\item{dimOut}{dimension of output, usually 1 for binary problems}
+\item{dimOut}{dimension of output, usually 1 for binary
+problems}
 
 \item{numHeads}{number of attention heads}
 
@@ -38,15 +39,19 @@ setTransformer(
 
 \item{dimHidden}{dimension of the feedworward block}
 
-\item{dimHiddenRatio}{dimension of the feedforward block as a ratio of dimToken (embedding size)}
+\item{dimHiddenRatio}{dimension of the feedforward block as a ratio
+of dimToken (embedding size)}
 
 \item{estimatorSettings}{created with `setEstimator`}
 
-\item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'}
+\item{hyperParamSearch}{what kind of hyperparameter search to do,
+default 'random'}
 
-\item{randomSample}{How many samples to use in hyperparameter search if random}
+\item{randomSample}{How many samples to use in hyperparameter
+search if random}
 
-\item{randomSampleSeed}{Random seed to sample hyperparameter combinations}
+\item{randomSampleSeed}{Random seed to sample hyperparameter
+combinations}
 }
 \description{
 A transformer model
diff --git a/man/TrainingCache.Rd b/man/trainingCache.Rd
similarity index 79%
rename from man/TrainingCache.Rd
rename to man/trainingCache.Rd
index c82bb23..77d2efc 100644
--- a/man/TrainingCache.Rd
+++ b/man/trainingCache.Rd
@@ -1,7 +1,7 @@
 % Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/TrainingCache-class.R
-\name{TrainingCache}
-\alias{TrainingCache}
+\name{trainingCache}
+\alias{trainingCache}
 \title{TrainingCache}
 \value{
 Whether the provided and cached parameter grid is identical
@@ -18,15 +18,15 @@ Parameter caching for training persistence and continuity
 \section{Methods}{
 \subsection{Public methods}{
 \itemize{
-\item \href{#method-TrainingCache-new}{\code{TrainingCache$new()}}
-\item \href{#method-TrainingCache-isParamGridIdentical}{\code{TrainingCache$isParamGridIdentical()}}
-\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}}
-\item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}}
-\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}}
-\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}}
-\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}}
-\item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}}
-\item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}}
+\item \href{#method-TrainingCache-new}{\code{trainingCache$new()}}
+\item \href{#method-TrainingCache-isParamGridIdentical}{\code{trainingCache$isParamGridIdentical()}}
+\item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{trainingCache$saveGridSearchPredictions()}}
+\item \href{#method-TrainingCache-saveModelParams}{\code{trainingCache$saveModelParams()}}
+\item \href{#method-TrainingCache-getGridSearchPredictions}{\code{trainingCache$getGridSearchPredictions()}}
+\item \href{#method-TrainingCache-isFull}{\code{trainingCache$isFull()}}
+\item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{trainingCache$getLastGridSearchIndex()}}
+\item \href{#method-TrainingCache-dropCache}{\code{trainingCache$dropCache()}}
+\item \href{#method-TrainingCache-clone}{\code{trainingCache$clone()}}
 }
 }
 \if{html}{\out{<hr>}}
@@ -35,7 +35,7 @@ Parameter caching for training persistence and continuity
 \subsection{Method \code{new()}}{
 Creates a new training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$new(inDir)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$new(inDir)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -53,7 +53,7 @@ Creates a new training cache
 Checks whether the parameter grid in the model settings is identical to
 the cached parameters.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$isParamGridIdentical(inModelParams)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -70,7 +70,7 @@ the cached parameters.
 \subsection{Method \code{saveGridSearchPredictions()}}{
 Saves the grid search results to the training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$saveGridSearchPredictions(inGridSearchPredictions)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -87,7 +87,7 @@ Saves the grid search results to the training cache
 \subsection{Method \code{saveModelParams()}}{
 Saves the parameter grid to the training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$saveModelParams(inModelParams)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$saveModelParams(inModelParams)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
@@ -104,7 +104,7 @@ Saves the parameter grid to the training cache
 \subsection{Method \code{getGridSearchPredictions()}}{
 Gets the grid search results from the training cache
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$getGridSearchPredictions()}\if{html}{\out{</div>}}
 }
 
 }
@@ -114,7 +114,7 @@ Gets the grid search results from the training cache
 \subsection{Method \code{isFull()}}{
 Check if cache is full
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$isFull()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$isFull()}\if{html}{\out{</div>}}
 }
 
 }
@@ -124,7 +124,7 @@ Check if cache is full
 \subsection{Method \code{getLastGridSearchIndex()}}{
 Gets the last index from the cached grid search
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$getLastGridSearchIndex()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$getLastGridSearchIndex()}\if{html}{\out{</div>}}
 }
 
 }
@@ -134,7 +134,7 @@ Gets the last index from the cached grid search
 \subsection{Method \code{dropCache()}}{
 Remove the training cache from the analysis path
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$dropCache()}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$dropCache()}\if{html}{\out{</div>}}
 }
 
 }
@@ -144,7 +144,7 @@ Remove the training cache from the analysis path
 \subsection{Method \code{clone()}}{
 The objects of this class are cloneable with this method.
 \subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$clone(deep = FALSE)}\if{html}{\out{</div>}}
+\if{html}{\out{<div class="r">}}\preformatted{trainingCache$clone(deep = FALSE)}\if{html}{\out{</div>}}
 }
 
 \subsection{Arguments}{
diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R
index debe95c..f96d7b8 100644
--- a/tests/testthat/test-TrainingCache.R
+++ b/tests/testthat/test-TrainingCache.R
@@ -14,7 +14,7 @@ resNetSettings <- setResNet(numLayers = c(1, 2, 4),
                             randomSample = 3,
                             randomSampleSeed = NULL)
 
-trainCache <- TrainingCache$new(testLoc)
+trainCache <- trainingCache$new(testLoc)
 paramSearch <- resNetSettings$param
 
 test_that("Training cache exists on disk", {