diff --git a/.Rbuildignore b/.Rbuildignore
index d003b0a..ac33fdc 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,6 +1,7 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 ^\.github$
+^\.idea$
 ^_pkgdown\.yml$
 ^docs$
 ^pkgdown$
diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml
index f064053..fa5072b 100644
--- a/.github/workflows/R_CDM_check_hades.yaml
+++ b/.github/workflows/R_CDM_check_hades.yaml
@@ -22,7 +22,7 @@ jobs:
         config:
           - {os: windows-latest, r: 'release'}
           - {os: macOS-latest, r: 'release'}
-          - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
+          - {os: ubuntu-22.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/jammy/latest"}
 
     env:
       GITHUB_PAT: ${{ secrets.GH_TOKEN }}
@@ -43,7 +43,6 @@ jobs:
       CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }}
       CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }}
       CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }}
-      TORCH_INSTALL: '1'
 
     steps:
       - uses: actions/checkout@v2
@@ -52,9 +51,13 @@ jobs:
         with:
           r-version: ${{ matrix.config.r }}
 
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.10' 
+      
       - uses: r-lib/actions/setup-tinytex@v2
 
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-pandoc@v2
   
       - name: Install system requirements
         if: runner.os == 'Linux'
@@ -70,7 +73,22 @@ jobs:
         with:
           extra-packages: any::rcmdcheck
           needs: check
-
+          
+      - name: setup r-reticulate venv
+        shell: Rscript {0}
+        run: |
+          python_packages <- 
+            c("polars", "tqdm", "connectorx", "pyarrow", "scikit-learn")
+          
+          library(reticulate)
+          virtualenv_create("r-reticulate", Sys.which("python"))
+          virtualenv_install("r-reticulate", python_packages)
+          virtualenv_install("r-reticulate", "torch", pip_options = c("--index-url https://download.pytorch.org/whl/cpu"))
+          
+          path_to_python <- virtualenv_python("r-reticulate")
+          writeLines(sprintf("RETICULATE_PYTHON=%s", path_to_python),
+                     Sys.getenv("GITHUB_ENV"))
+      
       - uses: r-lib/actions/check-r-package@v2
         with:
           args: 'c("--no-manual", "--as-cran")'
diff --git a/.gitignore b/.gitignore
index c567877..56db315 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,6 @@
 results
 config.yml
 docs
+.idea/
+renv.lock
+extras/
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index c2a17c7..496abc8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 1.1.6
+Version: 2.0.0
 Date: 18-04-2023
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),
@@ -22,10 +22,10 @@ Imports:
     dplyr,
     FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
-    PatientLevelPrediction (>= 6.0.4),
+    PatientLevelPrediction (>= 6.3.2),
     rlang,
-    torch (>= 0.10.0),
-    withr
+    withr,
+    reticulate (>= 1.31)
 Suggests:
     devtools,
     Eunomia,
@@ -44,3 +44,14 @@ Remotes:
 RoxygenNote: 7.2.3
 Encoding: UTF-8
 Config/testthat/edition: 3
+Config/reticulate:
+  list(
+    packages = list(
+      list(package = "torch"),
+      list(package = "polars"),
+      list(package = "tqdm"),
+      list(package = "connectorx"),
+      list(package = "scikit-learn"),
+      list(package = "pyarrow")
+      )
+  )
diff --git a/NAMESPACE b/NAMESPACE
index bea6721..4806f0d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,11 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
-export(Dataset)
-export(Estimator)
 export(TrainingCache)
 export(fitEstimator)
 export(gridCvDeep)
-export(lrFinder)
 export(predictDeepEstimator)
 export(setDefaultResNet)
 export(setDefaultTransformer)
@@ -14,4 +11,6 @@ export(setMultiLayerPerceptron)
 export(setResNet)
 export(setTransformer)
 importFrom(dplyr,"%>%")
+importFrom(reticulate,py_to_r)
+importFrom(reticulate,r_to_py)
 importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
index eb6a59b..3da704b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,14 @@
+DeepPatientLevelPrediction 2.0.0
+======================
+  - New backend which uses pytorch through reticulate instead of torch in R
+  - All models ported over to python
+  - Dataset class now in python
+  - Estimator class in python
+  - Learning rate finder in python
+  - Added input checks and tests for wrong inputs
+  - Training-cache for single hyperparameter combination added
+  - Fixed empty test for training-cache
+
 DeepPatientLevelPrediction 1.1.6
 ======================
   - Caching and resuming of hyperparameter iterations
diff --git a/R/Dataset.R b/R/Dataset.R
index 3b0dd8d..f4b5170 100644
--- a/R/Dataset.R
+++ b/R/Dataset.R
@@ -1,124 +1,35 @@
-#'  A torch dataset
-#' @export
-Dataset <- torch::dataset(
-  name = "myDataset",
-  #' @param data           a dataframe like object with the covariates
-  #' @param labels         a dataframe with the labels
-  #' @param numericalIndex in what column numeric data is in (if any)
-  initialize = function(data, labels = NULL, numericalIndex = NULL) {
-    # determine numeric
-    if (is.null(numericalIndex)) {
-      numericalIndex <- data %>%
-        dplyr::arrange(columnId) %>% 
-        dplyr::group_by(columnId) %>%
-        dplyr::summarise(n = dplyr::n_distinct(.data$covariateValue)) %>%
-        dplyr::collect() %>%
-        dplyr::pull(n) > 1
-      self$numericalIndex <- numericalIndex
-    } else {
-      self$numericalIndex <- NULL
-    }
-
-    # add labels if training (make 0 vector for prediction)
-    if (!is.null(labels)) {
-      self$target <- torch::torch_tensor(labels)
-    } else {
-      self$target <- torch::torch_tensor(rep(0, data %>% dplyr::summarize(m=max(rowId)) %>%
-                                               dplyr::collect() %>% dplyr::pull()))
-    }
-    # Weight to add in loss function to positive class
-    self$posWeight <- (self$target == 0)$sum() / self$target$sum()
-
-    # add features
-    catColumns <- which(!numericalIndex)
-    dataCat <- dplyr::filter(data, columnId %in% catColumns) %>%
-      dplyr::arrange(columnId) %>%
-      dplyr::group_by(columnId) %>%
-      dplyr::collect() %>%
-      dplyr::mutate(newColumnId = dplyr::cur_group_id()) %>%
-      dplyr::ungroup() %>%
-      dplyr::select(c("rowId", "newColumnId")) %>%
-      dplyr::rename(columnId = newColumnId) %>% 
-      dplyr::arrange(rowId)
-    start <- Sys.time()
-    catTensor <- torch::torch_tensor(cbind(dataCat$rowId, dataCat$columnId))
-    catTensor <- catTensor[catTensor[,1]$argsort(),]
-    tensorList <- torch::torch_split(catTensor[,2], 
-                                     as.numeric(torch::torch_unique_consecutive(catTensor[,1], 
-                                                                                return_counts = TRUE)[[3]]))
-    
-    # because of subjects without cat features, I need to create a list with all zeroes and then insert
-    # my tensorList. That way I can still index the dataset correctly.
-    totalList <- as.list(integer(length(self$target)))
-    totalList[unique(dataCat$rowId)] <- tensorList
-    self$lengths <- lengths
-    self$cat <- torch::nn_utils_rnn_pad_sequence(totalList, batch_first = T)
-    delta <- Sys.time() - start
-    ParallelLogger::logInfo("Data conversion for dataset took ", signif(delta, 3), " ", attr(delta, "units"))
-    if (sum(numericalIndex) == 0) {
-      self$num <- NULL
-    } else {
-      numericalData <- data %>%
-        dplyr::filter(columnId %in% !!which(numericalIndex)) %>%
-        dplyr::collect()
-      numericalData <- numericalData %>%
-        dplyr::group_by(columnId) %>%
-        dplyr::mutate(newId = dplyr::cur_group_id())
-      indices <- torch::torch_tensor(cbind(numericalData$rowId, numericalData$newId), dtype = torch::torch_long())$t_()
-      values <- torch::torch_tensor(numericalData$covariateValue, dtype = torch::torch_float32())
-      self$num <- torch::torch_sparse_coo_tensor(
-        indices = indices,
-        values = values,
-        size = c(self$target$shape, sum(numericalIndex))
-      )$to_dense()
-    }
-  },
-  getNumericalIndex = function() {
-    return(
-      self$numericalIndex
-    )
-  },
-  numCatFeatures = function() {
-    return(
-      sum(!self$numericalIndex)
-    )
-  },
-  numNumFeatures = function() {
-    if (!is.null(self$num)) {
-      return(self$num$shape[2])
-    } else {
-      return(0)
-    }
-  },
-  .getbatch = function(item) {
-    if (length(item) == 1) {
-      return(self$.getBatchSingle(item))
-    } else {
-      return(self$.getBatchRegular(item))
-    }
-  },
-  .getBatchSingle = function(item) {
-    # add leading singleton dimension since models expects 2d tensors
-    batch <- list(
-      cat = self$cat[item]$unsqueeze(1),
-      num = self$num[item]$unsqueeze(1)
-    )
-    return(list(
-      batch = batch,
-      target = self$target[item]$unsqueeze(1)
-    ))
-  },
-  .getBatchRegular = function(item) {
-    batch <- list(
-      cat = self$cat[item],
-      num = self$num[item]
-    )
-    return(list(
-      batch = batch,
-      target = self$target[item]
-    ))
-  },
-  .length = function() {
-    self$target$size()[[1]] # shape[1]
+# @file Dataset.R
+#
+# Copyright 2023 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+createDataset <- function(data, labels, plpModel=NULL) {
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+  Dataset <- reticulate::import_from_path("Dataset", path = path)$Data
+  if (is.null(attributes(data)$path)) {
+    # sqlite object
+    attributes(data)$path <- attributes(data)$dbname
   }
-)
+  if (is.null(plpModel)) {
+    data <- Dataset(r_to_py(normalizePath(attributes(data)$path)), 
+                            r_to_py(labels$outcomeCount))
+  }
+  else {
+    data <- Dataset(r_to_py(normalizePath(attributes(data)$path)),
+                    numerical_features = r_to_py(as.array(which(plpModel$covariateImportance$isNumeric)))  )
+  }
+  
+  return(data)  
+}
\ No newline at end of file
diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
index 6070709..e77812e 100644
--- a/R/DeepPatientLevelPrediction.R
+++ b/R/DeepPatientLevelPrediction.R
@@ -23,5 +23,14 @@
 #' @docType package
 #' @name DeepPatientLevelPrediction
 #' @importFrom dplyr %>%
+#' @importFrom reticulate r_to_py py_to_r
 #' @importFrom rlang .data
 NULL
+
+.onLoad <- function(libname, pkgname) {
+  # use superassignment to update global reference 
+  reticulate::configure_environment(pkgname)
+  torch <<- reticulate::import("torch", delay_load = TRUE)
+}
+
+
diff --git a/R/Estimator-class.R b/R/Estimator-class.R
deleted file mode 100644
index 5d2fe76..0000000
--- a/R/Estimator-class.R
+++ /dev/null
@@ -1,468 +0,0 @@
-# @file Estimator-class.R
-#
-# Copyright 2022 Observational Health Data Sciences and Informatics
-#
-# This file is part of DeepPatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Estimator
-#' @description
-#' A generic R6 class that wraps around a torch nn module and can be used to
-#' fit and predict the model defined in that module.
-#' @export
-Estimator <- R6::R6Class(
-  classname = "Estimator",
-  lock_objects = FALSE,
-  public = list(
-    #' @description
-    #' Creates a new estimator
-    #' @param modelType The torch nn module to use as model
-    #' @param modelParameters Parameters to initialize the model
-    #' @param estimatorSettings Parameters required for the estimator fitting
-    initialize = function(modelType,
-                          modelParameters,
-                          estimatorSettings) {
-      self$seed <- estimatorSettings$seed
-      if (is.function(estimatorSettings$device)) {
-        device <- estimatorSettings$device()
-      } else {
-        device <- estimatorSettings$device
-      }
-      self$device <- device
-      torch::torch_manual_seed(seed=self$seed)
-      self$model <- do.call(modelType, modelParameters)
-      self$modelParameters <- modelParameters
-      self$estimatorSettings <- estimatorSettings
-      self$epochs <- self$itemOrDefaults(estimatorSettings, "epochs", 10)
-      self$learningRate <- self$itemOrDefaults(estimatorSettings, "learningRate", 1e-3)
-      self$l2Norm <- self$itemOrDefaults(estimatorSettings, "weightDecay", 1e-5)
-      self$batchSize <- self$itemOrDefaults(estimatorSettings, "batchSize", 1024)
-      self$prefix <- self$itemOrDefaults(estimatorSettings, "prefix", self$model$name)
-      
-      self$previousEpochs <- self$itemOrDefaults(estimatorSettings, "previousEpochs", 0)
-      self$model$to(device = self$device)
-      
-      self$optimizer <- estimatorSettings$optimizer(
-        params = self$model$parameters,
-        lr = self$learningRate,
-        weight_decay = self$l2Norm
-      )
-      self$criterion <- estimatorSettings$criterion()
-      
-      if (!is.null(estimatorSettings$metric)) {
-        self$metric <- estimatorSettings$metric
-        if (is.character(self$metric)) {
-          if (self$metric == "auc") {
-            self$metric <- list(name="auc",
-                                mode="max")
-          } else if (self$metric == "loss") {
-            self$metric <- list(name="loss",
-                                mode="min")
-          }
-        }
-        if (!is.null(estimatorSettings$scheduler)) {
-          estimatorSettings$scheduler$params$mode <- self$metrix$mode 
-        }
-        if (!is.null(estimatorSettings$earlyStopping)) {
-          estimatorSettings$earlyStopping$params$mode <- self$metric$mode
-        }
-      }
-      
-      if (!is.null(estimatorSettings$scheduler)) {
-        self$scheduler <- do.call(estimatorSettings$scheduler$fun,
-                                  c(self$optimizer, estimatorSettings$scheduler$params))
-      }
-      
-      # gradient accumulation is useful when training large numbers where
-      # you can only fit few samples on the GPU in each batch.
-      self$gradAccumulationIter <- 1
-      
-      if (!is.null(estimatorSettings$earlyStopping) && estimatorSettings$earlyStopping$useEarlyStopping) {
-        self$earlyStopper <- do.call(EarlyStopping$new, estimatorSettings$earlyStopping$params)
-      } else {
-        self$earlyStopper <- NULL
-      }
-      
-      self$bestScore <- NULL
-      self$bestEpoch <- NULL
-    },
-    
-    #' @description fits the estimator
-    #' @param dataset     a torch dataset to use for model fitting
-    #' @param testDataset a torch dataset to use for early stopping
-    fit = function(dataset, testDataset) {
-      allScores <- list()
-      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
-      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
-      
-      testBatchIndex <- 1:length(testDataset)
-      testBatchIndex <- split(testBatchIndex, ceiling(seq_along(testBatchIndex) / self$batchSize))
-      
-      modelStateDict <- list()
-      epoch <- list()
-      times <- list()
-      learnRates <- list()
-      for (epochI in 1:self$epochs) {
-        startTime <- Sys.time()
-        trainLoss <- self$fitEpoch(dataset, batchIndex)
-        endTime <- Sys.time()
-        
-        # predict on test data
-        scores <- self$score(testDataset, testBatchIndex)
-        delta <- endTime - startTime
-        currentEpoch <- epochI + self$previousEpochs
-        lr <- self$optimizer$param_groups[[1]]$lr
-        self$printProgress(scores, trainLoss, delta, currentEpoch)
-        self$scheduler$step(scores$metric)
-        allScores[[epochI]] <- scores
-        learnRates <- c(learnRates, lr)
-        times <- c(times, round(delta, 3))
-        if (!is.null(self$earlyStopper)) {
-          self$earlyStopper$call(scores$metric)
-          if (self$earlyStopper$improved) {
-            # here it saves the results to lists rather than files
-            modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
-            epoch[[epochI]] <- currentEpoch
-          }
-          if (self$earlyStopper$earlyStop) {
-            ParallelLogger::logInfo("Early stopping, validation metric stopped improving")
-            ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
-            self$finishFit(allScores, modelStateDict, epoch, learnRates)
-            return(invisible(self))
-          }
-        } else {
-          modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
-          epoch[[epochI]] <- currentEpoch
-        }
-      }
-      ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
-      self$finishFit(allScores, modelStateDict, epoch, learnRates)
-      invisible(self)
-    },
-    
-    #' @description
-    #' fits estimator for one epoch (one round through the data)
-    #' @param dataset     torch dataset to use for fitting
-    #' @param batchIndex  indices of batches
-    fitEpoch = function(dataset, batchIndex) {
-      trainLosses <- torch::torch_empty(length(batchIndex))
-      ix <- 1
-      self$model$train()
-      progressBar <- utils::txtProgressBar(style = 3)
-      for (b in batchIndex) {
-        self$optimizer$zero_grad()
-        batch <- batchToDevice(dataset[b], device=self$device)
-        out <- self$model(batch[[1]])
-        loss <- self$criterion(out, batch[[2]])
-        loss$backward()
-        
-        self$optimizer$step()
-        trainLosses[ix] <- loss$detach()
-        utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
-        ix <- ix + 1
-      }
-      close(progressBar)
-      trainLosses$mean()$item()
-    },
-    
-    #' @description
-    #' calculates loss and auc after training for one epoch
-    #' @param dataset    The torch dataset to use to evaluate loss and auc
-    #' @param batchIndex Indices of batches in the dataset
-    #' @return list with average loss and auc in the dataset
-    score = function(dataset, batchIndex) {
-      torch::with_no_grad({
-        loss <- torch::torch_empty(c(length(batchIndex)))
-        predictions <- list()
-        targets <- list()
-        self$model$eval()
-        ix <- 1
-        for (b in batchIndex) {
-          batch <- batchToDevice(dataset[b], device=self$device)
-          pred <- self$model(batch[[1]])
-          predictions <- c(predictions, pred)
-          targets <- c(targets, batch[[2]])
-          loss[ix] <- self$criterion(pred, batch[[2]])
-          ix <- ix + 1
-        }
-        mean_loss <- loss$mean()$item()
-        predictionsClass <- data.frame(
-          value = as.matrix(torch::torch_sigmoid(torch::torch_cat(predictions)$cpu())),
-          outcomeCount = as.matrix(torch::torch_cat(targets)$cpu())
-        )
-        attr(predictionsClass, "metaData")$modelType <- "binary"
-        auc <- PatientLevelPrediction::computeAuc(predictionsClass)
-        scores <- list()
-        if (!is.null(self$metric)) {
-          if (self$metric$name == "auc") {
-            scores$metric <- auc
-          } else if (self$metric$name == "loss") {
-            scores$metric <- mean_loss
-          } else {
-            metric <- self$metric$fun(predictionsClass$value, predictionsClass$outcomeCount)
-            scores$metric <- metric
-          }
-        }
-        scores$auc <- auc
-        scores$loss <- mean_loss
-      })
-      return(scores)
-    },
-    
-    #' @description
-    #' operations that run when fitting is finished
-    #' @param scores          validation scores
-    #' @param modelStateDict  fitted model parameters
-    #' @param epoch           list of epochs fit
-    #' @param learnRates      learning rate sequence used so far
-    finishFit = function(scores, modelStateDict, epoch, learnRates) {
-      if (self$metric$mode=="max") {
-        bestEpochInd <- which.max(unlist(lapply(scores, function(x) x$metric)))
-      }
-      else if (self$metric$mode=="min") {
-        bestEpochInd <- which.min(unlist(lapply(scores, function(x) x$metric)))
-      }
-      
-      bestModelStateDict <- lapply(modelStateDict[[bestEpochInd]], function(x) x$to(device = self$device))
-      self$model$load_state_dict(bestModelStateDict)
-      
-      bestEpoch <- epoch[[bestEpochInd]]
-      self$bestEpoch <- bestEpoch
-      self$bestScore <- list(
-        loss = scores[[bestEpochInd]]$loss,
-        auc = scores[[bestEpochInd]]$auc
-      )
-      self$learnRateSchedule <- learnRates[1:bestEpochInd]
-      
-      ParallelLogger::logInfo("Loaded best model (based on AUC) from epoch ", bestEpoch)
-      ParallelLogger::logInfo("ValLoss: ", self$bestScore$loss)
-      ParallelLogger::logInfo("valAUC: ", self$bestScore$auc)
-      if (!is.null(self$metric) && (!self$metric$name=='auc') && (!self$metric$name=='loss')) {
-        self$bestScore[[self$metric$name]] <- scores[[bestEpochInd]]$metric
-        ParallelLogger::logInfo(self$metric$name,": ", self$bestScore[[self$metric$name]])
-      }
-    },
-    
-    #' @description Print out training progress per epoch
-    #' @param scores scores returned by `self$score`
-    #' @param trainLoss training loss
-    #' @param delta how long did the epoch take
-    #' @param currentEpoch the current epoch number
-    printProgress = function(scores, trainLoss, delta, currentEpoch) {
-      if (!is.null(self$metric) && (!self$metric$name=='auc') && (!self$metric$name=='loss')) {
-        ParallelLogger::logInfo(
-          "Epochs: ", currentEpoch,
-          " | Val ", self$metric$name, ": ", round(scores$metric, 3),
-          " | Val AUC: ", round(scores$auc, 3),
-          " | Val Loss: ", round(scores$loss, 3),
-          " | Train Loss: ", round(trainLoss, 3),
-          " | Time: ", round(delta, 3), " ",
-          units(delta),
-          " | LR: ", self$optimizer$param_groups[[1]]$lr
-        )
-      } else {
-      ParallelLogger::logInfo(
-        "Epochs: ", currentEpoch,
-        " | Val AUC: ", round(scores$auc, 3),
-        " | Val Loss: ", round(scores$loss, 3),
-        " | Train Loss: ", round(trainLoss, 3),
-        " | Time: ", round(delta, 3), " ",
-        units(delta),
-        " | LR: ", self$optimizer$param_groups[[1]]$lr
-      )
-      }
-    },
-    
-    #' @description
-    #' Fits whole training set on a specific number of epochs
-    #' @param dataset torch dataset
-    #' @param learnRates learnRateSchedule from CV
-    fitWholeTrainingSet = function(dataset, learnRates = NULL) {
-      if (length(learnRates) > 1) {
-        self$bestEpoch <- length(learnRates)
-      } else if (is.null(self$bestEpoch)) {
-        self$bestEpoch <- self$epochs
-      }
-      # TODO constant LR
-
-      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
-      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
-      for (epoch in 1:self$bestEpoch) {
-        self$optimizer$param_groups[[1]]$lr <- learnRates[[epoch]]
-        self$fitEpoch(dataset, batchIndex)
-      }
-    },
-    
-    #' @description
-    #' save model and those parameters needed to reconstruct it
-    #' @param path where to save the model
-    #' @param name name of file
-    #' @return the path to saved model
-    save = function(path, name) {
-      savePath <- file.path(path, name)
-      torch::torch_save(
-        list(
-          modelStateDict = self$model$state_dict(),
-          modelParameters = self$modelParameters,
-          estimatorSettings = self$estimatorSettings,
-          epoch = self$epochs
-        ),
-        savePath
-      )
-      return(savePath)
-    },
-    
-    
-    #' @description
-    #' predicts and outputs the probabilities
-    #' @param dataset Torch dataset to create predictions for
-    #' @return predictions as probabilities
-    predictProba = function(dataset) {
-      batchIndex <- 1:length(dataset)
-      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
-      torch::with_no_grad({
-        predictions <- torch::torch_empty(length(dataset), device=self$device)
-        self$model$eval()
-        progressBar <- utils::txtProgressBar(style = 3)
-        ix <- 1
-        coro::loop(for (b in batchIndex) {
-          batch <- batchToDevice(dataset[b], self$device)
-          target <- batch$target
-          pred <- self$model(batch$batch)
-          predictions[b] <- torch::torch_sigmoid(pred)
-          utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
-          ix <- ix + 1
-        })
-        predictions <- as.array(predictions$cpu())
-        close(progressBar)
-      })
-      return(predictions)
-    },
-    
-    #' @description
-    #' predicts and outputs the class
-    #' @param   dataset A torch dataset to create predictions for
-    #' @param   threshold Which threshold to use for predictions
-    #' @return  The predicted class for the data in the dataset
-    predict = function(dataset, threshold = NULL) {
-      predictions <- self$predictProba(dataset)
-      
-      if (is.null(threshold)) {
-        # use outcome rate
-        threshold <- dataset$target$sum()$item() / length(dataset)
-      }
-      predicted_class <- as.integer(predictions > threshold)
-      return(predicted_class)
-    },
-    
-    #' @description
-    #' select item from list, and if it's null sets a default
-    #' @param list A list with items
-    #' @param item Which list item to retrieve
-    #' @param default The value to return if list doesn't have item
-    #' @return the list item or default
-    itemOrDefaults = function(list, item, default = NULL) {
-      value <- list[[item]]
-      if (is.null(value)) default else value
-    }
-  )
-)
-
-#' Earlystopping class
-#' @description
-#' Stops training if a loss or metric has stopped improving
-EarlyStopping <- R6::R6Class(
-  classname = "EarlyStopping",
-  lock_objects = FALSE,
-  public = list(
-    #' @description
-    #' Creates a new earlyStopping object
-    #' @param patience Stop after this number of epochs if loss doesn't improve
-    #' @param delta    How much does the loss need to improve to count as improvement
-    #' @param verbose  If information should be printed out
-    #' @param mode    either `min` or `max` depending on metric to be used for earlyStopping
-    #' @return a new earlystopping object
-    initialize = function(patience = 3, delta = 0, verbose = TRUE,
-                          mode='max') {
-      self$patience <- patience
-      self$counter <- 0
-      self$verbose <- verbose
-      self$bestScore <- NULL
-      self$earlyStop <- FALSE
-      self$improved <- FALSE
-      self$delta <- delta
-      self$previousScore <- 0
-      self$mode <- mode
-    },
-    #' @description
-    #' call the earlystopping object and increment a counter if loss is not
-    #' improving
-    #' @param metric the current metric value
-    call = function(metric) {
-      if (self$mode=='max') {
-        score <- metric
-      } else {
-        score <- -1 * metric
-      }
-      if (is.null(self$bestScore)) {
-        self$bestScore <- score
-        self$improved <- TRUE
-      } else if (score < self$bestScore + self$delta) {
-        self$counter <- self$counter + 1
-        self$improved <- FALSE
-        if (self$verbose) {
-          ParallelLogger::logInfo(
-            "EarlyStopping counter: ", self$counter,
-            " out of ", self$patience
-          )
-        }
-        if (self$counter >= self$patience) {
-          self$earlyStop <- TRUE
-        }
-      } else {
-        self$bestScore <- score
-        self$counter <- 0
-        self$improved <- TRUE
-      }
-      self$previousScore <- score
-    }
-  )
-)
-
-#' sends a batch of data to device
-#' @description 
-#' sends a batch of data to device
-#' assumes batch includes lists of tensors to arbitrary nested depths
-#' @param batch the batch to send, usually a list of torch tensors
-#' @param device which device to send batch to
-#' @return the batch on the required device
-batchToDevice = function(batch, device) {
-  if (class(batch)[1] == "torch_tensor") {
-    batch <- batch$to(device = device)
-  } else {
-    ix <- 1
-    for (b in batch) {
-      if (class(b)[1] == "torch_tensor") {
-        b <- b$to(device = device)
-      } else {
-        b <- batchToDevice(b, device)
-      }
-      if (!is.null(b)) {
-        batch[[ix]] <- b
-      }
-      ix <- ix + 1
-    }
-  }
-  return(batch)
-}
diff --git a/R/Estimator.R b/R/Estimator.R
index 529c315..573c6a6 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -42,34 +42,65 @@ setEstimator <- function(learningRate='auto',
                          batchSize = 512,
                          epochs = 30,
                          device='cpu',
-                         optimizer = torch::optim_adamw,
-                         scheduler = list(fun=torch::lr_reduce_on_plateau,
+                         optimizer = torch$optim$AdamW,
+                         scheduler = list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau,
                                           params=list(patience=1)),
-                         criterion = torch::nn_bce_with_logits_loss,
+                         criterion = torch$nn$BCEWithLogitsLoss,
                          earlyStopping = list(useEarlyStopping=TRUE,
                                               params = list(patience=4)),
                          metric = "auc",
                          seed = NULL
 ) {
+  
+  checkIsClass(learningRate, c("numeric", "character"))
+  if (inherits(learningRate, "character")) {
+    if (learningRate != "auto"){
+      stop(paste0('Learning rate should be either a numeric or "auto", you provided: ', learningRate))
+    }
+  }
+  checkIsClass(weightDecay, "numeric")
+  checkHigherEqual(weightDecay, 0.0)
+  checkIsClass(batchSize, c("numeric", "integer"))
+  checkHigher(batchSize, 0)
+  checkIsClass(epochs, c("numeric", "integer"))
+  checkHigher(epochs, 0)
+  checkIsClass(device, c("character", "function"))
+  checkIsClass(scheduler, "list")
+  checkIsClass(earlyStopping, c("list", "NULL"))
+  checkIsClass(metric, c("character", "list"))
+  checkIsClass(seed, c("numeric", "integer", "NULL"))
+  
+  
   if (length(learningRate)==1 && learningRate=='auto') {findLR <- TRUE} else {findLR <- FALSE}
   if (is.null(seed)) {
     seed <- as.integer(sample(1e5, 1))
   }
-  
-  
   estimatorSettings <- list(learningRate=learningRate,
                             weightDecay=weightDecay,
                             batchSize=batchSize,
                             epochs=epochs,
                             device=device,
-                            optimizer=optimizer,
-                            scheduler=scheduler,
-                            criterion=criterion,
                             earlyStopping=earlyStopping,
                             findLR=findLR,
                             metric=metric,
-                            seed=seed[1] 
-  )
+                            seed=seed[1])
+  
+  optimizer <- rlang::enquo(optimizer) 
+  estimatorSettings$optimizer <- function() rlang::eval_tidy(optimizer)
+  class(estimatorSettings$optimizer) <- c("delayed", class(estimatorSettings$optimizer))
+  
+  criterion <- rlang::enquo(criterion)
+  estimatorSettings$criterion <- function() rlang::eval_tidy(criterion)
+  class(estimatorSettings$criterion) <- c("delayed", class(estimatorSettings$criterion))
+  
+  scheduler <- rlang::enquo(scheduler)
+  estimatorSettings$scheduler <- function() rlang::eval_tidy(scheduler)
+  class(estimatorSettings$scheduler) <-c("delayed", class(estimatorSettings$scheduler))
+
+  if (is.function(device)) {
+    class(estimatorSettings$device) <- c("delayed",  class(estimatorSettings$device))
+  }
+  
   paramsToTune <- list()
   for (name in names(estimatorSettings)) {
     param <- estimatorSettings[[name]]
@@ -77,7 +108,7 @@ setEstimator <- function(learningRate='auto',
       paramsToTune[[paste0('estimator.',name)]] <- param
     }
     if ("params" %in% names(param)) {
-      for (name2 in names(param[["params"]])) { 
+      for (name2 in names(param[["params"]])) {
         param2 <- param[["params"]][[name2]]
         if (length(param2) > 1) {
           paramsToTune[[paste0('estimator.',name,'.',name2)]] <- param2
@@ -86,9 +117,10 @@ setEstimator <- function(learningRate='auto',
     }
   }
   estimatorSettings$paramsToTune <- paramsToTune
+
   return(estimatorSettings)
 }
-
+ 
 #' fitEstimator
 #'
 #' @description
@@ -138,14 +170,15 @@ fitEstimator <- function(trainData,
   hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary))
   prediction <- cvResult$prediction
   incs <- rep(1, covariateRef %>% dplyr::tally() %>% 
-                dplyr::collect () %>%
-                as.integer)
+                dplyr::collect() %>%
+                as.integer())
   covariateRef <- covariateRef %>%
+    dplyr::arrange("columnId") %>%
     dplyr::collect() %>%
     dplyr::mutate(
       included = incs,
       covariateValue = 0,
-      isNumeric = cvResult$numericalIndex
+      isNumeric = .data$columnId %in% cvResult$numericalIndex
     )
   
   comp <- start - Sys.time()
@@ -218,27 +251,23 @@ predictDeepEstimator <- function(plpModel,
                                                      "covariateId"
                                                    )
     )
-    data <- Dataset(mappedData$covariates,
-                    numericalIndex = plpModel$covariateImportance$isNumeric
-    )
+    data <- createDataset(mappedData, plpModel=plpModel)
   }
   
   # get predictions
   prediction <- cohort
   if (is.character(plpModel$model)) {
-    model <- torch::torch_load(file.path(plpModel$model, "DeepEstimatorModel.pt"), device = "cpu")
-    estimator <- Estimator$new(
-      modelType = plpModel$modelDesign$modelSettings$modelType,
-      modelParameters = model$modelParameters,
-      estimatorSettings = model$estimatorSettings
-    )
-    estimator$model$load_state_dict(model$modelStateDict)
-    prediction$value <- estimator$predictProba(data)
+    modelSettings <- plpModel$modelDesign$modelSettings
+    model <- torch$load(file.path(plpModel$model, "DeepEstimatorModel.pt"), map_location = "cpu")
+    estimator <- createEstimator(modelType=modelSettings$modelType,
+                                 modelParameters=model$model_parameters,
+                                 estimatorSettings=model$estimator_settings)
+    estimator$model$load_state_dict(model$model_state_dict)
+    prediction$value <- estimator$predict_proba(data)
   } else {
-    prediction$value <- plpModel$model$predictProba(data)
+    prediction$value <- plpModel$model$predict_proba(data)
   }
   
-  
   attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType")
   
   return(prediction)
@@ -267,9 +296,10 @@ gridCvDeep <- function(mappedData,
   ###########################################################################
   
   paramSearch <- modelSettings$param
-  trainCache <- TrainingCache$new(analysisPath)
   
-  if (trainCache$isParamGridIdentical(paramSearch)) {
+  # TODO below chunk should be in a setupCache function
+  trainCache <- TrainingCache$new(analysisPath)
+    if (trainCache$isParamGridIdentical(paramSearch)) {
     gridSearchPredictons <- trainCache$getGridSearchPredictions()
   } else {
     gridSearchPredictons <- list()
@@ -278,55 +308,46 @@ gridCvDeep <- function(mappedData,
     trainCache$saveModelParams(paramSearch)
   }
   
-  dataset <- Dataset(mappedData$covariates, labels$outcomeCount)
-  
-  estimatorSettings <- modelSettings$estimatorSettings
+  dataset <- createDataset(data=mappedData, labels=labels)
   
   fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))]
-  
+  findLR <- modelSettings$estimatorSettings$findLR
   for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
     ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
     ParallelLogger::logInfo(paste0("HyperParameters: "))
     ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
-    modelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
-    
+    currentModelParams <- paramSearch[[gridId]][modelSettings$modelParamNames]
     
-    estimatorSettings <- fillEstimatorSettings(estimatorSettings, fitParams, 
+    currentEstimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams, 
                                                paramSearch[[gridId]])
-
+    
     # initiate prediction
-    prediction <- c()
+    prediction <- NULL
     
     fold <- labels$index
     ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
-    modelParams$catFeatures <- dataset$numCatFeatures()
-    modelParams$numFeatures <- dataset$numNumFeatures()
-    
-    if (estimatorSettings$findLR) {
-      lr <- lrFinder(dataset=dataset, 
-                     modelType = modelSettings$modelType,
-                     modelParams = modelParams,
-                     estimatorSettings = estimatorSettings)
+    currentModelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
+    currentModelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
+    if (findLR) {
+      LrFinder <- createLRFinder(modelType = modelSettings$modelType,
+                                 modelParameters = currentModelParams,
+                                 estimatorSettings = currentEstimatorSettings
+                                 )
+      lr <- LrFinder$get_lr(dataset)
       ParallelLogger::logInfo(paste0("Auto learning rate selected as: ", lr))
-      estimatorSettings$learningRate <- lr
+      currentEstimatorSettings$learningRate <- lr
     }
     
-    
     learnRates <- list()
     for (i in 1:max(fold)) {
       ParallelLogger::logInfo(paste0("Fold ", i))
-      trainDataset <- torch::dataset_subset(dataset, indices = which(fold != i))
-      testDataset <- torch::dataset_subset(dataset, indices = which(fold == i))
-      estimator <- Estimator$new(
-        modelType = modelSettings$modelType,
-        modelParameters = modelParams,
-        estimatorSettings = estimatorSettings
-      )
-      
-      estimator$fit(
-        trainDataset,
-        testDataset
-      )
+      trainDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold != i) - 1)) # -1 for python 0-based indexing
+      testDataset <- torch$utils$data$Subset(dataset, indices = as.integer(which(fold == i) -1)) # -1 for python 0-based indexing
+    
+      estimator <- createEstimator(modelType=modelSettings$modelType,
+                                   modelParameters=currentModelParams,
+                                   estimatorSettings=currentEstimatorSettings)
+      estimator$fit(trainDataset, testDataset)
       
       ParallelLogger::logInfo("Calculating predictions on left out fold set...")
       
@@ -339,8 +360,8 @@ gridCvDeep <- function(mappedData,
         )
       )
       learnRates[[i]] <- list(
-        LRs = estimator$learnRateSchedule,
-        bestEpoch = estimator$bestEpoch
+        LRs = estimator$learn_rate_schedule,
+        bestEpoch = estimator$best_epoch
       )
     }
     maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
@@ -374,20 +395,20 @@ gridCvDeep <- function(mappedData,
   if (!dir.exists(file.path(modelLocation))) {
     dir.create(file.path(modelLocation), recursive = T)
   }
-  modelParams$catFeatures <- dataset$numCatFeatures()
-  modelParams$numFeatures <- dataset$numNumFeatures()
   
-  estimatorSettings <- fillEstimatorSettings(estimatorSettings, fitParams,
-                                             finalParam)
+  modelParams$catFeatures <- dataset$get_cat_features()$shape[[1]]
+  modelParams$numFeatures <- dataset$get_numerical_features()$shape[[1]]
   
-  estimator <- Estimator$new(
-    modelType = modelSettings$modelType,
-    modelParameters = modelParams,
-    estimatorSettings = estimatorSettings
-  )
-  numericalIndex <- dataset$getNumericalIndex()
   
-  estimator$fitWholeTrainingSet(dataset, finalParam$learnSchedule$LRs)
+  estimatorSettings <- fillEstimatorSettings(modelSettings$estimatorSettings, fitParams,
+                                             finalParam)
+  estimatorSettings$learningRate <- finalParam$learnSchedule$LRs[[1]]
+  estimator <- createEstimator(modelType = modelSettings$modelType,
+                               modelParameters = modelParams,
+                               estimatorSettings = estimatorSettings)
+ 
+  numericalIndex <- dataset$get_numerical_features()
+  estimator$fit_whole_training_set(dataset, finalParam$learnSchedule$LRs)
   
   ParallelLogger::logInfo("Calculating predictions on all train data...")
   prediction <- predictDeepEstimator(
@@ -410,14 +431,13 @@ gridCvDeep <- function(mappedData,
   
   # save torch code here
   estimator$save(modelLocation, "DeepEstimatorModel.pt")
-  
   return(
     list(
       estimator = modelLocation,
       prediction = prediction,
       finalParam = finalParam,
       paramGridSearch = paramGridSearch,
-      numericalIndex = numericalIndex
+      numericalIndex = numericalIndex$to_list()
     )
   )
 }
@@ -434,4 +454,34 @@ fillEstimatorSettings <- function(estimatorSettings, fitParams, paramSearch) {
     }
   }
   return(estimatorSettings)
+}
+
+# utility function to evaluate any expressions or call functions passed as settings
+evalEstimatorSettings <- function(estimatorSettings) {
+  
+  for (set in names(estimatorSettings)) {
+    if (inherits(estimatorSettings[[set]], "delayed")) {
+      estimatorSettings[[set]] <- estimatorSettings[[set]]()
+    }
+  }
+  estimatorSettings
+}
+
+createEstimator <- function(modelType,
+                            modelParameters,
+                            estimatorSettings) {
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+
+  Model <- reticulate::import_from_path(modelType, path=path)[[modelType]]
+  
+  Estimator <- reticulate::import_from_path("Estimator", path=path)$Estimator
+  
+  modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
+  estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
+  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
+  
+  estimator <- Estimator(model = Model,
+                         model_parameters = modelParameters,
+                         estimator_settings = estimatorSettings)
+  return(estimator)
 }
\ No newline at end of file
diff --git a/R/HelperFunctions.R b/R/HelperFunctions.R
new file mode 100644
index 0000000..350baf9
--- /dev/null
+++ b/R/HelperFunctions.R
@@ -0,0 +1,80 @@
+# @file HelperFunctions.R
+#
+# Copyright 2023 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#' Convert a camel case string to snake case
+#'
+#' @param string   The string to be converted
+#'
+#' @return
+#' A string
+#'
+camelCaseToSnakeCase <- function(string) {
+  string <- gsub("([A-Z])", "_\\1", string)
+  string <- tolower(string)
+  string <- gsub("([a-z])([0-9])", "\\1_\\2", string)
+  return(string)
+}
+
+#' Convert the names of an object from camel case to snake case
+#'
+#' @param object   The object of which the names should be converted
+#'
+#' @return
+#' The same object, but with converted names.
+camelCaseToSnakeCaseNames <- function(object) {
+  names(object) <- camelCaseToSnakeCase(names(object))
+  return(object)
+}
+
+#' helper function to check class of input
+#' 
+#' @param parameter the input parameter to check
+#' @param classes which classes it should belong to (one or more)
+checkIsClass<- function(parameter,classes) {
+  name = deparse(substitute(parameter))
+  if (!inherits(x = parameter, what = classes)) {
+    ParallelLogger::logError(paste0(name, ' should be of class:', classes, ' '))      
+    stop(paste0(name, ' is wrong class'))
+  }
+  return(TRUE)
+}
+
+#' helper function to check that input is higher than a certain value
+#' 
+#' @param parameter the input parameter to check, can be a vector
+#' @param value which value it should be higher than
+checkHigher <- function(parameter,value) {
+  name = deparse(substitute(parameter))
+  if (!is.numeric(parameter) | all(parameter<=value)) {
+    ParallelLogger::logError(paste0(name, ' needs to be > ',value))      
+    stop(paste0(name, ' needs to be > ', value))
+  }
+  return(TRUE)
+}
+
+#' helper function to check that input is higher or equal than a certain value
+#' 
+#' @param parameter the input parameter to check, can be a vector
+#' @param value which value it should be higher or equal than
+checkHigherEqual <- function(parameter,value) {
+  name = deparse(substitute(parameter))
+  if (!is.numeric(parameter) | all(parameter<value)) {
+    ParallelLogger::logError(paste0(name, ' needs to be >= ',value))      
+    stop(paste0(name, ' needs to be >= ', value))
+  }
+  return(TRUE)
+} 
\ No newline at end of file
diff --git a/R/LRFinder.R b/R/LRFinder.R
index 9f670e4..dfe8ee3 100644
--- a/R/LRFinder.R
+++ b/R/LRFinder.R
@@ -1,122 +1,42 @@
-lrPerBatch <- torch::lr_scheduler(
-  "lrPerBatch",
-  initialize = function(
-    optimizer,
-    startLR = 1e-7,
-    endLR = 1.0,
-    nIters = 100,
-    lastEpoch = -1,
-    verbose = FALSE) {
-    
-    self$optimizer <- optimizer
-    self$endLR <- endLR
-    self$base_lrs <- startLR
-    self$iterations <- nIters
-    self$last_epoch <- lastEpoch
-    self$multiplier <- (endLR/startLR)^(1/nIters)
-    
-    super$initialize(optimizer, last_epoch=lastEpoch, verbose)
-    
-  },
+# @file LRFinder.R
+#
+# Copyright 2023 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+createLRFinder <- function(modelType,
+                           modelParameters,
+                           estimatorSettings,
+                           lrSettings=NULL) {
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+  LRFinderClass <- reticulate::import_from_path("LrFinder", path = path)$LrFinder
   
-  get_lr = function() {
-    if (self$last_epoch > 0) {
-      lrs <- numeric(length(self$optimizer$param_groups))
-      for (i in seq_along(self$optimizer$param_groups)) {
-        lrs[i] <- self$base_lrs[[i]] * (self$endLR / self$base_lrs[[i]]) ^ (self$last_epoch/(self$iterations-1))
-      }
-    } else {
-      lrs <- as.numeric(self$base_lrs)
-    }
-    lrs
+  model <- reticulate::import_from_path(modelType, path=path)[[modelType]]
+  
+  modelParameters <- camelCaseToSnakeCaseNames(modelParameters)
+  estimatorSettings <- camelCaseToSnakeCaseNames(estimatorSettings)
+  if (!is.null(lrSettings)) {
+    lrSettings <- camelCaseToSnakeCaseNames(lrSettings)
   }
   
-)
+  estimatorSettings <- evalEstimatorSettings(estimatorSettings)
+  
+  
+  lrFinder <- LRFinderClass(model=model,
+                            model_parameters = modelParameters,
+                            estimator_settings = estimatorSettings,
+                            lr_settings = lrSettings)
 
-#' Find learning rate that decreases loss the most
-#' @description Method originated from https://arxiv.org/abs/1506.01186 but this
-#' implementation draws inspiration from various other implementations such as
-#' pytorch lightning, fastai, luz and pytorch-lr-finder.
-#' @param dataset torch dataset, training dataset
-#' @param modelType the function used to initialize the model
-#' @param modelParams parameters used to initialize model
-#' @param estimatorSettings settings for estimator to fit model
-#' @param minLR lower bound of learning rates to search through
-#' @param maxLR upper bound of learning rates to search through
-#' @param numLR number of learning rates to go through
-#' @param smooth smoothing to use on losses 
-#' @param divergenceThreshold if loss increases this amount above the minimum, stop.
-#' @export
-lrFinder <- function(dataset, modelType, modelParams, estimatorSettings,
-                     minLR=1e-7, maxLR=1, numLR=100, smooth=0.05,
-                     divergenceThreshold=4) {
-    torch::torch_manual_seed(seed=estimatorSettings$seed)
-    model <- do.call(modelType, modelParams)
-    if (is.function(estimatorSettings$device)) {
-      device = estimatorSettings$device()
-    } else {device = estimatorSettings$device}
-    model$to(device=device)
-    
-    optimizer <- estimatorSettings$optimizer(model$parameters, lr=minLR)
-    
-    # made a special lr scheduler for this task
-    scheduler <- lrPerBatch(optimizer = optimizer,
-                            startLR = minLR,
-                            endLR = maxLR,
-                            nIters = numLR)
-    
-    criterion <- estimatorSettings$criterion()
-    
-    batchIndex <- seq(length(dataset))
-    set.seed(estimatorSettings$seed)
-    
-    losses <- numeric(numLR)
-    lrs <- numeric(numLR)
-    ParallelLogger::logInfo('\nSearching for best learning rate')
-    progressBar <- utils::txtProgressBar(style = 3)
-    for (i in seq(numLR)) {
-      optimizer$zero_grad()
-      
-      batch <- dataset[sample(batchIndex, estimatorSettings$batchSize)]
-      batch <- batchToDevice(batch, device=device)
-      
-      output <- model(batch$batch)
-      
-      loss <- criterion(output, batch$target)
-      if (!is.null(smooth) && i != 1) {
-        losses[i] <- smooth * loss$item() + (1 - smooth) * losses[i-1]
-      } else {
-        losses[i] <- loss$item()  
-      }
-      lrs[i] <- optimizer$param_groups[[1]]$lr
-      
-      loss$backward()
-      optimizer$step()
-      scheduler$step()
-      utils::setTxtProgressBar(progressBar, i / numLR)
-      
-      if (i == 1) {
-        bestLoss <- losses[i]
-      } else {
-        if (losses[i] < bestLoss) {
-          bestLoss <- losses[i]
-        }
-      }
-      
-      if (losses[i] > (divergenceThreshold * bestLoss)) {
-        ParallelLogger::logInfo("\nLoss diverged - stopped early")
-        break
-      }
-      
-    }
-    
-    # find LR where gradient is highest but before global minimum is reached
-    # I added -5 to make sure it is not still in the minimum
-    globalMinimum <- which.min(losses)
-    grad <- as.numeric(torch::torch_diff(torch::torch_tensor(losses[1:(globalMinimum-5)])))
-    smallestGrad <- which.min(grad)
-    
-    suggestedLR <- lrs[smallestGrad]
-    
-  return(suggestedLR) 
-}
\ No newline at end of file
+  return(lrFinder)
+}
diff --git a/R/MLP.R b/R/MLP.R
index 2b9c8b1..6ad35e6 100644
--- a/R/MLP.R
+++ b/R/MLP.R
@@ -25,8 +25,8 @@
 #' Model architecture
 #'
 #'
-#' @param numLayers         Number of layers in network, default: 1:16
-#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)
+#' @param numLayers         Number of layers in network, default: 1:8
+#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)
 #' @param dropout           How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)
 #' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
 #' @param estimatorSettings settings of Estimator created with `setEstimator`
@@ -37,38 +37,57 @@
 #' @export
 setMultiLayerPerceptron <- function(numLayers = c(1:8),
                                     sizeHidden = c(2^(6:9)),
-                                    dropout = c(seq(0, 0.5, 0.05)),
+                                    dropout = c(seq(0, 0.3, 0.05)),
                                     sizeEmbedding = c(2^(6:9)),
                                     estimatorSettings = setEstimator(
                                       learningRate = 'auto',
                                       weightDecay = c(1e-6, 1e-3),
                                       batchSize = 1024,
                                       epochs = 30,
-                                      device="cpu"),                              
+                                      device="cpu"),
                                     hyperParamSearch = "random",
                                     randomSample = 100,
                                     randomSampleSeed = NULL) {
+  
+  checkIsClass(numLayers, c("integer", "numeric"))
+  checkHigherEqual(numLayers, 1)
 
+  checkIsClass(sizeHidden, c("integer", "numeric"))
+  checkHigherEqual(sizeHidden, 1)
+  
+  checkIsClass(dropout, c("numeric"))
+  checkHigherEqual(dropout, 0)
+  
+  checkIsClass(sizeEmbedding, c("numeric", "integer"))
+  checkHigherEqual(sizeEmbedding, 1)
+  
+  checkIsClass(hyperParamSearch, "character")
+  
+  checkIsClass(randomSample, c("numeric", "integer"))
+  checkHigherEqual(randomSample, 1)
+  
+  checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
+  
   paramGrid <- list(
     numLayers = numLayers,
     sizeHidden = sizeHidden,
     dropout = dropout,
     sizeEmbedding = sizeEmbedding
   )
-  
+
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
-  
+
   param <- PatientLevelPrediction::listCartesian(paramGrid)
   if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.",
                "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
                "\n Please lower the amount of randomSamples"))
   }
-  
+
   if (hyperParamSearch == "random") {
     suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "MLP"  
+  attr(param, 'settings')$modelType <- "MLP"
 
   results <- list(
     fitFunction = "fitEstimator",
@@ -86,82 +105,3 @@ setMultiLayerPerceptron <- function(numLayers = c(1:8),
 
   return(results)
 }
-
-
-MLP <- torch::nn_module(
-  name = "MLP",
-  initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers,
-                        activation = torch::nn_relu,
-                        normalization = torch::nn_batch_norm1d, dropout = NULL,
-                        d_out = 1) {
-    self$embedding <- torch::nn_embedding_bag(
-      num_embeddings = catFeatures + 1,
-      embedding_dim = sizeEmbedding,
-      padding_idx = 1
-    )
-    if (numFeatures != 0) {
-      self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding)
-    }
-
-    self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden)
-
-
-    self$layers <- torch::nn_module_list(lapply(
-      1:numLayers,
-      function(x) {
-        MLPLayer(
-          sizeHidden,
-          normalization, activation,
-          dropout
-        )
-      }
-    ))
-    self$lastNorm <- normalization(sizeHidden)
-    self$head <- torch::nn_linear(sizeHidden, d_out)
-
-    self$lastAct <- activation()
-  },
-  forward = function(x) {
-    x_cat <- x$cat
-    x_num <- x$num
-    x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1
-    if (!is.null(x_num)) {
-      x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2
-    } else {
-      x <- x_cat
-    }
-    x <- self$first_layer(x)
-
-    for (i in 1:length(self$layers)) {
-      x <- self$layers[[i]](x)
-    }
-    x <- self$lastNorm(x)
-    x <- self$lastAct(x)
-    x <- self$head(x)
-    x <- x$squeeze(-1)
-    return(x)
-  }
-)
-
-MLPLayer <- torch::nn_module(
-  name = "MLPLayer",
-  initialize = function(sizeHidden = 64,
-                        normalization = torch::nn_batch_norm1d,
-                        activation = torch::nn_relu,
-                        dropout = 0.0, bias = TRUE) {
-    self$norm <- normalization(sizeHidden)
-    self$activation <- activation()
-    self$linear <- torch::nn_linear(sizeHidden, sizeHidden, bias = bias)
-
-    if (!is.null(dropout) | !dropout == 0.0) {
-      self$dropout <- torch::nn_dropout(p = dropout)
-    }
-  },
-  forward = function(x) {
-    x <- self$linear(self$norm(x))
-    if (!is.null(self$dropout)) {
-      x <- self$dropout(x)
-    }
-    return(self$activation(x))
-  }
-)
diff --git a/R/ResNet.R b/R/ResNet.R
index dc610b5..d20aa1d 100644
--- a/R/ResNet.R
+++ b/R/ResNet.R
@@ -84,6 +84,28 @@ setResNet <- function(numLayers = c(1:8),
                       randomSample = 100,
                       randomSampleSeed = NULL) 
 {
+  checkIsClass(numLayers, c("integer", "numeric"))
+  checkHigherEqual(numLayers, 1)
+  
+  checkIsClass(sizeHidden, c("integer", "numeric"))
+  checkHigherEqual(sizeHidden, 1)
+  
+  checkIsClass(residualDropout, "numeric")
+  checkHigherEqual(residualDropout, 0)
+  
+  checkIsClass(hiddenDropout, "numeric")
+  checkHigherEqual(hiddenDropout, 0)
+  
+  checkIsClass(sizeEmbedding, c("integer", "numeric"))
+  checkHigherEqual(sizeEmbedding, 1)
+  
+  checkIsClass(hyperParamSearch, "character")
+  
+  checkIsClass(randomSample, c("numeric", "integer"))
+  checkHigherEqual(randomSample, 1)
+  
+  checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
+  
   paramGrid <- list(
     numLayers = numLayers,
     sizeHidden = sizeHidden,
@@ -121,99 +143,3 @@ setResNet <- function(numLayers = c(1:8),
 
   return(results)
 }
-
-ResNet <- torch::nn_module(
-  name = "ResNet",
-  initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers,
-                        hiddenFactor, activation = torch::nn_relu,
-                        normalization = torch::nn_batch_norm1d, hiddenDropout = NULL,
-                        residualDropout = NULL, d_out = 1, concatNum=TRUE) {
-    self$embedding <- torch::nn_embedding_bag(
-      num_embeddings = catFeatures + 1,
-      embedding_dim = sizeEmbedding,
-      padding_idx = 1
-    )
-    if (numFeatures != 0 & concatNum != TRUE) {
-      self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding)
-    } else {
-      self$numEmbedding <- NULL
-      sizeEmbedding <- sizeEmbedding + numFeatures
-    }
-
-    self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden)
-
-    resHidden <- sizeHidden * hiddenFactor
-
-    self$layers <- torch::nn_module_list(lapply(
-      1:numLayers,
-      function(x) {
-        ResLayer(
-          sizeHidden, resHidden,
-          normalization, activation,
-          hiddenDropout,
-          residualDropout
-        )
-      }
-    ))
-    self$lastNorm <- normalization(sizeHidden)
-    self$head <- torch::nn_linear(sizeHidden, d_out)
-
-    self$lastAct <- activation()
-  },
-  forward = function(x) {
-    x_cat <- x$cat
-    x_num <- x$num
-    x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1
-    if (!is.null(x_num) & (!is.null(self$numEmbedding))) {
-      x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2
-    } else if (!is.null(x_num) & is.null(self$numEmbedding)) {
-      x <- torch::torch_cat(list(x_cat, x_num), dim = 2L)
-    } else {
-      x <- x_cat
-    }
-    x <- self$first_layer(x)
-
-    for (i in 1:length(self$layers)) {
-      x <- self$layers[[i]](x)
-    }
-    x <- self$lastNorm(x)
-    x <- self$lastAct(x)
-    x <- self$head(x)
-    x <- x$squeeze(-1)
-    return(x)
-  }
-)
-
-ResLayer <- torch::nn_module(
-  name = "ResLayer",
-  initialize = function(sizeHidden, resHidden, normalization,
-                        activation, hiddenDropout = NULL, residualDropout = NULL) {
-    self$norm <- normalization(sizeHidden)
-    self$linear0 <- torch::nn_linear(sizeHidden, resHidden)
-    self$linear1 <- torch::nn_linear(resHidden, sizeHidden)
-
-    if (!is.null(hiddenDropout)) {
-      self$hiddenDropout <- torch::nn_dropout(p = hiddenDropout)
-    }
-    if (!is.null(residualDropout)) {
-      self$residualDropout <- torch::nn_dropout(p = residualDropout)
-    }
-
-    self$activation <- activation()
-  },
-  forward = function(x) {
-    z <- x
-    z <- self$norm(z)
-    z <- self$linear0(z)
-    z <- self$activation(z)
-    if (!is.null(self$hiddenDropout)) {
-      z <- self$hiddenDropout(z)
-    }
-    z <- self$linear1(z)
-    if (!is.null(self$residualDropout)) {
-      z <- self$residualDropout(z)
-    }
-    x <- z + x
-    return(x)
-  }
-)
diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
index fac668e..8577f31 100644
--- a/R/TrainingCache-class.R
+++ b/R/TrainingCache-class.R
@@ -76,8 +76,13 @@ TrainingCache <- R6::R6Class(
       if (is.null(private$.paramPersistence$gridSearchPredictions)) {
         return(1)
       } else {
-        return(which(sapply(private$.paramPersistence$gridSearchPredictions,
+        # if only a single hyperparameter combination is assessed return 1
+        if (length(private$.paramPersistence$gridSearchPredictions) == 1) {
+          return(1)
+        } else {
+          return(which(sapply(private$.paramPersistence$gridSearchPredictions,
             is.null))[1])
+        }
       }
     },
     
diff --git a/R/Transformer.R b/R/Transformer.R
index 7e25ff7..8b0d0c4 100644
--- a/R/Transformer.R
+++ b/R/Transformer.R
@@ -17,12 +17,12 @@
 # limitations under the License.
 
 #' Create default settings for a non-temporal transformer
-#' 
+#'
 #' @description A transformer model with default hyperparameters
 #' @details from https://arxiv.org/abs/2106.11959
 #' Default hyperparameters from paper
 #' @param estimatorSettings created with `setEstimator`
-#' 
+#'
 #' @export
 setDefaultTransformer <- function(estimatorSettings=setEstimator(
   learningRate = 'auto',
@@ -67,16 +67,61 @@ setDefaultTransformer <- function(estimatorSettings=setEstimator(
 #' @param randomSampleSeed        Random seed to sample hyperparameter combinations
 #'
 #' @export
-setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
-                           numHeads = 8, attDropout = 0.25, ffnDropout = 0.25,
-                           resDropout = 0, dimHidden = 512, dimHiddenRatio = NULL,
+setTransformer <- function(numBlocks = 3, 
+                           dimToken = 96, 
+                           dimOut = 1,
+                           numHeads = 8, 
+                           attDropout = 0.25, 
+                           ffnDropout = 0.25,
+                           resDropout = 0, 
+                           dimHidden = 512, 
+                           dimHiddenRatio = NULL,
                            estimatorSettings=setEstimator(weightDecay = 1e-6,
                                                           batchSize=1024,
                                                           epochs=10,
                                                           seed=NULL),
                            hyperParamSearch = "random",
-                           randomSample = 1, randomSampleSeed = NULL) {
+                           randomSample = 1, 
+                           randomSampleSeed = NULL) {
+  
+  checkIsClass(numBlocks, c("integer", "numeric"))
+  checkHigherEqual(numBlocks, 1)
+  
+  checkIsClass(dimToken, c("integer", "numeric"))
+  checkHigherEqual(dimToken, 1)
+  
+  checkIsClass(dimOut, c("integer", "numeric"))
+  checkHigherEqual(dimOut, 1)
+  
+  checkIsClass(numHeads, c("integer", "numeric"))
+  checkHigherEqual(numHeads, 1)
 
+  checkIsClass(attDropout, c("numeric"))
+  checkHigherEqual(attDropout, 0)
+  
+  checkIsClass(ffnDropout, c("numeric"))
+  checkHigherEqual(ffnDropout, 0)
+  
+  checkIsClass(resDropout, c("numeric"))
+  checkHigherEqual(resDropout, 0)
+  
+  checkIsClass(dimHidden, c("integer", "numeric", "NULL"))
+  if (!is.null(dimHidden)) {
+    checkHigherEqual(dimHidden, 1)
+  }
+  
+  checkIsClass(dimHiddenRatio, c("numeric", "NULL"))
+  if (!is.null(dimHiddenRatio)) {
+    checkHigher(dimHiddenRatio, 0)
+  }
+  
+  checkIsClass(hyperParamSearch, "character")
+  
+  checkIsClass(randomSample, c("numeric", "integer"))
+  checkHigherEqual(randomSample, 1)
+  
+  checkIsClass(randomSampleSeed, c("numeric", "integer", "NULL"))
+  
   if (any(with(expand.grid(dimToken = dimToken, numHeads = numHeads), dimToken %% numHeads != 0))) {
     stop(paste(
       "dimToken needs to divisible by numHeads. dimToken =", dimToken,
@@ -91,10 +136,10 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
     ))
   } else {
     if (!is.null(dimHiddenRatio)) {
-      dimHidden <- round(dimToken*dimHiddenRatio, digits = 0)
+      dimHidden <- dimHiddenRatio
     }
   }
-  
+
   paramGrid <- list(
     numBlocks = numBlocks,
     dimToken = dimToken,
@@ -105,21 +150,28 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
     ffnDropout = ffnDropout,
     resDropout = resDropout
   )
-  
+
   paramGrid <- c(paramGrid, estimatorSettings$paramsToTune)
-  
+
   param <- PatientLevelPrediction::listCartesian(paramGrid)
 
+  if (!is.null(dimHiddenRatio)) {
+    param <- lapply(param, function(x) {
+      x$dimHidden <- round(x$dimToken*x$dimHidden, digits = 0)
+      return(x)
+    })
+  }
+  
   if (hyperParamSearch == "random" && randomSample>length(param)) {
-    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.", 
+    stop(paste("\n Chosen amount of randomSamples is higher than the amount of possible hyperparameter combinations.",
                "\n randomSample:", randomSample,"\n Possible hyperparameter combinations:", length(param),
                "\n Please lower the amount of randomSample"))
   }
-  
+
   if (hyperParamSearch == "random") {
     suppressWarnings(withr::with_seed(randomSampleSeed, {param <- param[sample(length(param), randomSample)]}))
   }
-  attr(param, 'settings')$modelType <- "Transformer"  
+  attr(param, 'settings')$modelType <- "Transformer"
   results <- list(
     fitFunction = "fitEstimator",
     param = param,
@@ -134,225 +186,4 @@ setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
 
   class(results) <- "modelSettings"
   return(results)
-}
-
-
-Transformer <- torch::nn_module(
-  name = "Transformer",
-  initialize = function(catFeatures, numFeatures, numBlocks, dimToken, dimOut = 1,
-                        numHeads, attDropout, ffnDropout, resDropout,
-                        headActivation = torch::nn_relu,
-                        activation = NULL,
-                        ffnNorm = torch::nn_layer_norm,
-                        headNorm = torch::nn_layer_norm,
-                        attNorm = torch::nn_layer_norm,
-                        dimHidden) {
-    activation <- nn_reglu
-    self$Categoricalembedding <- Embedding(catFeatures + 1, dimToken) # + 1 for padding idx
-    self$numericalEmbedding <- numericalEmbedding(numFeatures, dimToken)
-    self$classToken <- ClassToken(dimToken)
-
-    self$layers <- torch::nn_module_list(lapply(
-      1:numBlocks,
-      function(x) {
-        layer <- torch::nn_module_list()
-        layer$add_module("attention", torch::nn_multihead_attention(dimToken, numHeads,
-          dropout = attDropout,
-          bias = TRUE
-        ))
-        layer$add_module("ffn", FeedForwardBlock(dimToken, dimHidden,
-          biasFirst = TRUE,
-          biasSecond = TRUE,
-          dropout = ffnDropout,
-          activation = activation
-        ))
-        layer$add_module("attentionResDropout", torch::nn_dropout(resDropout))
-        layer$add_module("ffnResDropout", torch::nn_dropout(resDropout))
-        layer$add_module("ffnNorm", ffnNorm(dimToken))
-
-        if (x != 1) {
-          layer$add_module("attentionNorm", attNorm(dimToken))
-        }
-        return(layer)
-      }
-    ))
-    self$head <- Head(dimToken,
-      bias = TRUE, activation = headActivation,
-      headNorm, dimOut
-    )
-  },
-  forward = function(x) {
-    mask <- torch::torch_where(x$cat == 0, TRUE, FALSE)
-    input <- x
-    cat <- self$Categoricalembedding(x$cat)
-    if (!is.null(input$num)) {
-      num <- self$numericalEmbedding(input$num)
-      x <- torch::torch_cat(list(cat, num), dim = 2L)
-      mask <- torch::torch_cat(list(mask, torch::torch_zeros(c(
-        x$shape[1],
-        num$shape[2]
-      ),
-      device = mask$device,
-      dtype = mask$dtype
-      )),
-      dim = 2L
-      )
-    } else {
-      x <- cat
-    }
-    x <- self$classToken(x)
-    mask <- torch::torch_cat(list(mask, torch::torch_zeros(c(x$shape[1], 1),
-      device = mask$device,
-      dtype = mask$dtype
-    )),
-    dim = 2L
-    )
-    for (i in 1:length(self$layers)) {
-      layer <- self$layers[[i]]
-      xResidual <- self$startResidual(layer, "attention", x)
-
-      if (i == length(self$layers)) {
-        dims <- xResidual$shape
-        # in final layer take only attention on CLS token
-        xResidual <- layer$attention(
-          xResidual[, -1]$view(c(dims[1], 1, dims[3]))$transpose(1, 2),
-          xResidual$transpose(1, 2),
-          xResidual$transpose(1, 2), mask
-        )
-        attnWeights <- xResidual[[2]]
-        xResidual <- xResidual[[1]]
-        x <- x[, -1]$view(c(dims[1], 1, dims[3]))
-      } else {
-        # attention input is seq_length x batch_size x embedding_dim
-        xResidual <- layer$attention(
-          xResidual$transpose(1, 2),
-          xResidual$transpose(1, 2),
-          xResidual$transpose(1, 2),
-          mask,
-        )[[1]]
-      }
-      x <- self$endResidual(layer, "attention", x, xResidual$transpose(1, 2))
-
-      xResidual <- self$startResidual(layer, "ffn", x)
-      xResidual <- layer$ffn(xResidual)
-      x <- self$endResidual(layer, "ffn", x, xResidual)
-    }
-    x <- self$head(x)[, 1] # remove singleton dimension
-    return(x)
-  },
-  startResidual = function(layer, stage, x) {
-    xResidual <- x
-    normKey <- paste0(stage, "Norm")
-    if (normKey %in% names(as.list(layer))) {
-      xResidual <- layer[[normKey]](xResidual)
-    }
-    return(xResidual)
-  },
-  endResidual = function(layer, stage, x, xResidual) {
-    dropoutKey <- paste0(stage, "ResDropout")
-    xResidual <- layer[[dropoutKey]](xResidual)
-    x <- x + xResidual
-    return(x)
-  }
-)
-
-
-FeedForwardBlock <- torch::nn_module(
-  name = "FeedForwardBlock",
-  initialize = function(dimToken, dimHidden, biasFirst, biasSecond,
-                        dropout, activation) {
-    self$linearFirst <- torch::nn_linear(dimToken, dimHidden * 2, biasFirst)
-    self$activation <- activation()
-    self$dropout <- torch::nn_dropout(dropout)
-    self$linearSecond <- torch::nn_linear(dimHidden, dimToken, biasSecond)
-  },
-  forward = function(x) {
-    x <- self$linearFirst(x)
-    x <- self$activation(x)
-    x <- self$dropout(x)
-    x <- self$linearSecond(x)
-    return(x)
-  }
-)
-
-Head <- torch::nn_module(
-  name = "Head",
-  initialize = function(dimIn, bias, activation, normalization, dimOut) {
-    self$normalization <- normalization(dimIn)
-    self$activation <- activation()
-    self$linear <- torch::nn_linear(dimIn, dimOut, bias)
-  },
-  forward = function(x) {
-    x <- x[, -1] # ?
-    x <- self$normalization(x)
-    x <- self$activation(x)
-    x <- self$linear(x)
-    return(x)
-  }
-)
-
-Embedding <- torch::nn_module(
-  name = "Embedding",
-  initialize = function(numEmbeddings, embeddingDim) {
-    self$embedding <- torch::nn_embedding(numEmbeddings, embeddingDim, padding_idx = 1)
-  },
-  forward = function(x_cat) {
-    x <- self$embedding(x_cat + 1L) # padding idx is 1L
-    return(x)
-  }
-)
-
-numericalEmbedding <- torch::nn_module(
-  name = "numericalEmbedding",
-  initialize = function(numEmbeddings, embeddingDim, bias = TRUE) {
-    self$weight <- torch::nn_parameter(torch::torch_empty(numEmbeddings, embeddingDim))
-    if (bias) {
-      self$bias <- torch::nn_parameter(torch::torch_empty(numEmbeddings, embeddingDim))
-    } else {
-      self$bias <- NULL
-    }
-
-    for (parameter in list(self$weight, self$bias)) {
-      if (!is.null(parameter)) {
-        torch::nn_init_kaiming_uniform_(parameter, a = sqrt(5))
-      }
-    }
-  },
-  forward = function(x) {
-    x <- self$weight$unsqueeze(1) * x$unsqueeze(-1)
-    if (!is.null(self$bias)) {
-      x <- x + self$bias$unsqueeze(1)
-    }
-    return(x)
-  }
-)
-
-# adds a class token embedding to embeddings
-ClassToken <- torch::nn_module(
-  name = "ClassToken",
-  initialize = function(dimToken) {
-    self$weight <- torch::nn_parameter(torch::torch_empty(dimToken, 1))
-    torch::nn_init_kaiming_uniform_(self$weight, a = sqrt(5))
-  },
-  expand = function(dims) {
-    newDims <- vector("integer", length(dims) - 1) + 1
-    return(self$weight$view(c(newDims, -1))$expand(c(dims, -1)))
-  },
-  forward = function(x) {
-    return(torch::torch_cat(c(x, self$expand(c(dim(x)[[1]], 1))), dim = 2))
-  }
-)
-
-nn_reglu <- torch::nn_module(
-  name = "ReGlu",
-  forward = function(x) {
-    return(reglu(x))
-  }
-)
-
-
-reglu <- function(x) {
-  chunks <- x$chunk(2, dim = -1)
-
-  return(chunks[[1]] * torch::nnf_relu(chunks[[2]]))
-}
+}
\ No newline at end of file
diff --git a/extras/example.R b/extras/example.R
index fa989b1..99a6c08 100644
--- a/extras/example.R
+++ b/extras/example.R
@@ -4,51 +4,70 @@ library(PatientLevelPrediction)
 library(DeepPatientLevelPrediction)
 
 data(plpDataSimulationProfile)
-sampleSize <- 1e4
+sampleSize <- 1e3
 plpData <- simulatePlpData(
-  plpDataSimulationProfile,
-  n = sampleSize 
-)
+   plpDataSimulationProfile,
+   n = sampleSize
+ )
+
 
 populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
   requireTimeAtRisk = F, 
   riskWindowStart = 1, 
-  riskWindowEnd = 365)
+  riskWindowEnd = 365*5)
+
+# 
+# modelSettings <- setDefaultTransformer(estimatorSettings = setEstimator(
+#   learningRate = "auto",
+#   batchSize=64L,
+#   epochs = 10L
+# ))
 
- 
-modelSettings <- setResNet(numLayers = 2, sizeHidden = 64, hiddenFactor = 1,
-                          residualDropout = 0, hiddenDropout = 0.2, normalization = 'BatchNorm',
-                          activation = 'RelU', sizeEmbedding = 512, weightDecay = 1e-6,
-                          learningRate = 3e-4, seed = 42, hyperParamSearch = 'random',
-                          randomSample = 1, device = 'cuda:0',batchSize = 32,epochs = 10)
+modelSettings <- setDefaultResNet(estimatorSettings = setEstimator(
+  learningRate = "auto",
+  weightDecay = 1e-06,
+  device="cuda:0",
+  batchSize=128L,
+  epochs=50L,
+  seed=42
+))
 
-# modelSettings <- setTransformer(numBlocks=1, dimToken = 33, dimOut = 1, numHeads = 3,
-#                                 attDropout = 0.2, ffnDropout = 0.2, resDropout = 0,
-#                                 dimHidden = 8, batchSize = 32, hyperParamSearch = 'random',
-#                                 weightDecay = 1e-6, learningRate = 3e-4, epochs = 10,
-#                                 device = 'cuda:0', randomSamples = 1, seed = 42)
+modelSettings <- setResNet(numLayers = c(1L, 2L),
+                           sizeHidden = 72L,
+                           hiddenFactor = 1L,
+                           residualDropout = 0.0,
+                           hiddenDropout = 0.0,
+                           sizeEmbedding = 64L,
+                           estimatorSettings = setEstimator(
+                             learningRate = 3e-4,
+                             batchSize = 128L,
+                             epochs = 10L,
+                             device = "cpu",
+                             seed = 42
+                           ),
+                           randomSample = 2)
 
 res2 <- PatientLevelPrediction::runPlp(
-plpData = plpData,
-outcomeId = 3,
-modelSettings = modelSettings,
-analysisId = 'Test',
-analysisName = 'Testing DeepPlp',
-populationSettings = populationSet,
-splitSettings = createDefaultSplitSetting(),
-sampleSettings = createSampleSettings(),  # none
-featureEngineeringSettings = createFeatureEngineeringSettings(), # none
-preprocessSettings = createPreprocessSettings(),
-logSettings = createLogSettings(verbosity='TRACE'),
-executeSettings = createExecuteSettings(
-  runSplitData = T,
-  runSampleData = F,
-  runfeatureEngineering = F,
-  runPreprocessData = T,
-  runModelDevelopment = T,
-  runCovariateSummary = T
-),
-saveDirectory = '~/test/new_plp/'
+  plpData = plpData,
+  outcomeId = unique(plpData$outcomes$outcomeId)[[1]],
+  modelSettings = modelSettings,
+  analysisId = 'Test',
+  analysisName = 'Testing DeepPlp',
+  populationSettings = populationSet,
+  splitSettings = createDefaultSplitSetting(splitSeed = 123),
+  sampleSettings = createSampleSettings("underSample"),  # none
+  featureEngineeringSettings = createFeatureEngineeringSettings(), # none
+  preprocessSettings = createPreprocessSettings(normalize = F),
+  logSettings = createLogSettings(verbosity='TRACE'),
+  executeSettings = createExecuteSettings(
+    runSplitData = T,
+    runSampleData = T,
+    runfeatureEngineering = F,
+    runPreprocessData = T,
+    runModelDevelopment = T,
+    runCovariateSummary = F
+  ),
+  saveDirectory = '~/test/resnet/'
 )
 
 
diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
new file mode 100644
index 0000000..15d749b
--- /dev/null
+++ b/inst/python/Dataset.py
@@ -0,0 +1,102 @@
+import time
+import pathlib
+import urllib
+
+import polars as pl
+import torch
+from torch.utils.data import Dataset
+
+
+class Data(Dataset):
+    def __init__(self,
+                 data,
+                 labels=None,
+                 numerical_features=None):
+        """
+        data: path to a covariates dataframe either arrow dataset or sqlite object
+        labels: a list of either 0 or 1, 1 if the patient got the outcome
+        numerical_features: list of indices where the numerical features are
+        """
+        start = time.time()
+        if pathlib.Path(data).suffix == '.sqlite':
+            data = urllib.parse.quote(data)
+            data = pl.read_database("SELECT * from covariates",
+                                    connection_uri=f"sqlite://{data}").lazy()
+        else:
+            data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow'))
+        observations = data.select(pl.col('rowId').max()).collect()[0, 0]
+        # detect features are numeric
+        if numerical_features is None:
+            self.numerical_features = data.groupby(by='columnId') \
+                .n_unique().filter(pl.col('covariateValue') > 1).select('columnId').collect()['columnId']
+        else:
+            self.numerical_features = pl.Series('num', numerical_features)
+
+        if labels:
+            self.target = torch.as_tensor(labels)
+        else:
+            self.target = torch.zeros(size=(observations,))
+
+        # filter by categorical columns,
+        # sort and group_by columnId
+        # create newColumnId from 1 (or zero?) until # catColumns
+        # select rowId and newColumnId
+        # rename newColumnId to columnId and sort by it
+        data_cat = data.filter(~pl.col('columnId')
+                               .is_in(self.numerical_features)) \
+            .sort(by='columnId').with_row_count('newColumnId'). \
+            with_columns(pl.col('newColumnId').first().over('columnId')
+                         .rank(method="dense")) \
+            .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId')).sort('rowId') \
+            .with_columns(pl.col('rowId') - 1).collect()
+        cat_tensor = torch.as_tensor(data_cat.to_numpy())
+        tensor_list = torch.split(cat_tensor[:, 1], torch.unique_consecutive(cat_tensor[:, 0], return_counts=True)[1].
+                                  tolist())
+
+        # because of subjects without cat features, I need to create a list with all zeroes and then insert
+        # my tensorList. That way I can still index the dataset correctly.
+        total_list = [torch.as_tensor((0,))] * observations
+        idx = data_cat['rowId'].unique().to_list()
+        for i, i2 in enumerate(idx):
+            total_list[i2] = tensor_list[i]
+        self.cat = torch.nn.utils.rnn.pad_sequence(total_list, batch_first=True)
+        self.cat_features = data_cat['columnId'].unique()
+
+        # numerical data,
+        # N x C, dense matrix with values for N patients/visits for C numerical features
+        if pl.count(self.numerical_features) == 0:
+            self.num = None
+        else:
+            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \
+                with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId').
+                                                           rank(method="dense") - 1, pl.col('rowId') - 1) \
+                .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect()
+            indices = torch.as_tensor(numerical_data.select(['rowId', 'columnId']).to_numpy(), dtype=torch.long)
+            values = torch.as_tensor(numerical_data.select('covariateValue').to_numpy(), dtype=torch.float)
+            self.num = torch.sparse_coo_tensor(indices=indices.T,
+                                               values=values.squeeze(),
+                                               size=(observations, pl.count(self.numerical_features))).to_dense()
+        delta = time.time() - start
+        print(f'Processed data in {delta:.2f} seconds')
+
+    def get_numerical_features(self):
+        return self.numerical_features
+
+    def get_cat_features(self):
+        return self.cat_features
+
+    def __len__(self):
+        return self.target.size()[0]
+
+    def __getitem__(self, item):
+        if self.num is not None:
+            batch = {"cat": self.cat[item, :],
+                     "num": self.num[item, :]}
+        else:
+            batch = {"cat": self.cat[item, :].squeeze(),
+                     "num": None}
+        if batch["cat"].dim() == 1 and not isinstance(item, list):
+            batch["cat"] = batch["cat"].unsqueeze(0)
+        if batch["num"] is not None and batch["num"].dim() == 1 and not isinstance(item, list):
+            batch["num"] = batch["num"].unsqueeze(0)
+        return [batch, self.target[item].squeeze()]
diff --git a/inst/python/Estimator.py b/inst/python/Estimator.py
new file mode 100644
index 0000000..f8a182f
--- /dev/null
+++ b/inst/python/Estimator.py
@@ -0,0 +1,355 @@
+import time
+import pathlib
+
+import torch
+from torch.utils.data import DataLoader, BatchSampler, RandomSampler, SequentialSampler
+import torch.nn.functional as F
+from tqdm import tqdm
+from sklearn.metrics import roc_auc_score
+
+
+class Estimator:
+    """
+    A class that wraps around pytorch models.
+    """
+
+    def __init__(self,
+                 model,
+                 model_parameters,
+                 estimator_settings):
+        self.seed = estimator_settings["seed"]
+        if callable(estimator_settings["device"]):
+            self.device = estimator_settings["device"]()
+        else:
+            self.device = estimator_settings["device"]
+
+        torch.manual_seed(seed=self.seed)
+        self.model = model(**model_parameters)
+        self.model_parameters = model_parameters
+        self.estimator_settings = estimator_settings
+
+        self.epochs = int(estimator_settings.get("epochs", 5))
+        self.learning_rate = estimator_settings.get("learning_rate", 3e-4)
+        self.weight_decay = estimator_settings.get("weight_decay", 1e-5)
+        self.batch_size = int(estimator_settings.get("batch_size", 1024))
+        self.prefix = estimator_settings.get("prefix", self.model.name)
+
+        self.previous_epochs = int(estimator_settings.get("previous_epochs", 0))
+        self.model.to(device=self.device)
+
+        self.optimizer = estimator_settings["optimizer"](params=self.model.parameters(),
+                                                         lr=self.learning_rate,
+                                                         weight_decay=self.weight_decay)
+        self.criterion = estimator_settings["criterion"]()
+
+        if "metric" in estimator_settings.keys() and estimator_settings["metric"] is not None:
+            self.metric = estimator_settings["metric"]
+            if type(self.metric) == str:
+                if self.metric == "auc":
+                    self.metric = {"name": "auc",
+                                   "mode": "max"}
+                elif self.metric == "loss":
+                    self.metric = {"name": "loss",
+                                   "mode": "min"}
+            if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None:
+                estimator_settings["scheduler"]["params"]["mode"] = self.metric["mode"]
+            if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None:
+                estimator_settings["early_stopping"]["params"]["mode"] = self.metric["mode"]
+
+        if "scheduler" in estimator_settings.keys() and estimator_settings["scheduler"] is not None:
+            self.scheduler = estimator_settings["scheduler"]["fun"](self.optimizer,
+                                                                    **estimator_settings["scheduler"]["params"])
+
+        if "early_stopping" in estimator_settings.keys() and estimator_settings["early_stopping"] is not None:
+            self.early_stopper = EarlyStopping(**estimator_settings["early_stopping"]["params"])
+        else:
+            self.early_stopper = None
+
+        self.best_score = None
+        self.best_epoch = None
+        self.learn_rate_schedule = None
+
+    def fit(self, dataset, test_dataset):
+
+        train_dataloader = DataLoader(dataset=dataset,
+                                      batch_size=None,
+                                      sampler=BatchSampler(
+                                          sampler=RandomSampler(dataset),
+                                          batch_size=self.batch_size,
+                                          drop_last=True
+                                      ))
+        test_dataloader = DataLoader(dataset=test_dataset,
+                                     batch_size=None,
+                                     sampler=BatchSampler(
+                                         sampler=SequentialSampler(test_dataset),
+                                         batch_size=self.batch_size,
+                                         drop_last=False
+                                     ))
+
+        trained_epochs = dict()
+        times = list()
+        learning_rates = list()
+        all_scores = list()
+        model_state_dict = dict()
+        for epoch in range(self.epochs):
+            start_time = time.time()
+            training_loss = self.fit_epoch(train_dataloader)
+            scores = self.score(test_dataloader)
+            end_time = time.time()
+            delta_time = end_time - start_time
+            current_epoch = epoch + self.previous_epochs
+            lr = self.optimizer.param_groups[0]["lr"]
+            self.print_progress(scores, training_loss, delta_time, current_epoch)
+            self.scheduler.step(scores["metric"])
+            all_scores.append(scores)
+            learning_rates.append(lr)
+            times.append(round(delta_time, 3))
+
+            if self.early_stopper:
+                self.early_stopper(scores['metric'])
+                if self.early_stopper.improved:
+                    model_state_dict[epoch] = self.model.state_dict()
+                    trained_epochs[epoch] = current_epoch
+                if self.early_stopper.early_stop:
+                    print("Early stopping, validation metric stopped improving")
+                    print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item():.2f} seconds')
+                    self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates)
+                    return
+            else:
+                model_state_dict[epoch] = self.model.state_dict()
+                trained_epochs[epoch] = current_epoch
+        print(f'Average time per epoch was: {torch.mean(torch.as_tensor(times)).item()} seconds')
+        self.finish_fit(all_scores, model_state_dict, trained_epochs, learning_rates)
+        return
+
+    def fit_epoch(self, dataloader):
+        training_losses = torch.empty(len(dataloader))
+        self.model.train()
+        index = 0
+        for batch in tqdm(dataloader):
+            self.optimizer.zero_grad()
+            batch = batch_to_device(batch, device=self.device)
+            out = self.model(batch[0])
+            loss = self.criterion(out, batch[1])
+            loss.backward()
+
+            self.optimizer.step()
+            training_losses[index] = loss.detach()
+            index += 1
+        return training_losses.mean().item()
+
+    def score(self, dataloader):
+        with torch.no_grad():
+            loss = torch.empty(len(dataloader))
+            predictions = list()
+            targets = list()
+            self.model.eval()
+            index = 0
+            for batch in tqdm(dataloader):
+                batch = batch_to_device(batch, device=self.device)
+                pred = self.model(batch[0])
+                predictions.append(pred)
+                targets.append(batch[1])
+                loss[index] = self.criterion(pred, batch[1])
+                index += 1
+            mean_loss = loss.mean().item()
+            predictions = torch.concat(predictions)
+            targets = torch.concat(targets)
+            auc = roc_auc_score(targets.cpu(), predictions.cpu())
+            # auc = compute_auc(predictions, targets)
+            scores = dict()
+            if self.metric:
+                if self.metric["name"] == "auc":
+                    scores["metric"] = auc
+                elif self.metric["name"] == "loss":
+                    scores["metric"] = mean_loss
+                else:
+                    metric = self.metric["fun"](predictions, targets)
+                    scores["metric"] = metric
+            scores["auc"] = auc
+            scores["loss"] = mean_loss
+            return scores
+
+    def finish_fit(self, scores, model_state_dict, epoch, learning_rates):
+        if self.metric["mode"] == "max":
+            best_epoch_index = torch.argmax(torch.as_tensor([x["metric"] for x in scores])).item()
+        elif self.metric["mode"] == "min":
+            best_epoch_index = torch.argmin(torch.as_tensor([x["metric"] for x in scores])).item()
+
+        best_model_state_dict = model_state_dict[best_epoch_index]
+        self.model.load_state_dict(best_model_state_dict)
+
+        self.best_epoch = epoch[best_epoch_index]
+        self.best_score = {"loss": scores[best_epoch_index]["loss"],
+                           "auc": scores[best_epoch_index]["auc"]}
+        self.learn_rate_schedule = learning_rates[:(best_epoch_index+1)]
+        print(f"Loaded best model (based on AUC) from epoch {self.best_epoch}")
+        print(f"ValLoss: {self.best_score['loss']}")
+        print(f"valAUC: {self.best_score['auc']}")
+        if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss":
+            self.best_score[self.metric["name"]] = scores[best_epoch_index]["metric"]
+            print(f"{self.metric['name']}: {self.best_score[self.metric['name']]}")
+        return
+
+    def print_progress(self, scores, training_loss, delta_time, current_epoch):
+        if self.metric and self.metric["name"] != "auc" and self.metric["name"] != "loss":
+            print(f"Epochs: {current_epoch} | Val {self.metric['name']}: {scores['metric']:.3f} "
+                  f"| Val AUC: {scores['auc']:.3f} | Val Loss: {scores['loss']:.3f} "
+                  f"| Train Loss: {training_loss:.3f} | Time: {delta_time:.3f} seconds "
+                  f"| LR: {self.optimizer.param_groups[0]['lr']}")
+        else:
+            print(f"Epochs: {current_epoch} "
+                  f"| Val AUC: {scores['auc']:.3f} "
+                  f"| Val Loss: {scores['loss']:.3f} "
+                  f"| Train Loss: {training_loss:.3f} "
+                  f"| Time: {delta_time:.3f} seconds "
+                  f"| LR: {self.optimizer.param_groups[0]['lr']}")
+        return
+
+    def fit_whole_training_set(self, dataset, learning_rates=None):
+        dataloader = DataLoader(dataset=dataset,
+                                batch_size=None,
+                                sampler=BatchSampler(
+                                    sampler=RandomSampler(dataset),
+                                    batch_size=self.batch_size,
+                                    drop_last=True
+                                ))
+        if isinstance(learning_rates, list):
+            self.best_epoch = len(learning_rates)
+        elif ~isinstance(learning_rates, list):
+            learning_rates = [learning_rates]
+            self.best_epoch = len(learning_rates)
+        else:
+            self.best_epoch = self.epochs
+
+        for epoch in range(self.best_epoch):
+            self.optimizer.param_groups[0]['lr'] = learning_rates[epoch]
+            self.fit_epoch(dataloader)
+        return
+
+    def save(self, path, name):
+        save_path = pathlib.Path(path).joinpath(name)
+        out = dict(
+            model_state_dict=self.model.state_dict(),
+            model_parameters=self.model_parameters,
+            estimator_settings=self.estimator_settings,
+            epoch=self.epochs)
+        torch.save(out,
+                   f=save_path)
+        return save_path
+
+    def predict_proba(self, dataset):
+        dataloader = DataLoader(dataset=dataset,
+                                batch_size=None,
+                                sampler=BatchSampler(
+                                    sampler=SequentialSampler(dataset),
+                                    batch_size=self.batch_size,
+                                    drop_last=False
+                                ))
+        with torch.no_grad():
+            predictions = list()
+            self.model.eval()
+            for batch in tqdm(dataloader):
+                batch = batch_to_device(batch, device=self.device)
+                pred = self.model(batch[0])
+                predictions.append(torch.sigmoid(pred))
+            predictions = torch.concat(predictions).cpu().numpy()
+        return predictions
+
+    def predict(self, dataset, threshold=None):
+        predictions = self.predict_proba(dataset)
+
+        if threshold is None:
+            # use outcome rate
+            threshold = dataset.target.sum().item() / len(dataset)
+        predicted_class = predictions > threshold
+
+
+class EarlyStopping:
+
+    def __init__(self,
+                 patience=3,
+                 delta=0,
+                 verbose=True,
+                 mode='max'):
+        self.patience = patience
+        self.counter = 0
+        self.verbose = verbose
+        self.best_score = None
+        self.early_stop = False
+        self.improved = False
+        self.delta = delta
+        self.previous_score = 0
+        self.mode = mode
+
+    def __call__(self,
+                 metric):
+        if self.mode == 'max':
+            score = metric
+        else:
+            score = -1 * metric
+        if self.best_score is None:
+            self.best_score = score
+            self.improved = True
+        elif score < (self.best_score + self.delta):
+            self.counter += 1
+            self.improved = False
+            if self.verbose:
+                print(f"Early stopping counter: {self.counter}"
+                      f" out of {self.patience}")
+            if self.counter >= self.patience:
+                self.early_stop = True
+        else:
+            self.best_score = score
+            self.counter = 0
+            self.improved = True
+        self.previous_score = score
+
+
+def batch_to_device(batch, device='cpu'):
+    if torch.is_tensor(batch):
+        batch = batch.to(device=device)
+    else:
+        for ix, b in enumerate(batch):
+            if type(b) is str:
+                key = b
+                b = batch[b]
+            else:
+                key = None
+            if b is None:
+                continue
+            if torch.is_tensor(b):
+                b_out = b.to(device=device)
+            else:
+                b_out = batch_to_device(b, device)
+            if b_out is not None:
+                if key is not None:
+                    batch[key] = b_out
+                else:
+                    batch[ix] = b_out
+    return batch
+
+
+def compute_auc(input, target, n_threshold=1000):
+    threshold = torch.linspace(0, 1.0, n_threshold).to(device=input.device)
+    pred_label = input >= threshold[:, None, None]
+    input_target = pred_label * target
+
+    cum_tp = F.pad(input_target.sum(dim=-1).rot90(1, [1, 0]), (1, 0), value=0.0)
+    cum_fp = F.pad(
+        (pred_label.sum(dim=-1) - input_target.sum(dim=-1)).rot90(1, [1, 0]),
+        (1, 0),
+        value=0.0,
+    )
+
+    if len(cum_tp.shape) > 1:
+        factor = cum_tp[:, -1] * cum_fp[:, -1]
+    else:
+        factor = cum_tp[-1] * cum_fp[-1]
+    # Set AUROC to 0.5 when the target contains all ones or all zeros.
+    auroc = torch.where(
+        factor == 0,
+        0.5,
+        torch.trapz(cum_tp, cum_fp).double() / factor,
+    )
+    return auroc.item()
diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py
new file mode 100644
index 0000000..4c24a38
--- /dev/null
+++ b/inst/python/LrFinder.py
@@ -0,0 +1,105 @@
+import random
+
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader, BatchSampler, RandomSampler
+from tqdm import tqdm
+
+from Estimator import batch_to_device
+
+
+class ExponentialSchedulerPerBatch(_LRScheduler):
+
+    def __init__(self, optimizer,
+                 end_lr,
+                 num_iter):
+        self.end_lr = end_lr
+        self.num_iter = num_iter
+        super(ExponentialSchedulerPerBatch, self).__init__(optimizer, last_epoch=-1)
+
+    def get_lr(self):
+        r = self.last_epoch / (self.num_iter - 1)
+        return [base_lr * (self.end_lr / base_lr) ** r for base_lr in self.base_lrs]
+
+
+class LrFinder:
+
+    def __init__(self,
+                 model,
+                 model_parameters,
+                 estimator_settings,
+                 lr_settings):
+        if lr_settings is None:
+          lr_settings = {}
+        min_lr = lr_settings.get("min_lr", 1e-7)
+        max_lr = lr_settings.get("max_lr", 1)
+        num_lr = lr_settings.get("num_lr", 100)
+        smooth = lr_settings.get("smooth", 0.05)
+        divergence_threshold = lr_settings.get("divergence_threshold", 4)
+        torch.manual_seed(seed=estimator_settings["seed"])
+        self.model = model(**model_parameters)
+        if callable(estimator_settings["device"]):
+            self.device = estimator_settings["device"]()
+        else:
+            self.device = estimator_settings["device"]
+        self.model.to(device=self.device)
+        self.min_lr = min_lr
+        self.max_lr = max_lr
+        self.num_lr = num_lr
+        self.smooth = smooth
+        self.divergence_threshold = divergence_threshold
+
+        self.optimizer = estimator_settings['optimizer'](params=self.model.parameters(),
+                                                         lr=self.min_lr)
+
+        self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr)
+
+        self.criterion = estimator_settings["criterion"]()
+        self.batch_size = estimator_settings['batch_size']
+        self.losses = None
+        self.loss_index = None
+
+    def get_lr(self, dataset):
+        batch_index = torch.arange(0, len(dataset), 1).tolist()
+
+        losses = torch.empty(size=(self.num_lr,), dtype=torch.float)
+        lrs = torch.empty(size=(self.num_lr,), dtype=torch.float)
+        for i in tqdm(range(self.num_lr)):
+            self.optimizer.zero_grad()
+            random_batch = random.sample(batch_index, 32)
+            batch = dataset[random_batch]
+            batch = batch_to_device(batch, self.device)
+
+            out = self.model(batch[0])
+            loss = self.criterion(out, batch[1])
+            if self.smooth is not None and i != 0:
+                losses[i] = self.smooth * loss.item() + (1 - self.smooth) * losses[i - 1]
+            else:
+                losses[i] = loss.item()
+            lrs[i] = self.optimizer.param_groups[0]["lr"]
+
+            loss.backward()
+            self.optimizer.step()
+            self.scheduler.step()
+
+            if i == 0:
+                best_loss = losses[i]
+            else:
+                if losses[i] < best_loss:
+                    best_loss = losses[i]
+
+            if losses[i] > (self.divergence_threshold * best_loss):
+                print(f"Loss diverged - stopped early - iteration {i} out of {self.num_lr}")
+                break
+
+        # find LR where gradient is highest but before global minimum is reached
+        # I added -5 to make sure it is not still in the minimum
+        global_minimum = torch.argmin(losses)
+        gradient = torch.diff(losses[:(global_minimum - 5)+1])
+        smallest_gradient = torch.argmin(gradient)
+
+        suggested_lr = lrs[smallest_gradient]
+        self.losses = losses
+        self.loss_index = smallest_gradient
+        self.lrs = lrs
+        return suggested_lr.item()
diff --git a/inst/python/MLP.py b/inst/python/MLP.py
new file mode 100644
index 0000000..3b8b03b
--- /dev/null
+++ b/inst/python/MLP.py
@@ -0,0 +1,83 @@
+from torch import nn
+
+from ResNet import NumericalEmbedding
+
+
+class MLP(nn.Module):
+
+    def __init__(self,
+                 cat_features: int,
+                 num_features: int,
+                 size_embedding: int,
+                 size_hidden: int,
+                 num_layers: int,
+                 activation=nn.ReLU,
+                 normalization=nn.BatchNorm1d,
+                 dropout=None,
+                 d_out: int = 1):
+        super(MLP, self).__init__()
+        self.name = "MLP"
+        cat_features = int(cat_features)
+        num_features = int(num_features)
+        size_embedding = int(size_embedding)
+        size_hidden = int(size_hidden)
+        num_layers = int(num_layers)
+        d_out = int(d_out)
+        
+        self.embedding = nn.EmbeddingBag(cat_features + 1,
+                                         size_embedding,
+                                         padding_idx=0)
+
+        if num_features != 0 and num_features is not None:
+            self.num_embedding = NumericalEmbedding(num_features, size_embedding)
+
+        self.first_layer = nn.Linear(size_embedding, size_hidden)
+
+        self.layers = nn.ModuleList(MLPLayer(size_hidden=size_hidden,
+                                             normalization=normalization,
+                                             activation=activation,
+                                             dropout=dropout)
+                                    for _ in range(num_layers))
+        self.last_norm = normalization(size_hidden)
+        self.head = nn.Linear(size_hidden, d_out)
+
+        self.last_act = activation()
+
+    def forward(self, input):
+        x_cat = input["cat"]
+        x_cat = self.embedding(x_cat)
+        if "num" in input.keys() and self.num_embedding is not None:
+            x_num = input["num"]
+            x = (x_cat + self.num_embedding(x_num).mean(dim=1)) / 2
+        else:
+            x = x_cat
+        x = self.first_layer(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.last_norm(x)
+        x = self.last_act(x)
+        x = self.head(x)
+        x = x.squeeze(-1)
+        return x
+
+
+class MLPLayer(nn.Module):
+    def __init__(self,
+                 size_hidden=64,
+                 normalization=nn.BatchNorm1d,
+                 activation=nn.ReLU,
+                 dropout=0.0,
+                 bias=True):
+        super(MLPLayer, self).__init__()
+        self.norm = normalization(size_hidden)
+        self.activation = activation()
+        self.linear = nn.Linear(size_hidden, size_hidden, bias=bias)
+
+        if dropout != 0.0 or dropout is not None:
+            self.dropout = nn.Dropout(p=dropout)
+
+    def forward(self, x):
+        x = self.linear(x)
+        if self.dropout:
+            x = self.dropout(x)
+        return self.activation(x)
diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py
new file mode 100644
index 0000000..f680eb2
--- /dev/null
+++ b/inst/python/ResNet.py
@@ -0,0 +1,139 @@
+import math
+
+import torch
+from torch import nn
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 cat_features: int,
+                 num_features: int = 0,
+                 size_embedding: int = 256,
+                 size_hidden: int = 256,
+                 num_layers: int = 2,
+                 hidden_factor: int = 1,
+                 activation=nn.ReLU,
+                 normalization=nn.BatchNorm1d,
+                 hidden_dropout=0,
+                 residual_dropout=0,
+                 dim_out: int = 1,
+                 concat_num=True):
+        super(ResNet, self).__init__()
+        self.name = 'ResNet'
+        cat_features = int(cat_features)
+        num_features = int(num_features)
+        size_embedding = int(size_embedding)
+        size_hidden = int(size_hidden)
+        num_layers = int(num_layers)
+        hidden_factor = int(hidden_factor)
+        dim_out = int(dim_out)
+        
+        
+        self.embedding = nn.EmbeddingBag(
+            num_embeddings=cat_features + 1,
+            embedding_dim=size_embedding,
+            padding_idx=0
+        )
+        if num_features != 0 and not concat_num:
+            self.num_embedding = NumericalEmbedding(num_features, size_embedding)
+        else:
+            self.num_embedding = None
+            size_embedding = size_embedding + num_features
+
+        self.first_layer = nn.Linear(size_embedding, size_hidden)
+
+        res_hidden = size_hidden * hidden_factor
+
+        self.layers = nn.ModuleList(ResLayer(size_hidden, res_hidden, normalization,
+                                             activation, hidden_dropout, residual_dropout)
+                                    for _ in range(num_layers))
+
+        self.last_norm = normalization(size_hidden)
+
+        self.head = nn.Linear(size_hidden, dim_out)
+
+        self.last_act = activation()
+
+    def forward(self, x):
+        x_cat = x["cat"]
+        x_cat = self.embedding(x_cat)
+        if "num" in x.keys() and x["num"] is not None and self.num_embedding is not None:
+            x_num = x["num"]
+            # take the average af numerical and categorical embeddings
+            x = (x_cat + self.num_embedding(x_num).mean(dim=1)) / 2
+        elif "num" in x.keys() and x["num"] is not None and self.num_embedding is None:
+            x_num = x["num"]
+            # concatenate numerical to categorical embedding
+            x = torch.cat([x_cat, x_num], dim=1)
+        else:
+            x = x_cat
+        x = self.first_layer(x)
+        for layer in self.layers:
+            x = layer(x)
+        x = self.last_norm(x)
+        x = self.last_act(x)
+        x = self.head(x)
+        x = x.squeeze(-1)
+        return x
+
+
+class ResLayer(nn.Module):
+
+    def __init__(self,
+                 size_hidden,
+                 res_hidden,
+                 normalization,
+                 activation,
+                 hidden_dropout=None,
+                 residual_dropout=None):
+        super(ResLayer, self).__init__()
+
+        self.norm = normalization(size_hidden)
+        self.linear0 = nn.Linear(size_hidden, res_hidden)
+        self.linear1 = nn.Linear(res_hidden, size_hidden)
+
+        if hidden_dropout is not None:
+            self.hidden_dropout = nn.Dropout(p=hidden_dropout)
+        if residual_dropout is not None:
+            self.residual_dropout = nn.Dropout(p=residual_dropout)
+        self.activation = activation()
+
+    def forward(self, input):
+        z = input
+        z = self.norm(z)
+        z = self.linear0(z)
+        z = self.activation(z)
+        if self.hidden_dropout is not None:
+            z = self.hidden_dropout(z)
+        z = self.linear1(z)
+        if self.residual_dropout is not None:
+            z = self.residual_dropout(z)
+        z = z + input
+        return z
+
+
+class NumericalEmbedding(nn.Module):
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 bias=True):
+        super(NumericalEmbedding, self).__init__()
+        self.weight = nn.Parameter(torch.empty(num_embeddings, embedding_dim))
+        if bias:
+            self.bias = nn.Parameter(torch.empty(num_embeddings, embedding_dim))
+        else:
+            self.bias = None
+
+        for parameter in [self.weight, self.bias]:
+            if parameter is not None:
+                nn.init.kaiming_uniform_(parameter, a=math.sqrt(5))
+
+    def forward(self, input):
+        x = self.weight.unsqueeze(0) * input.unsqueeze(-1)
+        if self.bias is not None:
+            x = x + self.bias.unsqueeze(-1)
+        return x
+
+
+
diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py
new file mode 100644
index 0000000..5944e1b
--- /dev/null
+++ b/inst/python/Transformer.py
@@ -0,0 +1,196 @@
+import math
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+from ResNet import NumericalEmbedding
+
+
+def reglu(x):
+    a, b = x.chunk(2, dim=-1)
+    return a * F.relu(b)
+
+
+class ReGLU(nn.Module):
+    def forward(self, x):
+        return reglu(x)
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 cat_features: int,
+                 num_features: int,
+                 num_blocks: int,
+                 dim_token: int,
+                 num_heads: int,
+                 att_dropout,
+                 ffn_dropout,
+                 res_dropout,
+                 dim_hidden: int,
+                 dim_out: int = 1,
+                 head_activation=nn.ReLU,
+                 activation=ReGLU,
+                 ffn_norm=nn.LayerNorm,
+                 head_norm=nn.LayerNorm,
+                 att_norm=nn.LayerNorm):
+        super(Transformer, self).__init__()
+        self.name = "Transformer"
+        cat_features = int(cat_features)
+        num_features = int(num_features)
+        num_blocks = int(num_blocks)
+        dim_token = int(dim_token)
+        num_heads = int(num_heads)
+        dim_hidden = int(dim_hidden)
+        dim_out = int(dim_out)
+        
+        self.categorical_embedding = nn.Embedding(cat_features + 1, dim_token, padding_idx=0)
+
+        if num_features != 0 and num_features is not None:
+            self.numerical_embedding = NumericalEmbedding(num_features, dim_token)
+        self.class_token = ClassToken(dim_token)
+
+        self.layers = nn.ModuleList([])
+        for layer_idx in range(num_blocks):
+            layer = nn.ModuleDict({
+                "attention": nn.MultiheadAttention(dim_token, num_heads,
+                                                   dropout=att_dropout),
+                "ffn": FeedForwardBlock(dim_token, dim_hidden,
+                                        bias_first=True,
+                                        bias_second=True,
+                                        dropout=ffn_dropout,
+                                        activation=activation),
+                "attention_res_dropout": nn.Dropout(res_dropout),
+                "ffn_res_dropout": nn.Dropout(res_dropout),
+                "ffn_norm": ffn_norm(dim_token)
+            })
+            if layer_idx != 0:
+                layer["attention_norm"] = att_norm(dim_token)
+            self.layers.append(layer)
+
+        self.head = Head(dim_token,
+                         bias=True,
+                         activation=head_activation,
+                         normalization=head_norm,
+                         dim_out=dim_out)
+
+    def forward(self, x):
+        mask = torch.where(x["cat"] == 0, True, False)
+        cat = self.categorical_embedding(x["cat"])
+        if "num" in x.keys() and self.numerical_embedding is not None:
+            num = self.numerical_embedding(x["num"])
+            x = torch.cat([cat, num], dim=1)
+            mask = torch.cat([mask, torch.zeros([x.shape[0],
+                                                 num.shape[1]],
+                                                device=mask.device,
+                                                dtype=mask.dtype)],
+                             dim=1)
+        else:
+            x = cat
+        x = self.class_token(x)
+        mask = torch.cat([mask, torch.zeros([x.shape[0], 1],
+                                            device=mask.device,
+                                            dtype=mask.dtype)],
+                         dim=1)
+
+        for i, layer in enumerate(self.layers):
+            x_residual = self.start_residual(layer, "attention", x)
+
+            if i == len(self.layers)-1:
+                dims = x_residual.shape
+                x_residual = layer["attention"](
+                    x_residual[:, -1].view([dims[0], 1, dims[2]]).transpose(0, 1),
+                    x_residual.transpose(0, 1),
+                    x_residual.transpose(0, 1),
+                    mask
+                )
+                attn_weights = x_residual[1]
+                x_residual = x_residual[0]
+                x = x[:, -1].view([dims[0], 1, dims[2]])
+            else:
+                x_residual = layer["attention"](
+                    x_residual.transpose(0, 1),
+                    x_residual.transpose(0, 1),
+                    x_residual.transpose(0, 1),
+                    mask
+                )[0]
+            x = self.end_residual(layer, "attention", x, x_residual.transpose(0, 1))
+
+            x_residual = self.start_residual(layer, "ffn", x)
+            x_residual = layer["ffn"](x_residual)
+            x = self.end_residual(layer, "ffn", x, x_residual)
+
+        x = self.head(x)[:, 0]
+        return x
+
+    @staticmethod
+    def start_residual(layer, stage, x):
+        norm = f"{stage}_norm"
+        if norm in layer.keys():
+            x = layer[stage + "_norm"](x)
+        return x
+
+    @staticmethod
+    def end_residual(layer, stage, x, x_residual):
+        x_residual = layer[f"{stage}_res_dropout"](x_residual)
+        return x + x_residual
+
+
+class FeedForwardBlock(nn.Module):
+
+    def __init__(self,
+                 dim_token,
+                 dim_hidden,
+                 bias_first=True,
+                 bias_second=True,
+                 dropout=0.0,
+                 activation=ReGLU):
+        super(FeedForwardBlock, self).__init__()
+        self.linear0 = nn.Linear(dim_token, int(dim_hidden * 2), bias=bias_first)
+        self.activation = activation()
+        self.dropout = nn.Dropout(p=dropout)
+        self.linear1 = nn.Linear(dim_hidden, dim_token, bias=bias_second)
+
+    def forward(self, x):
+        x = self.linear0(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        x = self.linear1(x)
+        return x
+
+
+class Head(nn.Module):
+
+    def __init__(self,
+                 dim_in,
+                 bias,
+                 activation,
+                 normalization,
+                 dim_out):
+        super(Head, self).__init__()
+        self.normalization = normalization(dim_in)
+        self.activation = activation()
+        self.linear = nn.Linear(dim_in, dim_out, bias=bias)
+
+    def forward(self, x):
+        x = x[:, -1]
+        x = self.normalization(x)
+        x = self.activation(x)
+        x = self.linear(x)
+        return x
+
+class ClassToken(nn.Module):
+
+    def __init__(self,
+                 dim_token):
+        super(ClassToken, self).__init__()
+        self.weight = nn.Parameter(torch.empty(dim_token, 1))
+        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+
+    def expand(self, dims):
+        new_dims = [1] * (len(dims) - 1)
+        return self.weight.view(new_dims + [-1]).expand(dims +[-1])
+
+    def forward(self, x):
+        return torch.cat([x, self.expand([x.shape[0], 1])], dim=1)
diff --git a/inst/python/__init__.py b/inst/python/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/man/Dataset.Rd b/man/Dataset.Rd
deleted file mode 100644
index eb12468..0000000
--- a/man/Dataset.Rd
+++ /dev/null
@@ -1,18 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Dataset.R
-\name{Dataset}
-\alias{Dataset}
-\title{A torch dataset}
-\usage{
-Dataset(data, labels = NULL, numericalIndex = NULL)
-}
-\arguments{
-\item{data}{a dataframe like object with the covariates}
-
-\item{labels}{a dataframe with the labels}
-
-\item{numericalIndex}{in what column numeric data is in (if any)}
-}
-\description{
-A torch dataset
-}
diff --git a/man/EarlyStopping.Rd b/man/EarlyStopping.Rd
deleted file mode 100644
index 53581b2..0000000
--- a/man/EarlyStopping.Rd
+++ /dev/null
@@ -1,78 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Estimator-class.R
-\name{EarlyStopping}
-\alias{EarlyStopping}
-\title{Earlystopping class}
-\description{
-Stops training if a loss or metric has stopped improving
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-EarlyStopping-new}{\code{EarlyStopping$new()}}
-\item \href{#method-EarlyStopping-call}{\code{EarlyStopping$call()}}
-\item \href{#method-EarlyStopping-clone}{\code{EarlyStopping$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-EarlyStopping-new"></a>}}
-\if{latex}{\out{\hypertarget{method-EarlyStopping-new}{}}}
-\subsection{Method \code{new()}}{
-Creates a new earlyStopping object
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{EarlyStopping$new(patience = 3, delta = 0, verbose = TRUE, mode = "max")}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{patience}}{Stop after this number of epochs if loss doesn't improve}
-
-\item{\code{delta}}{How much does the loss need to improve to count as improvement}
-
-\item{\code{verbose}}{If information should be printed out}
-
-\item{\code{mode}}{either `min` or `max` depending on metric to be used for earlyStopping}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-a new earlystopping object
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-EarlyStopping-call"></a>}}
-\if{latex}{\out{\hypertarget{method-EarlyStopping-call}{}}}
-\subsection{Method \code{call()}}{
-call the earlystopping object and increment a counter if loss is not
-improving
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{EarlyStopping$call(metric)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{metric}}{the current metric value}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-EarlyStopping-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-EarlyStopping-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{EarlyStopping$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/Estimator.Rd b/man/Estimator.Rd
deleted file mode 100644
index 96692f0..0000000
--- a/man/Estimator.Rd
+++ /dev/null
@@ -1,278 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Estimator-class.R
-\name{Estimator}
-\alias{Estimator}
-\title{Estimator}
-\description{
-A generic R6 class that wraps around a torch nn module and can be used to
-fit and predict the model defined in that module.
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-Estimator-new}{\code{Estimator$new()}}
-\item \href{#method-Estimator-fit}{\code{Estimator$fit()}}
-\item \href{#method-Estimator-fitEpoch}{\code{Estimator$fitEpoch()}}
-\item \href{#method-Estimator-score}{\code{Estimator$score()}}
-\item \href{#method-Estimator-finishFit}{\code{Estimator$finishFit()}}
-\item \href{#method-Estimator-printProgress}{\code{Estimator$printProgress()}}
-\item \href{#method-Estimator-fitWholeTrainingSet}{\code{Estimator$fitWholeTrainingSet()}}
-\item \href{#method-Estimator-save}{\code{Estimator$save()}}
-\item \href{#method-Estimator-predictProba}{\code{Estimator$predictProba()}}
-\item \href{#method-Estimator-predict}{\code{Estimator$predict()}}
-\item \href{#method-Estimator-itemOrDefaults}{\code{Estimator$itemOrDefaults()}}
-\item \href{#method-Estimator-clone}{\code{Estimator$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-new"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-new}{}}}
-\subsection{Method \code{new()}}{
-Creates a new estimator
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$new(modelType, modelParameters, estimatorSettings)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{modelType}}{The torch nn module to use as model}
-
-\item{\code{modelParameters}}{Parameters to initialize the model}
-
-\item{\code{estimatorSettings}}{Parameters required for the estimator fitting}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-fit"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-fit}{}}}
-\subsection{Method \code{fit()}}{
-fits the estimator
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$fit(dataset, testDataset)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataset}}{a torch dataset to use for model fitting}
-
-\item{\code{testDataset}}{a torch dataset to use for early stopping}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-fitEpoch"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-fitEpoch}{}}}
-\subsection{Method \code{fitEpoch()}}{
-fits estimator for one epoch (one round through the data)
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$fitEpoch(dataset, batchIndex)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataset}}{torch dataset to use for fitting}
-
-\item{\code{batchIndex}}{indices of batches}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-score"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-score}{}}}
-\subsection{Method \code{score()}}{
-calculates loss and auc after training for one epoch
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$score(dataset, batchIndex)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataset}}{The torch dataset to use to evaluate loss and auc}
-
-\item{\code{batchIndex}}{Indices of batches in the dataset}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-list with average loss and auc in the dataset
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-finishFit"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-finishFit}{}}}
-\subsection{Method \code{finishFit()}}{
-operations that run when fitting is finished
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$finishFit(scores, modelStateDict, epoch, learnRates)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{scores}}{validation scores}
-
-\item{\code{modelStateDict}}{fitted model parameters}
-
-\item{\code{epoch}}{list of epochs fit}
-
-\item{\code{learnRates}}{learning rate sequence used so far}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-printProgress"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-printProgress}{}}}
-\subsection{Method \code{printProgress()}}{
-Print out training progress per epoch
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$printProgress(scores, trainLoss, delta, currentEpoch)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{scores}}{scores returned by `self$score`}
-
-\item{\code{trainLoss}}{training loss}
-
-\item{\code{delta}}{how long did the epoch take}
-
-\item{\code{currentEpoch}}{the current epoch number}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-fitWholeTrainingSet"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-fitWholeTrainingSet}{}}}
-\subsection{Method \code{fitWholeTrainingSet()}}{
-Fits whole training set on a specific number of epochs
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$fitWholeTrainingSet(dataset, learnRates = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataset}}{torch dataset}
-
-\item{\code{learnRates}}{learnRateSchedule from CV}
-}
-\if{html}{\out{</div>}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-save"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-save}{}}}
-\subsection{Method \code{save()}}{
-save model and those parameters needed to reconstruct it
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$save(path, name)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{path}}{where to save the model}
-
-\item{\code{name}}{name of file}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-the path to saved model
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-predictProba"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-predictProba}{}}}
-\subsection{Method \code{predictProba()}}{
-predicts and outputs the probabilities
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$predictProba(dataset)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataset}}{Torch dataset to create predictions for}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-predictions as probabilities
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-predict"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-predict}{}}}
-\subsection{Method \code{predict()}}{
-predicts and outputs the class
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$predict(dataset, threshold = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{dataset}}{A torch dataset to create predictions for}
-
-\item{\code{threshold}}{Which threshold to use for predictions}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-The predicted class for the data in the dataset
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-itemOrDefaults"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-itemOrDefaults}{}}}
-\subsection{Method \code{itemOrDefaults()}}{
-select item from list, and if it's null sets a default
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$itemOrDefaults(list, item, default = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{list}}{A list with items}
-
-\item{\code{item}}{Which list item to retrieve}
-
-\item{\code{default}}{The value to return if list doesn't have item}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-the list item or default
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-Estimator-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-Estimator-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{Estimator$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/batchToDevice.Rd b/man/batchToDevice.Rd
deleted file mode 100644
index a19e34c..0000000
--- a/man/batchToDevice.Rd
+++ /dev/null
@@ -1,20 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Estimator-class.R
-\name{batchToDevice}
-\alias{batchToDevice}
-\title{sends a batch of data to device}
-\usage{
-batchToDevice(batch, device)
-}
-\arguments{
-\item{batch}{the batch to send, usually a list of torch tensors}
-
-\item{device}{which device to send batch to}
-}
-\value{
-the batch on the required device
-}
-\description{
-sends a batch of data to device
-assumes batch includes lists of tensors to arbitrary nested depths
-}
diff --git a/man/camelCaseToSnakeCase.Rd b/man/camelCaseToSnakeCase.Rd
new file mode 100644
index 0000000..224e795
--- /dev/null
+++ b/man/camelCaseToSnakeCase.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{camelCaseToSnakeCase}
+\alias{camelCaseToSnakeCase}
+\title{Convert a camel case string to snake case}
+\usage{
+camelCaseToSnakeCase(string)
+}
+\arguments{
+\item{string}{The string to be converted}
+}
+\value{
+A string
+}
+\description{
+Convert a camel case string to snake case
+}
diff --git a/man/camelCaseToSnakeCaseNames.Rd b/man/camelCaseToSnakeCaseNames.Rd
new file mode 100644
index 0000000..a4f27ff
--- /dev/null
+++ b/man/camelCaseToSnakeCaseNames.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{camelCaseToSnakeCaseNames}
+\alias{camelCaseToSnakeCaseNames}
+\title{Convert the names of an object from camel case to snake case}
+\usage{
+camelCaseToSnakeCaseNames(object)
+}
+\arguments{
+\item{object}{The object of which the names should be converted}
+}
+\value{
+The same object, but with converted names.
+}
+\description{
+Convert the names of an object from camel case to snake case
+}
diff --git a/man/checkHigher.Rd b/man/checkHigher.Rd
new file mode 100644
index 0000000..bab0d76
--- /dev/null
+++ b/man/checkHigher.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{checkHigher}
+\alias{checkHigher}
+\title{helper function to check that input is higher than a certain value}
+\usage{
+checkHigher(parameter, value)
+}
+\arguments{
+\item{parameter}{the input parameter to check, can be a vector}
+
+\item{value}{which value it should be higher than}
+}
+\description{
+helper function to check that input is higher than a certain value
+}
diff --git a/man/checkHigherEqual.Rd b/man/checkHigherEqual.Rd
new file mode 100644
index 0000000..60230e6
--- /dev/null
+++ b/man/checkHigherEqual.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{checkHigherEqual}
+\alias{checkHigherEqual}
+\title{helper function to check that input is higher or equal than a certain value}
+\usage{
+checkHigherEqual(parameter, value)
+}
+\arguments{
+\item{parameter}{the input parameter to check, can be a vector}
+
+\item{value}{which value it should be higher or equal than}
+}
+\description{
+helper function to check that input is higher or equal than a certain value
+}
diff --git a/man/checkIsClass.Rd b/man/checkIsClass.Rd
new file mode 100644
index 0000000..5af3ad8
--- /dev/null
+++ b/man/checkIsClass.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/HelperFunctions.R
+\name{checkIsClass}
+\alias{checkIsClass}
+\title{helper function to check class of input}
+\usage{
+checkIsClass(parameter, classes)
+}
+\arguments{
+\item{parameter}{the input parameter to check}
+
+\item{classes}{which classes it should belong to (one or more)}
+}
+\description{
+helper function to check class of input
+}
diff --git a/man/lrFinder.Rd b/man/lrFinder.Rd
deleted file mode 100644
index 74f31d2..0000000
--- a/man/lrFinder.Rd
+++ /dev/null
@@ -1,42 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/LRFinder.R
-\name{lrFinder}
-\alias{lrFinder}
-\title{Find learning rate that decreases loss the most}
-\usage{
-lrFinder(
-  dataset,
-  modelType,
-  modelParams,
-  estimatorSettings,
-  minLR = 1e-07,
-  maxLR = 1,
-  numLR = 100,
-  smooth = 0.05,
-  divergenceThreshold = 4
-)
-}
-\arguments{
-\item{dataset}{torch dataset, training dataset}
-
-\item{modelType}{the function used to initialize the model}
-
-\item{modelParams}{parameters used to initialize model}
-
-\item{estimatorSettings}{settings for estimator to fit model}
-
-\item{minLR}{lower bound of learning rates to search through}
-
-\item{maxLR}{upper bound of learning rates to search through}
-
-\item{numLR}{number of learning rates to go through}
-
-\item{smooth}{smoothing to use on losses}
-
-\item{divergenceThreshold}{if loss increases this amount above the minimum, stop.}
-}
-\description{
-Method originated from https://arxiv.org/abs/1506.01186 but this
-implementation draws inspiration from various other implementations such as
-pytorch lightning, fastai, luz and pytorch-lr-finder.
-}
diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd
index 8454c55..fcb33ab 100644
--- a/man/setEstimator.Rd
+++ b/man/setEstimator.Rd
@@ -10,9 +10,10 @@ setEstimator(
   batchSize = 512,
   epochs = 30,
   device = "cpu",
-  optimizer = torch::optim_adamw,
-  scheduler = list(fun = torch::lr_reduce_on_plateau, params = list(patience = 1)),
-  criterion = torch::nn_bce_with_logits_loss,
+  optimizer = torch$optim$AdamW,
+  scheduler = list(fun = torch$optim$lr_scheduler$ReduceLROnPlateau, params =
+    list(patience = 1)),
+  criterion = torch$nn$BCEWithLogitsLoss,
   earlyStopping = list(useEarlyStopping = TRUE, params = list(patience = 4)),
   metric = "auc",
   seed = NULL
diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd
index d6ab36c..f3676cd 100644
--- a/man/setMultiLayerPerceptron.Rd
+++ b/man/setMultiLayerPerceptron.Rd
@@ -7,7 +7,7 @@
 setMultiLayerPerceptron(
   numLayers = c(1:8),
   sizeHidden = c(2^(6:9)),
-  dropout = c(seq(0, 0.5, 0.05)),
+  dropout = c(seq(0, 0.3, 0.05)),
   sizeEmbedding = c(2^(6:9)),
   estimatorSettings = setEstimator(learningRate = "auto", weightDecay = c(1e-06, 0.001),
     batchSize = 1024, epochs = 30, device = "cpu"),
@@ -17,9 +17,9 @@ setMultiLayerPerceptron(
 )
 }
 \arguments{
-\item{numLayers}{Number of layers in network, default: 1:16}
+\item{numLayers}{Number of layers in network, default: 1:8}
 
-\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)}
+\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:9) (64 to 512)}
 
 \item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)}
 
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index fde852b..adf0dcb 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -1,11 +1,7 @@
 library(PatientLevelPrediction)
 
-if(Sys.getenv('GITHUB_ACTIONS') == 'true' & torch::torch_is_installed() != FALSE) {
-  torch::install_torch()
-}
-
 testLoc <- tempdir()
-
+torch <- reticulate::import("torch")
 # get connection and data from Eunomia
 connectionDetails <- Eunomia::getEunomiaConnectionDetails()
 Eunomia::createCohorts(connectionDetails)
@@ -69,13 +65,16 @@ mappedData <- PatientLevelPrediction::MapIds(
   cohort = trainData$Train$labels
 )
 
+path <- system.file("python", package = "DeepPatientLevelPrediction")
+Dataset <- reticulate::import_from_path("Dataset", path = path)
+if (is.null(attributes(mappedData)$path)) {
+  # sqlite object
+  attributes(mappedData)$path <- attributes(mappedData)$dbname
+}
 
-
-dataset <- Dataset(
-  data = mappedData$covariates,
-  labels = trainData$Train$labels$outcomeCount,
-  numericalIndex = NULL
+dataset <- Dataset$Data(
+  data = reticulate::r_to_py(normalizePath(attributes(mappedData)$path)),
+  labels = reticulate::r_to_py(trainData$Train$labels$outcomeCount),
 )
-
-small_dataset <- torch::dataset_subset(dataset, (1:round(length(dataset)/3)))
+small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))
 
diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R
index b949394..940f9d1 100644
--- a/tests/testthat/test-Dataset.R
+++ b/tests/testthat/test-Dataset.R
@@ -1,19 +1,6 @@
-test_that("dataset correct class", {
-  testthat::expect_true("myDataset" %in% class(dataset))
-})
-
-test_that("length of index correct", {
-  testthat::expect_equal(
-    length(dataset$getNumericalIndex()),
-    dplyr::n_distinct(mappedData$covariates %>% 
-                        dplyr::collect() %>%
-                        dplyr::pull(covariateId))
-  )
-})
-
 test_that("number of num and cat features sum correctly", {
   testthat::expect_equal(
-    dataset$numNumFeatures() + dataset$numCatFeatures(),
+    length(dataset$get_numerical_features()) + length(dataset$get_cat_features()),
     dplyr::n_distinct(mappedData$covariates %>% dplyr::collect() %>%
                         dplyr::pull(covariateId))
   )
@@ -21,10 +8,10 @@ test_that("number of num and cat features sum correctly", {
 
 
 test_that("length of dataset correct", {
-  expect_equal(length(dataset), dataset$cat$shape[1])
-  expect_equal(length(dataset), dataset$num$shape[1])
+  expect_equal(length(dataset), dataset$cat$shape[0])
+  expect_equal(length(dataset), dataset$num$shape[0])
   expect_equal(
-    dataset$.length(),
+    length(dataset),
     dplyr::n_distinct(mappedData$covariates %>% 
                         dplyr::collect() %>% dplyr::pull(rowId))
   )
@@ -35,30 +22,30 @@ test_that(".getbatch works", {
   # get one sample
   out <- dataset[10]
   
-  # output should be a list of two items, the batch and targets,
+  # output should be a list of two items, the batch in pos 1 and targets in pos 2,
   # the batch is what goes to the model
   expect_equal(length(out), 2)
   
   # targets should be binary
-  expect_true(out$target$item() %in% c(0, 1))
+  expect_true(out[[2]]$item() %in% c(0, 1))
   
   # shape of batch is correct
-  expect_equal(length(out$batch), 2)
-  expect_equal(out$batch$cat$shape[1], 1)
-  expect_equal(out$batch$num$shape[1], 1)
+  expect_equal(length(out[[1]]), 2)
+  expect_equal(out[[1]]$cat$shape[0], 1)
+  expect_equal(out[[1]]$num$shape[0], 1)
   
   # shape of target
-  expect_equal(out$target$shape[1], 1)
+  expect_equal(out[[2]]$shape$numel(), 1)
   
   # get a whole batch
-  out <- dataset[10:(10 + batch_size - 1)]
+  out <- dataset[10:(10 + batch_size)]
   
   expect_equal(length(out), 2)
-  expect_true(all(torch::as_array(out$target) %in% c(0, 1)))
+  expect_true(all(out[[2]]$numpy() %in% c(0, 1)))
   
-  expect_equal(length(out$batch), 2)
-  expect_equal(out$batch$cat$shape[1], 16)
-  expect_equal(out$batch$num$shape[1], 16)
+  expect_equal(length(out[[1]]), 2)
+  expect_equal(out[[1]]$cat$shape[0], 16)
+  expect_equal(out[[1]]$num$shape[0], 16)
   
-  expect_equal(out$target$shape[1], 16)
+  expect_equal(out[[2]]$shape[0], 16)
 })
diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R
index a803003..b9ecadd 100644
--- a/tests/testthat/test-Estimator.R
+++ b/tests/testthat/test-Estimator.R
@@ -1,66 +1,84 @@
-catFeatures <- small_dataset$dataset$numCatFeatures()
-numFeatures <- small_dataset$dataset$numNumFeatures()
-
-modelType <- ResNet
+catFeatures <- small_dataset$dataset$get_cat_features()$shape[[1]]
+numFeatures <- small_dataset$dataset$get_numerical_features()$shape[[1]]
 
 modelParameters <- list(
-  catFeatures = catFeatures,
-  numFeatures = numFeatures,
-  sizeEmbedding = 16,
-  sizeHidden = 16,
-  numLayers = 2,
-  hiddenFactor = 2
+  cat_features = catFeatures,
+  num_features = numFeatures,
+  size_embedding = 16,
+  size_hidden = 16,
+  num_layers = 2,
+  hidden_factor = 2
 )
 
 estimatorSettings <- setEstimator(learningRate = 3e-4,
                                  weightDecay = 0.0,
                                  batchSize = 128,
                                  epochs = 5,
-                                 device = 'cpu')
-
-estimator <- Estimator$new(
-  modelType = modelType,
-  modelParameters = modelParameters,
-  estimatorSettings = estimatorSettings
-)
+                                 device = 'cpu',
+                                 seed=42,
+                                 optimizer=torch$optim$AdamW,
+                                 criterion=torch$nn$BCEWithLogitsLoss,
+                                 metric='loss',
+                                 scheduler= list(fun=torch$optim$lr_scheduler$ReduceLROnPlateau,
+                                                 params=list(patience=1)),
+                                 earlyStopping=NULL)
+
+modelType <- "ResNet"
+estimator <- createEstimator(modelType = modelType,
+                             modelParameters = modelParameters,
+                             estimatorSettings = estimatorSettings)
 
 test_that("Estimator initialization works", {
 
   # count parameters in both instances
+  path <- system.file("python", package = "DeepPatientLevelPrediction")
+  ResNet <- reticulate::import_from_path(modelType, path=path)[[modelType]] 
+
   testthat::expect_equal(
-    sum(sapply(estimator$model$parameters, function(x) prod(x$shape))),
-    sum(sapply(do.call(modelType, modelParameters)$parameters, function(x) prod(x$shape)))
+    sum(reticulate::iterate(estimator$model$parameters(), function(x) x$numel())),
+    sum(reticulate::iterate(do.call(ResNet, modelParameters)$parameters(), 
+                            function(x) x$numel()))
   )
 
   testthat::expect_equal(
-    estimator$modelParameters,
+    estimator$model_parameters,
     modelParameters
   )
+})
 
-  # check the function that results the value from a list
-  val <- estimator$itemOrDefaults(list(param = 1, test = 3), "param", default = NULL)
-  expect_equal(val, 1)
-  val <- estimator$itemOrDefaults(list(param = 1, test = 3), "paramater", default = NULL)
-  expect_true(is.null(val))
-  
+test_that("Estimator detects wrong inputs", {
+  
+  testthat::expect_error(setEstimator(learningRate='notAuto'))
+  testthat::expect_error(setEstimator(weightDecay = -1))
+  testthat::expect_error(setEstimator(weightDecay = "text"))
+  testthat::expect_error(setEstimator(batchSize = 0))
+  testthat::expect_error(setEstimator(batchSize = "text"))
+  testthat::expect_error(setEstimator(epochs = 0))
+  testthat::expect_error(setEstimator(epochs = "test"))
+  testthat::expect_error(setEstimator(device = 1))
+  testthat::expect_error(setEstimator(scheduler = "notList"))
+  testthat::expect_error(setEstimator(earlyStopping = "notListorNull"))
+  testthat::expect_error(setEstimator(metric = 1))
+  testthat::expect_error(setEstimator(seed = "32"))
 })
+
 sink(nullfile())
 estimator$fit(small_dataset, small_dataset)
 sink()
 
 test_that("estimator fitting works", {
 
-  expect_true(!is.null(estimator$bestEpoch))
-  expect_true(!is.null(estimator$bestScore$loss))
-  expect_true(!is.null(estimator$bestScore$auc))
+  expect_true(!is.null(estimator$best_epoch))
+  expect_true(!is.null(estimator$best_score$loss))
+  expect_true(!is.null(estimator$best_score$auc))
 
   old_weights <- estimator$model$head$weight$mean()$item()
 
   sink(nullfile())
-  estimator$fitWholeTrainingSet(small_dataset, estimator$learnRateSchedule)
+  estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule)
   sink()
 
-  expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learnRateSchedule, 1)[[1]])
+  expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learn_rate_schedule, 1)[[1]])
 
   new_weights <- estimator$model$head$weight$mean()$item()
 
@@ -72,7 +90,7 @@ test_that("estimator fitting works", {
   expect_true(file.exists(file.path(testLoc, "estimator.pt")))
   
   sink(nullfile())
-  preds <- estimator$predictProba(dataset)
+  preds <- estimator$predict_proba(dataset)
   sink()
   
   expect_lt(max(preds), 1)
@@ -95,11 +113,10 @@ test_that("estimator fitting works", {
                                     device = 'cpu',
                                     metric= "loss")
   
-  estimator <- Estimator$new(
-    modelType = modelType,
-    modelParameters = modelParameters,
-    estimatorSettings = estimatorSettings
-  )
+  estimator <- createEstimator(modelType=modelType,
+                               modelParameters=modelParameters,
+                               estimatorSettings=estimatorSettings)
+  
   sink(nullfile())
   estimator$fit(small_dataset, small_dataset)
   sink()
@@ -108,24 +125,27 @@ test_that("estimator fitting works", {
   expect_equal(estimator$metric$name, "loss")
   
   sink(nullfile())
-  estimator$fitWholeTrainingSet(small_dataset, estimator$learnRateSchedule)
+  estimator$fit_whole_training_set(small_dataset, estimator$learn_rate_schedule)
   sink()
   
-  expect_equal(estimator$learnRateSchedule[[estimator$bestEpoch]],
+  expect_equal(estimator$learn_rate_schedule[[estimator$best_epoch]],
                estimator$optimizer$param_groups[[1]]$lr)
   
 })
 
 test_that("early stopping works", {
-  earlyStop <- EarlyStopping$new(patience = 3, delta = 0, verbose = FALSE)
-  testthat::expect_true(is.null(earlyStop$bestScore))
-  testthat::expect_false(earlyStop$earlyStop)
-  earlyStop$call(0.5)
-  testthat::expect_equal(earlyStop$bestScore, 0.5)
-  earlyStop$call(0.4)
-  earlyStop$call(0.4)
-  earlyStop$call(0.4)
-  testthat::expect_true(earlyStop$earlyStop)
+  EarlyStopping <- reticulate::import_from_path("Estimator", path=path)$EarlyStopping
+  earlyStop <- EarlyStopping(patience = 3, delta = 0, verbose = FALSE)
+  
+  
+  testthat::expect_true(is.null(earlyStop$best_score))
+  testthat::expect_false(earlyStop$early_stop)
+  earlyStop(0.5)
+  testthat::expect_equal(earlyStop$best_score, 0.5)
+  earlyStop(0.4)
+  earlyStop(0.4)
+  earlyStop(0.4)
+  testthat::expect_true(earlyStop$early_stop)
 })
 
 modelSettings <- setResNet(
@@ -176,21 +196,22 @@ test_that("predictDeepEstimator works", {
 })
 
 test_that("batchToDevice works", {
+  batch_to_device <- reticulate::import_from_path("Estimator", path=path)$batch_to_device
   # since we can't test moving to gpu there is a fake device called meta
   # which we can use to test of the device is updated
   estimator$device <- "meta"
   b <- 1:10
-  batch <- batchToDevice(dataset[b], device=estimator$device)
-
+  batch <- batch_to_device(dataset[b], device=estimator$device)
+  
   devices <- lapply(
     lapply(unlist(batch, recursive = TRUE), function(x) x$device),
-    function(x) x == torch::torch_device(type = "meta")
+    function(x) x == torch$device(type = "meta")
   )
   # test that all are meta
   expect_true(all(devices == TRUE))
   
-  numDevice <- batchToDevice(dataset[b]$batch$num, device=estimator$device)
-  expect_true(numDevice$device==torch::torch_device(type="meta"))
+  numDevice <- batch_to_device(dataset[b][[1]]$num, device=estimator$device)
+  expect_true(numDevice$device==torch$device(type="meta"))
 })
 
 test_that("Estimator without earlyStopping works", {
@@ -201,17 +222,16 @@ test_that("Estimator without earlyStopping works", {
                                     epochs = 1,
                                     device = 'cpu',
                                     earlyStopping = NULL)
-  estimator2 <- Estimator$new(
-    modelType = modelType,
-    modelParameters = modelParameters,
-    estimatorSettings = estimatorSettings
-  )
+  
+  estimator2 <- createEstimator(modelType = modelType,
+                                modelParameters = modelParameters,
+                                estimatorSettings=estimatorSettings)
   sink(nullfile())
   estimator2$fit(small_dataset, small_dataset)
   sink()
   
-  expect_null(estimator2$earlyStopper)
-  expect_true(!is.null(estimator2$bestEpoch))
+  expect_null(estimator2$early_stopper)
+  expect_true(!is.null(estimator2$best_epoch))
   
 })
 
@@ -226,27 +246,25 @@ test_that("Early stopper can use loss and stops early", {
                                                                       patience=1)),
                                     metric = 'loss',
                                     seed=42)
-  estimator <- Estimator$new(
-    modelType=modelType,
-    modelParameters = modelParameters,
-    estimatorSettings = estimatorSettings
-  )
   
+  estimator <- createEstimator(modelType = modelType,
+                               modelParameters = modelParameters,
+                               estimatorSettings = estimatorSettings)
   sink(nullfile())
   estimator$fit(small_dataset, small_dataset)
   sink()
   
-  expect_true(estimator$bestEpoch < estimator$epochs)
+  expect_true(estimator$best_epoch < estimator$epochs)
   
 })
 
 test_that('Custom metric in estimator works', {
   
   metric_fun <- function(predictions, labels)  {
-    positive <- predictions[labels == 1]
-    negative <- predictions[labels == 0]
-    pr <- PRROC::pr.curve(scores.class0 = positive, scores.class1 = negative)
+    pr <- PRROC::pr.curve(scores.class0 = torch$sigmoid(predictions)$numpy(), 
+                          weights.class0 = labels$numpy())
     auprc <- pr$auc.integral
+    reticulate::r_to_py(auprc)
   }
     
   estimatorSettings <- setEstimator(learningRate = 3e-4,
@@ -257,10 +275,9 @@ test_that('Custom metric in estimator works', {
                                     metric=list(fun=metric_fun,
                                                 name="auprc",
                                                 mode="max"))
-  
-  estimator <- Estimator$new(modelType = modelType,
-                             modelParameters = modelParameters,
-                             estimatorSettings = estimatorSettings)
+  estimator <- createEstimator(modelType = modelType,
+                               modelParameters = modelParameters,
+                               estimatorSettings = estimatorSettings)
   expect_true(is.function(estimator$metric$fun))
   expect_true(is.character(estimator$metric$name))
   expect_true(estimator$metric$mode %in% c("min", "max"))
@@ -269,7 +286,7 @@ test_that('Custom metric in estimator works', {
   estimator$fit(small_dataset, small_dataset)
   sink()
   
-  expect_true(estimator$bestScore[["auprc"]]>0)
+  expect_true(estimator$best_score[["auprc"]]>0)
 
 })
 
@@ -309,31 +326,40 @@ test_that("device as a function argument works", {
       dev
     }
   }
-  
-  estimatorSettings <- setEstimator(device=getDevice)
+
+  estimatorSettings <- setEstimator(device=getDevice,
+                                    learningRate = 3e-4)
   
   model <- setDefaultResNet(estimatorSettings = estimatorSettings) 
   model$param[[1]]$catFeatures <- 10
-  
-  estimator <- Estimator$new(modelType="ResNet",
-                             modelParameters = model$param[[1]],
-                             estimatorSettings = estimatorSettings)
+
+  estimator <- createEstimator(modelType = modelType,
+                               modelParameters = model$param[[1]],
+                               estimatorSettings = estimatorSettings)
   
   expect_equal(estimator$device, "cpu")
   
   Sys.setenv("testDeepPLPDevice" = "meta")
   
-  estimatorSettings <- setEstimator(device=getDevice)
+  estimatorSettings <- setEstimator(device=getDevice,
+                                    learningRate = 3e-4)
   
   model <- setDefaultResNet(estimatorSettings = estimatorSettings) 
   model$param[[1]]$catFeatures <- 10
   
-  estimator <- Estimator$new(modelType="ResNet",
-                             modelParameters = model$param[[1]],
-                             estimatorSettings = estimatorSettings)
+  estimator <- createEstimator(modelType = modelType,
+                               modelParameters = model$param[[1]],
+                               estimatorSettings = estimatorSettings)
   
   expect_equal(estimator$device, "meta")
   
   Sys.unsetenv("testDeepPLPDevice")
   
   })
+
+# test_that("estimatorSettings can be saved and loaded with correct objects", {
+#   settings <- setEstimator()
+#   
+#   saveRDS(settings,file=file.path(testLoc, 'settings.RDS'))
+#   
+# })
\ No newline at end of file
diff --git a/tests/testthat/test-LRFinder.R b/tests/testthat/test-LRFinder.R
index f6c263e..0a5d700 100644
--- a/tests/testthat/test-LRFinder.R
+++ b/tests/testthat/test-LRFinder.R
@@ -1,18 +1,23 @@
+ResNet <- reticulate::import_from_path("ResNet", path)$ResNet
+lrFinderClass <- reticulate::import_from_path("LrFinder", path=path)$LrFinder
+
 test_that("LR scheduler that changes per batch works", {
   
-  model <- ResNet(catFeatures = 10, numFeatures = 1,
-                  sizeEmbedding = 32, sizeHidden = 64,
-                  numLayers = 1, hiddenFactor = 1)
-  optimizer <- torch::optim_adamw(model$parameters, lr=1e-7)
+  model <- ResNet(cat_features = 10L, num_features = 1L,
+                  size_embedding = 32L, size_hidden = 64L,
+                  num_layers = 1L, hidden_factor = 1L)
+  optimizer <- torch$optim$AdamW(model$parameters(), lr=1e-7)
   
-  scheduler <- lrPerBatch(optimizer,
-                          startLR = 1e-7,
-                          endLR = 1e-2,
-                          nIters = 5)
+  ExponentialSchedulerPerBatch <- reticulate::import_from_path("LrFinder", 
+                                                               path=path)$ExponentialSchedulerPerBatch
+  scheduler <- ExponentialSchedulerPerBatch(optimizer,
+                          end_lr = 1e-2,
+                          num_iter = 5)
   expect_equal(scheduler$last_epoch, 0)
   expect_equal(scheduler$optimizer$param_groups[[1]]$lr, 1e-7)
   
   for (i in 1:5) {
+    optimizer$step()
     scheduler$step()
   }
   
@@ -23,19 +28,21 @@ test_that("LR scheduler that changes per batch works", {
 
 
 test_that("LR finder works", {
-  
-  lr <- lrFinder(dataset, modelType = ResNet, modelParams = list(catFeatures=dataset$numCatFeatures(),
-                                                           numFeatures=dataset$numNumFeatures(),
-                                                           sizeEmbedding=32,
-                                                           sizeHidden=64,
-                                                           numLayers=1,
-                                                           hiddenFactor=1),
-           estimatorSettings = setEstimator(batchSize=32,
-                                            seed = 42),
-           minLR = 3e-4,
-           maxLR = 10.0,
-           numLR = 20,
-           divergenceThreshold = 1.1)
+  lrFinder <- createLRFinder(modelType="ResNet",
+                             modelParameters = list(cat_features=length(dataset$get_cat_features()),
+                                                    num_features=length(dataset$get_numerical_features()),
+                                                    size_embedding=32L,
+                                                    size_hidden=64L,
+                                                    num_layers=1L,
+                                                    hidden_factor=1L),
+                             estimatorSettings = setEstimator(batchSize = 32L,
+                                                              seed=42),
+                             lrSettings = list(minLr=3e-4,
+                                               maxLr=10.0,
+                                               numLr=20L,
+                                               divergenceThreshold=1.1))
+    
+  lr <- lrFinder$get_lr(dataset)
   
   expect_true(lr<=10.0)
   expect_true(lr>=3e-4)
@@ -46,19 +53,23 @@ test_that("LR finder works with device specified by a function", {
   deviceFun <- function(){
     dev = "cpu"
   }
-  lr <- lrFinder(dataset, modelType = ResNet, modelParams = list(catFeatures=dataset$numCatFeatures(),
-                                                                 numFeatures=dataset$numNumFeatures(),
-                                                                 sizeEmbedding=8,
-                                                                 sizeHidden=16,
-                                                                 numLayers=1,
-                                                                 hiddenFactor=1),
-                 estimatorSettings = setEstimator(batchSize=32,
+  lrFinder <- createLRFinder(model = "ResNet", 
+                             modelParameters = list(cat_features=length(dataset$get_cat_features()),
+                                                num_features=length(dataset$get_numerical_features()),
+                                                size_embedding=8L,
+                                                size_hidden=16L,
+                                                num_layers=1L,
+                                                hidden_factor=1L),
+                             estimatorSettings = setEstimator(batchSize=32L,
                                                   seed = 42,
                                                   device = deviceFun),
-                 minLR = 3e-4,
-                 maxLR = 10.0,
-                 numLR = 20,
-                 divergenceThreshold = 1.1)
+                             lrSettings = list(minLr=3e-4,
+                                               maxLr=10.0,
+                                               numLr=20L,
+                                               divergenceThreshold=1.1)
+                             )
+  lr <- lrFinder$get_lr(dataset)
+  
   expect_true(lr<=10.0)
   expect_true(lr>=3e-4)
   
diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R
index 4daa6aa..5cbd7a5 100644
--- a/tests/testthat/test-MLP.R
+++ b/tests/testthat/test-MLP.R
@@ -1,9 +1,9 @@
 
 modelSettings <- setMultiLayerPerceptron(
-  numLayers = c(2),
-  sizeHidden = c(32),
+  numLayers = 2,
+  sizeHidden = 32,
   dropout = c(0.1),
-  sizeEmbedding = c(32),
+  sizeEmbedding = 32,
   estimatorSettings = setEstimator(
     learningRate=c(3e-4),
     weightDecay = c(1e-6),
@@ -82,40 +82,48 @@ test_that("MLP with runPlp working checks", {
 
 
 test_that("MLP nn-module works ", {
+  MLP <- reticulate::import_from_path("MLP", path=path)$MLP
   model <- MLP(
-    catFeatures = 5, numFeatures = 1, sizeEmbedding = 5,
-    sizeHidden = 16, numLayers = 1,
-    activation = torch::nn_relu,
-    normalization = torch::nn_batch_norm1d, dropout = 0.3,
-    d_out = 1
+    cat_features = 5, 
+    num_features = 1, 
+    size_embedding = 5,
+    size_hidden = 16, 
+    num_layers = 1,
+    activation = torch$nn$ReLU,
+    normalization = torch$nn$BatchNorm1d, 
+    dropout = 0.3
   )
 
-  pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
+  pars <- sum(reticulate::iterate(model$parameters(), function(x) x$numel()))
 
   # expected number of parameters
   expect_equal(pars, 489)
 
   input <- list()
-  input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())
-  input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32())
+  input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype=torch$long)
+  input$num <- torch$randn(10L, 1L, dtype=torch$float32)
 
 
   output <- model(input)
 
   # output is correct shape
-  expect_equal(output$shape, 10)
+  expect_equal(output$shape[0], 10L)
 
   input$num <- NULL
   model <- MLP(
-    catFeatures = 5, numFeatures = 0, sizeEmbedding = 5,
-    sizeHidden = 16, numLayers = 1,
-    activation = torch::nn_relu,
-    normalization = torch::nn_batch_norm1d, dropout = 0.3,
-    d_out = 1
+    cat_features = 5L, 
+    num_features = 0, 
+    size_embedding = 5L,
+    size_hidden = 16L, 
+    num_layers = 1L,
+    activation = torch$nn$ReLU,
+    normalization = torch$nn$BatchNorm1d, 
+    dropout = 0.3,
+    d_out = 1L
   )
   output <- model(input)
   # model works without numeric variables
-  expect_equal(output$shape, 10)
+  expect_equal(output$shape[0], 10L)
 })
 
 
@@ -127,11 +135,13 @@ test_that("Errors are produced by settings function", {
     sizeHidden = 128,
     dropout = 0.0,
     sizeEmbedding = 128,
-    weightDecay = 1e-6,
-    learningRate = 0.01,
-    seed = 42,
     hyperParamSearch = 'random',
-    randomSample = randomSample))
+    estimatorSettings = setEstimator(
+      learningRate = 'auto',
+      weightDecay = c(1e-3),
+      batchSize = 1024,
+      epochs = 30,
+      device="cpu")))
                            
 })
 
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
index 7a380a4..29dfb8e 100644
--- a/tests/testthat/test-ResNet.R
+++ b/tests/testthat/test-ResNet.R
@@ -1,11 +1,11 @@
 
 resSet <- setResNet(
-  numLayers = c(2),
-  sizeHidden = c(32),
-  hiddenFactor = c(2),
-  residualDropout = c(0.1),
-  hiddenDropout = c(0.1),
-  sizeEmbedding = c(32),
+  numLayers = 2,
+  sizeHidden = 32,
+  hiddenFactor = 2,
+  residualDropout = 0.1,
+  hiddenDropout = 0.1,
+  sizeEmbedding = 32,
   estimatorSettings = setEstimator(learningRate="auto",
                                    weightDecay = c(1e-6),
                                    seed=42,
@@ -22,12 +22,12 @@ test_that("setResNet works", {
   
   testthat::expect_true(length(resSet$param) > 0)
   
-  expect_error(setResNet(numLayers = c(2),
-                         sizeHidden = c(32),
-                         hiddenFactor = c(2),
-                         residualDropout = c(0.1),
-                         hiddenDropout = c(0.1),
-                         sizeEmbedding = c(32),
+  expect_error(setResNet(numLayers = 2,
+                         sizeHidden = 32,
+                         hiddenFactor = 2,
+                         residualDropout = 0.1,
+                         hiddenDropout = 0.1,
+                         sizeEmbedding = 32,
                          estimatorSettings = setEstimator(learningRate=c(3e-4),
                                                           weightDecay = c(1e-6),
                                                           seed=42,
@@ -89,40 +89,51 @@ test_that("ResNet with runPlp working checks", {
 
 
 test_that("ResNet nn-module works ", {
+  ResNet <- reticulate::import_from_path("ResNet", path=path)$ResNet
   model <- ResNet(
-    catFeatures = 5, numFeatures = 1, sizeEmbedding = 5,
-    sizeHidden = 16, numLayers = 1, hiddenFactor = 2,
-    activation = torch::nn_relu,
-    normalization = torch::nn_batch_norm1d, hiddenDropout = 0.3,
-    residualDropout = 0.3, d_out = 1
+    cat_features = 5, 
+    num_features = 1, 
+    size_embedding = 5,
+    size_hidden = 16, 
+    num_layers = 1, 
+    hidden_factor = 2,
+    activation = torch$nn$ReLU,
+    normalization = torch$nn$BatchNorm1d, 
+    hidden_dropout = 0.3,
+    residual_dropout = 0.3
   )
   
-  pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
+  pars <- sum(reticulate::iterate(model$parameters(), function(x) x$numel()))
   
   # expected number of parameters
   expect_equal(pars, 1295)
   
   input <- list()
-  input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())
-  input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32())
+  input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long)
+  input$num <- torch$randn(10L, 1L, dtype = torch$float32)
   
   
   output <- model(input)
   
   # output is correct shape
-  expect_equal(output$shape, 10)
+  expect_equal(output$shape[0], 10L)
   
   input$num <- NULL
   model <- ResNet(
-    catFeatures = 5, numFeatures = 0, sizeEmbedding = 5,
-    sizeHidden = 16, numLayers = 1, hiddenFactor = 2,
-    activation = torch::nn_relu,
-    normalization = torch::nn_batch_norm1d, hiddenDropout = 0.3,
-    residualDropout = 0.3, d_out = 1
+    cat_features = 5, 
+    num_features = 0, 
+    size_embedding = 5,
+    size_hidden = 16, 
+    num_layers = 1, 
+    hidden_factor = 2,
+    activation = torch$nn$ReLU,
+    normalization = torch$nn$BatchNorm1d, 
+    hidden_dropout = 0.3,
+    residual_dropout = 0.3
   )
   output <- model(input)
   # model works without numeric variables
-  expect_equal(output$shape, 10)
+  expect_equal(output$shape[0], 10L)
 })
 
 test_that("Default Resnet works", {
@@ -148,9 +159,9 @@ test_that("Errors are produced by settings function", {
     residualDropout = 0.0,
     hiddenDropout = 0.0,
     sizeEmbedding = 128,
-    weightDecay = 1e-6,
-    learningRate = 0.01,
-    seed = 42,
+    estimatorSettings = setEstimator(weightDecay = 1e-6,
+                                     learningRate = 0.01,
+                                     seed = 42),
     hyperParamSearch = 'random',
     randomSample = randomSample))
 })
diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R
index feca8a4..eb4ab17 100644
--- a/tests/testthat/test-TrainingCache.R
+++ b/tests/testthat/test-TrainingCache.R
@@ -1,17 +1,17 @@
 resNetSettings <- setResNet(numLayers = c(1, 2, 4),
-                            sizeHidden = 2^6,
+                            sizeHidden = 64,
                             hiddenFactor = 1,
                             residualDropout = 0.5,
                             hiddenDropout = 0.5,
-                            sizeEmbedding = 2^6,
-                            estimatorSettings = setEstimator(learningRate='auto',
+                            sizeEmbedding = 64,
+                            estimatorSettings = setEstimator(learningRate=3e-4,
                                                              weightDecay=1e-3,
                                                              device='cpu',
-                                                             batchSize=1024,
-                                                             epochs=30,
+                                                             batchSize=64,
+                                                             epochs=1,
                                                              seed=NULL),
                             hyperParamSearch = "random",
-                            randomSample = 2,
+                            randomSample = 3,
                             randomSampleSeed = NULL)
 
 trainCache <- TrainingCache$new(testLoc)
@@ -84,4 +84,6 @@ test_that("Estimator can resume training from cache", {
     }
   )
   sink()
+  trainCache <- TrainingCache$new(analysisPath)
+  testthat::expect_equal(is.na(trainCache$getLastGridSearchIndex()), TRUE)
 })
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index 94c2e48..b3e421f 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -1,7 +1,12 @@
 settings <- setTransformer(
-  numBlocks = 1, dimToken = 8, dimOut = 1,
-  numHeads = 2, attDropout = 0.0, ffnDropout = 0.2,
-  resDropout = 0.0, dimHidden = 32, 
+  numBlocks = 1, 
+  dimToken = 8, 
+  dimOut = 1,
+  numHeads = 2, 
+  attDropout = 0.0, 
+  ffnDropout = 0.2,
+  resDropout = 0.0, 
+  dimHidden = 32, 
   estimatorSettings = setEstimator(learningRate = 3e-4,
                                    batchSize=64,
                                    epochs=1),
@@ -39,35 +44,48 @@ test_that("fitEstimator with Transformer works", {
 })
 
 test_that("transformer nn-module works", {
+  Transformer <- reticulate::import_from_path("Transformer", path=path)$Transformer
   model <- Transformer(
-    catFeatures = 5, numFeatures = 1, numBlocks = 2,
-    dimToken = 16, numHeads = 2, attDropout = 0, ffnDropout = 0,
-    resDropout = 0, dimHidden = 32
+    cat_features = 5, 
+    num_features = 1, 
+    num_blocks = 2,
+    dim_token = 16, 
+    num_heads = 2, 
+    att_dropout = 0, 
+    ffn_dropout = 0,
+    res_dropout = 0, 
+    dim_hidden = 32
   )
 
-  pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
+  pars <- sum(reticulate::iterate(model$parameters(), function(x) x$numel()))
 
   # expected number of parameters
   expect_equal(pars, 5697)
 
   input <- list()
-  input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())
-  input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32())
+  input$cat <- torch$randint(0L, 5L, c(10L, 5L), dtype = torch$long)
+  input$num <- torch$randn(10L, 1L, dtype = torch$float32)
 
   output <- model(input)
 
   # output is correct shape, size of batch
-  expect_equal(output$shape, 10)
+  expect_equal(output$shape[0], 10L)
 
   input$num <- NULL
 
   model <- Transformer(
-    catFeatures = 5, numFeatures = 0, numBlocks = 2,
-    dimToken = 16, numHeads = 2, attDropout = 0, ffnDropout = 0,
-    resDropout = 0, dimHidden = 32
+    cat_features = 5, 
+    num_features = 0, 
+    num_blocks = 2,
+    dim_token = 16, 
+    num_heads = 2, 
+    att_dropout = 0, 
+    ffn_dropout = 0,
+    res_dropout = 0, 
+    dim_hidden = 32
   )
   output <- model(input)
-  expect_equal(output$shape, 10)
+  expect_equal(output$shape[0], 10L)
 })
 
 test_that("Default Transformer works", {
@@ -90,3 +108,21 @@ test_that("Errors are produced by settings function", {
   
   expect_error(setTransformer(randomSample = randomSample))
 })
+
+test_that("dimHidden ratio works as expected", {
+  randomSample <- 4
+  dimToken <- c(64, 128, 256, 512)
+  dimHiddenRatio <- 2
+  modelSettings <- setTransformer(dimToken = dimToken,
+                                  dimHiddenRatio = dimHiddenRatio,
+                                  dimHidden = NULL,
+                                  randomSample = randomSample)
+  dimHidden <- unlist(lapply(modelSettings$param, function(x) x$dimHidden))
+  tokens <-   unlist(lapply(modelSettings$param, function(x) x$dimToken))
+  testthat::expect_true(all(dimHidden == dimHiddenRatio * tokens))
+  testthat::expect_error(setTransformer(dimHidden = NULL,
+                                        dimHiddenRatio = NULL))
+  testthat::expect_error(setTransformer(dimHidden = 256,
+                                        dimHiddenRatio = 4/3))
+
+})
diff --git a/vignettes/BuildingDeepModels.Rmd b/vignettes/BuildingDeepModels.Rmd
index adbfd98..e95e052 100644
--- a/vignettes/BuildingDeepModels.Rmd
+++ b/vignettes/BuildingDeepModels.Rmd
@@ -228,18 +228,18 @@ combinations are 2*2*2*2 or 16 but specify ```randomSample=10``` to only try
 ```{r, eval=FALSE}
 
 modelSettings <- setMultiLayerPerceptron(
-  numLayers = c(3, 5),
-  sizeHidden = c(64, 128), 
+  numLayers = c(3L, 5L),
+  sizeHidden = c(64L, 128L), 
   dropout = c(0.2),
-  sizeEmbedding = c(32, 64),
+  sizeEmbedding = c(32L, 64L),
   estimatorSettings = setEstimator(
     learningRate = c(1e-3, 1e-4),
     weightDecay = c(1e-5),
-    batchSize = c(128),
-    epochs=c(5),
-    seed=12
+    batchSize = c(128L),
+    epochs=c(5L),
+    seed=12L
   ),
-  randomSample=10
+  randomSample=10L
 )
 
 mlpResult <- PatientLevelPrediction::runPlp(
@@ -347,18 +347,18 @@ input only includes one option.
 ```{r, eval=FALSE}
 
 resset <- setResNet(
-  numLayers = c(2), 
-  sizeHidden = c(32),
-  hiddenFactor = c(2),
+  numLayers = c(2L), 
+  sizeHidden = c(32L),
+  hiddenFactor = c(2L),
   residualDropout = c(0.1), 
   hiddenDropout = c(0.1),
-  sizeEmbedding = c(32),
+  sizeEmbedding = c(32L),
   estimatorSettings = setEstimator(learningRate = c(3e-4),
                                    weightDecay = c(1e-6),
                                    #device='cuda:0', # uncomment to use GPU
-                                   batchSize = 128, 
-                                   epochs = 3,
-                                   seed = 42),
+                                   batchSize = 128L, 
+                                   epochs = 3L,
+                                   seed = 42L),
   hyperParamSearch = 'random',
   randomSample = 1
 )
@@ -438,22 +438,22 @@ block
 
 ```{r, eval=FALSE}
 
-modelSettings <- setTransformer(numBlocks = 3,
-                                dimToken = 32,
+modelSettings <- setTransformer(numBlocks = 3L,
+                                dimToken = 32L,
                                 dimOut = 1, 
-                                numHeads = 4,
+                                numHeads = 4L,
                                 attDropout = 0.25,
                                 ffnDropout = 0.25,
                                 resDropout = 0,
-                                dimHidden = 128,
+                                dimHidden = 128L,
                                 estimatorSettings = setEstimator(
                                   learningRate = 3e-4,
                                   weightDecay = 1e-6,
-                                  batchSize = 128,
-                                  epochs = 10,
+                                  batchSize = 128L,
+                                  epochs = 10L,
                                   device = 'cpu'
                                 ),
-                                randomSample=1)
+                                randomSample=1L)
                               
 
 
diff --git a/vignettes/FirstModel.Rmd b/vignettes/FirstModel.Rmd
index 4aaeee3..789ec26 100644
--- a/vignettes/FirstModel.Rmd
+++ b/vignettes/FirstModel.Rmd
@@ -120,8 +120,8 @@ library(DeepPatientLevelPrediction)
 modelSettings <- setDefaultResNet(
   estimatorSettings = setEstimator(learningRate=3e-4,
                                    device="cpu",
-                                   batchSize=256,
-                                   epochs=3)
+                                   batchSize=256L,
+                                   epochs=3L)
 )
 
 ```
diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd
index 8d77599..5b88419 100644
--- a/vignettes/Installing.Rmd
+++ b/vignettes/Installing.Rmd
@@ -45,6 +45,7 @@ This vignette describes how you need to install the Observational Health Data Sc
 Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires installing:
 
 -   R (<https://cran.cnr.berkeley.edu/> ) - (R \>= 4.0.0, but latest is recommended)
+-   Python - The package is tested with python 3.10, but \>= 3.8 should work
 -   Rstudio (<https://www.rstudio.com/> )
 -   Java (<http://www.java.com> )
 -   RTools (<https://cran.r-project.org/bin/windows/Rtools/>)
@@ -54,6 +55,7 @@ Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires
 Under Mac and Linux the OHDSI deepPLP package requires installing:
 
 -   R (<https://cran.cnr.berkeley.edu/> ) - (R \>= 4.0.0, but latest is recommended)
+-   Python - The package is tested with python 3.10, but \>= 3.8 should work
 -   Rstudio (<https://www.rstudio.com/> )
 -   Java (<http://www.java.com> )
 -   Xcode command line tools(run in terminal: xcode-select --install) [MAC USERS ONLY]
@@ -66,6 +68,25 @@ If you do not want the official release you could install the bleeding edge vers
 
 Note that the latest develop branch could contain bugs, please report them to us if you experience problems.
 
+## Installing Python environment
+
+Since the package uses `pytorch` through `reticulate` a working python installation is required. The package is tested with python 3.10. To install python an easy way is to use miniconda through `reticulate`:
+
+```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE}
+install.packages('reticulate')
+reticulate::install_miniconda()
+```
+
+By default `install_minconda()` creates an environment `r-reticulate` with `python 3.9`. To use instead `python 3.10` we can do:
+
+```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE}
+reticulate::conda_install(envname = 'r-reticulate', packages=c('python=3.10'))
+```
+
+Then when we can install `DeepPatientLevelPrediction` and it should install the required python packages in this environment.
+
+If instead you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`.
+
 ## Installing DeepPatientLevelPrediction using remotes
 
 To install using `remotes` run:
@@ -77,12 +98,6 @@ remotes::install_github("OHDSI/PatientLevelPrediction")
 remotes::install_github("OHDSI/DeepPatientLevelPrediction")
 ```
 
-DeepPLP relies on [torch for R](https://torch.mlverse.org/). When torch is installed the user 
-will be prompted if libtorch and lantern binaries should be downloaded. These binaries are necessary 
-for the package to run.
-
-If you are using DeepPLP in an offline environment the function `torch::install_torch_from_file()` can be used. This will first require to download and move the correct binaries to the offline environment. See [torch installation guide](https://torch.mlverse.org/docs/articles/installation.html) for more detailed instructions.
-
 When installing make sure to close any other Rstudio sessions that are using `DeepPatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks on windows that prevent the package installing.
 
 # Testing Installation
@@ -103,20 +118,20 @@ populationSettings <- PatientLevelPrediction::createStudyPopulationSettings(
                                                           riskWindowStart = 1, 
                                                           riskWindowEnd = 365)
 # a very simple resnet
-modelSettings <- setResNet(numLayers = 2, 
-                           sizeHidden = 64, 
-                           hiddenFactor = 1,
+modelSettings <- setResNet(numLayers = 2L, 
+                           sizeHidden = 64L, 
+                           hiddenFactor = 1L,
                            residualDropout = 0, 
                            hiddenDropout = 0.2, 
-                           sizeEmbedding = 64, 
+                           sizeEmbedding = 64L, 
                            estimatorSettings = setEstimator(learningRate = 3e-4,
                                                             weightDecay = 1e-6,
                                                             device='cpu',
-                                                            batchSize=128,
-                                                            epochs=3,
+                                                            batchSize=128L,
+                                                            epochs=3L,
                                                             seed = 42),
                            hyperParamSearch = 'random',
-                           randomSample = 1)
+                           randomSample = 1L)
 
 plpResults <- PatientLevelPrediction::runPlp(plpData = plpData,
                outcomeId = 3,
@@ -148,4 +163,4 @@ citation("DeepPatientLevelPrediction")
 
 **Please reference this paper if you use the PLP Package in your work:**
 
-[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032)
\ No newline at end of file
+[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032)