diff --git a/.gitignore b/.gitignore
index 56db315..94cb359 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,5 @@ config.yml
 docs
 .idea/
 renv.lock
-extras/
\ No newline at end of file
+extras/
+.Renviron
\ No newline at end of file
diff --git a/DESCRIPTION b/DESCRIPTION
index 0df769f..d4c1a53 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: DeepPatientLevelPrediction
 Type: Package
 Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
-Version: 2.0.0
+Version: 2.0.1
 Date: 18-04-2023
 Authors@R: c(
     person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),
diff --git a/NEWS.md b/NEWS.md
index 3da704b..70ccf12 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,15 @@
+DeepPatientLevelPrediction 2.0.1
+======================
+  - Connection parameter fixed to be in line with newest polars
+  - Fixed a bug where LRFinder used a hardcoded batch size
+  - Seed is now used in LRFinder so it's reproducible
+  - Fixed a bug in NumericalEmbedding
+  - Fixed a bug for Transformer and numerical features
+  - Fixed a bug when resuming from a full TrainingCache (thanks Zoey Jiang and Linying Zhang )
+  - Updated installation documentation after feedback from HADES hackathon
+  - Fixed a bug where order of numeric features wasn't conserved between training and test set
+  - TrainingCache now only saves prediction dataframe for the best performing model 
+
 DeepPatientLevelPrediction 2.0.0
 ======================
   - New backend which uses pytorch through reticulate instead of torch in R
diff --git a/R/Estimator.R b/R/Estimator.R
index f804368..7705279 100644
--- a/R/Estimator.R
+++ b/R/Estimator.R
@@ -310,7 +310,8 @@ gridCvDeep <- function(mappedData,
   
   fitParams <- names(paramSearch[[1]])[grepl("^estimator", names(paramSearch[[1]]))]
   findLR <- modelSettings$estimatorSettings$findLR
-  for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
+  if (!trainCache$isFull()) {
+    for (gridId in trainCache$getLastGridSearchIndex():length(paramSearch)) {
     ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
     ParallelLogger::logInfo(paste0("HyperParameters: "))
     ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
@@ -363,25 +364,38 @@ gridCvDeep <- function(mappedData,
       )
     }
     maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
-    paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]]
-    
     gridSearchPredictons[[gridId]] <- list(
       prediction = prediction,
-      param = paramSearch[[gridId]]
+      param = paramSearch[[gridId]],
+      gridPerformance =  PatientLevelPrediction::computeGridPerformance(prediction, paramSearch[[gridId]])
     )
+    gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary$learnRates <- rep(list(unlist(learnRates[[maxIndex]]$LRs)), 
+                                                                                  nrow(gridSearchPredictons[[gridId]]$gridPerformance$hyperSummary))   
+    gridSearchPredictons[[gridId]]$param$learnSchedule <- learnRates[[maxIndex]]
+    
 
+    # remove all predictions that are not the max performance
+    indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
+    for (i in seq_along(gridSearchPredictons)) {
+      if (!is.null(gridSearchPredictons[[i]])) {
+        if (i != indexOfMax) {
+          gridSearchPredictons[[i]]$prediction <- list(NULL)
+        }
+      }
+    }
+    ParallelLogger::logInfo(paste0("Caching all grid search results and prediction for best combination ", indexOfMax))
     trainCache$saveGridSearchPredictions(gridSearchPredictons)
   }
+  }
+  paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
+  # get best params
+  indexOfMax <- which.max(unlist(lapply(gridSearchPredictons, function(x) x$gridPerformance$cvPerformance)))
+  finalParam <- gridSearchPredictons[[indexOfMax]]$param
+
+  paramGridSearch <- lapply(gridSearchPredictons, function(x) x$gridPerformance)
   
-  # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function)
-  paramGridSearch <- lapply(gridSearchPredictons, function(x) {
-    do.call(PatientLevelPrediction::computeGridPerformance, x)
-  }) # cvAUCmean, cvAUC, param
-  
-  optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance)))
-  finalParam <- paramGridSearch[[optimalParamInd]]$param
-  
-  cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction
+  # get best CV prediction
+  cvPrediction <- gridSearchPredictons[[indexOfMax]]$prediction
   cvPrediction$evaluationType <- "CV"
   
   ParallelLogger::logInfo("Training final model using optimal parameters")
diff --git a/R/TrainingCache-class.R b/R/TrainingCache-class.R
index 8577f31..be626d5 100644
--- a/R/TrainingCache-class.R
+++ b/R/TrainingCache-class.R
@@ -69,6 +69,13 @@ TrainingCache <- R6::R6Class(
       return(private$.paramPersistence$gridSearchPredictions)
     },
     
+    #' @description
+    #' Check if cache is full
+    #' @returns Boolen
+    isFull = function() {
+      return(all(unlist(lapply(private$.paramPersistence$gridSearchPredictions, function(x) !is.null(x$gridPerformance)))))
+    },    
+    
     #' @description
     #' Gets the last index from the cached grid search
     #' @returns Last grid search index
diff --git a/extras/example.R b/extras/example.R
deleted file mode 100644
index 99a6c08..0000000
--- a/extras/example.R
+++ /dev/null
@@ -1,73 +0,0 @@
-# testing code (requires sequential branch of FeatureExtraction):
-# rm(list = ls())
-library(PatientLevelPrediction)
-library(DeepPatientLevelPrediction)
-
-data(plpDataSimulationProfile)
-sampleSize <- 1e3
-plpData <- simulatePlpData(
-   plpDataSimulationProfile,
-   n = sampleSize
- )
-
-
-populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
-  requireTimeAtRisk = F, 
-  riskWindowStart = 1, 
-  riskWindowEnd = 365*5)
-
-# 
-# modelSettings <- setDefaultTransformer(estimatorSettings = setEstimator(
-#   learningRate = "auto",
-#   batchSize=64L,
-#   epochs = 10L
-# ))
-
-modelSettings <- setDefaultResNet(estimatorSettings = setEstimator(
-  learningRate = "auto",
-  weightDecay = 1e-06,
-  device="cuda:0",
-  batchSize=128L,
-  epochs=50L,
-  seed=42
-))
-
-modelSettings <- setResNet(numLayers = c(1L, 2L),
-                           sizeHidden = 72L,
-                           hiddenFactor = 1L,
-                           residualDropout = 0.0,
-                           hiddenDropout = 0.0,
-                           sizeEmbedding = 64L,
-                           estimatorSettings = setEstimator(
-                             learningRate = 3e-4,
-                             batchSize = 128L,
-                             epochs = 10L,
-                             device = "cpu",
-                             seed = 42
-                           ),
-                           randomSample = 2)
-
-res2 <- PatientLevelPrediction::runPlp(
-  plpData = plpData,
-  outcomeId = unique(plpData$outcomes$outcomeId)[[1]],
-  modelSettings = modelSettings,
-  analysisId = 'Test',
-  analysisName = 'Testing DeepPlp',
-  populationSettings = populationSet,
-  splitSettings = createDefaultSplitSetting(splitSeed = 123),
-  sampleSettings = createSampleSettings("underSample"),  # none
-  featureEngineeringSettings = createFeatureEngineeringSettings(), # none
-  preprocessSettings = createPreprocessSettings(normalize = F),
-  logSettings = createLogSettings(verbosity='TRACE'),
-  executeSettings = createExecuteSettings(
-    runSplitData = T,
-    runSampleData = T,
-    runfeatureEngineering = F,
-    runPreprocessData = T,
-    runModelDevelopment = T,
-    runCovariateSummary = F
-  ),
-  saveDirectory = '~/test/resnet/'
-)
-
-
diff --git a/inst/python/Dataset.py b/inst/python/Dataset.py
index 15d749b..98d9ed3 100644
--- a/inst/python/Dataset.py
+++ b/inst/python/Dataset.py
@@ -21,7 +21,7 @@ def __init__(self,
         if pathlib.Path(data).suffix == '.sqlite':
             data = urllib.parse.quote(data)
             data = pl.read_database("SELECT * from covariates",
-                                    connection_uri=f"sqlite://{data}").lazy()
+                                    connection=f"sqlite://{data}").lazy()
         else:
             data = pl.scan_ipc(pathlib.Path(data).joinpath('covariates/*.arrow'))
         observations = data.select(pl.col('rowId').max()).collect()[0, 0]
@@ -67,7 +67,7 @@ def __init__(self,
         if pl.count(self.numerical_features) == 0:
             self.num = None
         else:
-            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)). \
+            numerical_data = data.filter(pl.col('columnId').is_in(self.numerical_features)).sort(by='columnId'). \
                 with_row_count('newColumnId').with_columns(pl.col('newColumnId').first().over('columnId').
                                                            rank(method="dense") - 1, pl.col('rowId') - 1) \
                 .select(pl.col('rowId'), pl.col('newColumnId').alias('columnId'), pl.col('covariateValue')).collect()
diff --git a/inst/python/LrFinder.py b/inst/python/LrFinder.py
index 4c24a38..9d5bd0c 100644
--- a/inst/python/LrFinder.py
+++ b/inst/python/LrFinder.py
@@ -37,6 +37,7 @@ def __init__(self,
         smooth = lr_settings.get("smooth", 0.05)
         divergence_threshold = lr_settings.get("divergence_threshold", 4)
         torch.manual_seed(seed=estimator_settings["seed"])
+        self.seed = estimator_settings["seed"]
         self.model = model(**model_parameters)
         if callable(estimator_settings["device"]):
             self.device = estimator_settings["device"]()
@@ -55,18 +56,18 @@ def __init__(self,
         self.scheduler = ExponentialSchedulerPerBatch(self.optimizer, self.max_lr, self.num_lr)
 
         self.criterion = estimator_settings["criterion"]()
-        self.batch_size = estimator_settings['batch_size']
+        self.batch_size = int(estimator_settings['batch_size'])
         self.losses = None
         self.loss_index = None
 
     def get_lr(self, dataset):
         batch_index = torch.arange(0, len(dataset), 1).tolist()
-
+        random.seed(self.seed)
         losses = torch.empty(size=(self.num_lr,), dtype=torch.float)
         lrs = torch.empty(size=(self.num_lr,), dtype=torch.float)
         for i in tqdm(range(self.num_lr)):
             self.optimizer.zero_grad()
-            random_batch = random.sample(batch_index, 32)
+            random_batch = random.sample(batch_index, self.batch_size)
             batch = dataset[random_batch]
             batch = batch_to_device(batch, self.device)
 
diff --git a/inst/python/ResNet.py b/inst/python/ResNet.py
index f680eb2..cef4b49 100644
--- a/inst/python/ResNet.py
+++ b/inst/python/ResNet.py
@@ -130,9 +130,9 @@ def __init__(self,
                 nn.init.kaiming_uniform_(parameter, a=math.sqrt(5))
 
     def forward(self, input):
-        x = self.weight.unsqueeze(0) * input.unsqueeze(-1)
+        x = self.weight[None] * input[..., None]
         if self.bias is not None:
-            x = x + self.bias.unsqueeze(-1)
+            x = x + self.bias[None]
         return x
 
 
diff --git a/inst/python/Transformer.py b/inst/python/Transformer.py
index 5944e1b..1c95b36 100644
--- a/inst/python/Transformer.py
+++ b/inst/python/Transformer.py
@@ -49,6 +49,9 @@ def __init__(self,
 
         if num_features != 0 and num_features is not None:
             self.numerical_embedding = NumericalEmbedding(num_features, dim_token)
+            self.use_numerical = True
+        else:
+            self.use_numerical = False
         self.class_token = ClassToken(dim_token)
 
         self.layers = nn.ModuleList([])
@@ -78,7 +81,7 @@ def __init__(self,
     def forward(self, x):
         mask = torch.where(x["cat"] == 0, True, False)
         cat = self.categorical_embedding(x["cat"])
-        if "num" in x.keys() and self.numerical_embedding is not None:
+        if self.use_numerical:
             num = self.numerical_embedding(x["num"])
             x = torch.cat([cat, num], dim=1)
             mask = torch.cat([mask, torch.zeros([x.shape[0],
diff --git a/man/TrainingCache.Rd b/man/TrainingCache.Rd
index 0a7ec7b..c82bb23 100644
--- a/man/TrainingCache.Rd
+++ b/man/TrainingCache.Rd
@@ -8,6 +8,8 @@ Whether the provided and cached parameter grid is identical
 
 Grid search results from the training cache
 
+Boolen
+
 Last grid search index
 }
 \description{
@@ -21,6 +23,7 @@ Parameter caching for training persistence and continuity
 \item \href{#method-TrainingCache-saveGridSearchPredictions}{\code{TrainingCache$saveGridSearchPredictions()}}
 \item \href{#method-TrainingCache-saveModelParams}{\code{TrainingCache$saveModelParams()}}
 \item \href{#method-TrainingCache-getGridSearchPredictions}{\code{TrainingCache$getGridSearchPredictions()}}
+\item \href{#method-TrainingCache-isFull}{\code{TrainingCache$isFull()}}
 \item \href{#method-TrainingCache-getLastGridSearchIndex}{\code{TrainingCache$getLastGridSearchIndex()}}
 \item \href{#method-TrainingCache-dropCache}{\code{TrainingCache$dropCache()}}
 \item \href{#method-TrainingCache-clone}{\code{TrainingCache$clone()}}
@@ -104,6 +107,16 @@ Gets the grid search results from the training cache
 \if{html}{\out{<div class="r">}}\preformatted{TrainingCache$getGridSearchPredictions()}\if{html}{\out{</div>}}
 }
 
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-TrainingCache-isFull"></a>}}
+\if{latex}{\out{\hypertarget{method-TrainingCache-isFull}{}}}
+\subsection{Method \code{isFull()}}{
+Check if cache is full
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{TrainingCache$isFull()}\if{html}{\out{</div>}}
+}
+
 }
 \if{html}{\out{<hr>}}
 \if{html}{\out{<a id="method-TrainingCache-getLastGridSearchIndex"></a>}}
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
index adf0dcb..7cd0fee 100644
--- a/tests/testthat/setup.R
+++ b/tests/testthat/setup.R
@@ -78,3 +78,21 @@ dataset <- Dataset$Data(
 )
 small_dataset <- torch$utils$data$Subset(dataset, (1:round(length(dataset)/3)))
 
+modelSettings <- setResNet(
+  numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
+  residualDropout = c(0, 0.2), hiddenDropout = 0,
+  sizeEmbedding = 16, hyperParamSearch = "random",
+  randomSample = 2,
+  setEstimator(epochs=1,
+               learningRate = 3e-4)
+)
+fitEstimatorPath <- file.path(testLoc, 'fitEstimator')
+if (!dir.exists(fitEstimatorPath)) {
+  dir.create(fitEstimatorPath)
+}
+fitEstimatorResults <- fitEstimator(trainData$Train, 
+                                    modelSettings = modelSettings, 
+                                    analysisId = 1, 
+                                    analysisPath = fitEstimatorPath)
+
+
diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R
index 9cb1382..b4dd0a4 100644
--- a/tests/testthat/test-Estimator.R
+++ b/tests/testthat/test-Estimator.R
@@ -146,25 +146,12 @@ test_that("early stopping works", {
   testthat::expect_true(earlyStop$early_stop)
 })
 
-modelSettings <- setResNet(
-  numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
-  residualDropout = 0, hiddenDropout = 0,
-  sizeEmbedding = 16, hyperParamSearch = "random",
-  randomSample = 1,
-  setEstimator(epochs=1,
-               learningRate = 3e-4)
-)
-
-sink(nullfile())
-results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1, analysisPath = testLoc)
-sink()
-
 test_that("Estimator fit function works", {
-  expect_true(!is.null(results$trainDetails$trainingTime))
+  expect_true(!is.null(fitEstimatorResults$trainDetails$trainingTime))
 
-  expect_equal(class(results), "plpModel")
-  expect_equal(attr(results, "modelType"), "binary")
-  expect_equal(attr(results, "saveType"), "file")
+  expect_equal(class(fitEstimatorResults), "plpModel")
+  expect_equal(attr(fitEstimatorResults, "modelType"), "binary")
+  expect_equal(attr(fitEstimatorResults, "saveType"), "file")
   fakeTrainData <- trainData
   fakeTrainData$train$covariateData <- list(fakeCovData <- c("Fake"))
   expect_error(fitEstimator(fakeTrainData$train, modelSettings, analysisId = 1, analysisPath = testLoc))
@@ -184,7 +171,7 @@ test_that("predictDeepEstimator works", {
   # input is a plpModel and data
   sink(nullfile())
   predictions <- predictDeepEstimator(
-    plpModel = results, data = trainData$Test,
+    plpModel = fitEstimatorResults, data = trainData$Test,
     trainData$Test$labels
   )
   sink()
@@ -369,4 +356,4 @@ test_that("estimatorSettings can be saved and loaded with correct python objects
   testthat::expect_false(reticulate::py_is_null_xptr(optimizer))
   testthat::expect_false(reticulate::py_is_null_xptr(scheduler$fun))
   testthat::expect_false(reticulate::py_is_null_xptr(criterion))
-})
\ No newline at end of file
+})
diff --git a/tests/testthat/test-TrainingCache.R b/tests/testthat/test-TrainingCache.R
index eb4ab17..debe95c 100644
--- a/tests/testthat/test-TrainingCache.R
+++ b/tests/testthat/test-TrainingCache.R
@@ -47,43 +47,32 @@ test_that("Param grid predictions can be cached", {
 })
 
 test_that("Estimator can resume training from cache", {
-  modelPath <- tempdir()
-  analysisPath <- file.path(modelPath, "Analysis_TrainCacheResNet")
-  dir.create(analysisPath)
-  trainCache <- TrainingCache$new(analysisPath)
-  trainCache$saveModelParams(paramSearch)
+  trainCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds"))
+  newPath <- file.path(testLoc, 'resume')
+  dir.create(newPath)
+  
+  # remove last row
+  trainCache$gridSearchPredictions[[2]] <- NULL
+  length(trainCache$gridSearchPredictions) <- 2
+  
+  # save new cache
+  saveRDS(trainCache, file=file.path(newPath, "paramPersistence.rds"))
   
   sink(nullfile())
-  res2 <- tryCatch(
-    {
-      PatientLevelPrediction::runPlp(
-        plpData = plpData,
-        outcomeId = 3,
-        modelSettings = resNetSettings,
-        analysisId = "Analysis_TrainCacheResNet",
-        analysisName = "Testing Training Cache",
-        populationSettings = populationSet,
-        splitSettings = PatientLevelPrediction::createDefaultSplitSetting(),
-        sampleSettings = PatientLevelPrediction::createSampleSettings(), # none
-        featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none
-        preprocessSettings = PatientLevelPrediction::createPreprocessSettings(),
-        executeSettings = PatientLevelPrediction::createExecuteSettings(
-          runSplitData = T,
-          runSampleData = F,
-          runfeatureEngineering = F,
-          runPreprocessData = T,
-          runModelDevelopment = T,
-          runCovariateSummary = F
-        ),
-        saveDirectory = modelPath
-      )
-    },
-    error = function(e) {
-      print(e)
-      return(NULL)
-    }
-  )
+  fitEstimatorResults <- fitEstimator(trainData$Train, 
+                                      modelSettings = modelSettings, 
+                                      analysisId = 1, 
+                                      analysisPath = newPath)
   sink()
-  trainCache <- TrainingCache$new(analysisPath)
-  testthat::expect_equal(is.na(trainCache$getLastGridSearchIndex()), TRUE)
+ 
+  newCache <- readRDS(file.path(newPath, "paramPersistence.rds"))
+  testthat::expect_equal(nrow(newCache$gridSearchPredictions[[2]]$gridPerformance$hyperSummary), 4)
+})
+
+test_that("Prediction is cached for optimal parameters", {
+  testCache <- readRDS(file.path(fitEstimatorPath, "paramPersistence.rds"))
+  indexOfMax <- which.max(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance)))
+  indexOfMin <- which.min(unlist(lapply(testCache$gridSearchPredictions, function(x) x$gridPerformance$cvPerformance)))
+  testthat::expect_equal(class(testCache$gridSearchPredictions[[indexOfMax]]$prediction), class(data.frame()))
+  testthat::expect_null(testCache$gridSearchPredictions[[indexOfMin]]$prediction[[1]])
 })
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
index b3e421f..043cbe7 100644
--- a/tests/testthat/test-Transformer.R
+++ b/tests/testthat/test-Transformer.R
@@ -85,6 +85,9 @@ test_that("transformer nn-module works", {
     dim_hidden = 32
   )
   output <- model(input)
+  expect_equal(output$shape[0], 10L)  
+  input$num <- reticulate::py_none()
+  output <- model(input)
   expect_equal(output$shape[0], 10L)
 })
 
@@ -126,3 +129,32 @@ test_that("dimHidden ratio works as expected", {
                                         dimHiddenRatio = 4/3))
 
 })
+
+test_that("numerical embedding works as expected", {
+  embeddings <- 32L # size of embeddings
+  features <- 2L # number of numerical features
+  patients <- 9L 
+  
+  numTensor <- torch$randn(c(patients, features))
+  
+  numericalEmbeddingClass <- reticulate::import_from_path("ResNet", path=path)$NumericalEmbedding
+  numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features,
+                                                embedding_dim = embeddings,
+                                                bias = TRUE)
+  out <- numericalEmbedding(numTensor)
+  
+  # should be patients x features x embedding size
+  expect_equal(out$shape[[0]], patients)
+  expect_equal(out$shape[[1]], features)
+  expect_equal(out$shape[[2]], embeddings)
+  
+  numericalEmbedding <- numericalEmbeddingClass(num_embeddings = features,
+                                                embedding_dim = embeddings,
+                                                bias = FALSE)
+  
+  out <- numericalEmbedding(numTensor)
+  expect_equal(out$shape[[0]], patients)
+  expect_equal(out$shape[[1]], features)
+  expect_equal(out$shape[[2]], embeddings)
+  
+  })
diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd
index 5b88419..1aded4d 100644
--- a/vignettes/Installing.Rmd
+++ b/vignettes/Installing.Rmd
@@ -52,7 +52,7 @@ Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires
 
 ## Mac/Linux Users
 
-Under Mac and Linux the OHDSI deepPLP package requires installing:
+Under Mac and Linux the OHDSI DeepPLP package requires installing:
 
 -   R (<https://cran.cnr.berkeley.edu/> ) - (R \>= 4.0.0, but latest is recommended)
 -   Python - The package is tested with python 3.10, but \>= 3.8 should work
@@ -83,9 +83,15 @@ By default `install_minconda()` creates an environment `r-reticulate` with `pyth
 reticulate::conda_install(envname = 'r-reticulate', packages=c('python=3.10'))
 ```
 
-Then when we can install `DeepPatientLevelPrediction` and it should install the required python packages in this environment.
+If reticulate is having issues finding the conda installation you can use the function `reticulate::miniconda_path()` to find the default installation location for your miniconda installation. Then you can force reticulate to use the newly generated environment by setting the environment variable `RETICULATE_PYTHON` to point to the python binary in the environment. For example by adding the following to the `.Renviron` file:
 
-If instead you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`.
+```         
+RETICULATE_PYTHON="/path/to/miniconda/envs/r-reticulate/python/bin"
+```
+
+Then you need to restart you R session. To verify that `reticulate` finds the correct version. You can call `reticulate::py_config()`. 
+
+Once you have a working python environment that reticulate can locate you can install `DeepPatientLevelPrediction`. If you want to use a specific python environment you can set the environment variable `RETICULATE_PYTHON` to point to the python executable of that environment in your `.Renviron` file. You need to do this before installing `DeepPatientLevelPrediction`.
 
 ## Installing DeepPatientLevelPrediction using remotes
 
@@ -93,11 +99,18 @@ To install using `remotes` run:
 
 ```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE}
 install.packages("remotes")
-remotes::install_github("OHDSI/FeatureExtraction")
-remotes::install_github("OHDSI/PatientLevelPrediction")
 remotes::install_github("OHDSI/DeepPatientLevelPrediction")
 ```
 
+This should install the required python packages. If that doesn't happen it can be triggered by calling:
+
+```
+library(DeepPatientLevelPrediction)
+torch$trandn(10L)
+```
+
+This should print out a tensor with ten different values. 
+
 When installing make sure to close any other Rstudio sessions that are using `DeepPatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks on windows that prevent the package installing.
 
 # Testing Installation
@@ -107,7 +120,7 @@ library(PatientLevelPrediction)
 library(DeepPatientLevelPrediction)
 
 data(plpDataSimulationProfile)
-sampleSize <- 1e4
+sampleSize <- 1e3
 plpData <- simulatePlpData(
   plpDataSimulationProfile,
   n = sampleSize