diff --git a/.Rbuildignore b/.Rbuildignore
index 91114bf..ac00a31 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,2 +1,6 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+^\.github$
+^_pkgdown\.yml$
+^docs$
+^pkgdown$
diff --git a/.github/.gitignore b/.github/.gitignore
new file mode 100644
index 0000000..2d19fc7
--- /dev/null
+++ b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml
new file mode 100644
index 0000000..7445265
--- /dev/null
+++ b/.github/workflows/R_CDM_check_hades.yaml
@@ -0,0 +1,164 @@
+# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag.
+# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions
+on:
+  push:
+    branches:
+      - '**'
+  pull_request:
+    branches:
+      - '**'
+
+name: R-CMD-check
+
+jobs:
+  R-CMD-check:
+    runs-on: ${{ matrix.config.os }}
+
+    name: ${{ matrix.config.os }} (${{ matrix.config.r }})
+
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - {os: windows-latest, r: 'release'}
+          - {os: macOS-latest, r: 'release'}
+          - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"}
+
+    env:
+      GITHUB_PAT: ${{ secrets.GH_TOKEN }}
+      R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
+      RSPM: ${{ matrix.config.rspm }}
+      CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM_SCHEMA }}
+      CDM5_ORACLE_OHDSI_SCHEMA: ${{ secrets.CDM5_ORACLE_OHDSI_SCHEMA }}
+      CDM5_ORACLE_PASSWORD: ${{ secrets.CDM5_ORACLE_PASSWORD }}
+      CDM5_ORACLE_SERVER: ${{ secrets.CDM5_ORACLE_SERVER }}
+      CDM5_ORACLE_USER: ${{ secrets.CDM5_ORACLE_USER }}
+      CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM_SCHEMA }}
+      CDM5_POSTGRESQL_OHDSI_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_OHDSI_SCHEMA }}
+      CDM5_POSTGRESQL_PASSWORD: ${{ secrets.CDM5_POSTGRESQL_PASSWORD }}
+      CDM5_POSTGRESQL_SERVER: ${{ secrets.CDM5_POSTGRESQL_SERVER }}
+      CDM5_POSTGRESQL_USER: ${{ secrets.CDM5_POSTGRESQL_USER }}
+      CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM_SCHEMA }}
+      CDM5_SQL_SERVER_OHDSI_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_OHDSI_SCHEMA }}
+      CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }}
+      CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }}
+      CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }}
+      TORCH_INSTALL: 1 # so torch runs torch::install_torch() 
+
+    steps:
+      - uses: actions/checkout@v2
+      
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          r-version: ${{ matrix.config.r }}
+
+      - uses: r-lib/actions/setup-tinytex@v2
+
+      - uses: r-lib/actions/setup-pandoc@v1
+  
+      - name: Install system requirements
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt-get install -y libssh-dev
+          Rscript -e 'install.packages("remotes")'
+          while read -r cmd
+          do
+            eval sudo $cmd
+          done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))')
+          
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::rcmdcheck
+          needs: check
+
+      - uses: r-lib/actions/check-r-package@v2
+        with:
+          args: 'c("--no-manual", "--as-cran")'
+          error-on: '"warning"'
+          check-dir: '"check"'
+
+      - name: Upload source package
+        if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main'
+        uses: actions/upload-artifact@v2
+        with:
+          name: package_tarball
+          path: check/*.tar.gz
+
+      - name: Install covr
+        if: runner.os == 'Windows'
+        run: |
+          remotes::install_cran("covr")
+        shell: Rscript {0}
+
+      - name: Test coverage
+        if: runner.os == 'Windows'
+        run: covr::codecov()
+        shell: Rscript {0}
+
+  Release:
+    needs: R-CMD-Check
+    
+    runs-on: macOS-latest
+
+    env:
+      GH_TOKEN: ${{ secrets.GH_TOKEN }}
+
+    if: ${{ github.event_name != 'pull_request' && github.ref == 'refs/heads/main' }}
+
+    steps:
+      
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0    
+
+      - name: Check if version has increased
+        run: |
+          echo "new_version="$(perl compare_versions --tag) >> $GITHUB_ENV
+          
+      - name: Display new version number
+        if: ${{ env.new_version != '' }}
+        run: |
+          echo "${{ env.new_version }}" 
+          
+      - name: Create release
+        if: ${{ env.new_version != '' }}
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} 
+        with:
+          tag_name: ${{ env.new_version }}
+          release_name: Release ${{ env.new_version }}
+          body: |
+            See NEWS.md for release notes.
+          draft: false
+          prerelease: false
+          
+      - uses: r-lib/actions/setup-r@v2
+        if: ${{ env.new_version != '' }}
+        
+      - name: Install drat
+        if: ${{ env.new_version != '' }}
+        run: |
+          install.packages('drat')
+        shell: Rscript {0}
+        
+      - name: Remove any tarballs that already exists
+        if: ${{ env.new_version != '' }}
+        run: |
+          rm -f *.tar.gz
+          
+      - name: Download package tarball
+        if: ${{ env.new_version != '' }}
+        uses: actions/download-artifact@v2
+        with:
+          name: package_tarball    
+          
+      - name: Push to drat
+        if: ${{ env.new_version != '' }}
+        run: |
+          bash deploy.sh
+      - name: Push to BroadSea
+        if: ${{ env.new_version != '' }}
+        run: |
+          curl --data "build=true" -X POST https://registry.hub.docker.com/u/ohdsi/broadsea-methodslibrary/trigger/f0b51cec-4027-4781-9383-4b38b42dd4f5/
+          
diff --git a/.github/workflows/nightly_cleanup_Hades.yml b/.github/workflows/nightly_cleanup_Hades.yml
new file mode 100644
index 0000000..3f9d7ce
--- /dev/null
+++ b/.github/workflows/nightly_cleanup_Hades.yml
@@ -0,0 +1,19 @@
+
+name: 'nightly artifacts cleanup'
+on:
+  schedule:
+    - cron: '0 1 * * *' # every night at 1 am UTC
+
+jobs:
+  remove-old-artifacts:
+    runs-on: ubuntu-latest
+    timeout-minutes: 10
+
+    steps:
+    - name: Remove old artifacts
+      uses: c-hive/gha-remove-artifacts@v1
+      with:
+        age: '7 days'
+        # Optional inputs
+        # skip-tags: true
+        skip-recent: 1
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
new file mode 100644
index 0000000..7a5e8ac
--- /dev/null
+++ b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,46 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [main, develop]
+  release:
+    types: [published]
+  workflow_dispatch:
+
+name: pkgdown
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    # Only restrict concurrency for non-PR jobs
+    concurrency:
+      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, ohdsi/OhdsiRTools
+          needs: website
+
+      - name: Build site
+        run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)'
+        
+      - name: Fix Hades Logo
+        run: Rscript -e 'OhdsiRTools::fixHadesLogo()'
+
+      - name: Deploy to GitHub pages 🚀
+        if: github.event_name != 'pull_request'
+        uses: JamesIves/github-pages-deploy-action@4.1.4
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..c567877
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+results
+config.yml
+docs
diff --git a/DESCRIPTION b/DESCRIPTION
index 42e28a4..8fa61a7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,49 +1,42 @@
 Package: DeepPatientLevelPrediction
 Type: Package
-Title: Package for deep learning patient level prediction using data in the OMOP Common Data
-    Model
-Version: 0.0.1
-Date: 2021-06-07
+Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model
+Version: 1.0.0
+Date: 29-08-2022
 Authors@R: c(
-    person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")),
+    person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut")), 
+    person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")),
     person("Seng", "Chan You", role = c("aut")),
-    person("Egill", "Friogeirsson", role = c("aut"))
+    person("Chungsoo", "Kim", role = c("aut")),
+    person("Henrik", "John", role = c("aut"))
     )
-    
-Maintainer: Jenna Reps <jreps@its.jnj.com>
-Description: A package for creating deep learning patient level prediction models following
-the OHDSI PatientLevelPrediction framework.
+Maintainer: Egill Fridgeirsson <e.fridgeirsson@erasmusmc.nl>
+Description: A package for creating deep learning patient level prediction models following the OHDSI PatientLevelPrediction framework.
 License: Apache License 2.0
 URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction
 BugReports: https://github.com/OHDSI/DeepPatientLevelPrediction/issues
 VignetteBuilder: knitr
 Depends:
-    R (>= 3.3.0),
-    FeatureExtraction (>= 3.0.0)
+    R (>= 3.5.0)
 Imports:
-    Andromeda,
-    DatabaseConnector (>= 3.0.0),
     dplyr,
-    knitr,
-    magrittr,
-    Matrix,
+    data.table,
+    FeatureExtraction (>= 3.0.0),
     ParallelLogger (>= 2.0.0),
-    reshape2,
-    reticulate (> 1.16),
-    RSQLite,
-    slam,
-    SqlRender (>= 1.1.3),
-    tibble,
-    tidyr,
+    PatientLevelPrediction,
+    rlang,
+    torch (>= 0.8.0)
 Suggests:
     devtools,
-    keras,
+    Eunomia,
+    knitr,
+    markdown,
     plyr,  
-    tensorflow,
     testthat
 Remotes:
-    ohdsi/FeatureExtraction
-LinkingTo: Rcpp
-NeedsCompilation: yes
-RoxygenNote: 7.1.1
+    ohdsi/PatientLevelPrediction@develop,
+    ohdsi/FeatureExtraction,
+    ohdsi/Eunomia
+RoxygenNote: 7.2.1
 Encoding: UTF-8
+Config/testthat/edition: 3
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..a102062
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,14 @@
+# Generated by roxygen2: do not edit by hand
+
+export(Dataset)
+export(Estimator)
+export(fitEstimator)
+export(gridCvDeep)
+export(predictDeepEstimator)
+export(setMultiLayerPerceptron)
+export(setResNet)
+export(setTransformer)
+import(data.table)
+importFrom(data.table,":=")
+importFrom(dplyr,"%>%")
+importFrom(rlang,.data)
diff --git a/NEWS.md b/NEWS.md
new file mode 100644
index 0000000..c0bdb3a
--- /dev/null
+++ b/NEWS.md
@@ -0,0 +1,6 @@
+DeepPatientLevelPrediction 1.0.0
+==========================
+
+- created an Estimator R6 class to handle the model fitting
+- Added three non-temporal models. An MLP, a ResNet and a Transformer
+- Created tests and documentation for the package
diff --git a/R/CIReNN.R b/R/CIReNN.R
deleted file mode 100644
index 4cdaa81..0000000
--- a/R/CIReNN.R
+++ /dev/null
@@ -1,1162 +0,0 @@
-# @file CIReNN.R
-# Code edited from OHDSI contributor @chandryou CIReNN branch
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# BuildCIReNN<-function(outcomes=ff::as.ffdf(population[,c('rowId','y')]),
-#     covariates = result$data,
-#     indexFolder=indexFolder){
-# 
-# }
-
-#' Create setting for CIReNN model
-#'
-#' @param numberOfRNNLayer The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3) 
-#' @param units         The number of units of RNN layer - as a list of vectors
-#' @param recurrentDropout  The reccurrent dropout rate (regularisation)
-#' @param layerDropout      The layer dropout rate (regularisation)
-#' @param lr                 Learning rate
-#' @param decay              Learning rate decay over each update.
-#' @param outcomeWeight      The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight. 
-#' @param batchSize          The number of data points to use per training batch
-#' @param epochs          Number of times to iterate over dataset
-#' @param earlyStoppingMinDelta         minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement.
-#' @param earlyStoppingPatience         Number of epochs with no improvement after which training will be stopped.
-#' @param useDeepEnsemble               logical (either TRUE or FALSE) value for using Deep Ensemble
-#' @param numberOfEnsembleNetwork       Integer, Number of Ensemble. If you want to use Deep Ensemble, this number should be greater than 1. 
-#' @param bayes                         logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer
-#' @param useDeepEnsemble               logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing. 
-#' @param numberOfEnsembleNetwork       Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5).
-#' @param useVae                        logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN
-#' @param vaeDataSamplingProportion     Data sampling proportion for VAE
-#' @param vaeValidationSplit            Validation split proportion for VAE
-#' @param vaeBatchSize                  batch size for VAE
-#' @param vaeLatentDim                  Number of latent dimesion for VAE
-#' @param vaeIntermediateDim            Number of intermediate dimesion for VAE
-#' @param vaeEpoch                      Number of times to interate over dataset for VAE
-#' @param vaeEpislonStd                 Epsilon
-#' @param useGPU                        logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE
-#' @param maxGPUs                       Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas.
-#' @param seed            Random seed used by deep learning model
-#' @importFrom zeallot %<-%
-#' @examples
-#' \dontrun{
-#' model.CIReNN <- setCIReNN()
-#' }
-#' @export
-setCIReNN <- function(numberOfRNNLayer=c(1),units=c(128, 64), recurrentDropout=c(0.2), layerDropout=c(0.2),
-                      lr =c(1e-4), decay=c(1e-5), outcomeWeight = c(0), batchSize = c(100), 
-                      epochs= c(100), earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), 
-                      bayes = T, useDeepEnsemble = F, numberOfEnsembleNetwork = 5, 
-                      useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, 
-                      vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, 
-                      vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2,
-                      seed=1234  ){
-  
-  ensure_installed("keras")
-  ensure_installed("tensorflow")
-  ensure_installed("plyr")
-  
-  if( sum(!( numberOfRNNLayer %in% c(1,2,3)))!=0 ) stop ('Only 1,2 or 3 is available now. ')
-  if(!((all.equal(numberOfEnsembleNetwork, as.integer(numberOfEnsembleNetwork))) & (numberOfEnsembleNetwork>=1) )) {
-    stop ('Number of ensemble network should be a natural number')  }
-  
-  # if(class(indexFolder)!='character')
-  #     stop('IndexFolder must be a character')
-  # if(length(indexFolder)>1)
-  #     stop('IndexFolder must be one')
-  # 
-  # if(class(units)!='numeric')
-  #     stop('units must be a numeric value >0 ')
-  # if(units<1)
-  #     stop('units must be a numeric value >0 ')
-  # 
-  # #if(length(units)>1)
-  # #    stop('units can only be a single value')
-  # 
-  # if(class(recurrent_dropout)!='numeric')
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if( (recurrent_dropout<0) | (recurrent_dropout>=1))
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if(class(layer_dropout)!='numeric')
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if( (layer_dropout<0) | (layer_dropout>=1))
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if(class(lr)!='numeric')
-  #     stop('lr must be a numeric value >0')
-  # if(lr<=0)
-  #     stop('lr must be a numeric value >0')
-  # if(class(decay)!='numeric')
-  #     stop('decay must be a numeric value >=0')
-  # if(decay<=0)
-  #     stop('decay must be a numeric value >=0')
-  # if(class(outcome_weight)!='numeric')
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(outcome_weight<=0)
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(class(batch_size)!='numeric')
-  #     stop('batch_size must be an integer')
-  # if(batch_size%%1!=0)
-  #     stop('batch_size must be an integer')
-  # if(class(epochs)!='numeric')
-  #     stop('epochs must be an integer')
-  # if(epochs%%1!=0)
-  #     stop('epochs must be an integer')
-  # if(!class(seed)%in%c('numeric','NULL'))
-  #     stop('Invalid seed')
-  #if(class(UsetidyCovariateData)!='logical')
-  #    stop('UsetidyCovariateData must be an TRUE or FALSE')
-  
-  result <- list(model='fitCIReNN', param=split(expand.grid(
-    numberOfRNNLayer=numberOfRNNLayer,units=units, recurrentDropout=recurrentDropout, 
-    layerDropout=layerDropout,
-    lr =lr, decay=decay, outcomeWeight=outcomeWeight, epochs= epochs,
-    earlyStoppingMinDelta = earlyStoppingMinDelta, earlyStoppingPatience = earlyStoppingPatience,
-    bayes= bayes, useDeepEnsemble = useDeepEnsemble,numberOfEnsembleNetwork = numberOfEnsembleNetwork,
-    useVae= useVae,vaeDataSamplingProportion = vaeDataSamplingProportion, vaeValidationSplit= vaeValidationSplit, 
-    vaeBatchSize = vaeBatchSize, vaeLatentDim = vaeLatentDim, vaeIntermediateDim = vaeIntermediateDim, 
-    vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd, useGPU = useGPU, maxGPUs = maxGPUs,
-    seed=ifelse(is.null(seed),'NULL', seed)),
-    
-    1:(length(numberOfRNNLayer)*length(units)*length(recurrentDropout)*length(layerDropout)*length(lr)*length(decay)*length(outcomeWeight)*length(earlyStoppingMinDelta)*length(earlyStoppingPatience)*length(epochs)*max(1,length(seed)))),
-    name='CIReNN'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitCIReNN <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=T)
-  data <- result$data
-  covariateMap <- result$map
-  
-  #remove result to save memory
-  rm(result)
-  
-  if(param[[1]]$useVae){
-    #Sampling the data for bulding VAE
-    vaeSampleData <- data[sample(seq(dim(data)[1]), floor(dim(data)[1]*param[[1]]$vaeDataSamplingProportion),replace=FALSE),,]
-    
-    #Build VAE
-    vae <- buildVae(vaeSampleData, vaeValidationSplit= param[[1]]$vaeValidationSplit, 
-                  vaeBatchSize = param[[1]]$vaeBatchSize, vaeLatentDim = param[[1]]$vaeLatentDim, vaeIntermediateDim = param[[1]]$vaeIntermediateDim,
-                  vaeEpoch = param[[1]]$vaeEpoch, vaeEpislonStd = param[[1]]$vaeEpislonStd, useGPU= param[[1]]$useGPU, maxGPUs= param[[1]]$maxGPUs, temporal = TRUE)
-    #remove sample data for VAE to save memory
-    vaeSampleData <- NULL
-    
-    vaeEnDecoder<- vae[[1]]
-    vaeEncoder  <- vae[[2]]
-    
-    #Embedding by using VAE encoder
-    data <- plyr::aaply(as.array(data), 2, function(x) stats::predict(vaeEncoder, x, batch_size = param$vaeBatchSize))
-    data <- aperm(data, perm = c(2,1,3))#rearrange of dimension
-    
-    ##Check the performance of vae
-    # decodedVaeData<-plyr::aaply(as.array(data), 2, function(x) predict(vaeEnDecoder, x, batch_size = param$vaeBatchSzie))
-    # decodedVaeData<-aperm(decodedVaeData, c(2,1,3))
-    # a1=Epi::ROC(form=as.factor(as.vector(data))~as.vector(decodedVaeData),plot="ROC")
-    
-  }else {
-    vaeEnDecoder <- NULL
-    vaeEncoder <- NULL
-  }
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, 2)#[,2] #population$outcomeCount
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  
-  #remove data to save memory
-  data <- NULL
-  
-  #Selection of hyperparameters
-  hyperParamSel <- lapply(param, function(x) do.call(trainCIReNN, c(x,datas,train=TRUE)  ))
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel <- unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainCIReNN, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCIReNN',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId = outcomeId,
-                 cohortId = cohortId,
-                 varImp = covariateRef, 
-                 trainingTime = comp,
-                 covariateMap = covariateMap,
-                 useDeepEnsemble = param.best$useDeepEnsemble,
-                 numberOfEnsembleNetwork = param.best$numberOfEnsembleNetwork,
-                 useVae = param.best$useVae,
-                 vaeBatchSize = param.best$vaeBatchSize,
-                 vaeEnDecoder = vaeEnDecoder,
-                 vaeEncoder = vaeEncoder,
-                 predictionTrain  = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deep'
-  if(param.best$useDeepEnsemble){
-    attr(result, 'type') <- 'deepEnsemble'
-    }
-  if(param.best$bayes){
-    attr(result, 'type') <- 'BayesianDeep'
-    }
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainCIReNN<-function(plpData, population,
-                      numberOfRNNLayer=1,units=128, recurrentDropout=0.2, layerDropout=0.2,
-                      lr =1e-4, decay=1e-5, outcomeWeight = 0, batchSize = 100, 
-                      epochs= 100, earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), 
-                      bayes = T, useDeepEnsemble = F,numberOfEnsembleNetwork =3,
-                      useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, 
-                      vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, 
-                      vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2,
-                      seed=NULL, train=TRUE){
-  
-  mu <- function(){return(NULL)}
-  sigma <- function(){return(NULL)}
-  
-  output_dim = 2 #output dimension for outcomes
-  num_MC_samples = 100 #sample number for MC sampling in Bayesian Deep Learning Prediction
-  if(outcomeWeight == 0){
-    outcomeWeight = round(sum(population$outcomeCount==0)/sum(population$outcomeCount>=1),1) #if outcome weight = 0, then it means balanced weight
-  }
-  #heteroscedatic loss function
-  heteroscedastic_loss = function(y_true, y_pred) {
-    mean = y_pred[, 1:output_dim]
-    log_var = y_pred[, (output_dim + 1):(output_dim * 2)]
-    precision = keras::k_exp(-log_var)
-    keras::k_sum(precision * (y_true - mean) ^ 2 + log_var, axis = 2)
-  }
-  
-  if(!is.null(population$indexes) && train==T){
-    writeLines(paste('Training recurrent neural network with ',length(unique(population$indexes)),' fold CV'))
-    
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      if(useDeepEnsemble){
-        predList<-list()
-        for (i in seq(numberOfEnsembleNetwork)){
-          #print(i)
-          ParallelLogger::logInfo(paste(i,'th process is started'))
-          pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs,
-                                      earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience,
-                                      train_rows=train_rows,index=index,lr=lr,decay=decay,
-                                      units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer,
-                                      layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs)
-          ParallelLogger::logInfo(paste(i,'th process is ended started'))
-          predList <- append(predList,pred)
-        }
-        model <- predList
-        
-        # batch prediciton 
-        maxVal <- sum(population$indexes==index)
-        batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-        prediction <- population[population$indexes==index,]
-        prediction$value <- 0
-        prediction$sigmas <- 0
-        
-        for(batch in batches){
-          
-          for (i in seq(numberOfEnsembleNetwork)){
-            if(i==1){
-              muMatrix <- data.frame()
-              sigmaMatrix <-data.frame()
-            }
-            c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,])))
-            muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2])))
-            sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
-          }
-          
-          muMean <- apply(muMatrix,2,mean)
-          muSq <- muMatrix^2
-          sigmaSq <- sigmaMatrix^2
-          sigmaMean <- apply(sigmaMatrix,2,mean)
-          sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2
-          
-          prediction$value[batch] <- c(muMean)
-          #if prediction$value is negative, make this positive
-          prediction$sigmas[batch] <- c(sigmaResult)
-          
-        }
-        prediction$value[prediction$value>1] <- 1
-        prediction$value[prediction$value<0] <- 0
-        #prediction$value[batch] <- mu[,2]
-        #prediction$sigmas[batch] <- sigma[,2]
-        
-        #writeLines(paste0(dim(pred[,2]), collapse='-'))
-        #writeLines(paste0(pred[1,2], collapse='-'))
-        
-        attr(prediction, "metaData") <- list(predictionType = "binary")
-        aucVal <- computeAuc(prediction)
-        perform <- c(perform,aucVal)
-        
-        # add the fold predictions and compute AUC after loop
-        predictionMat$value[population$indexes==index] <- prediction$value
-        
-      }else{
-        layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-        if(useGPU){
-          ##GRU layer
-          if(numberOfRNNLayer==1){
-            layerOutput <- layerInput %>% 
-              keras::layer_cudnn_gru(units=units, #time step x number of features
-                                     return_sequences=FALSE) %>% 
-              keras::layer_dropout(layerDropout)
-          } 
-          if(numberOfRNNLayer==2){
-            layerOutput <- layerInput %>% 
-              keras::layer_cudnn_gru(units=units, #time step x number of features
-                                     return_sequences=TRUE) %>% 
-              keras::layer_dropout(layerDropout) %>%
-              keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-              keras::layer_dropout(layerDropout)
-          }
-          if(numberOfRNNLayer==3){
-            layerOutput <- layerInput %>% 
-              keras::layer_cudnn_gru(units=units, #time step x number of features
-                                     return_sequences=TRUE) %>% 
-              keras::layer_dropout(layerDropout) %>%
-              keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
-              keras::layer_dropout(layerDropout) %>%
-              keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-              keras::layer_dropout(layerDropout)
-          }
-        }else{
-          ##GRU layer
-          if(numberOfRNNLayer == 1){
-            layerOutput <- layerInput %>% 
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=FALSE) 
-          }
-          if(numberOfRNNLayer > 1 ){
-            layerInput %>% # !ISSUE : "missing layerOutput <- "?
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=TRUE) 
-          }
-          if(numberOfRNNLayer == 2){
-            layerOutput <- layerInput %>% 
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=FALSE)
-          }
-          if(numberOfRNNLayer==3){
-            layerOutput <- layerInput %>% 
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=TRUE) %>%
-              # ISSUE- I removed layerOutput <- layerInput %>%  as this was after a pipe
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=FALSE) 
-          }
-        }
-        
-        earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
-                                                     mode="auto",min_delta = earlyStoppingMinDelta)
-        reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                      patience = 5,mode = "auto", 
-                                                      min_delta = 1e-5, cooldown = 0, min_lr = 0)
-        
-        class_weight=list("0"=1,"1"=outcomeWeight)
-        
-        if(bayes){
-          mean = layerOutput %>% 
-            layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-          
-          log_var = layerOutput %>% 
-            layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-          
-          output = keras::layer_concatenate(list(mean, log_var))
-          model = keras::keras_model(layerInput, output)
-          #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output)
-          model %>% keras::compile(
-            optimizer = "adam",
-            loss = heteroscedastic_loss,
-            metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss))
-          )
-          
-        }else{
-          model<- layerInput %>% 
-            keras::layer_dense(units=2, activation='softmax')
-          model %>% keras::compile(
-            loss = 'binary_crossentropy',
-            metrics = c('accuracy'),
-            optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-          )
-        }
-        
-        data <- plpData[population$rowId[population$indexes!=index],,]
-        
-        #Extract validation set first - 10k people or 5%
-        valN <- min(10000,sum(population$indexes!=index)*0.05)
-        val_rows <- sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-        train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-        
-        sampling_generator <- function(data, population, batchSize, train_rows, index){
-          function(){
-            gc()
-            rows<-sample(train_rows, batchSize, replace=FALSE)
-            
-            list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,])
-          }
-        }
-        
-
-        history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                                  steps_per_epoch = sum(population$indexes!=index)/batchSize,
-                                                  epochs=epochs,
-                                                  validation_data=list(as.array(data[val_rows,,]), 
-                                                                       population$y[population$indexes!=index,][val_rows,]),
-                                                  callbacks=list(earlyStopping,reduceLr),
-                                                  class_weight=class_weight)
-        
-        # batch prediciton 
-        maxVal <- sum(population$indexes==index)
-        batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-        prediction <- population[population$indexes==index,]
-        prediction$value <- 0
-        
-        if(bayes){
-          prediction$epistemicUncertainty <- 0
-          prediction$aleatoricUncertainty <- 0
-          for(batch in batches){
-            MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
-            for (k in 1:num_MC_samples){
-              MC_samples[k,, ] = stats::predict(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-                #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-            }
-            pred <- apply(MC_samples[,,output_dim], 2, mean)
-            epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
-            logVar = MC_samples[, , output_dim * 2]
-            
-            if(length(dim(logVar))<=1){
-              aleatoricUncertainty = exp(mean(logVar))
-            }else{
-              aleatoricUncertainty = exp(colMeans(logVar))
-            }
-            
-            prediction$value[batch] <- pred
-            prediction$epistemicUncertainty[batch] = epistemicUncertainty
-            prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
-            
-          }
-          
-        }else{
-          for(batch in batches){
-            pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-            prediction$value[batch] <- pred[,2]
-            #writeLines(paste0(dim(pred[,2]), collapse='-'))
-            #writeLines(paste0(pred[1,2], collapse='-'))
-          }
-        }
-        prediction$value[prediction$value>1] <- 1
-        prediction$value[prediction$value<0] <- 0
-        attr(prediction, "metaData") <- list(predictionType = "binary")
-        aucVal <- computeAuc(prediction)
-        perform <- c(perform,aucVal)
-        
-        # add the fold predictions and compute AUC after loop
-        predictionMat$value[population$indexes==index] <- prediction$value
-        # add uncertainty
-        predictionMat$aleatoricUncertainty[population$indexes==index] <- prediction$aleatoricUncertainty
-        predictionMat$epistemicUncertainty[population$indexes==index] <- prediction$epistemicUncertainty
-      }
-      
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('RNNlayer Number: ', numberOfRNNLayer, '-- units: ',units,'-- recurrentDropout: ', recurrentDropout,
-                        'layerDropout: ',layerDropout,'-- lr: ', lr,
-                        '-- decay: ', decay,'-- outcomeWeight',outcomeWeight, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
-    writeLines('==========================================')
-    writeLines(paste0('CIReNN with parameters:', param.val,' obtained an AUC of ',auc))
-    writeLines('==========================================')
-    
-  } else {
-    if(useDeepEnsemble){
-      predList<-list()
-      for (i in seq(numberOfEnsembleNetwork)){
-       #print(i)
-       pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs,
-                                   earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience,
-                                   train_rows=train_rows,index=index,lr=lr,decay=decay,
-                                   units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer,
-                                   layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs)
-       
-       predList <- append(predList,pred)
-      }
-      model <- predList
-      
-      # batch prediciton 
-      maxVal <- nrow(population)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population
-      prediction$value <- 0
-      prediction$sigmas <- 0
-      
-      for(batch in batches){
-        
-        for (i in seq(numberOfEnsembleNetwork)){
-          if(i == 1){
-            muMatrix <- data.frame()
-            sigmaMatrix <-data.frame()
-          }
-          c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[batch,,])))
-          muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2])))
-          sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
-        }
-        
-        muMean <- apply(muMatrix,2,mean)
-        muSq <- muMatrix^2
-        sigmaSq <- sigmaMatrix^2
-        sigmaMean <- apply(sigmaMatrix,2,mean)
-        sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2
-        
-        prediction$value[batch] <- c(muMean)
-        prediction$sigmas[batch] <- c(sigmaResult)
-      }
-      prediction$value[prediction$value>1] <- 1
-      prediction$value[prediction$value<0] <- 0
-
-      
-    }else{
-      layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-      if(useGPU){
-        ##GRU layer
-        if(numberOfRNNLayer==1){
-          layerOutput <- layerInput %>% 
-            keras::layer_cudnn_gru(units=units, #time step x number of features
-                                   return_sequences=FALSE) %>% 
-            keras::layer_dropout(layerDropout)
-        } 
-        if(numberOfRNNLayer==2){
-          layerOutput <- layerInput %>% 
-            keras::layer_cudnn_gru(units=units, #time step x number of features
-                                   return_sequences=TRUE) %>% 
-            keras::layer_dropout(layerDropout) %>%
-            keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-            keras::layer_dropout(layerDropout)
-        }
-        if(numberOfRNNLayer==3){
-          layerOutput <- layerInput %>% 
-            keras::layer_cudnn_gru(units=units, #time step x number of features
-                                   return_sequences=TRUE) %>% 
-            keras::layer_dropout(layerDropout) %>%
-            keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
-            keras::layer_dropout(layerDropout) %>%
-            keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-            keras::layer_dropout(layerDropout)
-        }
-      }else{
-        ##GRU layer
-        if(numberOfRNNLayer==1){
-          layerOutput <- layerInput %>% 
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=FALSE) 
-        }
-        if(numberOfRNNLayer>1 ){
-          layerInput %>%   # ISSUE - "layerInput <- " missing?
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=TRUE) 
-        }
-        if(numberOfRNNLayer==2){
-          layerOutput <- layerInput %>% 
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=FALSE) 
-        }
-        if(numberOfRNNLayer==3){
-          layerOutput <- layerInput %>% 
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=TRUE) %>%
-            #layerOutput <- layerInput %>%  ISSUE - pipe above?
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=FALSE) 
-        }
-      }
-      
-      earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
-                                                   mode="auto",min_delta = earlyStoppingMinDelta)
-      reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", 
-                                                    min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight = list("0" = 1, 
-                          "1" = outcomeWeight)
-      
-      if(bayes){
-        mean = layerOutput %>% 
-          layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-        
-        log_var = layerOutput %>% 
-          layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-        
-        output = keras::layer_concatenate(list(mean, log_var))
-        model = keras::keras_model(layerInput, output)
-        #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output)
-        model %>% keras::compile(
-          optimizer = "adam",
-          loss = heteroscedastic_loss,
-          metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss))
-        )
-        
-      }else{
-        model <- layerInput %>% 
-          keras::layer_dense(units=2, activation='softmax')
-        model %>% keras::compile(
-          loss = 'binary_crossentropy',
-          metrics = c('accuracy'),
-          optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-        )
-      }
-      
-      data <- plpData[population$rowId,,]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,length(population$indexes)*0.05)
-      val_rows <- sample(1:length(population$indexes), valN, replace=FALSE)
-      train_rows <- c(1:length(population$indexes))[-val_rows]
-      
-      sampling_generator2 <- function(data, population, batchSize, train_rows){
-        function(){
-          gc()
-          rows<-sample(train_rows, batchSize, replace=FALSE)
-          list(as.array(data[rows,,]), population$y[rows,])
-        }
-      }
-      
-      
-      history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                                steps_per_epoch = nrow(population[-val_rows,])/batchSize,
-                                                epochs=epochs,
-                                                validation_data=list(as.array(data[val_rows,,]), 
-                                                                     population$y[val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight,
-                                                view_metrics=F)
-      
-      
-      # batched prediciton 
-      maxVal <- nrow(population)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population
-      prediction$value <- 0
-      
-      if(bayes){
-        prediction$epistemicUncertainty <- 0
-        prediction$aleatoricUncertainty <- 0
-        for(batch in batches){
-          MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
-          for (k in 1:num_MC_samples){
-            MC_samples[k,, ] = stats::predict(model, as.array(plpData[batch,,]))
-            #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-          }
-          pred <- apply(MC_samples[,,output_dim], 2, mean)
-          epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
-          logVar = MC_samples[, , output_dim * 2]
-          if(length(dim(logVar)) <= 1){
-            aleatoricUncertainty = exp(mean(logVar))
-          }else{
-            aleatoricUncertainty = exp(colMeans(logVar))
-            
-          }
-          prediction$value[batch] <- pred
-          prediction$epistemicUncertainty[batch] = epistemicUncertainty
-          prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
-        }
-        
-      }else{
-        for(batch in batches){
-          pred <- keras::predict_on_batch(model, as.array(plpData[batch,,]))
-          prediction$value[batch] <- pred[,2]
-        }
-        
-      }
-      prediction$value[prediction$value>1] <- 1
-      prediction$value[prediction$value<0] <- 0
-      
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      auc <- computeAuc(prediction)
-      foldPerm <- auc
-      predictionMat <- prediction
-      
-    }
-  
-  }
-  result <- list(model=model,
-                 auc=auc,
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(numberOfRNNLayer=numberOfRNNLayer, 
-                                        units=units, recurrentDropout=recurrentDropout, 
-                                        layerDropout=layerDropout,lr =lr, decay=decay,outcomeWeight=outcomeWeight,
-                                        batchSize = batchSize, epochs= epochs, earlyStoppingMinDelta = earlyStoppingMinDelta, 
-                                        earlyStoppingPatience=earlyStoppingPatience,
-                                        useDeepEnsemble = useDeepEnsemble,
-                                        numberOfEnsembleNetwork =numberOfEnsembleNetwork,
-                                        useVae = useVae, vaeDataSamplingProportion =vaeDataSamplingProportion ,vaeValidationSplit = vaeValidationSplit,
-                                        vaeBatchSize =vaeBatchSize,
-                                        vaeLatentDim = vaeLatentDim,
-                                        vaeIntermediateDim = vaeIntermediateDim,
-                                        vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd))
-  )
-  return(result)
-  
-}
-
-#function for building vae
-buildVae<-function(data, vaeValidationSplit= 0.2, vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L,
-                   vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU= FALSE, maxGPUs = NULL, temporal = TRUE){
-  if (temporal){
-    dataSample <- data %>% 
-      apply(3, as.numeric)
-    } else{
-      dataSample <- data
-    }
-  originalDim <- dim(dataSample)[2]
-  K <- keras::backend()
-  x <- keras::layer_input (shape =originalDim)
-  h <- keras::layer_dense (x, vaeIntermediateDim, activation = 'relu')
-  z_mean <- keras::layer_dense(h, vaeLatentDim)
-  z_log_var <- keras::layer_dense(h, vaeLatentDim)
-  
-  sampling<- function(arg){
-    z_mean <- arg[,1:vaeLatentDim]
-    z_log_var <- arg[, (vaeLatentDim+1):(2*vaeLatentDim)]
-    
-    epsilon <- keras::k_random_normal(
-      shape = c(keras::k_shape(z_mean)[[1]]),
-      mean = 0.,
-      stddev = vaeEpislonStd
-    )
-    
-    z_mean + keras::k_exp(z_log_var/2)*epsilon
-  }
-  
-  z <- keras::layer_concatenate(list(z_mean, z_log_var)) %>% 
-    keras::layer_lambda(sampling)
-  
-  #we instantiate these layers separately so as to reuse them later
-  decoder_h <- keras::layer_dense(units = vaeIntermediateDim, activation = 'relu')
-  decoder_mean <- keras::layer_dense (units = originalDim, activation = 'sigmoid')
-  h_decoded <- decoder_h (z)
-  x_decoded_mean <- decoder_mean(h_decoded)
-  
-  #end-to-end autoencoder
-  vae <- keras::keras_model (x,x_decoded_mean)
-  #encoder, from inputs to latent space
-  encoder <- keras::keras_model(x, z_mean)
-  
-  #generator, from latent space to reconstruted inputs
-  decoder_input <- keras::layer_input (shape = vaeLatentDim)
-  h_decoded_2 <- decoder_h(decoder_input)
-  x_decoded_mean_2 <- decoder_mean(h_decoded_2)
-  generator <- keras::keras_model (decoder_input, x_decoded_mean_2)
-  
-  vae_loss <- function(x, x_decoded_mean){
-    xent_loss <- (originalDim/1.0)* keras::loss_binary_crossentropy(x, x_decoded_mean)
-    k1_loss <- -0.5 * keras::k_mean(1 + z_log_var - keras::k_square(z_mean) - keras::k_exp(z_log_var), axis = -1L)
-    xent_loss + k1_loss
-  }
-  #Activating parallelisation of GPU in encoder
-  if(useGPU & (maxGPUs>1) ){
-    vae <- keras::multi_gpu_model(vae,gpus = maxGPUs)
-  }
-  
-  vae %>% keras::compile (optimizer = "rmsprop", loss = vae_loss)
-  #if (!is.null(dataValidation)) dataValidation<-list(dataValidation,dataValidation)
-  vaeEarlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=5,mode="auto",min_delta = 1e-3)
-  naanStopping = keras::callback_terminate_on_naan()
-  csvLogging = keras::callback_csv_logger (filename="./vae.csv",separator = ",", append =TRUE )
-  
-  vae %>% keras::fit (
-    dataSample,dataSample
-    ,shuffle = TRUE
-    ,epochs = vaeEpoch
-    ,batch_size = vaeBatchSize
-    #,validation_data = dataValidation
-    ,validation_split = vaeValidationSplit
-    ,callbacks = list(vaeEarlyStopping, 
-                      csvLogging,
-                      naanStopping)
-  )
-  return (list (vae,encoder))
-}
-
-#Defining Gaussian Layer for Deep Ensemble
-GaussianLayer <- R6::R6Class("GaussianLayer",
-                             inherit = keras::KerasLayer,
-                             
-                             public = list(
-                               output_dim = NULL,
-                               kernel_1 = NULL,
-                               kernel_2 = NULL,
-                               bias_1 = NULL,
-                               bias_2 = NULL,
-                               
-                               initialize = function(output_dim){
-                                 self$output_dim <- output_dim
-                               },
-                               build = function(input_shape){
-                                 super$build(input_shape)
-                                 
-                                 self$kernel_1 = self$add_weight(name = 'kernel_1',
-                                                                 shape = list(as.integer(input_shape[[2]]), self$output_dim), #list(30, self$output_dim),#shape = keras::shape(30, self$output_dim),
-                                                                 initializer = keras::initializer_glorot_normal(),
-                                                                 trainable = TRUE)
-                                 self$kernel_2 = self$add_weight(name = 'kernel_2',
-                                                                 shape = list(as.integer(input_shape[[2]]), self$output_dim),#list(30, self$output_dim),  #shape = keras::shape(30, self$output_dim),
-                                                                 initializer = keras::initializer_glorot_normal(),
-                                                                 trainable = TRUE)
-                                 self$bias_1 = self$add_weight(name = 'bias_1',
-                                                               shape = list(self$output_dim),  #shape = keras::shape(self$output_dim),
-                                                               initializer = keras::initializer_glorot_normal(),
-                                                               trainable = TRUE)
-                                 self$bias_2 = self$add_weight(name = 'bias_2',
-                                                               shape = list(self$output_dim), #shape = keras::shape(self$output_dim),
-                                                               initializer = keras::initializer_glorot_normal(),
-                                                               trainable = TRUE)
-                               },
-                               
-                               call = function(x, mask = NULL){
-                                 output_mu = keras::k_dot(x, self$kernel_1) + self$bias_1
-                                 output_sig = keras::k_dot(x, self$kernel_2) + self$bias_2
-                                 output_sig_pos = keras::k_log(1 + keras::k_exp(output_sig)) + 1e-06
-                                 return (list(output_mu, output_sig_pos))
-                               },
-                               
-                               
-                               compute_output_shape = function(input_shape){
-                                 return (list (
-                                   list(input_shape[[1]], self$output_dim), 
-                                   list(input_shape[[1]], self$output_dim) )
-                                 )
-                               } 
-                             )
-)
-
-#define layer wrapper function for Deep Ensemble
-layer_custom <- function(object, output_dim, name = NULL, trainable = TRUE) {
-  keras::create_layer(GaussianLayer, object, list(
-    output_dim = as.integer(output_dim),
-    name = name,
-    trainable = trainable
-  ))
-}
-
-#Custom loss function for Deep Ensemble
-custom_loss <- function(sigma){
-  gaussian_loss <- function(y_true,y_pred){
-    tensorflow::tf$reduce_mean(0.5*tensorflow::tf$log(sigma) + 0.5*tensorflow::tf$div(tensorflow::tf$square(y_true - y_pred), sigma)) + 1e-6
-  }
-  return(gaussian_loss)
-}
-
-#Create Deep Ensemble Network function
-createEnsembleNetwork<-function(train, plpData,population,batchSize,epochs, earlyStoppingPatience, earlyStoppingMinDelta,
-                                train_rows=NULL,index=NULL,lr,decay,
-                                units,recurrentDropout,numberOfRNNLayer,layerDropout, useGPU = useGPU, maxGPUs = maxGPUs){
-  
-  mu <- function(){return(NULL)}
-  sigma <- function(){return(NULL)}
-  
-  if(useGPU){
-    ##GRU layer
-    layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-    
-    if(numberOfRNNLayer==1){
-      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
-                                                return_sequences=FALSE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    } 
-    if(numberOfRNNLayer==2){
-      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_gru(units=units, return_sequences=FALSE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-    if(numberOfRNNLayer==3){
-      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-  }else{
-    ##GRU layer
-    layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-    
-    if(numberOfRNNLayer==1){
-      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
-                                                return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    } 
-    if(numberOfRNNLayer==2){
-      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE)%>% keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-    if(numberOfRNNLayer==3){
-      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=TRUE) %>%
-        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-  }
-  
-  c(mu,sigma) %<-% layer_custom(layers, 2, name = 'main_output')
-  
-  model <- keras::keras_model(inputs = layerInput,outputs=mu)
-  
-  earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
-                                               mode="auto",min_delta = earlyStoppingMinDelta)
-  #Currently using Parallel GPU makes errors in Deep Ensemble
-  #if(useGPU & (maxGPUs>1) ) model <- keras::multi_gpu_model(model,gpus = maxGPUs) 
-  
-  model %>% keras::compile(
-    loss = custom_loss(!!sigma),
-    optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-  )
-  
-  
-  if(!is.null(population$indexes) && train==T){
-    data <- plpData[population$rowId[population$indexes!=index],,]
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,sum(population$indexes!=index)*0.05)
-    val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-    train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-    
-    sampling_generator<-function(data, population, batchSize, train_rows, index){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        
-        list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,])
-      }
-    }
-    
-    history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                              steps_per_epoch = sum(population$indexes!=index)/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(data[val_rows,,]), 
-                                                                   population$y[population$indexes!=index,][val_rows,])#,
-                                              ,callbacks=list(earlyStopping)
-                                              #callbacks=list(earlyStopping,reduceLr),
-    )
-    
-  }else{
-    data <- plpData[population$rowId,,]
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,length(population$indexes)*0.05)
-    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
-    train_rows <- c(1:length(population$indexes))[-val_rows]
-    
-    sampling_generator2<-function(data, population, batchSize, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        list(as.array(data[rows,,]), population$y[rows,])
-      }
-    }
-    
-    
-    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                              steps_per_epoch = nrow(population[-val_rows,])/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(data[val_rows,,]),
-                                                                   population$y[val_rows,]),
-                                              callbacks=list(earlyStopping),
-                                              view_metrics=F)
-
-  }
-  #ParallelLogger::logInfo('right before get_intermediate')
-  layer_name = 'main_output'
-  get_intermediate = keras::k_function(inputs=list(model$input),
-                                       outputs=model$get_layer(layer_name)$output)
-  #ParallelLogger::logInfo('right after get_intermediate')
-  return(get_intermediate)
-}
-
-#Custom layer for Bayesian Drop Out Layer
-ConcreteDropout <- R6::R6Class("ConcreteDropout",
-                               
-                               inherit = keras::KerasWrapper,
-                               
-                               public = list(
-                                 weight_regularizer = NULL,
-                                 dropout_regularizer = NULL,
-                                 init_min = NULL,
-                                 init_max = NULL,
-                                 is_mc_dropout = NULL,
-                                 supports_masking = TRUE,
-                                 p_logit = NULL,
-                                 p = NULL,
-                                 
-                                 initialize = function(weight_regularizer,
-                                                       dropout_regularizer,
-                                                       init_min,
-                                                       init_max,
-                                                       is_mc_dropout) {
-                                   self$weight_regularizer <- weight_regularizer
-                                   self$dropout_regularizer <- dropout_regularizer
-                                   self$is_mc_dropout <- is_mc_dropout
-                                   self$init_min <- keras::k_log(init_min) - keras::k_log(1 - init_min)
-                                   self$init_max <- keras::k_log(init_max) - keras::k_log(1 - init_max)
-                                 },
-                                 
-                                 build = function(input_shape) {
-                                   super$build(input_shape)
-                                   
-                                   self$p_logit <- super$add_weight(
-                                     name = "p_logit",
-                                     shape = keras::shape(1),
-                                     initializer = keras::initializer_random_uniform(self$init_min, self$init_max),
-                                     trainable = TRUE
-                                   )
-                                   
-                                   self$p <- keras::k_sigmoid(self$p_logit)
-                                   
-                                   input_dim <- input_shape[[2]]
-                                   
-                                   weight <- private$py_wrapper$layer$kernel
-                                   
-                                   kernel_regularizer <- self$weight_regularizer * 
-                                     keras::k_sum(keras::k_square(weight)) / 
-                                     (1 - self$p)
-                                   
-                                   dropout_regularizer <- self$p * keras::k_log(self$p)
-                                   dropout_regularizer <- dropout_regularizer +  
-                                     (1 - self$p) * keras::k_log(1 - self$p)
-                                   dropout_regularizer <- dropout_regularizer * 
-                                     self$dropout_regularizer * 
-                                     keras::k_cast(input_dim, keras::k_floatx())
-                                   
-                                   regularizer <- keras::k_sum(kernel_regularizer + dropout_regularizer)
-                                   super$add_loss(regularizer)
-                                 },
-                                 
-                                 concrete_dropout = function(x) {
-                                   eps <- keras::k_cast_to_floatx(keras::k_epsilon())
-                                   temp <- 0.1
-                                   
-                                   unif_noise <- keras::k_random_uniform(shape = keras::k_shape(x))
-                                   
-                                   drop_prob <- keras::k_log(self$p + eps) - 
-                                     keras::k_log(1 - self$p + eps) + 
-                                     keras::k_log(unif_noise + eps) - 
-                                     keras::k_log(1 - unif_noise + eps)
-                                   drop_prob <- keras::k_sigmoid(drop_prob / temp)
-                                   
-                                   random_tensor <- 1 - drop_prob
-                                   
-                                   retain_prob <- 1 - self$p
-                                   x <- x * random_tensor
-                                   x <- x / retain_prob
-                                   x
-                                 },
-                                 
-                                 call = function(x, mask = NULL, training = NULL) {
-                                   if (self$is_mc_dropout) {
-                                     super$call(self$concrete_dropout(x))
-                                   } else {
-                                     k_in_train_phase(
-                                       function()
-                                         super$call(self$concrete_dropout(x)),
-                                       super$call(x),
-                                       training = training
-                                     )
-                                   }
-                                 }
-                               )
-)
-#define layer wrapper for Bayesian Drop-out layer
-layer_concrete_dropout <- function(object, 
-                                   layer,
-                                   weight_regularizer = 1e-6,
-                                   dropout_regularizer = 1e-5,
-                                   init_min = 0.1,
-                                   init_max = 0.1,
-                                   is_mc_dropout = TRUE,
-                                   name = NULL,
-                                   trainable = TRUE) {
-  keras::create_wrapper(ConcreteDropout, object, list(
-    layer = layer,
-    weight_regularizer = weight_regularizer,
-    dropout_regularizer = dropout_regularizer,
-    init_min = init_min,
-    init_max = init_max,
-    is_mc_dropout = is_mc_dropout,
-    name = name,
-    trainable = trainable
-  ))
-}
diff --git a/R/CNNTorch.R b/R/CNNTorch.R
deleted file mode 100644
index b2d021d..0000000
--- a/R/CNNTorch.R
+++ /dev/null
@@ -1,178 +0,0 @@
-# @file CNNTorch.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for CNN model with python 
-#' @param nbfilters  The number of filters
-#' @param epochs     The number of epochs
-#' @param seed       A seed for the model
-#' @param class_weight   The class weight used for imbalanced data: 
-#'                           0: Inverse ratio between positives and negatives
-#'                          -1: Focal loss
-#' @param type      It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, 
-#'                      'CNN_MIX', 'ResNet' and 'CNN_MULTI'
-#' 
-#' @examples
-#' \dontrun{
-#' model.cnnTorch <- setCNNTorch()
-#' }
-#' @export
-setCNNTorch <- function(nbfilters=c(16, 32), epochs=c(20, 50), seed=0, class_weight = 0, type = 'CNN'){
-  
-  ParallelLogger::logWarn('This model has broken - please use setCNN() or setCNN2() instead ')
-  
-  # set seed
-  if(is.null(seed[1])){
-    seed <- as.integer(sample(100000000,1))
-  }
-  
-  result <- list(model='fitCNNTorch', param=split(expand.grid(nbfilters=nbfilters,
-                                            epochs=epochs, seed=seed[1], 
-											class_weight = class_weight, type = type),
-											1:(length(nbfilters)*length(epochs)) ),
-                 					  name='CNN Torch')
-  
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-fitCNNTorch <- function(population, plpData, param, search='grid', quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # check plpData is libsvm format or convert if needed
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  
-  if(colnames(population)[ncol(population)]!='indexes'){
-    warning('indexes column not present as last column - setting all index to 1')
-    population$indexes <- rep(1, nrow(population))
-  }
-  
-  
-  start <- Sys.time()
-  
-  population$rowIdPython <- population$rowId-1  #to account for python/r index difference #subjectId
-  pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
-  
-  result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T)
-  
-  outLoc <- createTempModelLoc()
-  # clear the existing model pickles
-  for(file in dir(outLoc))
-    file.remove(file.path(outLoc,file))
-
-  # do cross validation to find hyperParameter
-  hyperParamSel <- lapply(param, function(x) do.call(trainCNNTorch, listAppend(x, 
-                                                                                list(plpData = result$data,
-                                                                                     population = pPopulation,
-                                                                                     train=TRUE,
-                                                                                     modelOutput=outLoc))  ))
-  
-  hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel <- do.call(trainCNNTorch, listAppend(param[[bestInd]], 
-                                                  list(plpData = result$data,
-                                                       population = pPopulation,
-                                                       train=FALSE,
-                                                       modelOutput=outLoc)))
-  
-
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  pred <- as.matrix(finalModel)
-  pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes
-  colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-  pred <- as.data.frame(pred)
-  attr(pred, "metaData") <- list(predictionType="binary")
-  
-  pred$value <- 1-pred$value
-  prediction <- merge(population, pred[,c('rowId','value')], by='rowId')
-  
-  
-  # return model location 
-  result <- list(model = modelTrained,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCNNTorch',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 dense=1,
-                 covariateMap=result$map, # I think this is need for new data to map the same?
-                 predictionTrain = prediction
-                 )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'pythonReticulate'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-
-trainCNNTorch <- function(plpData, population, epochs=50, nbfilters = 16, seed=0, class_weight= 0, type = 'CNN', train=TRUE, modelOutput, quiet=F){
-  
-  train_deeptorch <- function(){return(NULL)}
-  
-  python_dir <- system.file(package='PatientLevelPrediction','python')
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e)
-  
-  
-  result <- train_deeptorch(population = population, 
-                            plpData = plpData, 
-                            epochs = as.integer(epochs),
-                            nbfilters = as.integer(nbfilters),
-                            seed = as.integer(seed), 
-                            class_weight = as.double(class_weight),
-                            model_type = as.character(type),
-                            train = train,
-                            modelOutput = modelOutput,
-                            quiet = quiet
-                            )
-  
-  if(train){
-    # then get the prediction 
-    pred <- as.matrix(result)
-    colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-    pred <- as.data.frame(pred)
-    attr(pred, "metaData") <- list(predictionType="binary")
-    
-    pred$value <- 1-pred$value
-    auc <- computeAuc(pred)
-    writeLines(paste0('Model obtained CV AUC of ', auc))
-    return(auc)
-  }
-  
-  return(result)
-  
-}
diff --git a/R/CovNN.R b/R/CovNN.R
deleted file mode 100644
index 4c05ffb..0000000
--- a/R/CovNN.R
+++ /dev/null
@@ -1,535 +0,0 @@
-# @file CovNN.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)
-#'
-#' @param batchSize         The number of samples to used in each batch during model training
-#' @param outcomeWeight     The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)
-#' @param lr                The learning rate
-#' @param decay             The decay of the learning rate
-#' @param dropout            [currently not used] the dropout rate for regularisation
-#' @param epochs            The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)
-#' @param filters            The number of columns output by each convolution
-#' @param kernelSize        The number of time dimensions used for each convolution
-#' @param loss               The loss function implemented 
-#' @param seed               The random seed
-#'
-#' @examples
-#' \dontrun{
-#' model.CovNN <- setCovNN()
-#' }
-#' @export
-setCovNN <- function(batchSize = 1000,
-                     outcomeWeight=1,
-                     lr=0.00001,
-                     decay=0.000001,
-                     dropout=0,
-                     epochs = 10,
-                     filters = 3, kernelSize = 10,
-                     loss = "binary_crossentropy", 
-                     seed=NULL  ){
-  #[TODO: add input checks...]
-  
-  ensure_installed("keras")
-  
-  if(!is.null(seed)){
-    warning('seed currently not implemented in CovNN')
-  }
-  
-  
-  result <- list(model='fitCovNN', param=split(expand.grid(
-    batchSize=batchSize, 
-    outcomeWeight=outcomeWeight,
-    lr=lr,
-    decay=decay,
-    dropout=dropout,
-    epochs= epochs,filters=filters,
-    kernelSize=kernelSize,loss =loss,
-    seed=ifelse(is.null(seed),'NULL', seed)),
-    1:(length(batchSize)*length(outcomeWeight)*length(epochs)*
-         length(filters)*length(lr)*length(decay)*
-         length(kernelSize)*length(loss)*max(1,length(seed)))),
-    name='CovNN'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitCovNN <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  if(is.null(plpData$timeRef)){
-    stop('Data not temporal...')
-  }
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=T)
-  
-  data <- result$data#[population$rowId,,]
-  #data<-as.array(data) -- cant make dense on big data!
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, 2)
-  #colnames(population$y) <- c('0','1')
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  hyperParamSel <- lapply(param, function(x) do.call(trainCovNN, c(x,datas,train=TRUE)  ))
-  
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainCovNN, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCovNN',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 covariateMap=result$map,
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deepMulti'
-  attr(result, 'inputs') <- '3'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainCovNN<-function(plpData, population,
-                     outcomeWeight=1, lr=0.0001, decay=0.9,
-                     dropout=0.5, filters=3,
-                     kernelSize = dim(plpData)[3],
-                     batchSize, epochs, loss= "binary_crossentropy",
-                     seed=NULL, train=TRUE){
-  
-  if(!is.null(population$indexes) && train==T){
-    writeLines(paste('Training covolutional multi-resolution neural network with ',length(unique(population$indexes)),' fold CV'))
-    
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    if(kernelSize>dim(plpData)[3]){
-      kernelSize <- dim(plpData)[3] -1
-      warning('kernelsize reduced')
-    }
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      #submodel1 <- keras::keras_model_sequential()
-      
-      submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                            name='submodel1_input')
-      submodel1_output <- submodel1_input %>%
-        # Begin with 1D convolutional layer
-        keras::layer_max_pooling_1d(pool_size = kernelSize) %>%
-        keras::layer_conv_1d(
-          #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), 
-          filters = filters, 
-          kernel_size = floor(dim(plpData)[2]/kernelSize),
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-       keras::layer_flatten()
-      
-      #submodel2 <- keras::keras_model_sequential()
-      #submodel2 %>%
-      #  keras::layer_reshape(input_shape=c(dim(plpData)[2], dim(plpData)[3]), 
-      #                       target_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
-      submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                            name='submodel2_input')
-      submodel2_output <- submodel2_input %>%
-        # Begin with 1D convolutional layer
-        keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-        keras::layer_conv_1d(
-          #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), 
-          filters = filters, 
-          kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))),
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-       keras::layer_flatten()
-      
-      
-      #submodel3 <- keras::keras_model_sequential()
-      #submodel3 %>%
-      submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                            name='submodel3_input')
-      submodel3_output <- submodel3_input %>%
-        # Begin with 1D convolutional layer
-        keras::layer_conv_1d(
-          input_shape = c(dim(plpData)[2], dim(plpData)[3]), 
-          filters = filters, 
-          kernel_size = kernelSize,
-          padding = "valid"
-        ) %>%
-        # Normalize the activations of the previous layer
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))),
-          padding = "valid", 
-          use_bias = T
-        ) %>%
-        keras::layer_flatten() 
-      
-        #model <- keras::keras_model_sequential()
-        
-        deep_output <- keras::layer_concatenate(list(submodel1_output, 
-                                                     submodel2_output, 
-                                                     submodel3_output)) %>%
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense( 
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FULLY CONNECTED LAYER 2
-        # add drop out of 0.5
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense(
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FINAL LAYER
-        keras::layer_dropout(rate=dropout) %>%
-        keras::layer_dense(name = 'final',
-                           units = 2, 
-                           activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'sigmoid', name='deep_output')
-      
-        model <- keras::keras_model(
-          inputs = c(submodel1_input, submodel2_input, submodel3_input), 
-          outputs = c(deep_output)
-        )
-        
-        
-      # Prepare model for training
-      model %>% keras::compile(
-        loss = "binary_crossentropy",
-        metrics = c('accuracy'),
-        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-      )
-      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight=list("0"=1,"1"=outcomeWeight)
-      
-      data <- plpData[population$rowId[population$indexes!=index],,]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,sum(population$indexes!=index)*0.05)
-      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-      
-      sampling_generator<-function(data, population, batchSize, train_rows, index){
-        function(){
-          gc()
-          rows<-sample(train_rows, batchSize, replace=FALSE)
-          
-          list(list(as.array(data[rows,,]),
-                    as.array(data[rows,,]),
-                    as.array(data[rows,,])),
-               population$y[population$indexes!=index,1:2][rows,])
-        }
-      }
-      
-      
-      #print(table(population$y))
-      
-      if(length(train_rows) < batchSize){
-        # checking if this fixes issue with batchsize too big
-        batchSize <- length(train_rows)
-        ParallelLogger::logInfo('Reduce batchSize to training size')
-      }
-      
-      history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                                steps_per_epoch = length(train_rows)/batchSize,
-                                                epochs=epochs,
-                                                validation_data=list(list(as.array(data[val_rows,,]), 
-                                                                          as.array(data[val_rows,,]),
-                                                                          as.array(data[val_rows,,])),
-                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight)
-      
-      
-      
-      
-      # batch prediciton 
-      maxVal <- sum(population$indexes==index)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population[population$indexes==index,]
-      prediction$value <- 0
-      for(batch in batches){
-        pred <- keras::predict_on_batch(model, list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,]),
-                                                    as.array(plpData[population$rowId[population$indexes==index],,][batch,,]),
-                                                    as.array(plpData[population$rowId[population$indexes==index],,][batch,,])))
-        prediction$value[batch] <- as.double(pred[,2])
-        }
-        
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- prediction$value
-    
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('outcomeWeight: ', outcomeWeight,
-                        '-- kernelSize: ',paste0(kernelSize,collapse ='-'),
-                        '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay,
-                        '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
-    writeLines('==========================================')
-    writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc))
-    writeLines('==========================================')
-  } else {
-    
-    #Initialize model
-    submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                          name='submodel1_input')
-    submodel1_output <- submodel1_input %>%
-      # Begin with 1D convolutional layer
-      keras::layer_max_pooling_1d(pool_size = kernelSize) %>%
-      keras::layer_conv_1d(
-        #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), 
-        filters = filters, 
-        kernel_size = floor(dim(plpData)[2]/kernelSize),
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      keras::layer_flatten()
-    
-    submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                          name='submodel2_input')
-    submodel2_output <- submodel2_input %>%
-      # Begin with 1D convolutional layer
-      keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-      keras::layer_conv_1d(
-        #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), 
-        filters = filters, 
-        kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))),
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      keras::layer_flatten()
-    
- 
-    submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                          name='submodel3_input')
-    submodel3_output <- submodel3_input %>%
-      # Begin with 1D convolutional layer
-      keras::layer_conv_1d(
-        input_shape = c(dim(plpData)[2], dim(plpData)[3]), 
-        filters = filters, 
-        kernel_size = kernelSize,
-        padding = "valid"
-      ) %>%
-      # Normalize the activations of the previous layer
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))),
-        padding = "valid", 
-        use_bias = T
-      ) %>%
-      keras::layer_flatten() 
-
-    deep_output <- keras::layer_concatenate(list(submodel1_output, 
-                                                 submodel2_output, 
-                                                 submodel3_output)) %>%
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense( 
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FULLY CONNECTED LAYER 2
-      # add drop out of 0.5
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense(
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FINAL LAYER
-      keras::layer_dropout(rate=dropout) %>%
-      keras::layer_dense(name = 'final',
-                         units = 2, 
-                         activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'sigmoid', name='deep_output')
-    
-    model <- keras::keras_model(
-      inputs = c(submodel1_input, submodel2_input, submodel3_input), 
-      outputs = c(deep_output)
-    )
-    
-    
-    # Prepare model for training
-    model %>% keras::compile(
-      loss = "binary_crossentropy",#loss, 
-      metrics = c('accuracy'),
-      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-    )
-    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-    
-    
-    class_weight=list("0"=1,"1"=outcomeWeight)
-    
-    data <- plpData[population$rowId,,]
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,length(population$indexes)*0.05)
-    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
-    train_rows <- c(1:length(population$indexes))[-val_rows]
-    
-    
-    sampling_generator2<-function(data, population, batchSize, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        list(list(as.array(data[rows,,]),
-                  as.array(data[rows,,]),
-                  as.array(data[rows,,])), population$y[rows,])
-      }
-    }
-    
-    
-    if(length(train_rows) < batchSize){
-      # checking if this fixes issue with batchsize too big
-      batchSize <- length(train_rows)
-      ParallelLogger::logInfo('Reduce batchSize to training size')
-    }
-    
-    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                              steps_per_epoch = length(train_rows)/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(list(as.array(data[val_rows,,]), 
-                                                                   as.array(data[val_rows,,]),
-                                                                   as.array(data[val_rows,,])),
-                                                                   population$y[val_rows,]),
-                                              callbacks=list(earlyStopping,reduceLr),
-                                              class_weight=class_weight,
-                                              view_metrics=F)
-    
-    # batched prediciton 
-    maxVal <- nrow(population)
-    batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_on_batch(model, list(as.array(plpData[batch,,]),
-                                                  as.array(plpData[batch,,]),
-                                                  as.array(plpData[batch,,])))
-      prediction$value[batch] <- as.double(pred[,2])
-    }
-
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-    predictionMat <- prediction
-  }
-  
-  result <- list(model=model,
-                 auc=auc, 
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay,
-                                        outcomeWeight=outcomeWeight,
-                                        dropout=dropout,filters=filters,
-                                        kernelSize=paste0(kernelSize,collapse ='-'),
-                                        epochs=epochs, loss=loss,
-                                        fold_auc=foldPerm))
-                 )
-  
-  return(result)
-  
-}
diff --git a/R/CovNN2.R b/R/CovNN2.R
deleted file mode 100644
index de7bd45..0000000
--- a/R/CovNN2.R
+++ /dev/null
@@ -1,461 +0,0 @@
-# @file CovNN2.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf
-#'
-#' @param batchSize         The number of samples to used in each batch during model training
-#' @param outcomeWeight     The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)
-#' @param lr                The learning rate
-#' @param decay             The decay of the learning rate
-#' @param dropout            [currently not used] the dropout rate for regularisation
-#' @param epochs            The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)
-#' @param filters            The number of columns output by each convolution
-#' @param kernelSize        The number of time dimensions used for each convolution
-#' @param loss               The loss function implemented 
-#' @param seed               The random seed
-#'
-#' @examples
-#' \dontrun{
-#' model.CovNN <- setCovNN()
-#' }
-#' @export
-setCovNN2 <- function(batchSize = 1000,
-                      outcomeWeight=1,
-                      lr=0.00001,
-                      decay=0.000001,
-                      dropout=0,
-                      epochs = 10,
-                      filters = 3, kernelSize = 10,
-                      loss = "binary_crossentropy", 
-                      seed=NULL  ){
-  #[TODO: add input checks...]
-  
-  ensure_installed("keras")
-  
-  if(!is.null(seed)){
-    warning('seed currently not implemented in CovNN')
-  }
-  
-  result <- list(model='fitCovNN2', param=split(expand.grid(
-    batchSize=batchSize, 
-    outcomeWeight=outcomeWeight,
-    lr=lr,
-    decay=decay,
-    dropout=dropout,
-    epochs= epochs,filters=filters,
-    kernelSize=kernelSize,loss =loss,
-    seed=ifelse(is.null(seed),'NULL', seed)),
-    1:(length(batchSize)*length(outcomeWeight)*length(epochs)*
-         length(filters)*length(lr)*length(decay)*
-         length(kernelSize)*length(loss)*max(1,length(seed)))),
-    name='CovNN2'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitCovNN2 <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  if(is.null(plpData$timeRef)){
-    stop('Data not temporal...')
-  }
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=T)
-  
-  data <- result$data#[population$rowId,,]
-  #data<-as.array(data) -- cant make dense on big data!
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, 2)
-  #colnames(population$y) <- c('0','1')
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  hyperParamSel <- lapply(param, function(x) do.call(trainCovNN2, c(x,datas,train=TRUE)  ))
-  
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainCovNN2, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction 
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCovNN2',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 covariateMap=result$map,
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deep'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainCovNN2<-function(plpData, population,
-                      outcomeWeight=1, lr=0.0001, decay=0.9,
-                      dropout=0.5, filters=3,
-                      kernelSize = dim(plpData)[3],
-                      batchSize, epochs, loss= "binary_crossentropy",
-                      seed=NULL, train=TRUE){
-  
-  if(!is.null(population$indexes) && train==T){
-    writeLines(paste('Training covolutional neural network (input and time) with ',length(unique(population$indexes)),' fold CV'))
-    
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      model <- keras::keras_model_sequential()  %>%
-        
-        ##model_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-        ##                                      name='initial_input') %>%
-        keras::layer_permute(dims = c(2,1), 
-                             input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
-        
-        # Begin with 1D convolutional layer across input - hidden layer 1
-        ##model_output <-  keras::layer_input(shape=c(dim(plpData)[3], dim(plpData)[2]), 
-        ##                                    name='second_input') %>%
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = dim(plpData)[3]-2,
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-        # second layer across input - hidden layer 2   time x filters
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = 3,#filters,
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-        
-        # permute bact to time x var
-        keras::layer_permute(dims = c(2,1)) %>% 
-        #max pool over time and conv
-        keras::layer_max_pooling_1d(pool_size = 2) %>%
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = 2,
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-        keras::layer_flatten() %>%
-        # final 2 deep layers with dropout and batchnorm/ relu activation
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense( 
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FULLY CONNECTED LAYER 2
-        # add drop out of 0.5
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense(
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FINAL LAYER
-        keras::layer_dropout(rate=dropout) %>%
-        keras::layer_dense(name = 'final',
-                           units = 2, 
-                           activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'sigmoid', name='main_output')
-      
-      ##model <- keras::keras_model(
-      ##  inputs = c(model_input), 
-      ##  outputs = c(model_output)
-      ##)
-      
-      # Prepare model for training
-      model %>% keras::compile(
-        loss = "binary_crossentropy",
-        metrics = c('accuracy'),
-        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-      )
-      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight=list("0"=1,"1"=outcomeWeight)
-      
-      data <- plpData[population$rowId[population$indexes!=index],,]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,sum(population$indexes!=index)*0.05)
-      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-      
-      sampling_generator<-function(data, population, batchSize, train_rows, index){
-        function(){
-          gc()
-          rows<-sample(train_rows, batchSize, replace=FALSE)
-          
-          list(as.array(data[rows,,]),
-               population$y[population$indexes!=index,1:2][rows,])
-        }
-      }
-      
-      
-      #print(table(population$y))
-      if(length(train_rows) < batchSize){
-        # checking if this fixes issue with batchsize too big
-        batchSize <- length(train_rows)
-        ParallelLogger::logInfo('Reducing batchSize to training size')
-      }
-      
-      history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                                steps_per_epoch = length(train_rows)/batchSize,
-                                                epochs=epochs,
-                                                validation_data=list(as.array(data[val_rows,,]),
-                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight)
-      
-      
-      
-      
-      # batch prediciton 
-      maxVal <- sum(population$indexes==index)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population[population$indexes==index,]
-      prediction$value <- 0
-      for(batch in batches){
-        pred <- keras::predict_on_batch(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-        prediction$value[batch] <- as.double(pred[,2])
-      }
-      
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- prediction$value
-      
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('outcomeWeight: ', outcomeWeight,
-                        '-- kernelSize: ',paste0(kernelSize,collapse ='-'),
-                        '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay,
-                        '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
-    writeLines('==========================================')
-    writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc))
-    writeLines('==========================================')
-  } else {
-    
-    model <- keras::keras_model_sequential()  %>%
-      
-      keras::layer_permute(dims = c(2,1), 
-                           input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
-      
-      # Begin with 1D convolutional layer across input - hidden layer 1
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = dim(plpData)[3]-2,
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      # second layer across input - hidden layer 2   time x filters
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = 3,
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      
-      # permute bact to time x var
-      keras::layer_permute(dims = c(2,1)) %>% 
-      #max pool over time and conv
-      keras::layer_max_pooling_1d(pool_size = 2) %>%
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = 2,
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      keras::layer_flatten() %>%
-      # final 2 deep layers with dropout and batchnorm/ relu activation
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense( 
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FULLY CONNECTED LAYER 2
-      # add drop out of 0.5
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense(
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FINAL LAYER
-      keras::layer_dropout(rate=dropout) %>%
-      keras::layer_dense(name = 'final',
-                         units = 2, 
-                         activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'sigmoid', name='main_output')
-    
-    
-    
-    # Prepare model for training
-    model %>% keras::compile(
-      loss = "binary_crossentropy",#loss, 
-      metrics = c('accuracy'),
-      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-    )
-    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-    
-    
-    class_weight=list("0"=1,"1"=outcomeWeight)
-    
-    data <- plpData[population$rowId,,]
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,length(population$indexes)*0.05)
-    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
-    train_rows <- c(1:length(population$indexes))[-val_rows]
-    
-    
-    sampling_generator2<-function(data, population, batchSize, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        list(as.array(data[rows,,]), population$y[rows,])
-      }
-    }
-    
-    if(length(train_rows) < batchSize){
-      # checking if this fixes issue with batchsize too big
-      batchSize <- length(train_rows)
-      ParallelLogger::logInfo('Reducing batchSize to training size')
-    }
-    
-    
-    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                              steps_per_epoch = length(train_rows)/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(data[val_rows,,]),
-                                                                   population$y[val_rows,]),
-                                              callbacks=list(earlyStopping,reduceLr),
-                                              class_weight=class_weight,
-                                              view_metrics=F)
-    
-    # batched prediciton 
-    maxVal <- nrow(population)
-    batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_on_batch(model, as.array(plpData[batch,,]))
-      prediction$value[batch] <- as.double(pred[,2])
-    }
-    
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-    predictionMat <- prediction
-  }
-  
-  result <- list(model=model,
-                 auc=auc,
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay,
-                                        outcomeWeight=outcomeWeight,
-                                        dropout=dropout,filters=filters,
-                                        kernelSize=paste0(kernelSize,collapse ='-'),
-                                        epochs=epochs, loss=loss,
-                                        fold_auc=foldPerm))
-  )
-  
-  return(result)
-  
-}
diff --git a/R/Dataset.R b/R/Dataset.R
new file mode 100644
index 0000000..73464d1
--- /dev/null
+++ b/R/Dataset.R
@@ -0,0 +1,120 @@
+#'  A torch dataset
+#' @import data.table
+#' @export
+Dataset <- torch::dataset(
+  name = "myDataset",
+  #' @param data           a dataframe like object with the covariates
+  #' @param labels         a dataframe with the labels
+  #' @param numericalIndex in what column numeric data is in (if any)
+  initialize = function(data, labels = NULL, numericalIndex = NULL) {
+    # determine numeric
+    if (is.null(numericalIndex)) {
+      numericalIndex <- data %>%
+        dplyr::group_by(columnId) %>%
+        dplyr::collect() %>%
+        dplyr::summarise(n = dplyr::n_distinct(.data$covariateValue)) %>%
+        dplyr::pull(n) > 1
+      self$numericalIndex <- numericalIndex
+    } else {
+      self$numericalIndex <- NULL
+    }
+
+    # add labels if training (make 0 vector for prediction)
+    if (!is.null(labels)) {
+      self$target <- torch::torch_tensor(labels)
+    } else {
+      self$target <- torch::torch_tensor(rep(0, data %>% dplyr::distinct(rowId)
+        %>% dplyr::collect() %>% nrow()))
+    }
+    # Weight to add in loss function to positive class
+    self$posWeight <- (self$target == 0)$sum() / self$target$sum()
+
+    # add features
+    catColumns <- which(!numericalIndex)
+    dataCat <- dplyr::filter(data, columnId %in% catColumns) %>%
+      dplyr::arrange(columnId) %>%
+      dplyr::group_by(columnId) %>%
+      dplyr::collect() %>%
+      dplyr::mutate(newColumnId = dplyr::cur_group_id()) %>%
+      dplyr::ungroup() %>%
+      dplyr::select(c("rowId", "newColumnId")) %>%
+      dplyr::rename(columnId = newColumnId)
+    # the fastest way I found so far to convert data using data.table
+    # 1.5 min for 100k rows :(
+    dt <- data.table::data.table(rows = dataCat$rowId, cols = dataCat$columnId)
+    maxFeatures <- max(dt[, .N, by = rows][, N])
+    start <- Sys.time()
+    tensorList <- lapply(1:max(data %>% dplyr::pull(rowId)), function(x) {
+      torch::torch_tensor(dt[rows == x, cols])
+    })
+    self$lengths <- lengths
+    self$cat <- torch::nn_utils_rnn_pad_sequence(tensorList, batch_first = T)
+    delta <- Sys.time() - start
+    ParallelLogger::logInfo("Data conversion for dataset took ", signif(delta, 3), " ", attr(delta, "units"))
+    if (sum(numericalIndex) == 0) {
+      self$num <- NULL
+    } else {
+      numericalData <- data %>%
+        dplyr::filter(columnId %in% !!which(numericalIndex)) %>%
+        dplyr::collect()
+      numericalData <- numericalData %>%
+        dplyr::group_by(columnId) %>%
+        dplyr::mutate(newId = dplyr::cur_group_id())
+      indices <- torch::torch_tensor(cbind(numericalData$rowId, numericalData$newId), dtype = torch::torch_long())$t_()
+      values <- torch::torch_tensor(numericalData$covariateValue, dtype = torch::torch_float32())
+      self$num <- torch::torch_sparse_coo_tensor(
+        indices = indices,
+        values = values,
+        size = c(self$target$shape, sum(numericalIndex))
+      )$to_dense()
+    }
+  },
+  getNumericalIndex = function() {
+    return(
+      self$numericalIndex
+    )
+  },
+  numCatFeatures = function() {
+    return(
+      sum(!self$numericalIndex)
+    )
+  },
+  numNumFeatures = function() {
+    if (!is.null(self$num)) {
+      return(self$num$shape[2])
+    } else {
+      return(0)
+    }
+  },
+  .getbatch = function(item) {
+    if (length(item) == 1) {
+      return(self$.getBatchSingle(item))
+    } else {
+      return(self$.getBatchRegular(item))
+    }
+  },
+  .getBatchSingle = function(item) {
+    # add leading singleton dimension since models expects 2d tensors
+    batch <- list(
+      cat = self$cat[item]$unsqueeze(1),
+      num = self$num[item]$unsqueeze(1)
+    )
+    return(list(
+      batch = batch,
+      target = self$target[item]$unsqueeze(1)
+    ))
+  },
+  .getBatchRegular = function(item) {
+    batch <- list(
+      cat = self$cat[item],
+      num = self$num[item]
+    )
+    return(list(
+      batch = batch,
+      target = self$target[item]
+    ))
+  },
+  .length = function() {
+    self$target$size()[[1]] # shape[1]
+  }
+)
diff --git a/R/DeepNN.R b/R/DeepNN.R
deleted file mode 100644
index 14c57b0..0000000
--- a/R/DeepNN.R
+++ /dev/null
@@ -1,557 +0,0 @@
-# @file DeepNN.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for DeepNN model
-#'
-#' @param units         The number of units of the deep network - as a list of vectors
-#' @param layer_dropout      The layer dropout rate (regularisation)
-#' @param lr                 Learning rate
-#' @param decay              Learning rate decay over each update.
-#' @param outcome_weight      The weight of the outcome class in the loss function
-#' @param batch_size          The number of data points to use per training batch
-#' @param epochs          Number of times to iterate over dataset
-#' @param seed            Random seed used by deep learning model
-#'
-#' @examples
-#' \dontrun{
-#' model <- setDeepNN()
-#' }
-#' @export
-setDeepNN <- function(units=list(c(128, 64), 128), layer_dropout=c(0.2),
-                      lr =c(1e-4), decay=c(1e-5), outcome_weight = c(1.0), batch_size = c(100), 
-                      epochs= c(100),  seed=NULL  ){
-  
-  ensure_installed("keras")
-  
-  # if(class(indexFolder)!='character')
-  #     stop('IndexFolder must be a character')
-  # if(length(indexFolder)>1)
-  #     stop('IndexFolder must be one')
-  # 
-  # if(class(units)!='numeric')
-  #     stop('units must be a numeric value >0 ')
-  # if(units<1)
-  #     stop('units must be a numeric value >0 ')
-  # 
-  # #if(length(units)>1)
-  # #    stop('units can only be a single value')
-  # 
-  # if(class(recurrent_dropout)!='numeric')
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if( (recurrent_dropout<0) | (recurrent_dropout>=1))
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if(class(layer_dropout)!='numeric')
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if( (layer_dropout<0) | (layer_dropout>=1))
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if(class(lr)!='numeric')
-  #     stop('lr must be a numeric value >0')
-  # if(lr<=0)
-  #     stop('lr must be a numeric value >0')
-  # if(class(decay)!='numeric')
-  #     stop('decay must be a numeric value >=0')
-  # if(decay<=0)
-  #     stop('decay must be a numeric value >=0')
-  # if(class(outcome_weight)!='numeric')
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(outcome_weight<=0)
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(class(batch_size)!='numeric')
-  #     stop('batch_size must be an integer')
-  # if(batch_size%%1!=0)
-  #     stop('batch_size must be an integer')
-  # if(class(epochs)!='numeric')
-  #     stop('epochs must be an integer')
-  # if(epochs%%1!=0)
-  #     stop('epochs must be an integer')
-  # if(!class(seed)%in%c('numeric','NULL'))
-  #     stop('Invalid seed')
-  #if(class(UsetidyCovariateData)!='logical')
-  #    stop('UsetidyCovariateData must be an TRUE or FALSE')
-  
-  param <- expand.grid(units=units,
-    layer_dropout=layer_dropout,
-    lr =lr, decay=decay, outcome_weight=outcome_weight,epochs= epochs,
-    seed=ifelse(is.null(seed),'NULL', seed))
-  param$units1=unlist(lapply(param$units, function(x) x[1])) 
-  param$units2=unlist(lapply(param$units, function(x) x[2])) 
-  param$units3=unlist(lapply(param$units, function(x) x[3]))
-  
-  result <- list(model='fitDeepNN', param=split(param,
-    1:(length(units)*length(layer_dropout)*length(lr)*length(decay)*length(outcome_weight)*length(epochs)*max(1,length(seed)))),
-    name='DeepNN'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitDeepNN <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData)){
-    stop('DeepNN requires correct covariateData')
-  }
-  if(!is.null(plpData$timeRef)){
-    warning('Data temporal but deepNN uses non-temporal data...')
-  }
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=F)
-  data <- result$data
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount)))
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  hyperParamSel <- lapply(param, function(x) do.call(trainDeepNN, c(x,datas,train=TRUE)  ))
-  
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainDeepNN, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitDeepNN',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 covariateMap=result$map,
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deep'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainDeepNN<-function(plpData, population,
-                      units1=128, units2= NULL, units3=NULL, 
-                      layer_dropout=0.2,
-                      lr =1e-4, decay=1e-5, outcome_weight = 1.0, batch_size = 100, 
-                      epochs= 100, seed=NULL, train=TRUE, ...){
-  
-  ParallelLogger::logInfo(paste('Training deep neural network with ',length(unique(population$indexes)),' fold CV'))
-  if(!is.null(population$indexes) && train==T){
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      ParallelLogger::logInfo(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      model <- keras::keras_model_sequential()
-      
-      if(is.na(units2)){
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-      } else if(is.na(units3)){
-        model %>%
-          keras::layer_dense(units=units1, #activation='identify', 
-                             input_shape=ncol(plpData)) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=units2 #,activation='identify'
-                             ) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-      } else{
-        model %>%
-          keras::layer_dense(units=units1, #activation='identify', 
-                             input_shape=ncol(plpData)) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=units2 #,activation='identify'
-                             ) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=units3 #,activation='identify'
-                             ) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-      }
-      
-
-      # Prepare model for training
-      model %>% keras::compile(
-        loss = "binary_crossentropy",
-        metrics = c('accuracy'),
-        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-      )
-      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight=list("0"=1,"1"=outcome_weight)
-      
-      data <- plpData[population$rowId[population$indexes!=index],]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,sum(population$indexes!=index)*0.05)
-      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-      
-      sampling_generator<-function(data, population, batch_size, train_rows, index){
-        function(){
-          gc()
-          rows<-sample(train_rows, batch_size, replace=FALSE)
-          
-          list(as.array(data[rows,]),
-               population$y[population$indexes!=index,1:2][rows,])
-        }
-      }
-      
-      
-      #print(table(population$y))
-      
-      history <- model %>% keras::fit_generator(sampling_generator(data,population,batch_size,train_rows, index),
-                                                steps_per_epoch = floor(sum(population$indexes!=index)/batch_size),
-                                                epochs=epochs,
-                                                validation_data=list(as.array(data[val_rows,]),
-                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight)
-      
-      
-      # batch prediciton 
-      maxVal <- sum(population$indexes==index)
-      batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-      prediction <- population[population$indexes==index,]
-      prediction$value <- 0
-      for(batch in batches){
-        pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],][batch,,drop=FALSE]))
-        prediction$value[batch] <- pred[,2]
-      }
-      
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- prediction$value
-      
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('units1: ',units1,'units2: ',units2,'units3: ',units3,
-                        'layer_dropout: ',layer_dropout,'-- lr: ', lr,
-                        '-- decay: ', decay, '-- batch_size: ',batch_size, '-- epochs: ', epochs)
-    ParallelLogger::logInfo('==========================================')
-    ParallelLogger::logInfo(paste0('DeepNN with parameters:', param.val,' obtained an AUC of ',auc))
-    ParallelLogger::logInfo('==========================================')
-    
-  } else {
-    
-    model <- keras::keras_model_sequential()
-    if(is.na(units2)){
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-    } else if(is.na(units3)){
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=units2 #,activation='identify'
-                           ) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-    } else{
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=units2 #,activation='identify'
-                           ) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=units3 #,activation='identify'
-                           ) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-    }
-    
-    # Prepare model for training
-    model %>% keras::compile(
-      loss = "binary_crossentropy",
-      metrics = c('accuracy'),
-      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-    )
-    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-    
-    class_weight=list("0"=1,"1"=outcome_weight)
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,nrow(population)*0.05)
-    val_rows<-sample(1:nrow(population), valN, replace=FALSE)
-    train_rows <- c(1:nrow(population))[-val_rows]
-    
-    sampling_generator2<-function(data, population, batch_size, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batch_size, replace=FALSE)
-        
-        list(as.array(data[rows,,]),
-             population$y[,1:2][rows,])
-      }
-    }
-
-    history <- model %>% keras::fit_generator(sampling_generator2(plpData,population,batch_size,train_rows),
-                                              steps_per_epoch = nrow(population)/batch_size,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(plpData[val_rows,,]),
-                                                                   population$y[val_rows,1:2]),
-                                              callbacks=list(earlyStopping,reduceLr),
-                                              class_weight=class_weight)
-    
-    
-    # batch prediciton 
-    maxVal <- nrow(population)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_proba(model, as.array(plpData[batch,,drop=FALSE]))
-      prediction$value[batch] <- pred[,2]
-    }
-    
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-    predictionMat <- prediction
-  }
-  
-  result <- list(model=model,
-                 auc=auc,
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(units1=units1,units2=units2,units3=units3, 
-                                        layer_dropout=layer_dropout,lr =lr, decay=decay,
-                                        batch_size = batch_size, epochs= epochs))
-  )
-  return(result)
-  
-}
-
-
-
-#' [Under development] Transfer learning
-#'
-#' @param plpResult   The plp result when training a kersa deep learning model on big data
-#' @param plpData     The new data to fine tune the model on
-#' @param population  The population for the new data
-#' @param fixLayers   boolean specificying whether to fix weights in model being transferred
-#' @param includeTop  If TRUE the final layer of the model being transferred is removed 
-#' @param addLayers   vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10
-#' @param layerDropout  Add dropout to each new layer (binary vector length of addLayers)
-#' @param layerActivation Activation function for each new layer (string vector length of addLayers)
-#' @param outcomeWeight  The weight to assign the class 1 when training the model
-#' @param batchSize   Size of each batch for updating layers
-#' @param epochs      Number of epoches to run 
-#' @examples
-#' \dontrun{
-#' modelSet <- setDeepNN()
-#' plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...)
-#' 
-#' transferLearning(...)
-#' }
-#' @export
-transferLearning <- function(plpResult, 
-                 plpData, 
-                 population, 
-                 fixLayers = T, 
-                 includeTop= F,
-                 addLayers = c(100,10), 
-                 layerDropout = c(T,T),
-                 layerActivation = c('relu','softmax'),
-                 outcomeWeight = 1,
-                 batchSize = 10000, 
-                 epochs=20){
-  
-  # checks
-  if(!is.null(addLayers)){
-    if(length(addLayers)!=length(layerDropout)){
-      stop('Layer vector not same length as layer dropout vector')
-    }
-    if(length(addLayers)!=length(layerActivation)){
-      stop('Layer vector not same length as layer activation vector')
-    }
-  }
-  if(batchSize > nrow(population)){
-    warning('batchSize is too big for your data...')
-    batchSize = nrow(population)/10
-  }
-  
-  if(!includeTop){
-  # remove last layer if not dropout
-    if(length(grep('Dropout',as.character(plpResult$model$model$layers[[length(plpResult$model$model$layers)]])))==0){
-      keras::pop_layer(plpResult$model$model)
-    }
-  }
- 
-  # create the base pre-trained
-  base_model <-  plpResult$model$model
-  
-  # add our custom layers
-  predictions <- base_model$output 
-  
-  
-  # fix the older layers
-  if(fixLayers){
-    for (i in 1:length(base_model$layers)){
-      try({base_model$layers[[i]]$trainable <- F}, silent = TRUE)
-    }
-  }
-  
-  ## add loop over settings here - move code to new function and call it
-  ##====
-  # !!check this looping logic works
-  if(!is.null(addLayers)){
-    for(i in 1:length(addLayers)){
-      predictions <- keras::layer_dense(predictions,units = addLayers[i], activation = layerActivation[i])
-      if(layerDropout[i]){
-        predictions <- keras::layer_dropout(predictions, rate = 0.5)
-      }
-    }
-  }
-  
-  # add find layer for binary outcome
-  predictions <- keras::layer_dense(predictions,units = 2, activation = 'sigmoid')
-  
-  
-  # this is the model we will train
-  model <- keras::keras_model(inputs = base_model$input, outputs = predictions)
-  
-  # compile the model (should be done *after* setting layers to non-trainable)
-  model %>% keras::compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',
-                           metrics = c('accuracy'))
-  
-  
-  # make this input...
-  earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-  reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, 
-                                                min_lr = 0)
-  
-  class_weight=list("0"=1,"1"=outcomeWeight)
-  
-
-  sampling_generator<-function(data, population, batchSize, train_rows){
-    function(){
-      gc()
-      rows<-sample(train_rows, batchSize, replace=FALSE)
-      
-      list(as.array(data[rows,]),
-           population$y[rows,1:2])
-    }
-  }
-  
-  # convert plpdata to matrix:
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  result<- toSparseM(plpData,population,map=plpResult$model$covariateMap, temporal=F)
-  data <- result$data
-  population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount)))
-  
-  #Extract validation set first - 10k people or 5%
-  valN <- min(10000,nrow(population)*0.05)
-  val_rows<-sample(1:nrow(population), valN, replace=FALSE)
-  train_rows <- c(1:nrow(population))[-val_rows]
-  
-  history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows),
-                                            steps_per_epoch = nrow(population)/batchSize,
-                                            epochs=epochs,
-                                            validation_data=list(as.array(data[val_rows,,]),
-                                                                 population$y[val_rows,1:2]),
-                                            callbacks=list(earlyStopping,reduceLr),
-                                            class_weight=class_weight)
-  
-  
-  # batch prediciton 
-  maxVal <- nrow(population)
-  batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-  prediction <- population
-  prediction$value <- 0
-  for(batch in batches){
-    pred <- model$predict(as.array(data[batch,,drop=FALSE])) # added drop=FALSE
-    prediction$value[batch] <- pred[,2]
-  }
-  
-  ##===
-  
-  attr(prediction, "metaData") <- list(predictionType = "binary")
-  auc <- computeAuc(prediction)
-  foldPerm <- auc
-  predictionMat <- prediction
-
-result <- list(model=model,
-               auc=auc,
-               prediction = predictionMat,
-               hyperSum = list(fixLayers = fixLayers, 
-                               addLayers = addLayers, 
-                               layerDropout = layerDropout,
-                               layerActivation = layerActivation))
-  
-  return(result)
-  
-}
diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
new file mode 100644
index 0000000..f6f5a19
--- /dev/null
+++ b/R/DeepPatientLevelPrediction.R
@@ -0,0 +1,28 @@
+# @file DeepPatientLevelPrediction.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' DeepPatientLevelPrediction
+#'
+#' @description A package containing deep learning extensions for developing prediction models using data in the OMOP CDM
+#'
+#' @docType package
+#' @name DeepPatientLevelPrediction
+#' @importFrom dplyr %>%
+#' @importFrom data.table :=
+#' @importFrom rlang .data
+NULL
diff --git a/R/Estimator.R b/R/Estimator.R
new file mode 100644
index 0000000..8abb048
--- /dev/null
+++ b/R/Estimator.R
@@ -0,0 +1,736 @@
+# @file Estimator.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' setEstimator
+#'
+#' @description
+#' creates settings for the Estimator, which takes a model and trains it
+#'
+#' @name setEstimator
+#' @param learningRate          what learning rate to use
+#' @param weightDecay           what weight_decay to use
+#' @param optimizer             which optimizer to use
+#' @param scheduler             which learning rate scheduler to use
+#' @param criterion             loss function to use
+#' @param posWeight             If more weight should be added to positive labels during training - will result in miscalibrated models
+#' @param earlyStopping         If earlyStopping should be used which stops the training of your metric is not improving
+#' @param earlyStoppingMetric   Which parameter to use for early stopping
+#' @param patience              patience for earlyStopper
+#' @param hyperparameterMetric  which metric to use for hyperparameter, loss, auc, auprc or a custom function
+NULL
+
+#' fitEstimator
+#'
+#' @description
+#' fits a deep learning estimator to data.
+#'
+#' @param trainData      the data to use
+#' @param modelSettings  modelSettings object
+#' @param analysisId     Id of the analysis
+#' @param ...            Extra inputs
+#'
+#' @export
+fitEstimator <- function(trainData,
+                         modelSettings,
+                         analysisId,
+                         ...) {
+  start <- Sys.time()
+
+  # check covariate data
+  if (!FeatureExtraction::isCovariateData(trainData$covariateData)) {
+    stop("Needs correct covariateData")
+  }
+
+  param <- modelSettings$param
+
+  # get the settings from the param
+  settings <- attr(param, "settings")
+  if (!is.null(trainData$folds)) {
+    trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId")
+  }
+  mappedCovariateData <- PatientLevelPrediction::MapIds(
+    covariateData = trainData$covariateData,
+    cohort = trainData$labels
+  )
+
+  covariateRef <- mappedCovariateData$covariateRef
+
+  outLoc <- PatientLevelPrediction::createTempModelLoc()
+  cvResult <- do.call(
+    what = gridCvDeep,
+    args = list(
+      mappedData = mappedCovariateData,
+      labels = trainData$labels,
+      settings = settings,
+      modelLocation = outLoc,
+      paramSearch = param
+    )
+  )
+
+  hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary))
+  prediction <- cvResult$prediction
+  incs <- rep(1, covariateRef %>% dplyr::tally() %>% dplyr::pull())
+  covariateRef <- covariateRef %>%
+    dplyr::collect() %>%
+    dplyr::mutate(
+      included = incs,
+      covariateValue = 0,
+      isNumeric = cvResult$numericalIndex
+    )
+
+
+  comp <- start - Sys.time()
+  result <- list(
+    model = cvResult$estimator, # file.path(outLoc),
+
+    preprocessing = list(
+      featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering,
+      tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings,
+      requireDenseMatrix = settings$requiresDenseMatrix
+    ),
+    prediction = prediction,
+    modelDesign = PatientLevelPrediction::createModelDesign(
+      targetId = attr(trainData, "metaData")$targetId,
+      outcomeId = attr(trainData, "metaData")$outcomeId,
+      restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings,
+      covariateSettings = attr(trainData, "metaData")$covariateSettings,
+      populationSettings = attr(trainData, "metaData")$populationSettings,
+      featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings,
+      preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings,
+      modelSettings = modelSettings,
+      splitSettings = attr(trainData, "metaData")$splitSettings,
+      sampleSettings = attr(trainData, "metaData")$sampleSettings
+    ),
+    trainDetails = list(
+      analysisId = analysisId,
+      analysisSource = "",
+      developementDatabase = attr(trainData, "metaData")$cdmDatabaseSchema,
+      attrition = attr(trainData, "metaData")$attrition,
+      trainingTime = paste(as.character(abs(comp)), attr(comp, "units")),
+      trainingDate = Sys.Date(),
+      modelName = settings$name,
+      finalModelParameters = cvResult$finalParam,
+      hyperParamSearch = hyperSummary
+    ),
+    covariateImportance = covariateRef
+  )
+
+  class(result) <- "plpModel"
+  attr(result, "predictionFunction") <- "predictDeepEstimator"
+  attr(result, "modelType") <- "binary"
+  attr(result, "saveType") <- attr(param, "settings")$saveType
+
+  return(result)
+}
+
+#' predictDeepEstimator
+#'
+#' @description
+#' the prediction function for the estimator
+#'
+#' @param plpModel   the plpModel
+#' @param data       plp data object or a torch dataset
+#' @param cohort     data.frame with the rowIds of the people
+#'
+#' @export
+predictDeepEstimator <- function(plpModel,
+                                 data,
+                                 cohort) {
+  if (!"plpModel" %in% class(plpModel)) {
+    plpModel <- list(model = plpModel)
+    attr(plpModel, "modelType") <- "binary"
+  }
+  if ("plpData" %in% class(data)) {
+    mappedData <- PatientLevelPrediction::MapIds(data$covariateData,
+      cohort = cohort,
+      mapping = plpModel$covariateImportance %>%
+        dplyr::select(
+          .data$columnId,
+          .data$covariateId
+        )
+    )
+    data <- Dataset(mappedData$covariates,
+      numericalIndex = plpModel$covariateImportance$isNumeric
+    )
+  }
+
+  # get predictions
+  prediction <- cohort
+  if (is.character(plpModel$model)) {
+    model <- torch::torch_load(file.path(plpModel$model, "DeepEstimatorModel.pt"), device = "cpu")
+    estimator <- Estimator$new(
+      baseModel = attr(plpModel$modelDesign$modelSettings$param, "settings")$baseModel,
+      modelParameters = model$modelParameters,
+      fitParameters = model$fitParameters,
+      device = attr(plpModel$modelDesign$modelSettings$param, "settings")$device
+    )
+    estimator$model$load_state_dict(model$modelStateDict)
+    prediction$value <- estimator$predictProba(data)
+  } else {
+    prediction$value <- plpModel$model$predictProba(data)
+  }
+
+
+  attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType")
+
+  return(prediction)
+}
+
+#' gridCvDeep
+#'
+#' @description
+#' Performs grid search for a deep learning estimator
+#'
+#'
+#' @param mappedData    Mapped data with covariates
+#' @param labels        Dataframe with the outcomes
+#' @param settings      Settings of the model
+#' @param modelLocation Where to save the model
+#' @param paramSearch   model parameters to perform search over
+#'
+#' @export
+gridCvDeep <- function(mappedData,
+                       labels,
+                       settings,
+                       modelLocation,
+                       paramSearch) {
+  modelName <- settings$modelName
+  modelParamNames <- settings$modelParamNames
+  fitParamNames <- c("weightDecay", "learningRate")
+  epochs <- settings$epochs
+  batchSize <- settings$batchSize
+  baseModel <- settings$baseModel
+  device <- settings$device
+
+  ParallelLogger::logInfo(paste0("Running CV for ", modelName, " model"))
+
+  ###########################################################################
+
+  gridSearchPredictons <- list()
+  length(gridSearchPredictons) <- length(paramSearch)
+  dataset <- Dataset(mappedData$covariates, labels$outcomeCount)
+  for (gridId in 1:length(paramSearch)) {
+    ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId))
+    ParallelLogger::logInfo(paste0("HyperParameters: "))
+    ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | "))
+    modelParams <- paramSearch[[gridId]][modelParamNames]
+
+    fitParams <- paramSearch[[gridId]][fitParamNames]
+    fitParams$epochs <- epochs
+    fitParams$batchSize <- batchSize
+
+
+    # initiate prediction
+    prediction <- c()
+
+    fold <- labels$index
+    ParallelLogger::logInfo(paste0("Max fold: ", max(fold)))
+    modelParams$catFeatures <- dataset$numCatFeatures()
+    modelParams$numFeatures <- dataset$numNumFeatures()
+    learnRates <- list()
+    for (i in 1:max(fold)) {
+      ParallelLogger::logInfo(paste0("Fold ", i))
+      trainDataset <- torch::dataset_subset(dataset, indices = which(fold != i))
+      testDataset <- torch::dataset_subset(dataset, indices = which(fold == i))
+      # fitParams$posWeight <- trainDataset$dataset$posWeight
+      estimator <- Estimator$new(
+        baseModel = baseModel,
+        modelParameters = modelParams,
+        fitParameters = fitParams,
+        device = device
+      )
+
+      estimator$fit(
+        trainDataset,
+        testDataset
+      )
+
+      ParallelLogger::logInfo("Calculating predictions on left out fold set...")
+
+      prediction <- rbind(
+        prediction,
+        predictDeepEstimator(
+          plpModel = estimator,
+          data = testDataset,
+          cohort = labels[fold == i, ]
+        )
+      )
+      learnRates[[i]] <- list(
+        LRs = estimator$learnRateSchedule,
+        bestEpoch = estimator$bestEpoch
+      )
+    }
+    maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2)))
+    paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]]
+
+    gridSearchPredictons[[gridId]] <- list(
+      prediction = prediction,
+      param = paramSearch[[gridId]]
+    )
+  }
+  # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function)
+  paramGridSearch <- lapply(gridSearchPredictons, function(x) {
+    do.call(PatientLevelPrediction::computeGridPerformance, x)
+  }) # cvAUCmean, cvAUC, param
+
+  optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance)))
+  finalParam <- paramGridSearch[[optimalParamInd]]$param
+
+  cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction
+  cvPrediction$evaluationType <- "CV"
+
+  ParallelLogger::logInfo("Training final model using optimal parameters")
+
+  # get the params
+  modelParams <- finalParam[modelParamNames]
+  fitParams <- finalParam[fitParamNames]
+  fitParams$epochs <- finalParam$learnSchedule$bestEpoch
+  fitParams$batchSize <- batchSize
+  fitParams$posWeight <- dataset$posWeight
+  # create the dir
+  if (!dir.exists(file.path(modelLocation))) {
+    dir.create(file.path(modelLocation), recursive = T)
+  }
+  modelParams$catFeatures <- dataset$numCatFeatures()
+  modelParams$numFeatures <- dataset$numNumFeatures()
+
+  estimator <- Estimator$new(
+    baseModel = baseModel,
+    modelParameters = modelParams,
+    fitParameters = fitParams,
+    device = device
+  )
+  numericalIndex <- dataset$getNumericalIndex()
+
+  estimator$fitWholeTrainingSet(dataset, finalParam$learnSchedule$LRs)
+
+  ParallelLogger::logInfo("Calculating predictions on all train data...")
+  prediction <- predictDeepEstimator(
+    plpModel = estimator,
+    data = dataset,
+    cohort = labels
+  )
+  prediction$evaluationType <- "Train"
+
+  prediction <- rbind(
+    prediction,
+    cvPrediction
+  )
+  # modify prediction
+  prediction <- prediction %>%
+    dplyr::select(-.data$index)
+
+  prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, origin = "1970-01-01")
+
+
+  # save torch code here
+  estimator$save(modelLocation, "DeepEstimatorModel.pt")
+
+  return(
+    list(
+      estimator = modelLocation,
+      prediction = prediction,
+      finalParam = finalParam,
+      paramGridSearch = paramGridSearch,
+      numericalIndex = numericalIndex
+    )
+  )
+}
+
+#' Estimator
+#' @description
+#' A generic R6 class that wraps around a torch nn module and can be used to
+#' fit and predict the model defined in that module.
+#' @export
+Estimator <- R6::R6Class(
+  classname = "Estimator",
+  lock_objects = FALSE,
+  public = list(
+    #' @description
+    #' Creates a new estimator
+    #' @param baseModel       The torch nn module to use as model
+    #' @param modelParameters Parameters to initialize the baseModel
+    #' @param fitParameters   Parameters required for the estimator fitting
+    #' @param optimizer       A torch optimizer to use, default is Adam
+    #' @param criterion       The torch loss function to use, defaults to
+    #'                        binary cross entropy with logits
+    #' @param scheduler       learning rate scheduler to use
+    #' @param device           Which device to use for fitting, default is cpu
+    #' @param patience         Patience to use for early stopping
+    initialize = function(baseModel,
+                          modelParameters,
+                          fitParameters,
+                          optimizer = torch::optim_adam,
+                          criterion = torch::nn_bce_with_logits_loss,
+                          scheduler = torch::lr_reduce_on_plateau,
+                          device = "cpu",
+                          patience = 4) {
+      self$device <- device
+      self$model <- do.call(baseModel, modelParameters)
+      self$modelParameters <- modelParameters
+      self$fitParameters <- fitParameters
+      self$epochs <- self$itemOrDefaults(fitParameters, "epochs", 10)
+      self$learningRate <- self$itemOrDefaults(fitParameters, "learningRate", 1e-3)
+      self$l2Norm <- self$itemOrDefaults(fitParameters, "weightDecay", 1e-5)
+      self$batchSize <- self$itemOrDefaults(fitParameters, "batchSize", 1024)
+      self$posWeight <- self$itemOrDefaults(fitParameters, "posWeight", 1)
+      self$prefix <- self$itemOrDefaults(fitParameters, "prefix", self$model$name)
+
+      self$previousEpochs <- self$itemOrDefaults(fitParameters, "previousEpochs", 0)
+      self$model$to(device = self$device)
+
+      self$optimizer <- optimizer(
+        params = self$model$parameters,
+        lr = self$learningRate,
+        weight_decay = self$l2Norm
+      )
+      self$criterion <- criterion(torch::torch_tensor(self$posWeight,
+        device = self$device
+      ))
+
+      self$scheduler <- scheduler(self$optimizer,
+        patience = 1,
+        verbose = FALSE, mode = "max"
+      )
+
+      # gradient accumulation is useful when training large numbers where
+      # you can only fit few samples on the GPU in each batch.
+      self$gradAccumulationIter <- 1
+
+      if (!is.null(patience)) {
+        self$earlyStopper <- EarlyStopping$new(patience = patience)
+      } else {
+        self$earlyStopper <- NULL
+      }
+
+      self$bestScore <- NULL
+      self$bestEpoch <- NULL
+    },
+
+    #' @description fits the estimator
+    #' @param dataset     a torch dataset to use for model fitting
+    #' @param testDataset a torch dataset to use for early stopping
+    fit = function(dataset, testDataset) {
+      valLosses <- c()
+      valAUCs <- c()
+      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
+      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
+
+      testBatchIndex <- 1:length(testDataset)
+      testBatchIndex <- split(testBatchIndex, ceiling(seq_along(testBatchIndex) / self$batchSize))
+
+      modelStateDict <- list()
+      epoch <- list()
+      times <- list()
+      learnRates <- list()
+      for (epochI in 1:self$epochs) {
+        startTime <- Sys.time()
+        trainLoss <- self$fitEpoch(dataset, batchIndex)
+        endTime <- Sys.time()
+
+        # predict on test data
+        scores <- self$score(testDataset, testBatchIndex)
+        delta <- endTime - startTime
+        currentEpoch <- epochI + self$previousEpochs
+        lr <- self$optimizer$param_groups[[1]]$lr
+        ParallelLogger::logInfo(
+          "Epochs: ", currentEpoch,
+          " | Val AUC: ", round(scores$auc, 3),
+          " | Val Loss: ", round(scores$loss, 3),
+          " | Train Loss: ", round(trainLoss, 3),
+          " | Time: ", round(delta, 3), " ",
+          units(delta),
+          " | LR: ", lr
+        )
+        self$scheduler$step(scores$auc)
+        valLosses <- c(valLosses, scores$loss)
+        valAUCs <- c(valAUCs, scores$auc)
+        learnRates <- c(learnRates, lr)
+        times <- c(times, round(delta, 3))
+        if (!is.null(self$earlyStopper)) {
+          self$earlyStopper$call(scores$auc)
+          if (self$earlyStopper$improved) {
+            # here it saves the results to lists rather than files
+            modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
+            epoch[[epochI]] <- currentEpoch
+          }
+          if (self$earlyStopper$earlyStop) {
+            ParallelLogger::logInfo("Early stopping, validation AUC stopped improving")
+            ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
+            self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)
+            return(invisible(self))
+          }
+        } else {
+          modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu())
+          epoch[[epochI]] <- currentEpoch
+        }
+      }
+      ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta))
+      self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)
+      invisible(self)
+    },
+
+    #' @description
+    #' fits estimator for one epoch (one round through the data)
+    #' @param dataset     torch dataset to use for fitting
+    #' @param batchIndex  indices of batches
+    fitEpoch = function(dataset, batchIndex) {
+      trainLosses <- torch::torch_empty(length(batchIndex))
+      ix <- 1
+      self$model$train()
+      progressBar <- utils::txtProgressBar(style = 3)
+      coro::loop(for (b in batchIndex) {
+        self$optimizer$zero_grad()
+        batch <- self$batchToDevice(dataset[b])
+        out <- self$model(batch[[1]])
+        loss <- self$criterion(out, batch[[2]])
+        loss$backward()
+
+        self$optimizer$step()
+        trainLosses[ix] <- loss$detach()
+        utils::setTxtProgressBar(progressBar, ix / length(batchIndex))
+        ix <- ix + 1
+      })
+      close(progressBar)
+      trainLosses$mean()$item()
+    },
+
+    #' @description
+    #' calculates loss and auc after training for one epoch
+    #' @param dataset    The torch dataset to use to evaluate loss and auc
+    #' @param batchIndex Indices of batches in the dataset
+    #' @return list with average loss and auc in the dataset
+    score = function(dataset, batchIndex) {
+      torch::with_no_grad({
+        loss <- torch::torch_empty(c(length(batchIndex)))
+        predictions <- list()
+        targets <- list()
+        self$model$eval()
+        ix <- 1
+        coro::loop(for (b in batchIndex) {
+          batch <- self$batchToDevice(dataset[b])
+          pred <- self$model(batch[[1]])
+          predictions <- c(predictions, pred)
+          targets <- c(targets, batch[[2]])
+          loss[ix] <- self$criterion(pred, batch[[2]])
+          ix <- ix + 1
+        })
+        mean_loss <- loss$mean()$item()
+        predictionsClass <- data.frame(
+          value = as.matrix(torch::torch_sigmoid(torch::torch_cat(predictions)$cpu())),
+          outcomeCount = as.matrix(torch::torch_cat(targets)$cpu())
+        )
+        attr(predictionsClass, "metaData")$modelType <- "binary"
+        auc <- PatientLevelPrediction::computeAuc(predictionsClass)
+      })
+      return(list(loss = mean_loss, auc = auc))
+    },
+
+    #' @description
+    #' operations that run when fitting is finished
+    #' @param valAUCs         validation AUC values
+    #' @param modelStateDict  fitted model parameters
+    #' @param valLosses       validation losses
+    #' @param epoch           list of epochs fit
+    #' @param learnRates      learning rate sequence used so far
+    finishFit = function(valAUCs, modelStateDict, valLosses, epoch, learnRates) {
+      bestEpochInd <- which.max(valAUCs) # change this if a different metric is used
+
+      bestModelStateDict <- lapply(modelStateDict[[bestEpochInd]], function(x) x$to(device = self$device))
+      self$model$load_state_dict(bestModelStateDict)
+
+      bestEpoch <- epoch[[bestEpochInd]]
+      self$bestEpoch <- bestEpoch
+      self$bestScore <- list(
+        loss = valLosses[bestEpochInd],
+        auc = valAUCs[bestEpochInd]
+      )
+      self$learnRateSchedule <- learnRates[1:bestEpochInd]
+
+      ParallelLogger::logInfo("Loaded best model (based on AUC) from epoch ", bestEpoch)
+      ParallelLogger::logInfo("ValLoss: ", self$bestScore$loss)
+      ParallelLogger::logInfo("valAUC: ", self$bestScore$auc)
+    },
+
+    #' @description
+    #' Fits whole training set on a specific number of epochs
+    #' TODO What happens when learning rate changes per epochs?
+    #' Ideally I would copy the learning rate strategy from before
+    #' and adjust for different sizes ie more iterations/updates???
+    #' @param dataset torch dataset
+    #' @param learnRates learnRateSchedule from CV
+    fitWholeTrainingSet = function(dataset, learnRates = NULL) {
+      if (is.null(self$bestEpoch)) {
+        self$bestEpoch <- self$epochs
+      }
+
+      batchIndex <- torch::torch_randperm(length(dataset)) + 1L
+      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
+      for (epoch in 1:self$bestEpoch) {
+        self$optimizer$param_groups[[1]]$lr <- learnRates[[epoch]]
+        self$fitEpoch(dataset, batchIndex)
+      }
+    },
+
+    #' @description
+    #' save model and those parameters needed to reconstruct it
+    #' @param path where to save the model
+    #' @param name name of file
+    #' @return the path to saved model
+    save = function(path, name) {
+      savePath <- file.path(path, name)
+      torch::torch_save(
+        list(
+          modelStateDict = self$model$state_dict(),
+          modelParameters = self$modelParameters,
+          fitParameters = self$fitParameters,
+          epoch = self$epochs
+        ),
+        savePath
+      )
+      return(savePath)
+    },
+
+
+    #' @description
+    #' predicts and outputs the probabilities
+    #' @param dataset Torch dataset to create predictions for
+    #' @return predictions as probabilities
+    predictProba = function(dataset) {
+      batchIndex <- 1:length(dataset)
+      batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize))
+      torch::with_no_grad({
+        predictions <- c()
+        self$model$eval()
+        coro::loop(for (b in batchIndex) {
+          batch <- self$batchToDevice(dataset[b])
+          target <- batch$target
+          pred <- self$model(batch$batch)
+          predictions <- c(predictions, as.array(torch::torch_sigmoid(pred$cpu())))
+        })
+      })
+      return(predictions)
+    },
+
+    #' @description
+    #' predicts and outputs the class
+    #' @param   dataset A torch dataset to create predictions for
+    #' @param   threshold Which threshold to use for predictions
+    #' @return  The predicted class for the data in the dataset
+    predict = function(dataset, threshold = NULL) {
+      predictions <- self$predictProba(dataset)
+
+      if (is.null(threshold)) {
+        # use outcome rate
+        threshold <- dataset$target$sum()$item() / length(dataset)
+      }
+      predicted_class <- as.integer(predictions > threshold)
+      return(predicted_class)
+    },
+
+    #' @description
+    #' sends a batch of data to device
+    #' assumes batch includes lists of tensors to arbitrary nested depths
+    #' @param batch the batch to send, usually a list of torch tensors
+    #' @return the batch on the required device
+    batchToDevice = function(batch) {
+      if (class(batch)[1] == "torch_tensor") {
+        batch <- batch$to(device = self$device)
+      } else {
+        ix <- 1
+        for (b in batch) {
+          if (class(b)[1] == "torch_tensor") {
+            b <- b$to(device = self$device)
+          } else {
+            b <- self$batchToDevice(b)
+          }
+          if (!is.null(b)) {
+            batch[[ix]] <- b
+          }
+          ix <- ix + 1
+        }
+      }
+      return(batch)
+    },
+
+    #' @description
+    #' select item from list, and if it's null sets a default
+    #' @param list A list with items
+    #' @param item Which list item to retrieve
+    #' @param default The value to return if list doesn't have item
+    #' @return the list item or default
+    itemOrDefaults = function(list, item, default = NULL) {
+      value <- list[[item]]
+      if (is.null(value)) default else value
+    }
+  )
+)
+
+#' Earlystopping class
+#' @description
+#' Stops training if a loss or metric has stopped improving
+EarlyStopping <- R6::R6Class(
+  classname = "EarlyStopping",
+  lock_objects = FALSE,
+  public = list(
+    #' @description
+    #' Creates a new earlystopping object
+    #' @param patience Stop after this number of epochs if loss doesn't improve
+    #' @param delta    How much does the loss need to improve to count as improvement
+    #' @param verbose  If information should be printed out
+    #' @return a new earlystopping object
+    initialize = function(patience = 3, delta = 0, verbose = TRUE) {
+      self$patience <- patience
+      self$counter <- 0
+      self$verbose <- verbose
+      self$bestScore <- NULL
+      self$earlyStop <- FALSE
+      self$improved <- FALSE
+      self$delta <- delta
+      self$previousScore <- 0
+    },
+    #' @description
+    #' call the earlystopping object and increment a counter if loss is not
+    #' improving
+    #' @param metric the current metric value
+    call = function(metric) {
+      score <- metric
+      if (is.null(self$bestScore)) {
+        self$bestScore <- score
+        self$improved <- TRUE
+      } else if (score < self$bestScore + self$delta) {
+        self$counter <- self$counter + 1
+        self$improved <- FALSE
+        if (self$verbose) {
+          ParallelLogger::logInfo(
+            "EarlyStopping counter: ", self$counter,
+            " out of ", self$patience
+          )
+        }
+        if (self$counter >= self$patience) {
+          self$earlyStop <- TRUE
+        }
+      } else {
+        self$bestScore <- score
+        self$counter <- 0
+        self$improved <- TRUE
+      }
+      self$previousScore <- score
+    }
+  )
+)
diff --git a/R/Formatting.R b/R/Formatting.R
deleted file mode 100644
index 600c24b..0000000
--- a/R/Formatting.R
+++ /dev/null
@@ -1,457 +0,0 @@
-# @file formatting.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Convert the plpData in COO format into a sparse R matrix
-#'
-#' @description
-#' Converts the standard plpData to a sparse matrix
-#'
-#' @details
-#' This function converts the covariate file from ffdf in COO format into a sparse matrix from
-#' the package Matrix
-#' @param plpData                       An object of type \code{plpData} with covariate in coo format - the patient level prediction
-#'                                      data extracted from the CDM.
-#' @param population                    The population to include in the matrix
-#' @param map                           A covariate map (telling us the column number for covariates)
-#' @param temporal                      Whether you want to convert temporal data
-#' @examples
-#' #TODO
-#'
-#' @return
-#' Returns a list, containing the data as a sparse matrix, the plpData covariateRef
-#' and a data.frame named map that tells us what covariate corresponds to each column
-#' This object is a list with the following components: \describe{
-#' \item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
-#' \item{covariateRef}{The plpData covariateRef.}
-#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
-#' }
-#'
-#' @export
-toSparseM <- function(plpData,population, map=NULL, temporal=F){
-  # check logger
-  if(length(ParallelLogger::getLoggers())==0){
-    logger <- ParallelLogger::createLogger(name = "SIMPLE",
-                                        threshold = "INFO",
-                                        appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
-    ParallelLogger::registerLogger(logger)
-  }
-  
-  ParallelLogger::logInfo(paste0('starting toSparseM'))
-  
-  
-  ParallelLogger::logDebug(paste0('covariates nrow: ', nrow(plpData$covariateData$covariates)))
-  ParallelLogger::logDebug(paste0('covariateRef nrow: ', nrow(plpData$covariateData$covariateRef)))
-  
-   
-  #assign newIds to covariateRef
-  newcovariateData <- MapCovariates(plpData$covariateData,
-                                 population, 
-                                 mapping=map)
-  
-  ParallelLogger::logDebug(paste0('Max covariateId in covariates: ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T)))))
-  ParallelLogger::logDebug(paste0('# covariates in covariateRef: ', nrow(newcovariateData$covariateRef)))
-  ParallelLogger::logDebug(paste0('Max rowId in covariates: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T)))))
-
-  maxY <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId, na.rm = TRUE)))$max
-  ParallelLogger::logDebug(paste0('Max newCovariateId in mapping: ',maxY))
-  maxX <- max(population$rowId)
-  ParallelLogger::logDebug(paste0('Max rowId in population: ',maxX))
-  
-  # chunk then add
-  if(!temporal){
-    ParallelLogger::logInfo(paste0('toSparseM non temporal used'))
-  data <- Matrix::sparseMatrix(i=1,
-                               j=1,
-                               x=0,
-                               dims=c(maxX,maxY))
-  
-  dataEnv <- environment()
-  convertData1 <- function(batch,dataEnv) {
-    data <- get("data", envir = dataEnv)
-    data <- data + Matrix::sparseMatrix(i=as.data.frame(batch %>% dplyr::select(.data$rowId))$rowId,
-                                         j=as.data.frame(batch %>% dplyr::select(.data$covariateId))$covariateId,
-                                         x=as.data.frame(batch %>% dplyr::select(.data$covariateValue))$covariateValue,
-                                         dims=c(maxX,maxY))
-    assign("data", data, envir = dataEnv)
-    return(NULL)
-  }
-  Andromeda::batchApply(newcovariateData$covariates, convertData1, batchSize = 100000, dataEnv = dataEnv)
-  
-  } else {
-    ParallelLogger::logInfo(paste0('toSparseM temporal used'))
-    
-    ParallelLogger::logTrace(paste0('Min time:', min(plpData$timeRef$timeId)))
-    ParallelLogger::logTrace(paste0('Max time:', max(plpData$timeRef$timeId)))
-    
-    # do we want to use for(i in sort(plpData$timeRef$timeId)){ ?
-    for(i in min(plpData$timeRef$timeId):max(plpData$timeRef$timeId)){
-      
-      if(nrow(newcovariateData$covariates %>% dplyr::filter(.data$timeId==i))>0){
-        ParallelLogger::logTrace(paste0('Found covariates for timeId ', i))
-        
- 
-        # initiate the sparse matrix
-        dataPlp <- Matrix::sparseMatrix(i=1,
-                                     j=1,
-                                     x=0,
-                                     dims=c(maxX, maxY)) 
-        dataEnv <- environment()
-        ParallelLogger::logTrace(paste0('Initiated Mapping covariates for timeId ', i))
-        
-        
-        # add function to batch creating matrix from Andromeda data
-        convertData <- function(batch, dataEnv) {
-          dataPlp <- get("dataPlp", envir = dataEnv)
-          dataPlp <- dataPlp + Matrix::sparseMatrix(i=as.double(as.character(batch$rowId)),
-                                               j=as.double(as.character(batch$covariateId)),
-                                               x=batch$covariateValue,
-                                               dims=c(maxX,maxY))
-          
-          assign("dataPlp", dataPlp, envir = dataEnv)
-          return(NULL)
-        }
-        
-        # add age for each time
-        tempCovs <- addAgeTemp(timeId = i,newcovariateData, plpData$timeRef) # EDITED adding newCov
-        if(!is.null(tempCovs)){
-          Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv)
-          ParallelLogger::logTrace(paste0('Added any age covariates for timeId ', i))
-        }
-        
-        # add non age temporal covs for each time
-        tempCovs <- addNonAgeTemp(timeId = i,newcovariateData)
-        if(!is.null(tempCovs)){
-          Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv)
-          ParallelLogger::logTrace(paste0('Added non-age non-temporal covariates for timeId ', i))
-        }
-        
-        # add non temporal covs
-        tempCovs <- newcovariateData$covariates %>% 
-          dplyr::filter(!is.na(.data$timeId)) %>% 
-          dplyr::filter(.data$timeId == i) 
-        Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv)
-        
-        data_array <- slam::as.simple_sparse_array(dataPlp)
-        # remove dataPlp
-        #dataPlp <<- NULL
-        ParallelLogger::logTrace(paste0('Dim of data_array: ', paste0(dim(data_array), collapse='-')))
-        
-        #extending one more dimesion to the array
-        data_array<-slam::extend_simple_sparse_array(data_array, MARGIN =c(1L))
-        ParallelLogger::logTrace(paste0('Finished Mapping covariates for timeId ', i))
-      } else {
-        data_array <- tryCatch(slam::simple_sparse_array(i=matrix(c(1,1,1), ncol = 3), 
-                                                   v=0,
-                                                   dim=c(maxX,1, maxY))
-        )
-        
-      }
-      #binding arrays along the dimesion
-      if(i==min(plpData$timeRef$timeId)) {
-        result_array <- data_array
-      }else{
-        result_array <- slam::abind_simple_sparse_array(result_array,data_array,MARGIN=2L)
-      }
-    }
-    data <- result_array
-  }
-  
-  ParallelLogger::logDebug(paste0('Sparse matrix with dimensionality: ', paste(dim(data), collapse=',')  ))
-
-  ParallelLogger::logInfo(paste0('finishing toSparseM'))
-  
-  result <- list(data=data,
-                 covariateRef=as.data.frame(newcovariateData$covariateRef),
-                 map=as.data.frame(newcovariateData$mapping))
-  return(result)
-
-}
-
-# restricts to pop and saves/creates mapping
-MapCovariates <- function(covariateData,population, mapping){
-  
-  # to remove check notes
-  #covariateId <- oldCovariateId <- newCovariateId <- NULL
-  ParallelLogger::logInfo(paste0('starting MapCovariates'))
-  
-  newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
-                                           analysisRef = covariateData$analysisRef)
-  
-  # restrict to population for speed
-  ParallelLogger::logTrace('restricting to population for speed and mapping')
-  if(is.null(mapping)){
-    mapping <- data.frame(oldCovariateId = as.data.frame(covariateData$covariateRef %>% dplyr::select(.data$covariateId)),
-                          newCovariateId = 1:nrow(covariateData$covariateRef))
-  }
-  if(sum(colnames(mapping)%in%c('oldCovariateId','newCovariateId'))!=2){
-    colnames(mapping) <- c('oldCovariateId','newCovariateId')
-  }
-  covariateData$mapping <- mapping
-  covariateData$population <- data.frame(rowId = population[,'rowId'])
-  # assign new ids :
-  newCovariateData$covariates <- covariateData$covariates %>%
-    dplyr::inner_join(covariateData$population) %>% 
-    dplyr::rename(oldCovariateId = .data$covariateId) %>% 
-    dplyr::inner_join(covariateData$mapping) %>% 
-    dplyr::select(- .data$oldCovariateId)  %>%
-    dplyr::rename(covariateId = .data$newCovariateId)
-  covariateData$population <- NULL
-  covariateData$mapping <- NULL
-  
-  newCovariateData$mapping <- mapping
-  
-  ParallelLogger::logInfo(paste0('finished MapCovariates'))
-  
-  return(newCovariateData)
-}
-
-
-
-#' Convert the plpData in COO format into a sparse python matrix using torch.sparse
-#'
-#' @description
-#' Converts the standard plpData to a sparse matrix firectly into python
-#'
-#' @details
-#' This function converts the covariate file from ffdf in COO format into a sparse matrix from
-#' the package Matrix
-#' @param plpData                       An object of type \code{plpData} with covariate in coo format - the patient level prediction
-#'                                      data extracted from the CDM.
-#' @param population                    The population to include in the matrix
-#' @param map                           A covariate map (telling us the column number for covariates)
-#' @param temporal                      Whether to include timeId into tensor
-#' @param pythonExePath                 Location of python exe you want to use
-#' @examples
-#' #TODO
-#'
-#' @return
-#' Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef
-#' and a data.frame named map that tells us what covariate corresponds to each column
-#' This object is a list with the following components: \describe{
-#' \item{data}{The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
-#' \item{covariateRef}{The plpData covariateRef.}
-#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
-#' }
-#'
-#' @export
-toSparseTorchPython <- function(plpData,population, map=NULL, temporal=F, pythonExePath=NULL){
-  
-  map_python_initiate <- map_python <- function(){return(NULL)}
-  
-  # check logger
-  if(length(ParallelLogger::getLoggers())==0){
-    logger <- ParallelLogger::createLogger(name = "SIMPLE",
-                                           threshold = "INFO",
-                                           appenders = list(ParallelLogger::createConsoleAppender(layout = 'layoutTimestamp')))
-    ParallelLogger::registerLogger(logger)
-  }
-  
-  newcovariateData <- MapCovariates(plpData$covariateData,
-                                         population, 
-                                         mapping=map)
-  
-  ParallelLogger::logDebug(paste0('Max ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T)))))
-  ParallelLogger::logDebug(paste0('# cols: ', nrow(newcovariateData$covariateRef)))
-  ParallelLogger::logDebug(paste0('Max rowId: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T)))))
-  
-  ParallelLogger::logTrace(paste0('Converting data into python sparse matrix...'))
-  
-  maxT <- NULL
-  if(temporal){
-    maxT <- as.data.frame(newcovariateData$covariates$timeId %>% dplyr::summarise(max = max(.data$id, na.rm=T)))
-    ParallelLogger::logDebug(paste0('Max timeId: ', maxT))
-  }
-  
-  maxCol <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId,na.rm = TRUE)))$max
-  maxRow <- max(population$rowId)
-  
-  # source the python fucntion
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','TorchMap.py'), envir = e)
-  
-  dataEnv <- e # adding to remove <<- 
-  #dataPlp <<- map_python_initiate(maxCol = as.integer(maxCol), 
-  dataPlp <- map_python_initiate(maxCol = as.integer(maxCol), 
-                                         maxRow = as.integer(maxRow), 
-                                         maxT= as.integer(maxT))
-  
-  convertData <- function(batch, temporal=T, dataEnv) {
-    if(temporal){
-      #dataPlp <<- map_python(matrix = dataPlp ,
-      dataEnv$dataPlp <- map_python(matrix = dataEnv$dataPlp,
-                                    datas = as.matrix(as.data.frame(batch %>% dplyr::select(.data$rowId,.data$covariateId,.data$timeId,.data$covariateValue))),
-                                    maxCol = as.integer(maxCol),
-                                    maxRow = as.integer(maxRow),
-                                    maxT = as.integer(maxT))
-    }else{
-     # dataPlp <<- map_python(matrix = dataPlp ,
-      dataEnv$dataPlp <- map_python(matrix = dataEnv$dataPlp,
-                             datas = as.matrix(as.data.frame(batch %>% dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue))),
-                             maxCol = as.integer(maxCol),
-                             maxRow = as.integer(maxRow),
-                             maxT = NULL) 
-    }
-    return(NULL)
-  }
-  
-  if(temporal==T){
-    # add the age and non-temporal data
-    timeIds <- unique(plpData$timeRef$timeId)
-    for(timeId in timeIds){
-      tempData <- addAgeTemp(timeId, newcovariateData)
-      if(!is.null(tempData)){
-        Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv)
-      }
-      #tempData <- addNonAgeTemp(timeId,plpData.mapped) - what is plpData.mapped?
-      tempData <- addNonAgeTemp(timeId, newcovariateData)
-      if(!is.null(tempData)){
-        Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv)
-      }
-      tempData <- NULL
-    }
-    
-    # add the rest
-    tempData <- newcovariateData$covariates %>%
-      dplyr::filter(.data$timeId!=0) %>%
-      dplyr::filter(!is.na(.data$timeId))
-    Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv)
-    tempData <- NULL
-  } else {
-    Andromeda::batchApply(newcovariateData$covariates, convertData,
-                          temporal =F, batchSize = 100000, dataEnv=dataEnv)
-  }
-  ##result <- dataEnv$dataPlp
-  ##dataPlp <<- NULL
-  ##dataEnv$dataPlp <- NULL
-  ParallelLogger::logTrace(paste0('Sparse python tensor converted'))                            
-  
-  result <- list(data=dataPlp,
-                 covariateRef=as.data.frame(newcovariateData$covariateRef),
-                 map=as.data.frame(newcovariateData$mapping))
-  return(result)
-}
-
-
-# reformat the evaluation
-reformatPerformance <- function(train, test, analysisId){
-  
-  ParallelLogger::logInfo(paste0('starting reformatPerformance'))
-
-  nr1 <- length(unlist(train$evaluationStatistics[-1]))
-  nr2 <- length(unlist(test$evaluationStatistics[-1]))
-  evaluationStatistics <- cbind(analysisId=rep(analysisId,nr1+nr2),
-                                Eval=c(rep('train', nr1),rep('test', nr2)),
-                                Metric = names(c(unlist(train$evaluationStatistics[-1]),
-                                                 unlist(test$evaluationStatistics[-1]))),
-                                Value = c(unlist(train$evaluationStatistics[-1]),
-                                      unlist(test$evaluationStatistics[-1]))
-                                )
-
-
-  if(!is.null(test$thresholdSummary) & !is.null(train$thresholdSummary)){
-    nr1 <- nrow(train$thresholdSummary)
-    nr2 <- nrow(test$thresholdSummary)
-    thresholdSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
-                                    train$thresholdSummary),
-                              cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
-                                    test$thresholdSummary))
-  } else{
-    thresholdSummary <- NULL
-  }
-  
-
-  if(!is.null(train$demographicSummary) & !is.null(test$demographicSummary)){
-    nr1 <- nrow(train$demographicSummary)
-    nr2 <- nrow(test$demographicSummary)
-    demographicSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
-                                      train$demographicSummary),
-                                cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
-                                      test$demographicSummary))
-  } else{
-    demographicSummary <- NULL
-  }
-
-  nr1 <- nrow(train$calibrationSummary)
-  nr2 <- nrow(test$calibrationSummary)
-  calibrationSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
-                                    train$calibrationSummary),
-                              cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
-                                    test$calibrationSummary))
-
-  if(!is.null(train$predictionDistribution) & !is.null(test$predictionDistribution)){
-    nr1 <- nrow(train$predictionDistribution)
-    nr2 <- nrow(test$predictionDistribution)
-    predictionDistribution <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
-                                    train$predictionDistribution),
-                              cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
-                                    test$predictionDistribution))
-  } else {
-    predictionDistribution <- NULL
-  }
-    
-  result <- list(evaluationStatistics=evaluationStatistics,
-                 thresholdSummary=thresholdSummary,
-                 demographicSummary =demographicSummary,
-                 calibrationSummary=calibrationSummary,
-                 predictionDistribution=predictionDistribution)
-
-  return(result)
-}
-
-
-# helpers for converting temporal PLP data to matrix/tensor
-addAgeTemp <- function(timeId, newcovariateData, timeRef){
-  
-  startDay <- as.data.frame(timeRef[timeRef$timeId==timeId,])$startDay
-  
-  ageId <- as.data.frame(newcovariateData$mapping %>% 
-    dplyr::filter(.data$oldCovariateId == 1002) %>%
-    dplyr::select(.data$newCovariateId))$newCovariateId
-  
-  ageData <- newcovariateData$covariates%>% # changed from plpData$covariateData
-    dplyr::filter(.data$covariateId == ageId) %>%
-    dplyr::mutate(covariateValueNew = .data$covariateValue*365 + startDay,
-                  timeId = timeId) %>%
-    dplyr::select(- .data$covariateValue) %>%
-    dplyr::rename(covariateValue = .data$covariateValueNew) %>%
-    dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue, .data$timeId)
-  
-  if(nrow(ageData)==0){
-    return(NULL)
-  }
-  return(ageData)
-}
-
-
-addNonAgeTemp <- function(timeId, newcovariateData){
-
-  ageId <- as.data.frame(newcovariateData$mapping %>% 
-                           dplyr::filter(.data$oldCovariateId == 1002) %>%
-                           dplyr::select(.data$newCovariateId))$newCovariateId
-  
-  otherTempCovs <- newcovariateData$covariates%>% 
-    dplyr::filter(is.na(.data$timeId)) %>%
-    dplyr::filter(.data$covariateId != ageId) %>%
-    dplyr::mutate(timeId = timeId) %>%
-    dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue,.data$timeId)
-  
-  if(nrow(otherTempCovs)==0){
-    return(NULL)
-  }
-  return(otherTempCovs)
-}
diff --git a/R/MLP.R b/R/MLP.R
new file mode 100644
index 0000000..dc5f56a
--- /dev/null
+++ b/R/MLP.R
@@ -0,0 +1,175 @@
+# @file MLP.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' setMultiLayerPerceptron
+#'
+#' @description
+#' Creates settings for a Multilayer perceptron model
+#'
+#' @details
+#' Model architecture
+#'
+#'
+#' @param numLayers         Number of layers in network, default: 1:16
+#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)
+#' @param dropout           How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)
+#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
+#' @param weightDecay       Weight decay to apply, default: c(1e-6, 1e-3)
+#' @param learningRate      Learning rate to use. default: c(1e-2, 1e-5)
+#' @param seed              Seed to use for sampling hyperparameter space
+#' @param hyperParamSearch  Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'
+#' @param randomSample      How many random samples from hyperparameter space to use
+#' @param device            Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'
+#' @param batchSize         Size of batch, default: 1024
+#' @param epochs            Number of epochs to run, default: 10
+#'
+#' @export
+setMultiLayerPerceptron <- function(numLayers = c(1:8),
+                                    sizeHidden = c(2^(6:9)),
+                                    dropout = c(seq(0, 0.5, 0.05)),
+                                    sizeEmbedding = c(2^(6:9)),
+                                    weightDecay = c(1e-6, 1e-3),
+                                    learningRate = c(1e-2, 3e-4, 1e-5),
+                                    seed = NULL,
+                                    hyperParamSearch = "random",
+                                    randomSample = 100,
+                                    device = "cpu",
+                                    batchSize = 1024,
+                                    epochs = 30) {
+  if (is.null(seed)) {
+    seed <- as.integer(sample(1e5, 1))
+  }
+
+  paramGrid <- list(
+    numLayers = numLayers,
+    sizeHidden = sizeHidden,
+    dropout = dropout,
+    sizeEmbedding = sizeEmbedding,
+    weightDecay = weightDecay,
+    learningRate = learningRate,
+    seed = list(as.integer(seed[[1]]))
+  )
+
+  param <- PatientLevelPrediction::listCartesian(paramGrid)
+
+  if (hyperParamSearch == "random") {
+    param <- param[sample(length(param), randomSample)]
+  }
+
+  attr(param, "settings") <- list(
+    seed = seed[1],
+    device = device,
+    batchSize = batchSize,
+    epochs = epochs,
+    name = "MLP",
+    saveType = "file",
+    modelParamNames = c(
+      "numLayers", "sizeHidden",
+      "dropout", "sizeEmbedding"
+    ),
+    baseModel = "MLP"
+  )
+
+  results <- list(
+    fitFunction = "fitEstimator",
+    param = param
+  )
+
+  class(results) <- "modelSettings"
+
+  return(results)
+}
+
+
+MLP <- torch::nn_module(
+  name = "MLP",
+  initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers,
+                        activation = torch::nn_relu,
+                        normalization = torch::nn_batch_norm1d, dropout = NULL,
+                        d_out = 1) {
+    self$embedding <- torch::nn_embedding_bag(
+      num_embeddings = catFeatures + 1,
+      embedding_dim = sizeEmbedding,
+      padding_idx = 1
+    )
+    if (numFeatures != 0) {
+      self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding)
+    }
+
+    self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden)
+
+
+    self$layers <- torch::nn_module_list(lapply(
+      1:numLayers,
+      function(x) {
+        MLPLayer(
+          sizeHidden,
+          normalization, activation,
+          dropout
+        )
+      }
+    ))
+    self$lastNorm <- normalization(sizeHidden)
+    self$head <- torch::nn_linear(sizeHidden, d_out)
+
+    self$lastAct <- activation()
+  },
+  forward = function(x) {
+    x_cat <- x$cat
+    x_num <- x$num
+    x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1
+    if (!is.null(x_num)) {
+      x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2
+    } else {
+      x <- x_cat
+    }
+    x <- self$first_layer(x)
+
+    for (i in 1:length(self$layers)) {
+      x <- self$layers[[i]](x)
+    }
+    x <- self$lastNorm(x)
+    x <- self$lastAct(x)
+    x <- self$head(x)
+    x <- x$squeeze(-1)
+    return(x)
+  }
+)
+
+MLPLayer <- torch::nn_module(
+  name = "MLPLayer",
+  initialize = function(sizeHidden = 64,
+                        normalization = torch::nn_batch_norm1d,
+                        activation = torch::nn_relu,
+                        dropout = 0.0, bias = TRUE) {
+    self$norm <- normalization(sizeHidden)
+    self$activation <- activation()
+    self$linear <- torch::nn_linear(sizeHidden, sizeHidden, bias = bias)
+
+    if (!is.null(dropout) | !dropout == 0.0) {
+      self$dropout <- torch::nn_dropout(p = dropout)
+    }
+  },
+  forward = function(x) {
+    x <- self$linear(self$norm(x))
+    if (!is.null(self$dropout)) {
+      x <- self$dropout(x)
+    }
+    return(self$activation(x))
+  }
+)
diff --git a/R/Predict.R b/R/Predict.R
deleted file mode 100644
index caad604..0000000
--- a/R/Predict.R
+++ /dev/null
@@ -1,643 +0,0 @@
-# @file predict.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' predictPlp
-#'
-#' @description
-#' Predict the risk of the outcome using the input plpModel for the input plpData
-#' @details
-#' The function applied the trained model on the plpData to make predictions
-#' @param plpModel                         An object of type \code{plpModel} - a patient level prediction model
-#' @param population                       The population created using createStudyPopulation() who will have their risks predicted
-#' @param plpData                          An object of type \code{plpData} - the patient level prediction
-#'                                         data extracted from the CDM.
-#' @param index                            A data frame containing rowId: a vector of rowids and index: a vector of doubles the same length as the rowIds. If used, only the rowIds with a negative index value are used to calculate the prediction.  
-#' 
-#' @return
-#' A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details.
-#'
-
-# parent predict that calls the others
-#' @export
-predictPlp <- function(plpModel, population, plpData,  index=NULL){
-  
-  if(is.null(plpModel))
-    stop('No model input')
-  if(is.null(population))
-    stop('No population input')
-  if(is.null(plpData))
-    stop('No plpData input')
-  
-  # check logger
-  if(length(ParallelLogger::getLoggers())==0){
-    logger <- ParallelLogger::createLogger(name = "SIMPLE",
-                                           threshold = "INFO",
-                                           appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
-    ParallelLogger::registerLogger(logger)
-  }
-  
-  # apply the feature transformations
-  if(!is.null(index)){
-    ParallelLogger::logTrace(paste0('Calculating prediction for ',sum(index$index<0),' in test set'))
-    ind <- population$rowId%in%index$rowId[index$index<0]
-  } else{
-    ParallelLogger::logTrace(paste0('Calculating prediction for ',nrow(population),' in dataset'))
-    ind <- rep(T, nrow(population))
-  }
-  
-  # do the predction on the new data
-  if(class(plpModel)=='plpModel'){
-    # extract the classifier type
-    prediction <- plpModel$predict(plpData=plpData,population=population[ind,])
-    
-    if(nrow(prediction)!=nrow(population[ind,]))
-      ParallelLogger::logWarn(paste0('Dimension mismatch between prediction and population test cases.  Population test: ',nrow(population[ind, ]), '-- Prediction:', nrow(prediction) ))
-  } else{
-    ParallelLogger::logError('Non plpModel input')
-    stop()
-  }
-  
-  metaData <- list(predictionType = attr(plpModel, 'predictionType'), #"binary", 
-                   cohortId = attr(population,'metaData')$cohortId,
-                   outcomeId = attr(population,'metaData')$outcomeId,
-                   timepoint = attr(population,'metaData')$riskWindowEnd)
-  
-  attr(prediction, "metaData") <- metaData
-  return(prediction)
-}
-
-# default patient level prediction prediction  
-predict.plp <- function(plpModel,population, plpData, ...){
-  ## done in transform covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
-  ParallelLogger::logTrace('predict.plp - predictingProbabilities start')
-  prediction <- predictProbabilities(plpModel$model, population, 
-                                     plpData$covariateData)
-  ParallelLogger::logTrace('predict.plp - predictingProbabilities end')
-  
-  # add baselineHazard function
-  if(!is.null(plpModel$model$baselineHazard)){
-    time <- attr(population,'metaData')$riskWindowEnd
-    bhind <- which.min(abs(plpModel$model$baselineHazard$time-time))
-    prediction$value <- 1-plpModel$model$baselineHazard$surv[bhind]^prediction$value
-    
-    attr(prediction, "baselineHazard") <- plpModel$model$baselineHazard$surv[bhind]
-    attr(prediction, "timePoint") <- time
-    attr(prediction, "offset") <- 0
-  }
-  
-  return(prediction)
-}
-
-# for gxboost
-predict.xgboost <- function(plpModel,population, plpData, ...){ 
-  result <- toSparseM(plpData, population, map=plpModel$covariateMap)
-  data <- result$data[population$rowId,, drop = F]
-  prediction <- data.frame(rowId=population$rowId,
-                           value=stats::predict(plpModel$model, data)
-  )
-  
-  prediction <- merge(population, prediction, by='rowId', all.x=T, fill=0)
-  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
-  attr(prediction, "metaData") <- list(predictionType = "binary") 
-  return(prediction)
-  
-}
-
-predict.pythonReticulate <- function(plpModel, population, plpData){
-  
-  
-  python_predict <- python_predict_temporal <- function(){return(NULL)}
-  
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e)
-  
-  
-  ParallelLogger::logInfo('Mapping covariates...')
-  if(!is.null(plpData$timeRef)){
-    pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T)
-    pdata <- pdata$data
-    fun_predict <- python_predict_temporal
-  } else {  
-    newData <- toSparseM(plpData, population, map=plpModel$covariateMap)
-    included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map?
-    included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] 
-    pdata <- reticulate::r_to_py(newData$data[,included, drop = F])
-    fun_predict <- python_predict
-  }
-  
-  population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
-  namesInd <- c('rowIdPython','rowId')%in%colnames(population)
-  namesInd <- c('rowIdPython','rowId')[namesInd]
-  pPopulation <- as.matrix(population[,namesInd])
-  
-  # run the python predict code:
-  ParallelLogger::logInfo('Executing prediction...')
-  result <- fun_predict(population = pPopulation, 
-                        plpData = pdata, 
-                        model_loc = plpModel$model,
-                        dense = ifelse(is.null(plpModel$dense),0,plpModel$dense),
-                        autoencoder = F)
-  
-  #get the prediction from python and reformat:
-  ParallelLogger::logInfo('Returning results...')
-  prediction <- result
-  prediction <- as.data.frame(prediction)
-  attr(prediction, "metaData") <- list(predictionType="binary")
-  colnames(prediction) <- c(namesInd, 'value')
-  
-  # add 1 to rowId from python:
-  ##prediction$rowId <- prediction$rowId+1
-  
-  # add subjectId and date:
-  prediction <- merge(prediction[,colnames(prediction)!='rowIdPython'],
-                      population, 
-                      by='rowId')
-  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')]
-  return(prediction)
-}
-
-predict.pythonAuto <- function(plpModel, population, plpData){
-  
-  python_predict <- function(){return(NULL)}
-  
-  ParallelLogger::logInfo('Mapping covariates...')
-  if(!is.null(plpData$timeRef)){
-    pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T)
-    pdata <- pdata$data
-  } else {  
-    newData <- toSparseM(plpData, population, map=plpModel$covariateMap)
-    included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map?
-    included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] 
-    pdata <- reticulate::r_to_py(newData$data[,included])
-  }
-  
-  population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
-  namesInd <- c('rowIdPython','rowId')%in%colnames(population)
-  namesInd <- c('rowIdPython','rowId')[namesInd]
-  pPopulation <- as.matrix(population[,namesInd])
-  
-  # run the python predict code:
-  ParallelLogger::logInfo('Executing prediction...')
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e)
-  
-  result <- python_predict(population = pPopulation, 
-                           plpData = pdata, 
-                           model_loc = plpModel$model,
-                           dense = ifelse(is.null(plpModel$dense),0,plpModel$dense),
-                           autoencoder = T)
-  #get the prediction from python and reformat:
-  ParallelLogger::logInfo('Returning results...')
-  prediction <- result
-  prediction <- as.data.frame(prediction)
-  attr(prediction, "metaData") <- list(predictionType="binary")
-  colnames(prediction) <- c(namesInd, 'value')
-  
-  prediction <- merge(prediction[,colnames(prediction)!='rowIdPython'],
-                      population, 
-                      by='rowId')
-  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')]
-  return(prediction)
-}
-
-
-
-predict.knn <- function(plpData, population, plpModel, ...){
-  ##covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
-  prediction <- BigKnn::predictKnn(covariates = plpData$covariateData$covariates,
-                                   cohorts= population[,!colnames(population)%in%'cohortStartDate'],
-                                   indexFolder = plpModel$model,
-                                   k = plpModel$modelSettings$modelParameters$k,
-                                   weighted = TRUE,
-                                   threads = plpModel$modelSettings$threads)
-  
-  # can add: threads = 1 in the future
-  
-  # return the cohorts as a data frame with the prediction added as 
-  # a new column with the column name 'value'
-  prediction <- merge(population, prediction[,c('rowId','value')], by='rowId', 
-                      all.x=T, fill=0)
-  prediction$value[is.na(prediction$value)] <- 0
-  
-  return(prediction)
-}
-
-
-predict.deep <- function(plpModel, population, plpData,   ...){
-  ensure_installed("plyr")
-  
-  temporal <- !is.null(plpData$timeRef)
-  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
-  if(temporal){
-    ParallelLogger::logTrace('temporal')
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
-    
-    data <-result$data[population$rowId,,]
-    if(!is.null(plpModel$useVae)){
-      if(plpModel$useVae==TRUE){
-        data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
-        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
-      }}
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,]))
-      if(is.null(dim(pred))){
-        prediction$value[batch] <- as.double(pred)
-      } else{
-        prediction$value[batch] <- as.double(pred[,2])
-      }
-    }
-    
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
-    return(prediction)
-  } else{
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
-    data <-result$data[population$rowId,]
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,]))
-      prediction$value[batch] <- as.double(pred[,2])
-    }
-    
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
-    return(prediction)
-    
-  }
-}
-
-predict.BayesianDeep <- function(plpModel, population, plpData,   ...){
-  ensure_installed("plyr")
-  
-  temporal <- !is.null(plpData$timeRef)
-  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
-  if(temporal){
-    ParallelLogger::logTrace('temporal')
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
-    
-    data <-result$data[population$rowId,,]
-    if(!is.null(plpModel$useVae)){
-      if(plpModel$useVae==TRUE){
-        data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
-        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
-      }}
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    prediction$epistemicUncertainty <- 0
-    prediction$aleatoricUncertainty <- 0
-    for(batch in batches){
-      num_MC_samples = 100
-      output_dim = 2
-      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,]))
-      MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
-      for (k in 1:num_MC_samples){
-        MC_samples[k,, ] = stats::predict(plpModel$model, as.array(data[batch,,]))
-        #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-      }
-      pred <- apply(MC_samples[,,output_dim], 2, mean)
-      epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
-      logVar = MC_samples[, , output_dim * 2]
-      if(length(dim(logVar))<=1){
-        aleatoricUncertainty = exp(mean(logVar))
-      }else{
-        aleatoricUncertainty = exp(colMeans(logVar))
-        
-      }
-      prediction$value[batch] <- pred
-      prediction$epistemicUncertainty[batch] = epistemicUncertainty
-      prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
-      #writeLines(paste0(dim(pred[,2]), collapse='-'))
-      #writeLines(paste0(pred[1,2], collapse='-'))
-      
-    }
-    prediction$value[prediction$value>1] <- 1
-    prediction$value[prediction$value<0] <- 0
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender',
-                                                        'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue
-    return(prediction)
-  } else{
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
-    data <-result$data[population$rowId,]
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      num_MC_samples = 100
-      output_dim =2
-      MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
-      for (k in 1:num_MC_samples){
-        MC_samples[k,, ] = stats::predict(plpModel$model, as.array(data[batch,,]))
-        #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-      }
-      pred <- apply(MC_samples[,,output_dim], 2, mean)
-      epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
-      logVar = MC_samples[, , output_dim * 2]
-      if(length(dim(logVar))<=1){
-        aleatoricUncertainty = exp(mean(logVar))
-      }else{
-        aleatoricUncertainty = exp(colMeans(logVar))
-        
-      }
-      prediction$value[batch] <- pred
-      prediction$epistemicUncertainty[batch] = epistemicUncertainty
-      prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
-      #writeLines(paste0(dim(pred[,2]), collapse='-'))
-      #writeLines(paste0(pred[1,2], collapse='-'))
-    }
-    prediction$value[prediction$value>1] <- 1
-    prediction$value[prediction$value<0] <- 0
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender',
-                                                        'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue
-    return(prediction)
-    
-  }
-}
-
-predict.deepEnsemble <- function(plpModel, population, plpData,   ...){
-  ensure_installed("plyr")
-  
-  mu <- function(){return(NULL)}
-  sigma <- function(){return(NULL)}
-  
-  temporal <- !is.null(plpData$timeRef)
-  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
-  if(temporal){
-    ParallelLogger::logTrace('temporal')
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
-    
-    data <-result$data[population$rowId,,]
-    if(!is.null(plpModel$useVae)){
-      if(plpModel$useVae==TRUE){
-        data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
-        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
-      }}
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    
-    prediction$sigmas <- 0
-    for(batch in batches){
-      for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
-        if(i==1){
-          muMatrix <- data.frame()
-          sigmaMatrix <-data.frame()
-        }
-        c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,])))
-        muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2])))
-        sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
-      }
-      
-      muMean <- apply(muMatrix,2,mean)
-      muSq <- muMatrix^2
-      sigmaSq <- sigmaMatrix^2
-      sigmaMean <- apply(sigmaMatrix,2,mean)
-      sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2
-      
-      prediction$value[batch] <- c(muMean)
-      prediction$sigmas[batch] <- c(sigmaResult)
-    }
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas')] # need to fix no index issue
-    #If the prediction value is negative, please add values to make all the values to positive. 
-    if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1))  }
-    return(prediction)
-    
-  } else{
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
-    data <-result$data[population$rowId,]
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    
-    prediction$sigmas <- 0
-    for(batch in batches){
-      for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
-        if(i==1){
-          muMatrix <- data.frame()
-          sigmaMatrix <-data.frame()
-        }
-        c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,])))
-        muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2])))
-        sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
-      }
-      
-      muMean <- apply(muMatrix,2,mean)
-      muSq <- muMatrix^2
-      sigmaSq <- sigmaMatrix^2
-      sigmaMean <- apply(sigmaMatrix,2,mean)
-      sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2
-      
-      prediction$value[batch] <- c(muMean)
-      prediction$sigmas[batch] <- c(sigmaResult)
-    }
-    
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas','ageYear', 'gender')] # need to fix no index issue
-    if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1))  }
-    return(prediction)
-    
-  }
-}
-
-predict.deepMulti <- function(plpModel, population, plpData,   ...){
-  
-  repeats <- attr(plpModel, 'inputs')
-  
-  temporal <- !is.null(plpData$timeRef)
-  ParallelLogger::logDebug('timeRef null: ',paste0(is.null(plpData$timeRef)))
-  if(temporal){
-    ParallelLogger::logTrace('temporal')
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
-    ParallelLogger::logDebug('result$data dim: ',paste0(dim(result$data), collapse = '-'))
-    ParallelLogger::logDebug('population dim: ',paste0(dim(population), collapse = '-'))
-    data <-result$data[population$rowId,,]
-    
-    ParallelLogger::logInfo('running batch prediction')
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      ParallelLogger::logDebug('batch length: ', length(batch))
-      dat <- list()
-      length(dat) <- repeats
-      for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])}
-      
-      pred <- keras::predict_on_batch(plpModel$model, dat)
-      if(is.null(dim(pred))){
-        ParallelLogger::logDebug('Pred length: ', length(pred))
-        prediction$value[batch] <- as.double(pred)
-      } else{
-        ParallelLogger::logDebug('Pred dim: ', paste0(dim(pred), collapse = '-'))
-        prediction$value[batch] <- as.double(pred[,2])
-      }
-    }
-    
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
-    return(prediction)
-  } else{
-    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
-    data <-result$data[population$rowId,]
-    
-    batch_size <- min(2000, length(population$rowId))
-    maxVal <- length(population$rowId)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      dat <- list()
-      length(dat) <- repeats
-      for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])}
-      
-      pred <- keras::predict_on_batch(plpModel$model, dat)
-      prediction$value[batch] <- as.double(pred)
-    }
-    
-    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
-    return(prediction)
-    
-  }
-}
-
-
-#' Create predictive probabilities
-#'
-#' @details
-#' Generates predictions for the population specified in plpData given the model.
-#'
-#' @return
-#' The value column in the result data.frame is: logistic: probabilities of the outcome, poisson:
-#' Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.
-#'
-#' @param predictiveModel   An object of type \code{predictiveModel} as generated using
-#'                          \code{\link{fitPlp}}.
-#' @param population        The population to calculate the prediction for
-#' @param covariateData        The covariateData containing the covariates for the population
-#' @export
-predictProbabilities <- function(predictiveModel, population, covariateData) {
-  start <- Sys.time()
-  
-  ParallelLogger::logTrace('predictProbabilities - predictAndromeda start')
-  prediction <- predictAndromeda(predictiveModel$coefficients,
-                            population,
-                            covariateData,
-                            predictiveModel$modelType)
-  ParallelLogger::logTrace('predictProbabilities - predictAndromeda end')
-  prediction$time <- NULL
-  attr(prediction, "modelType") <- predictiveModel$modelType
-  attr(prediction, "cohortId") <- attr(population, "metadata")$cohortId
-  attr(prediction, "outcomeId") <- attr(population, "metadata")$outcomeId
-  
-  if (predictiveModel$modelType %in% c("survival", "cox")) {
-    attr(prediction, "timepoint") <- attr(population, "metadata")$riskWindowEnd
-  }
-  
-  delta <- Sys.time() - start
-  ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units"))
-  return(prediction)
-}
-
-#' Generated predictions from a regression model
-#'
-#' @param coefficients   A names numeric vector where the names are the covariateIds, except for the
-#'                       first value which is expected to be the intercept.
-#' @param population       A data frame containing the population to do the prediction for
-#' @param covariateData     An andromeda object containing the covariateData with predefined columns
-#'                       (see below).
-#' @param modelType      Current supported types are "logistic", "poisson", "cox" or "survival".
-#'
-#' @details
-#' These columns are expected in the outcome object: \tabular{lll}{ \verb{rowId} \tab(integer) \tab
-#' Row ID is used to link multiple covariates (x) to a single outcome (y) \cr \verb{time} \tab(real)
-#' \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr \tab
-#' \tab(e.g. number of days) \cr } These columns are expected in the covariates object: \tabular{lll}{
-#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome
-#' (y) \cr \verb{covariateId} \tab(integer) \tab A numeric identifier of a covariate \cr
-#' \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr }
-#'
-#' @export
-predictAndromeda <- function(coefficients, population, covariateData, modelType = "logistic") {
-  if (!(modelType %in% c("logistic", "poisson", "survival","cox"))) {
-    stop(paste("Unknown modelType:", modelType))
-  }
-  if (!FeatureExtraction::isCovariateData(covariateData)){
-    stop("Needs correct covariateData")
-  }
-  
-  intercept <- coefficients[names(coefficients)%in%'(Intercept)']
-  if(length(intercept)==0) intercept <- 0
-  coefficients <- coefficients[!names(coefficients)%in%'(Intercept)']
-  coefficients <- data.frame(beta = as.numeric(coefficients),
-                             covariateId = bit64::as.integer64(names(coefficients)) #!@ modified 
-                             )
-  coefficients <- coefficients[coefficients$beta != 0, ]
-  if(sum(coefficients$beta != 0)>0){
-    covariateData$coefficients <- coefficients
-    on.exit(covariateData$coefficients <- NULL, add = TRUE)
-    
-    prediction <- covariateData$covariates %>% 
-      dplyr::inner_join(covariateData$coefficients, by= 'covariateId') %>% 
-      dplyr::mutate(values = .data$covariateValue*.data$beta) %>%
-      dplyr::group_by(.data$rowId) %>%
-      dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>%
-      dplyr::select(.data$rowId, .data$value)
-    
-    prediction <- as.data.frame(prediction)
-    prediction <- merge(population, prediction, by ="rowId", all.x = TRUE, fill = 0)
-    prediction$value[is.na(prediction$value)] <- 0
-    prediction$value <- prediction$value + intercept
-  } else{
-    warning('Model had no non-zero coefficients so predicted same for all population...')
-    prediction <- population
-    prediction$value <- rep(0, nrow(population)) + intercept
-  }
-  if (modelType == "logistic") {
-    link <- function(x) {
-      return(1/(1 + exp(0 - x)))
-    }
-    prediction$value <- link(prediction$value)
-  } else if (modelType == "poisson" || modelType == "survival" || modelType == "cox") {
-    prediction$value <- exp(prediction$value)
-    #if(max(prediction$value)>1){
-    #  prediction$value <- prediction$value/max(prediction$value)
-    #}
-  }
-  return(prediction)
-}
-
diff --git a/R/RNNTorch.R b/R/RNNTorch.R
deleted file mode 100644
index ad86b0e..0000000
--- a/R/RNNTorch.R
+++ /dev/null
@@ -1,173 +0,0 @@
-# @file RNNTorch.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for RNN model with python 
-#' @param hidden_size  The hidden size
-#' @param epochs     The number of epochs
-#' @param seed       A seed for the model
-#' @param class_weight   The class weight used for imbalanced data: 
-#'                           0: Inverse ratio between positives and negatives
-#'                          -1: Focal loss
-#' @param type      It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'
-#'
-#' @examples
-#' \dontrun{
-#' model.rnnTorch <- setRNNTorch()
-#' }
-#' @export
-setRNNTorch <- function(hidden_size=c(50, 100), epochs=c(20, 50), seed=0, class_weight = 0, type = 'RNN'){
-  # check inputs
-  
-  # add warning about dense data..
-  
-  
-  # set seed
-  if(is.null(seed[1])){
-    seed <- as.integer(sample(100000000,1))
-  }
-  
-  result <- list(model='fitRNNTorch', param=split(expand.grid(hidden_size=hidden_size,
-                                            epochs=epochs, seed=seed[1], 
-                                            class_weight = class_weight, type = type),
-									        1:(length(hidden_size)*length(epochs)) ),
-                                      name='RNN Torch')
-  
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-fitRNNTorch <- function(population, plpData, param, search='grid', quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # check plpData is libsvm format or convert if needed
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  
-  if(colnames(population)[ncol(population)]!='indexes'){
-    warning('indexes column not present as last column - setting all index to 1')
-    population$indexes <- rep(1, nrow(population))
-  }
-  
-  start <- Sys.time()
-  
-  population$rowIdPython <- population$rowId-1  #to account for python/r index difference #subjectId
-  pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
-  
-  result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T)
-    
-  outLoc <- createTempModelLoc()
-  # clear the existing model pickles
-  for(file in dir(outLoc))
-    file.remove(file.path(outLoc,file))
-  
-  # do cross validation to find hyperParameter
-  hyperParamSel <- lapply(param, function(x) do.call(trainRNNTorch, listAppend(x, 
-                                                                               list(plpData = result$data,
-                                                                                    population = pPopulation,
-                                                                                    train=TRUE,
-                                                                                    modelOutput=outLoc))  ))
- 
-  hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel <- do.call(trainRNNTorch, listAppend(param[[bestInd]], 
-                                                  list(plpData = result$data,
-                                                       population = pPopulation,
-                                                       train=FALSE,
-                                                       modelOutput=outLoc)))
-
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # prediction on train set:
-  pred <- finalModel
-  pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes
-  colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-  pred <- as.data.frame(pred)
-  attr(pred, "metaData") <- list(predictionType="binary")
-  
-  pred$value <- 1-pred$value
-  prediction <- merge(population, pred[,c('rowId','value')], by='rowId')
-  
-  # return model location 
-  result <- list(model = modelTrained,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitRNNTorch',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 dense=1,
-                 covariateMap=result$map, # I think this is need for new data to map the same?
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'pythonReticulate'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-
-trainRNNTorch <- function(plpData, population, epochs=50, hidden_size = 100, seed=0, class_weight= 0, type = 'RNN', train=TRUE, modelOutput, quiet=F){
-
-  train_deeptorch <- function(){return(NULL)}
-  python_dir <- system.file(package='PatientLevelPrediction','python')
-  
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e)
-  
-  result <- train_deeptorch(population = population, 
-                                train = train,
-                                plpData = plpData, 
-                                model_type = as.character(type),
-                                epochs = as.integer(epochs),
-                                hidden_size = as.integer(hidden_size),
-                                class_weight = class_weight,
-                                seed = seed, 
-                                quiet = quiet,
-                                modelOutput = modelOutput)
-  
-  if(train){
-    # then get the prediction 
-    pred <- result
-    colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-    pred <- as.data.frame(pred)
-    attr(pred, "metaData") <- list(predictionType="binary")
-    
-    pred$value <- 1-pred$value
-    auc <- computeAuc(pred)
-    writeLines(paste0('Model obtained CV AUC of ', auc))
-    return(auc)
-  }
-  
-  return(result)
-  
-}
diff --git a/R/ResNet.R b/R/ResNet.R
new file mode 100644
index 0000000..e8ce322
--- /dev/null
+++ b/R/ResNet.R
@@ -0,0 +1,194 @@
+# @file ResNet.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' setResNet
+#'
+#' @description
+#' Creates settings for a ResNet model
+#'
+#' @details
+#' Model architecture from by https://arxiv.org/abs/2106.11959
+#'
+#'
+#' @param numLayers         Number of layers in network, default: 1:16
+#' @param sizeHidden        Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)
+#' @param hiddenFactor      How much to grow the amount of neurons in each ResLayer, default: 1:4
+#' @param residualDropout   How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)
+#' @param hiddenDropout     How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)
+#' @param sizeEmbedding     Size of embedding layer, default: 2^(6:9) (64 to 512)
+#' @param weightDecay       Weight decay to apply, default: c(1e-6, 1e-3)
+#' @param learningRate      Learning rate to use. default: c(1e-2, 1e-5)
+#' @param seed              Seed to use for sampling hyperparameter space
+#' @param hyperParamSearch  Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'
+#' @param randomSample      How many random samples from hyperparameter space to use
+#' @param device            Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'
+#' @param batchSize         Size of batch, default: 1024
+#' @param epochs            Number of epochs to run, default: 10
+#'
+#' @export
+setResNet <- function(numLayers = c(1:8),
+                      sizeHidden = c(2^(6:9)),
+                      hiddenFactor = c(1:4),
+                      residualDropout = c(seq(0, 0.5, 0.05)),
+                      hiddenDropout = c(seq(0, 0.5, 0.05)),
+                      sizeEmbedding = c(2^(6:9)),
+                      weightDecay = c(1e-6, 1e-3),
+                      learningRate = c(1e-2, 3e-4, 1e-5),
+                      seed = NULL,
+                      hyperParamSearch = "random",
+                      randomSample = 100,
+                      device = "cpu",
+                      batchSize = 1024,
+                      epochs = 30) {
+  if (is.null(seed)) {
+    seed <- as.integer(sample(1e5, 1))
+  }
+
+  paramGrid <- list(
+    numLayers = numLayers,
+    sizeHidden = sizeHidden,
+    hiddenFactor = hiddenFactor,
+    residualDropout = residualDropout,
+    hiddenDropout = hiddenDropout,
+    sizeEmbedding = sizeEmbedding,
+    weightDecay = weightDecay,
+    learningRate = learningRate,
+    seed = list(as.integer(seed[[1]]))
+  )
+
+  param <- PatientLevelPrediction::listCartesian(paramGrid)
+
+  if (hyperParamSearch == "random") {
+    param <- param[sample(length(param), randomSample)]
+  }
+
+  attr(param, "settings") <- list(
+    seed = seed[1],
+    device = device,
+    batchSize = batchSize,
+    epochs = epochs,
+    name = "ResNet",
+    saveType = "file",
+    modelParamNames = c(
+      "numLayers", "sizeHidden", "hiddenFactor",
+      "residualDropout", "hiddenDropout", "sizeEmbedding"
+    ),
+    baseModel = "ResNet"
+  )
+
+  results <- list(
+    fitFunction = "fitEstimator",
+    param = param
+  )
+
+  class(results) <- "modelSettings"
+
+  return(results)
+}
+
+ResNet <- torch::nn_module(
+  name = "ResNet",
+  initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers,
+                        hiddenFactor, activation = torch::nn_relu,
+                        normalization = torch::nn_batch_norm1d, hiddenDropout = NULL,
+                        residualDropout = NULL, d_out = 1) {
+    self$embedding <- torch::nn_embedding_bag(
+      num_embeddings = catFeatures + 1,
+      embedding_dim = sizeEmbedding,
+      padding_idx = 1
+    )
+    if (numFeatures != 0) {
+      self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding)
+    }
+
+    self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden)
+
+    resHidden <- sizeHidden * hiddenFactor
+
+    self$layers <- torch::nn_module_list(lapply(
+      1:numLayers,
+      function(x) {
+        ResLayer(
+          sizeHidden, resHidden,
+          normalization, activation,
+          hiddenDropout,
+          residualDropout
+        )
+      }
+    ))
+    self$lastNorm <- normalization(sizeHidden)
+    self$head <- torch::nn_linear(sizeHidden, d_out)
+
+    self$lastAct <- activation()
+  },
+  forward = function(x) {
+    x_cat <- x$cat
+    x_num <- x$num
+    x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1
+    if (!is.null(x_num)) {
+      x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2
+      # x <- torch::torch_cat(list(x_cat, x_num), dim = 2L)
+    } else {
+      x <- x_cat
+    }
+    x <- self$first_layer(x)
+
+    for (i in 1:length(self$layers)) {
+      x <- self$layers[[i]](x)
+    }
+    x <- self$lastNorm(x)
+    x <- self$lastAct(x)
+    x <- self$head(x)
+    x <- x$squeeze(-1)
+    return(x)
+  }
+)
+
+ResLayer <- torch::nn_module(
+  name = "ResLayer",
+  initialize = function(sizeHidden, resHidden, normalization,
+                        activation, hiddenDropout = NULL, residualDropout = NULL) {
+    self$norm <- normalization(sizeHidden)
+    self$linear0 <- torch::nn_linear(sizeHidden, resHidden)
+    self$linear1 <- torch::nn_linear(resHidden, sizeHidden)
+
+    if (!is.null(hiddenDropout)) {
+      self$hiddenDropout <- torch::nn_dropout(p = hiddenDropout)
+    }
+    if (!is.null(residualDropout)) {
+      self$residualDropout <- torch::nn_dropout(p = residualDropout)
+    }
+
+    self$activation <- activation()
+  },
+  forward = function(x) {
+    z <- x
+    z <- self$norm(z)
+    z <- self$linear0(z)
+    z <- self$activation(z)
+    if (!is.null(self$hiddenDropout)) {
+      z <- self$hiddenDropout(z)
+    }
+    z <- self$linear1(z)
+    if (!is.null(self$residualDropout)) {
+      z <- self$residualDropout(z)
+    }
+    x <- z + x
+    return(x)
+  }
+)
diff --git a/R/SaveLoadPlp.R b/R/SaveLoadPlp.R
deleted file mode 100644
index 4b7e6d6..0000000
--- a/R/SaveLoadPlp.R
+++ /dev/null
@@ -1,891 +0,0 @@
-# @file PlpSaveLoad.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of CohortMethod
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Get the patient level prediction data from the server
-#' @description
-#' This function executes a large set of SQL statements against the database in OMOP CDM format to
-#' extract the data needed to perform the analysis.
-#'
-#' @details
-#' Based on the arguments, the at risk cohort data is retrieved, as well as outcomes
-#' occurring in these subjects. The at risk cohort is identified  through
-#' user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema.
-#' Similarly, outcomes are identified 
-#' through user-defined cohorts in a cohort table either inside the CDM instance or in a separate
-#' schema. Covariates are automatically extracted from the appropriate tables within the CDM.
-#' If you wish to exclude concepts from covariates you will need to
-#' manually add the concept_ids and descendants to the \code{excludedCovariateConceptIds} of the
-#' \code{covariateSettings} argument.
-#'
-#' @param connectionDetails            An R object of type\cr\code{connectionDetails} created using the
-#'                                     function \code{createConnectionDetails} in the
-#'                                     \code{DatabaseConnector} package.
-#' @param cdmDatabaseSchema            The name of the database schema that contains the OMOP CDM
-#'                                     instance.  Requires read permissions to this database. On SQL
-#'                                     Server, this should specifiy both the database and the schema,
-#'                                     so for example 'cdm_instance.dbo'.
-#' @param oracleTempSchema             For Oracle only: the name of the database schema where you want
-#'                                     all temporary tables to be managed. Requires create/insert
-#'                                     permissions to this database.
-#' @param cohortId                     A unique identifier to define the at risk cohort. CohortId is
-#'                                     used to select the cohort_concept_id in the cohort-like table.
-#' @param outcomeIds                   A list of cohort_definition_ids used to define outcomes (-999 mean no outcome gets downloaded).
-#' @param studyStartDate               A calendar date specifying the minimum date that a cohort index
-#'                                     date can appear. Date format is 'yyyymmdd'.
-#' @param studyEndDate                 A calendar date specifying the maximum date that a cohort index
-#'                                     date can appear. Date format is 'yyyymmdd'. Important: the study
-#'                                     end data is also used to truncate risk windows, meaning no outcomes
-#'                                     beyond the study end date will be considered.
-#' @param cohortDatabaseSchema         The name of the database schema that is the location where the
-#'                                     cohort data used to define the at risk cohort is available.
-#'                                     Requires read permissions to this database.
-#' @param cohortTable                  The tablename that contains the at risk cohort.  cohortTable has
-#'                                     format of COHORT table: cohort_concept_id, SUBJECT_ID,
-#'                                     COHORT_START_DATE, COHORT_END_DATE.
-#' @param outcomeDatabaseSchema            The name of the database schema that is the location where
-#'                                         the data used to define the outcome cohorts is available. 
-#'                                         Requires read permissions to this database.
-#' @param outcomeTable                     The tablename that contains the outcome cohorts. Expectation is
-#'                                         outcomeTable has format of COHORT table:
-#'                                         COHORT_DEFINITION_ID, SUBJECT_ID, COHORT_START_DATE,
-#'                                         COHORT_END_DATE.
-#' @param cdmVersion                   Define the OMOP CDM version used: currently support "4", "5" and "6".
-#' @param firstExposureOnly            Should only the first exposure per subject be included? Note that
-#'                                     this is typically done in the \code{createStudyPopulation} function,
-#'                                     but can already be done here for efficiency reasons.
-#' @param washoutPeriod                The mininum required continuous observation time prior to index
-#'                                     date for a person to be included in the at risk cohort. Note that
-#'                                     this is typically done in the \code{createStudyPopulation} function,
-#'                                     but can already be done here for efficiency reasons.
-#' @param sampleSize                   If not NULL, only this number of people will be sampled from the target population (Default NULL)
-#' 
-#' @param covariateSettings            An object of type \code{covariateSettings} as created using the
-#'                                     \code{createCovariateSettings} function in the
-#'                                     \code{FeatureExtraction} package.
-#' @param excludeDrugsFromCovariates   A redundant option                                     
-#'
-#' @return
-#' Returns an object of type \code{plpData}, containing information on the cohorts, their
-#' outcomes, and baseline covariates. Information about multiple outcomes can be captured at once for
-#' efficiency reasons. This object is a list with the following components: \describe{
-#' \item{outcomes}{A data frame listing the outcomes per person, including the time to event, and
-#' the outcome id. Outcomes are not yet filtered based on risk window, since this is done at
-#' a later stage.} \item{cohorts}{A data frame listing the persons in each cohort, listing their
-#' exposure status as well as the time to the end of the observation period and time to the end of the
-#' cohort (usually the end of the exposure era).} \item{covariates}{An ffdf object listing the
-#' baseline covariates per person in the two cohorts. This is done using a sparse representation:
-#' covariates with a value of 0 are omitted to save space.} \item{covariateRef}{An ffdf object describing the covariates that have been extracted.}
-#' \item{metaData}{A list of objects with information on how the cohortMethodData object was
-#' constructed.} } The generic \code{()} and \code{summary()} functions have been implemented for this object.
-#'
-#' @export
-getPlpData <- function(connectionDetails,
-                       cdmDatabaseSchema,
-                       oracleTempSchema = cdmDatabaseSchema,
-                       cohortId,
-                       outcomeIds,
-                       studyStartDate = "",
-                       studyEndDate = "",
-                       cohortDatabaseSchema = cdmDatabaseSchema,
-                       cohortTable = "cohort",
-                       outcomeDatabaseSchema = cdmDatabaseSchema,
-                       outcomeTable = "cohort",
-                       cdmVersion = "5",
-                       firstExposureOnly = FALSE,
-                       washoutPeriod = 0,
-                       sampleSize = NULL,
-                       covariateSettings,
-                       excludeDrugsFromCovariates = FALSE) {
-  if (studyStartDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyStartDate) == -1)
-    stop("Study start date must have format YYYYMMDD")
-  if (studyEndDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyEndDate) == -1)
-    stop("Study end date must have format YYYYMMDD")
-  if(!is.null(sampleSize)){
-    if(!class(sampleSize) %in% c('numeric', 'integer'))
-      stop("sampleSize must be numeric")
-  }
-  
-  if(is.null(cohortId))
-    stop('User must input cohortId')
-  if(length(cohortId)>1)
-    stop('Currently only supports one cohortId at a time')
-  if(is.null(outcomeIds))
-    stop('User must input outcomeIds')
-  #ToDo: add other checks the inputs are valid
-  
-  connection <- DatabaseConnector::connect(connectionDetails)
-  on.exit(DatabaseConnector::disconnect(connection))
-  dbms <- connectionDetails$dbms
-  
-  writeLines("\nConstructing the at risk cohort")
-  if(!is.null(sampleSize))  writeLines(paste("\n Sampling ",sampleSize, " people"))
-  renderedSql <- SqlRender::loadRenderTranslateSql("CreateCohorts.sql",
-                                                   packageName = "PatientLevelPrediction",
-                                                   dbms = dbms,
-                                                   oracleTempSchema = oracleTempSchema,
-                                                   cdm_database_schema = cdmDatabaseSchema,
-                                                   cohort_database_schema = cohortDatabaseSchema,
-                                                   cohort_table = cohortTable,
-                                                   cdm_version = cdmVersion,
-                                                   cohort_id = cohortId,
-                                                   study_start_date = studyStartDate,
-                                                   study_end_date = studyEndDate,
-                                                   first_only = firstExposureOnly,
-                                                   washout_period = washoutPeriod,
-                                                   use_sample = !is.null(sampleSize),
-                                                   sample_number=sampleSize
-  )
-  DatabaseConnector::executeSql(connection, renderedSql)
-  
-  writeLines("Fetching cohorts from server")
-  start <- Sys.time()
-  cohortSql <- SqlRender::loadRenderTranslateSql("GetCohorts.sql",
-                                                 packageName = "PatientLevelPrediction",
-                                                 dbms = dbms,
-                                                 oracleTempSchema = oracleTempSchema,
-                                                 cdm_version = cdmVersion)
-  cohorts <- DatabaseConnector::querySql(connection, cohortSql)
-  colnames(cohorts) <- SqlRender::snakeCaseToCamelCase(colnames(cohorts))
-  metaData.cohort <- list(cohortId = cohortId,
-                   studyStartDate = studyStartDate,
-                   studyEndDate = studyEndDate)
-  
-  if(nrow(cohorts)==0)
-    stop('Target population is empty')
-
-  delta <- Sys.time() - start
-  writeLines(paste("Loading cohorts took", signif(delta, 3), attr(delta, "units")))
-  
-  #covariateSettings$useCovariateCohortIdIs1 <- TRUE
-  covariateData <- FeatureExtraction::getDbCovariateData(connection = connection,
-                                                         oracleTempSchema = oracleTempSchema,
-                                                         cdmDatabaseSchema = cdmDatabaseSchema,
-                                                         cdmVersion = cdmVersion,
-                                                         cohortTable = "#cohort_person",
-                                                         cohortTableIsTemp = TRUE,
-                                                         rowIdField = "row_id",
-                                                         covariateSettings = covariateSettings)
-  # add indexes for tidyCov + covariate summary
-  Andromeda::createIndex(covariateData$covariates, c('rowId'),
-                         indexName = 'covariates_rowId')
-  Andromeda::createIndex(covariateData$covariates, c('covariateId'),
-                         indexName = 'covariates_covariateId')
-  Andromeda::createIndex(covariateData$covariates, c('covariateId', 'covariateValue'),
-                         indexName = 'covariates_covariateId_value')
-
-  if(max(outcomeIds)!=-999){
-  writeLines("Fetching outcomes from server")
-  start <- Sys.time()
-  outcomeSql <- SqlRender::loadRenderTranslateSql("GetOutcomes.sql",
-                                                  packageName = "PatientLevelPrediction",
-                                                  dbms = dbms,
-                                                  oracleTempSchema = oracleTempSchema,
-                                                  cdm_database_schema = cdmDatabaseSchema,
-                                                  outcome_database_schema = outcomeDatabaseSchema,
-                                                  outcome_table = outcomeTable,
-                                                  outcome_ids = outcomeIds,
-                                                  cdm_version = cdmVersion)
-  outcomes <- DatabaseConnector::querySql(connection, outcomeSql)
-  colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes))
-  metaData.outcome <- data.frame(outcomeIds =outcomeIds)
-  attr(outcomes, "metaData") <- metaData.outcome
-  if(nrow(outcomes)==0)
-    stop('No Outcomes')
-
-  metaData.cohort$attrition <- getCounts2(cohorts,outcomes, "Original cohorts")
-  attr(cohorts, "metaData") <- metaData.cohort
-  
-  delta <- Sys.time() - start
-  writeLines(paste("Loading outcomes took", signif(delta, 3), attr(delta, "units")))
-  } else {
-    outcomes <- NULL
-  }
-
-  
-  
-  
-  # Remove temp tables:
-  renderedSql <- SqlRender::loadRenderTranslateSql("RemoveCohortTempTables.sql",
-                                                   packageName = "PatientLevelPrediction",
-                                                   dbms = dbms,
-                                                   oracleTempSchema = oracleTempSchema)
-  DatabaseConnector::executeSql(connection, renderedSql, progressBar = FALSE, reportOverallTime = FALSE)
-  #DatabaseConnector::disconnect(connection)
-  
-  metaData <- covariateData$metaData
-  metaData$call <- match.call()
-  metaData$call$connectionDetails = connectionDetails
-  metaData$call$connection = NULL
-  metaData$call$cdmDatabaseSchema = cdmDatabaseSchema
-  metaData$call$oracleTempSchema = oracleTempSchema
-  metaData$call$cohortId = cohortId
-  metaData$call$outcomeIds = outcomeIds
-  metaData$call$studyStartDate = studyStartDate
-  metaData$call$studyEndDate = studyEndDate
-  metaData$call$cohortDatabaseSchema = cohortDatabaseSchema
-  metaData$call$cohortTable = cohortTable
-  metaData$call$outcomeDatabaseSchema = outcomeDatabaseSchema
-  metaData$call$outcomeTable = outcomeTable
-  metaData$call$cdmVersion = cdmVersion
-  metaData$call$firstExposureOnly = firstExposureOnly
-  metaData$call$washoutPeriod = washoutPeriod
-  metaData$call$covariateSettings= covariateSettings
-  metaData$call$sampleSize = sampleSize
-  
-  # create the temporal settings (if temporal use)
-  timeReference <- NULL
-  if(!is.null(covariateSettings$temporal)){
-    if(covariateSettings$temporal){
-      # make sure time days populated
-      if(length(covariateSettings$temporalStartDays)>0){
-        timeReference = data.frame(timeId=1:length(covariateSettings$temporalStartDays),
-                                               startDay = covariateSettings$temporalStartDays, 
-                                               endDay = covariateSettings$temporalEndDays)
-      }
-    }}
-  
-
-  result <- list(cohorts = cohorts,
-                 outcomes = outcomes,
-                 covariateData = covariateData,
-                 timeRef = timeReference,
-                 metaData = metaData)
-  
-  class(result) <- "plpData"
-  return(result)
-}
-
-
-#' Save the cohort data to folder
-#'
-#' @description
-#' \code{savePlpData} saves an object of type plpData to folder.
-#'
-#' @param plpData   An object of type \code{plpData} as generated using
-#'                           \code{getPlpData}.
-#' @param file               The name of the folder where the data will be written. The folder should
-#'                           not yet exist.
-#' @param envir              The environment for to evaluate variables when saving
-#' @param overwrite          Whether to force overwrite an existing file
-#' @details
-#' The data will be written to a set of files in the folder specified by the user.
-#'
-#' @examples
-#' # todo
-#'
-#' @export
-savePlpData <- function(plpData, file, envir=NULL, overwrite=F) {
-  if (missing(plpData))
-    stop("Must specify plpData")
-  if (missing(file))
-    stop("Must specify file")
-  if (!class(plpData) %in% c("plpData","plpData.libsvm"  ))
-    stop("Data not of class plpData")
-  if(dir.exists(file.path(file, "covariates"))){
-    stop('Folder to save covariates already exists...')
-  }
-  
-  if(!dir.exists(file)){
-    dir.create(file)
-  }
-  
-  # save the actual values in the metaData
-  # TODO - only do this if exists in parent or environ
-  if(is.null(plpData$metaData$call$sampleSize)){  # fixed a bug when sampleSize is NULL
-    plpData$metaData$call$sampleSize <- 'NULL'
-  }
-  for(i in 2:length(plpData$metaData$call)){
-    if(!is.null(plpData$metaData$call[[i]]))
-      plpData$metaData$call[[i]] <- eval(plpData$metaData$call[[i]], envir = envir)
-  }
-  
-  #FeatureExtraction::saveCovariateData(covariateData = plpData$covariateData, file = file.path(file, "covariates"))
-  Andromeda::saveAndromeda(plpData$covariateData, file = file.path(file, "covariates"), maintainConnection = T)
-  saveRDS(plpData$timeRef, file = file.path(file, "timeRef.rds"))
-  saveRDS(plpData$cohorts, file = file.path(file, "cohorts.rds"))
-  saveRDS(plpData$outcomes, file = file.path(file, "outcomes.rds"))
-  saveRDS(plpData$metaData, file = file.path(file, "metaData.rds"))
-}
-
-#' Load the cohort data from a folder
-#'
-#' @description
-#' \code{loadPlpData} loads an object of type plpData from a folder in the file
-#' system.
-#'
-#' @param file       The name of the folder containing the data.
-#' @param readOnly   If true, the data is opened read only.
-#'
-#' @details
-#' The data will be written to a set of files in the folder specified by the user.
-#'
-#' @return
-#' An object of class plpData.
-#'
-#' @examples
-#' # todo
-#'
-#' @export
-loadPlpData <- function(file, readOnly = TRUE) {
-  if (!file.exists(file))
-    stop(paste("Cannot find folder", file))
-  if (!file.info(file)$isdir)
-    stop(paste("Not a folder", file))
-  
-  result <- list(covariateData = FeatureExtraction::loadCovariateData(file = file.path(file, "covariates")),
-                 timeRef = readRDS(file.path(file, "timeRef.rds")),
-                 cohorts = readRDS(file.path(file, "cohorts.rds")),
-                 outcomes = readRDS(file.path(file, "outcomes.rds")),
-                 metaData = readRDS(file.path(file, "metaData.rds")))
-
-  class(result) <- "plpData"
-
-  return(result)
-}
-
-
-#' @export
-print.plpData <- function(x, ...) {
-  writeLines("plpData object")
-  writeLines("")
-  writeLines(paste("At risk concept ID:", attr(x$cohorts, "metaData")$cohortId))
-  writeLines(paste("Outcome concept ID(s):", paste(attr(x$outcomes, "metaData")$outcomeIds, collapse = ",")))
-}
-
-#' @method summary plpData
-#' @export
-summary.plpData <- function(object,...){
-  people <- length(unique(object$cohorts$subjectId))
-  outcomeCounts <- data.frame(outcomeId = attr(object$outcomes, "metaData")$outcomeIds,
-                              eventCount = 0,
-                              personCount = 0)
-  for (i in 1:nrow(outcomeCounts)) {
-    outcomeCounts$eventCount[i] <- sum(object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i])
-    outcomeCounts$personCount[i] <- length(unique(object$outcomes$rowId[object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i]]))
-  }
-  
-  covDetails <- FeatureExtraction::summary(object$covariateData)
-  result <- list(metaData = append(append(object$metaData, attr(object$cohorts, "metaData")), attr(object$outcomes, "metaData")),
-                 people = people,
-                 outcomeCounts = outcomeCounts,
-                 covariateCount = covDetails$covariateCount,
-                 covariateValueCount = covDetails$covariateValueCount)
-  class(result) <- "summary.plpData"
-  return(result)
-}
-
-#' @export
-print.summary.plpData <- function(x, ...) {
-  writeLines("plpData object summary")
-  writeLines("")
-  writeLines(paste("At risk cohort concept ID:", x$metaData$cohortId))
-  writeLines(paste("Outcome concept ID(s):", x$metaData$outcomeIds, collapse = ","))
-  writeLines("")
-  writeLines(paste("People:", paste(x$people)))
-  writeLines("")
-  writeLines("Outcome counts:")
-  outcomeCounts <- x$outcomeCounts
-  rownames(outcomeCounts) <- outcomeCounts$outcomeId
-  outcomeCounts$outcomeId <- NULL
-  colnames(outcomeCounts) <- c("Event count", "Person count")
-  stats::printCoefmat(outcomeCounts)
-  writeLines("")
-  writeLines("Covariates:")
-  writeLines(paste("Number of covariates:", x$covariateCount))
-  writeLines(paste("Number of non-zero covariate values:", x$covariateValueCount))
-}
-
-
-#' Saves the plp model
-#'
-#' @details
-#' Saves the plp model to a user specificed folder
-#'
-#' @param plpModel                   A trained classifier returned by running \code{runPlp()$model}
-#' @param dirPath                  A location to save the model to
-#'
-#' @export
-savePlpModel <- function(plpModel, dirPath){
-  if (missing(plpModel))
-    stop("Must specify plpModel")
-  if (missing(dirPath))
-    stop("Must specify directory path")
-  if (class(plpModel) != "plpModel")
-    stop("Not a plpModel")
-  
-  if(!dir.exists(dirPath)) dir.create(dirPath)
-  
-  
-  # If model is saved on hard drive move it...
-  #============================================================
-  moveFile <- moveHdModel(plpModel, dirPath )
-  if(!moveFile){
-    ParallelLogger::logError('Moving model files error')
-  }
-  #============================================================
-    
-
-  # if deep (keras) then save hdfs
-  if(attr(plpModel, 'type')%in%c('deep', 'deepMulti','deepEnsemble')){
-    
-    if(attr(plpModel, 'type')=='deepEnsemble'){
-      tryCatch(
-        {#saveRDS(plpModel, file = file.path(dirPath,  "deepEnsemble_model.rds"))
-          for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
-          model<-keras::serialize_model(plpModel$model[[i]], include_optimizer = TRUE)
-          keras::save_model_hdf5(model, filepath = file.path(dirPath, "keras_model",i))
-        }},error=function(e) NULL
-      )
-    }
-    if(attr(plpModel, 'type')=='deep'){
-      keras::save_model_hdf5(plpModel$model, filepath = file.path(dirPath, "keras_model"))
-    }
-    if(attr(plpModel, 'type')=='deepMulti'){
-      saveRDS(attr(plpModel, 'inputs'), file = file.path(dirPath,  "inputs_attr.rds"))
-    }
-  } else if(attr(plpModel, 'type') == "xgboost"){
-    # fixing xgboost save/load issue
-    xgboost::xgb.save(model = plpModel$model, fname = file.path(dirPath, "model"))
-  } else {  
-  saveRDS(plpModel$model, file = file.path(dirPath, "model.rds"))
-  }
-  #saveRDS(plpModel$predict, file = file.path(dirPath, "transform.rds"))
-  saveRDS(NULL, file = file.path(dirPath, "transform.rds"))
-  saveRDS(plpModel$index, file = file.path(dirPath, "index.rds"))
-  saveRDS(plpModel$trainCVAuc, file = file.path(dirPath, "trainCVAuc.rds"))
-  saveRDS(plpModel$hyperParamSearch, file = file.path(dirPath, "hyperParamSearch.rds"))
-  saveRDS(plpModel$modelSettings, file = file.path(dirPath,  "modelSettings.rds"))
-  saveRDS(plpModel$metaData, file = file.path(dirPath, "metaData.rds"))
-  saveRDS(plpModel$populationSettings, file = file.path(dirPath, "populationSettings.rds"))
-  saveRDS(plpModel$trainingTime, file = file.path(dirPath,  "trainingTime.rds"))
-  saveRDS(plpModel$varImp, file = file.path(dirPath,  "varImp.rds"))
-  saveRDS(plpModel$dense, file = file.path(dirPath,  "dense.rds"))
-  saveRDS(plpModel$cohortId, file = file.path(dirPath,  "cohortId.rds"))
-  saveRDS(plpModel$outcomeId, file = file.path(dirPath,  "outcomeId.rds"))
-  saveRDS(plpModel$analysisId, file = file.path(dirPath,  "analysisId.rds"))
-  #if(!is.null(plpModel$covariateMap))
-  saveRDS(plpModel$covariateMap, file = file.path(dirPath,  "covariateMap.rds"))
-  
-  attributes <- list(type=attr(plpModel, 'type'), predictionType=attr(plpModel, 'predictionType') )
-  saveRDS(attributes, file = file.path(dirPath,  "attributes.rds"))
-  
-  
-}
-
-moveHdModel <- function(plpModel, dirPath ){
-  #==================================================================
-  # if python then move pickle
-  #==================================================================
-  if(attr(plpModel, 'type') %in% c('pythonOld','pythonReticulate', 'pythonAuto') ){
-    if(!dir.exists(file.path(dirPath,'python_model')))
-      dir.create(file.path(dirPath,'python_model'))
-    for(file in dir(plpModel$model)){   #DOES THIS CORRECTLY TRANSFER AUTOENCODER BITS?
-      file.copy(file.path(plpModel$model,file), 
-                file.path(dirPath,'python_model'), overwrite=TRUE,  recursive = FALSE,
-                copy.mode = TRUE, copy.date = FALSE)
-    }
-  }
-  
-  #==================================================================
-  # if sagemaker then move pickle
-  #==================================================================
-  if(attr(plpModel, 'type') =='sagemaker'){
-    if(!dir.exists(file.path(dirPath,'sagemaker_model')))
-      dir.create(file.path(dirPath,'sagemaker_model'))
-    for(file in dir(plpModel$model$loc)){
-      file.copy(file.path(plpModel$model$loc,file), 
-                file.path(dirPath,'sagemaker_model'), overwrite=TRUE,  recursive = FALSE,
-                copy.mode = TRUE, copy.date = FALSE)
-    }
-  }
-  
-  #==================================================================
-  # if knn then move model
-  #==================================================================
-  if(attr(plpModel, 'type') =='knn'){
-    if(!dir.exists(file.path(dirPath,'knn_model')))
-      dir.create(file.path(dirPath,'knn_model'))
-    for(file in dir(plpModel$model)){
-      file.copy(file.path(plpModel$model,file), 
-                file.path(dirPath,'knn_model'), overwrite=TRUE,  recursive = FALSE,
-                copy.mode = TRUE, copy.date = FALSE)
-    }
-  }
-  
-  return(TRUE)
-}
-
-#' loads the plp model
-#'
-#' @details
-#' Loads a plp model that was saved using \code{savePlpModel()}
-#'
-#' @param dirPath                  The location of the model
-#'
-#' @export
-loadPlpModel <- function(dirPath) {
-  if (!file.exists(dirPath))
-    stop(paste("Cannot find folder", dirPath))
-  if (!file.info(dirPath)$isdir)
-    stop(paste("Not a folder", dirPath))
-  
-  hyperParamSearch <- tryCatch(readRDS(file.path(dirPath, "hyperParamSearch.rds")),
-                               error=function(e) NULL)
-  # add in these as they got dropped
-  outcomeId <- tryCatch(readRDS(file.path(dirPath, "outcomeId.rds")),
-                        error=function(e) NULL)
-  cohortId <- tryCatch(readRDS(file.path(dirPath, "cohortId.rds")),
-                       error=function(e) NULL)  
-  dense <- tryCatch(readRDS(file.path(dirPath, "dense.rds")),
-                    error=function(e) NULL)  
-  covariateMap <- tryCatch(readRDS(file.path(dirPath, "covariateMap.rds")),
-                           error=function(e) NULL) 
-  analysisId <- tryCatch(readRDS(file.path(dirPath, "analysisId.rds")),
-                           error=function(e) NULL) 
-  
-  if(file.exists(file.path(dirPath, "keras_model"))){
-    ensure_installed("keras")
-    model <- keras::load_model_hdf5(file.path(dirPath, "keras_model"))
-  } else if(readRDS(file.path(dirPath, "attributes.rds"))$type == "xgboost"){
-    ensure_installed("xgboost")
-    # fixing xgboost save/load issue
-    model <- xgboost::xgb.load(file.path(dirPath, "model"))
-  } else {  
-    model <- readRDS(file.path(dirPath, "model.rds"))
-  }
-  
-  result <- list(model = model,
-                 modelSettings = readRDS(file.path(dirPath, "modelSettings.rds")),
-                 hyperParamSearch = hyperParamSearch,
-                 trainCVAuc = readRDS(file.path(dirPath, "trainCVAuc.rds")),
-                 metaData = readRDS(file.path(dirPath, "metaData.rds")),
-                 populationSettings= readRDS(file.path(dirPath, "populationSettings.rds")),
-                 outcomeId = outcomeId,
-                 cohortId = cohortId,
-                 varImp = readRDS(file.path(dirPath, "varImp.rds")),
-                 trainingTime = readRDS(file.path(dirPath, "trainingTime.rds")),
-                 covariateMap =covariateMap,
-                 predict = readRDS(file.path(dirPath, "transform.rds")),
-                 index = readRDS(file.path(dirPath, "index.rds")),
-                 dense = dense,
-                 analysisId = analysisId)
-  
-  #attributes <- readRDS(file.path(dirPath, "attributes.rds"))
-  attributes <- readRDS(file.path(dirPath, "attributes.rds"))
-  attr(result, 'type') <- attributes$type
-  attr(result, 'predictionType') <- attributes$predictionType
-  class(result) <- "plpModel"
-  
-  # update the model location to the load dirPath
-  result <- updateModelLocation(result, dirPath)
-  
-  # make this backwrds compatible for ffdf:
-  result$predict <- createTransform(result)
-  
-  return(result)
-}
-
-updateModelLocation  <- function(plpModel, dirPath){
-  type <- attr(plpModel, 'type')
-  # if python update the location
-  if( type %in% c('pythonOld','pythonReticulate', 'pythonAuto')){
-    plpModel$model <- file.path(dirPath,'python_model')
-    plpModel$predict <- createTransform(plpModel)
-  }
-  if( type =='sagemaker'){
-    plpModel$model$loc <- file.path(dirPath,'sagemaker_model')
-    plpModel$predict <- createTransform(plpModel)
-  }
-  # if knn update the locaiton - TODO !!!!!!!!!!!!!!
-  if( type =='knn'){
-    plpModel$model <- file.path(dirPath,'knn_model')
-    plpModel$predict <- createTransform(plpModel)
-  }
-  if( type =='deep' ){
-    plpModel$predict <- createTransform(plpModel)
-  }
-  if( type =='deepEnsemble' ){
-    plpModel$predict <- createTransform(plpModel)
-  }
-  if( type =='deepMulti'){
-    attr(plpModel, 'inputs') <- tryCatch(readRDS(file.path(dirPath, "inputs_attr.rds")),
-                                       error=function(e) NULL) 
-    plpModel$predict <- createTransform(plpModel)
-    
-  }
-  
-  return(plpModel) 
-}
-
-
-#' Saves the prediction dataframe to RDS
-#'
-#' @details
-#' Saves the prediction data frame returned by predict.R to an RDS file and returns the fileLocation where the prediction is saved
-#'
-#' @param prediction                   The prediciton data.frame
-#' @param dirPath                     The directory to save the prediction RDS
-#' @param fileName                    The name of the RDS file that will be saved in dirPath
-#' 
-#' @export
-savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){
-  #TODO check inupts
-  saveRDS(prediction, file=file.path(dirPath,fileName))
-  
-  return(file.path(dirPath,fileName))
-}
-
-#' Loads the prediciton dataframe to csv
-#'
-#' @details
-#' Loads the prediciton  RDS file
-#'
-#' @param fileLocation                     The location with the saved prediction
-#' 
-#' @export
-loadPrediction <- function(fileLocation){
-  #TODO check inupts
-  prediction <- readRDS(file=fileLocation)
-  return(prediction)
-}
-
-#' Saves the result from runPlp into the location directory
-#'
-#' @details
-#' Saves the result from runPlp into the location directory
-#'
-#' @param result                      The result of running runPlp()
-#' @param dirPath                     The directory to save the csv
-#' 
-#' @export
-savePlpResult <- function(result, dirPath){
-  if (missing(result))
-    stop("Must specify runPlp output")
-  if (missing(dirPath))
-    stop("Must specify directory location")
-  #if (class(plpModel) != "plpModel")
-  #  stop("Not a plpModel")
-  
-  if(!dir.exists(dirPath)) dir.create(dirPath, recursive = T)
-  
-  savePlpModel(result$model, dirPath=file.path(dirPath,'model') )
-  saveRDS(result$analysisRef, file = file.path(dirPath, "analysisRef.rds"))
-  saveRDS(result$inputSetting, file = file.path(dirPath, "inputSetting.rds"))
-  saveRDS(result$executionSummary, file = file.path(dirPath, "executionSummary.rds"))
-  saveRDS(result$prediction, file = file.path(dirPath, "prediction.rds"))
-  saveRDS(result$performanceEvaluation, file = file.path(dirPath, "performanceEvaluation.rds"))
-  #saveRDS(result$performanceEvaluationTrain, file = file.path(dirPath, "performanceEvaluationTrain.rds"))
-  saveRDS(result$covariateSummary, file = file.path(dirPath, "covariateSummary.rds"))
-  
-  
-}
-
-#' Loads the evalaution dataframe
-#'
-#' @details
-#' Loads the evaluation 
-#'
-#' @param dirPath                     The directory where the evaluation was saved
-#' 
-#' @export
-loadPlpResult <- function(dirPath){
-  if (!file.exists(dirPath))
-    stop(paste("Cannot find folder", dirPath))
-  if (!file.info(dirPath)$isdir)
-    stop(paste("Not a folder", dirPath))
-  
-  
-  result <- list(model = loadPlpModel(file.path(dirPath, "model")),
-                 analysisRef = readRDS(file.path(dirPath, "analysisRef.rds")),
-                 inputSetting = readRDS(file.path(dirPath, "inputSetting.rds")),
-                 executionSummary = readRDS(file.path(dirPath, "executionSummary.rds")),
-                 prediction = readRDS(file.path(dirPath, "prediction.rds")),
-                 performanceEvaluation = readRDS(file.path(dirPath, "performanceEvaluation.rds")),
-                 #performanceEvaluationTrain= readRDS(file.path(dirPath, "performanceEvaluationTrain.rds")),
-                 covariateSummary = readRDS(file.path(dirPath, "covariateSummary.rds"))
-  )
-  class(result) <- "runPlp"
-  
-  return(result)
-  
-}
-
-
-#result$inputSetting$dataExtrractionSettings$covariateSettings
-formatCovariateSettings <- function(covariateSettings){
-  
-  if(class(covariateSettings) == "covariateSettings"){
-    return(list(cvs = data.frame(X = names(unlist(covariateSettings)), x= unlist(covariateSettings)), 
-                fun = attr(covariateSettings,'fun')))
-    
-  } else{
-    return(list(cvs = do.call(rbind, lapply(1:length(covariateSettings), function(i){
-      inds <- which(lapply(covariateSettings[[i]], class) == "function")
-      if(length(inds)>0){
-        for(j in inds){
-          covariateSettings[[i]][[j]] <- paste0(deparse(covariateSettings[[i]][[j]]), collapse = " ")
-        }
-      }
-    tempResult <- data.frame(names = names(unlist(covariateSettings[[i]])),
-               values = unlist(covariateSettings[[i]]))
-    tempResult$settingsId <- i
-    return(tempResult)
-    })), 
-    fun = unlist(lapply(covariateSettings, function(x) attr(x,'fun')))
-    )
-    )
-  }
-  
-}
-
-reformatCovariateSettings <- function(covariateSettingsLocation){
-  # adding this to stop warnings when files does not exist
-  if(!file.exists(covariateSettingsLocation)){
-    return(NULL)
-  }
-  cs <- utils::read.csv(covariateSettingsLocation, stringsAsFactors=FALSE)
-  fun <- utils::read.csv(gsub('.csv','_fun.csv',covariateSettingsLocation), stringsAsFactors=FALSE)
-  
-  if(sum(colnames(cs)%in%c('X','x'))==2){
-    covariateSettings <- cs$x
-    covariateSettings <- as.list(covariateSettings)
-    names(covariateSettings) <- cs$X
-    attr(covariateSettings,'fun') <- fun$x
-    class(covariateSettings) <- 'covariateSettings'
-  } else {
-    
-    covariateSettings <- list()
-    length(covariateSettings) <- max(cs$settingsId)
-    
-    for(i in 1:max(cs$settingsId)){
-      covariateSettings[[i]] <- cs$values[cs$settingsId==i]
-      covariateSettings[[i]] <- as.list(covariateSettings[[i]])
-      names(covariateSettings[[i]]) <- cs$names[cs$settingsId==i]
-      attr(covariateSettings[[i]],'fun') <- fun$x[i]
-    }
-    
-  }
-  
-return(covariateSettings)
-}
-
-
-#' Save parts of the plp result as a csv for transparent sharing
-#'
-#' @details
-#' Saves the main results as a csv (these files can be read by the shiny app)
-#'
-#' @param result                      An object of class runPlp with development or validation results
-#' @param dirPath                     The directory the save the results as csv files
-#' 
-#' @export
-savePlpToCsv <- function(result, dirPath){
-  
-  #inputSetting
-  if(!dir.exists(file.path(dirPath, 'inputSetting'))){dir.create(file.path(dirPath, 'inputSetting'), recursive = T)}
-  utils::write.csv(result$inputSetting$modelSettings$model, file = file.path(dirPath, 'inputSetting','modelSettings_model.csv'), row.names = F)
-  
-  if(!is.null(result$inputSetting$modelSettings$param)){
-    utils::write.csv(as.data.frame(t(unlist(result$inputSetting$modelSettings$param))), file = file.path(dirPath, 'inputSetting','modelSettings_param.csv'), row.names = F)
-  }else{
-    utils::write.csv(NULL, file = file.path(dirPath, 'inputSetting','modelSettings_param.csv'), row.names = F)
-  }
-  utils::write.csv(result$inputSetting$modelSettings$name, file = file.path(dirPath, 'inputSetting','modelSettings_name.csv'), row.names = F)
-  if(!is.null(result$inputSetting$dataExtrractionSettings$covariateSettings)){
-    utils::write.csv(formatCovariateSettings(result$inputSetting$dataExtrractionSettings$covariateSettings)$cvs, file = file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings.csv'), row.names = F)
-    utils::write.csv(formatCovariateSettings(result$inputSetting$dataExtrractionSettings$covariateSettings)$fun, file = file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings_fun.csv'), row.names = F)
-  }
-  utils::write.csv(result$inputSetting$populationSettings$attrition, file = file.path(dirPath, 'inputSetting','populationSettings_attrition.csv'), row.names = F)
-  result$inputSetting$populationSettings$attrition <- NULL
-  utils::write.csv(result$inputSetting$populationSettings, file = file.path(dirPath, 'inputSetting','populationSettings.csv'), row.names = F)
-  
-  #executionSummary
-  if(!dir.exists(file.path(dirPath, 'executionSummary'))){dir.create(file.path(dirPath, 'executionSummary'), recursive = T)}
-  utils::write.csv(result$executionSummary$PackageVersion, file = file.path(dirPath, 'executionSummary','PackageVersion.csv'), row.names = F)
-  utils::write.csv(unlist(result$executionSummary$PlatformDetails), file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))
-  utils::write.csv(result$executionSummary$TotalExecutionElapsedTime, file = file.path(dirPath, 'executionSummary','TotalExecutionElapsedTime.csv'), row.names = F)
-  utils::write.csv(result$executionSummary$ExecutionDateTime, file = file.path(dirPath, 'executionSummary','ExecutionDateTime.csv'), row.names = F)
-  
-  #performanceEvaluation
-  if(!dir.exists(file.path(dirPath, 'performanceEvaluation'))){dir.create(file.path(dirPath, 'performanceEvaluation'), recursive = T)}
-  utils::write.csv(result$performanceEvaluation$evaluationStatistics, file = file.path(dirPath, 'performanceEvaluation','evaluationStatistics.csv'), row.names = F)
-  utils::write.csv(result$performanceEvaluation$thresholdSummary, file = file.path(dirPath, 'performanceEvaluation','thresholdSummary.csv'), row.names = F)
-  utils::write.csv(result$performanceEvaluation$demographicSummary, file = file.path(dirPath, 'performanceEvaluation','demographicSummary.csv'), row.names = F)
-  utils::write.csv(result$performanceEvaluation$calibrationSummary, file = file.path(dirPath, 'performanceEvaluation','calibrationSummary.csv'), row.names = F)
-  utils::write.csv(result$performanceEvaluation$predictionDistribution, file = file.path(dirPath, 'performanceEvaluation','predictionDistribution.csv'), row.names = F)
-  
-  #covariateSummary
-  utils::write.csv(result$covariateSummary, file = file.path(dirPath,'covariateSummary.csv'), row.names = F)
-}
-
-#' Loads parts of the plp result saved as csv files for transparent sharing
-#'
-#' @details
-#' Load the main results from csv files into a runPlp object
-#'
-#' @param dirPath                     The directory with the results as csv files
-#' 
-#' @export
-loadPlpFromCsv <- function(dirPath){
-  
-  result <- list()
-  objects <- gsub('.csv','',dir(dirPath))
-  if(sum(!c('covariateSummary','executionSummary','inputSetting','performanceEvaluation')%in%objects)>0){
-    stop('Incorrect csv results file')
-  }
-  
-  length(result) <- length(objects)
-  names(result) <- objects
-  
-  #covariateSummary
-  result$covariateSummary <- utils::read.csv(file = file.path(dirPath,'covariateSummary.csv'))
-
-  #executionSummary
-  result$executionSummary <- list()
-  result$executionSummary$PackageVersion <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'executionSummary','PackageVersion.csv')))}, error = function(e){return(NULL)})
-  result$executionSummary$PlatformDetails <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))$x)}, error = function(e){return(NULL)})
-  names(result$executionSummary$PlatformDetails) <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))$X}, error = function(e){return(NULL)})
-  result$executionSummary$TotalExecutionElapsedTime <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','TotalExecutionElapsedTime.csv'))$x}, error = function(e){return(NULL)})
-  result$executionSummary$ExecutionDateTime <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','ExecutionDateTime.csv'))$x}, error = function(e){return(NULL)})
-  
-  #inputSetting
-  result$inputSetting <- list()
-  result$inputSetting$modelSettings$model <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_model.csv'))$x}, error = function(e){return(NULL)})
-  result$inputSetting$modelSettings$param <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_param.csv')))}, error = function(e){return(NULL)})
-  result$inputSetting$modelSettings$name <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_name.csv'))$x}, error = function(e){return(NULL)})
-  
-  result$inputSetting$dataExtrractionSettings$covariateSettings <- tryCatch({reformatCovariateSettings(file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings.csv'))}, error = function(e){return(NULL)})
-
-  result$inputSetting$populationSettings <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'inputSetting','populationSettings.csv')))}, error = function(e){return(NULL)})
-  result$inputSetting$populationSettings$attrition <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','populationSettings_attrition.csv'))}, error = function(e){return(NULL)})
-  
-  #performanceEvaluation
-  result$performanceEvaluation <- list()
-  result$performanceEvaluation$evaluationStatistics <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','evaluationStatistics.csv'))}, error = function(e){return(NULL)})
-  result$performanceEvaluation$thresholdSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','thresholdSummary.csv'))}, error = function(e){return(NULL)})
-  result$performanceEvaluation$demographicSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','demographicSummary.csv'))}, error = function(e){return(NULL)})
-  result$performanceEvaluation$calibrationSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','calibrationSummary.csv'))}, error = function(e){return(NULL)})
-  result$performanceEvaluation$predictionDistribution <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','predictionDistribution.csv'))}, error = function(e){return(NULL)})
-  
-  result$model$modelSettings <- result$inputSetting$modelSettings
-  result$model$populationSettings <- result$inputSetting$populationSettings
-  result$model$metaData$call$covariateSettings <- result$inputSetting$dataExtrractionSettings$covariateSettings
-  
-  class(result) <- "runPlp"
-  return(result)
-}
diff --git a/R/Transformer.R b/R/Transformer.R
new file mode 100644
index 0000000..0ab7227
--- /dev/null
+++ b/R/Transformer.R
@@ -0,0 +1,303 @@
+#' create settings for training a non-temporal transformer
+#'
+#' @description A transformer model
+#' @details from https://arxiv.org/abs/2106.11959
+#'
+#' @param numBlocks               number of transformer blocks
+#' @param dimToken                dimension of each token (embedding size)
+#' @param dimOut                  dimension of output, usually 1 for binary problems
+#' @param numHeads                number of attention heads
+#' @param attDropout              dropout to use on attentions
+#' @param ffnDropout              dropout to use in feedforward block
+#' @param resDropout              dropout to use in residual connections
+#' @param dimHidden               dimension of the feedworward block
+#' @param weightDecay             weightdecay to use
+#' @param learningRate            learning rate to use
+#' @param batchSize               batchSize to use
+#' @param epochs                  How many epochs to run the model for
+#' @param device                  Which device to use, cpu or cuda
+#' @param hyperParamSearch        what kind of hyperparameter search to do, default 'random'
+#' @param randomSamples           How many samples to use in hyperparameter search if random
+#' @param seed                    Random seed to use
+#'
+#' @export
+setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1,
+                           numHeads = 8, attDropout = 0.25, ffnDropout = 0.25,
+                           resDropout = 0, dimHidden = 512, weightDecay = 1e-6,
+                           learningRate = 3e-4, batchSize = 1024,
+                           epochs = 10, device = "cpu", hyperParamSearch = "random",
+                           randomSamples = 100, seed = NULL) {
+  if (is.null(seed)) {
+    seed <- as.integer(sample(1e5, 1))
+  }
+
+  if (dimToken %% numHeads != 0) {
+    stop(paste(
+      "dimToken needs to divisble by numHeads. dimToken =", dimToken,
+      "is not divisible by numHeads =", numHeads
+    ))
+  }
+
+  paramGrid <- list(
+    numBlocks = numBlocks,
+    dimToken = dimToken,
+    dimOut = dimOut,
+    numHeads = numHeads,
+    dimHidden = dimHidden,
+    attDropout = attDropout,
+    ffnDropout = ffnDropout,
+    resDropout = resDropout,
+    weightDecay = weightDecay,
+    learningRate = learningRate,
+    seed = list(as.integer(seed[[1]]))
+  )
+
+  param <- PatientLevelPrediction::listCartesian(paramGrid)
+
+  if (hyperParamSearch == "random") {
+    param <- param[sample(length(param), randomSamples)]
+  }
+
+  attr(param, "settings") <- list(
+    seed = seed[1],
+    device = device,
+    batchSize = batchSize,
+    epochs = epochs,
+    name = "Transformer",
+    saveType = "file",
+    modelParamNames = c(
+      "numBlocks", "dimToken", "dimOut", "numHeads",
+      "attDropout", "ffnDropout", "resDropout", "dimHidden"
+    ),
+    baseModel = "Transformer"
+  )
+
+  results <- list(
+    fitFunction = "fitEstimator",
+    param = param
+  )
+
+  class(results) <- "modelSettings"
+  return(results)
+}
+
+
+Transformer <- torch::nn_module(
+  name = "Transformer",
+  initialize = function(catFeatures, numFeatures, numBlocks, dimToken, dimOut = 1,
+                        numHeads, attDropout, ffnDropout, resDropout,
+                        headActivation = torch::nn_relu,
+                        activation = NULL,
+                        ffnNorm = torch::nn_layer_norm,
+                        headNorm = torch::nn_layer_norm,
+                        attNorm = torch::nn_layer_norm,
+                        dimHidden) {
+    activation <- nn_reglu
+    self$Categoricalembedding <- Embedding(catFeatures + 1, dimToken) # + 1 for padding idx
+    self$numericalEmbedding <- numericalEmbedding(numFeatures, dimToken)
+    self$classToken <- ClassToken(dimToken)
+
+    self$layers <- torch::nn_module_list(lapply(
+      1:numBlocks,
+      function(x) {
+        layer <- torch::nn_module_list()
+        layer$add_module("attention", torch::nn_multihead_attention(dimToken, numHeads,
+          dropout = attDropout,
+          bias = TRUE
+        ))
+        layer$add_module("ffn", FeedForwardBlock(dimToken, dimHidden,
+          biasFirst = TRUE,
+          biasSecond = TRUE,
+          dropout = ffnDropout,
+          activation = activation
+        ))
+        layer$add_module("attentionResDropout", torch::nn_dropout(resDropout))
+        layer$add_module("ffnResDropout", torch::nn_dropout(resDropout))
+        layer$add_module("ffnNorm", ffnNorm(dimToken))
+
+        if (x != 1) {
+          layer$add_module("attentionNorm", attNorm(dimToken))
+        }
+        return(layer)
+      }
+    ))
+    self$head <- Head(dimToken,
+      bias = TRUE, activation = headActivation,
+      headNorm, dimOut
+    )
+  },
+  forward = function(x) {
+    mask <- torch::torch_where(x$cat == 0, TRUE, FALSE)
+    input <- x
+    cat <- self$Categoricalembedding(x$cat)
+    if (!is.null(input$num)) {
+      num <- self$numericalEmbedding(input$num)
+      x <- torch::torch_cat(list(cat, num), dim = 2L)
+      mask <- torch::torch_cat(list(mask, torch::torch_zeros(c(
+        x$shape[1],
+        num$shape[2]
+      ),
+      device = mask$device,
+      dtype = mask$dtype
+      )),
+      dim = 2L
+      )
+    } else {
+      x <- cat
+    }
+    x <- self$classToken(x)
+    mask <- torch::torch_cat(list(mask, torch::torch_zeros(c(x$shape[1], 1),
+      device = mask$device,
+      dtype = mask$dtype
+    )),
+    dim = 2L
+    )
+    for (i in 1:length(self$layers)) {
+      layer <- self$layers[[i]]
+      xResidual <- self$startResidual(layer, "attention", x)
+
+      if (i == length(self$layers)) {
+        dims <- xResidual$shape
+        # in final layer take only attention on CLS token
+        xResidual <- layer$attention(
+          xResidual[, -1]$view(c(dims[1], 1, dims[3]))$transpose(1, 2),
+          xResidual$transpose(1, 2),
+          xResidual$transpose(1, 2), mask
+        )
+        attnWeights <- xResidual[[2]]
+        xResidual <- xResidual[[1]]
+        x <- x[, -1]$view(c(dims[1], 1, dims[3]))
+      } else {
+        # attention input is seq_length x batch_size x embedding_dim
+        xResidual <- layer$attention(
+          xResidual$transpose(1, 2),
+          xResidual$transpose(1, 2),
+          xResidual$transpose(1, 2),
+          mask,
+        )[[1]]
+      }
+      x <- self$endResidual(layer, "attention", x, xResidual$transpose(1, 2))
+
+      xResidual <- self$startResidual(layer, "ffn", x)
+      xResidual <- layer$ffn(xResidual)
+      x <- self$endResidual(layer, "ffn", x, xResidual)
+    }
+    x <- self$head(x)[, 1] # remove singleton dimension
+    return(x)
+  },
+  startResidual = function(layer, stage, x) {
+    xResidual <- x
+    normKey <- paste0(stage, "Norm")
+    if (normKey %in% names(as.list(layer))) {
+      xResidual <- layer[[normKey]](xResidual)
+    }
+    return(xResidual)
+  },
+  endResidual = function(layer, stage, x, xResidual) {
+    dropoutKey <- paste0(stage, "ResDropout")
+    xResidual <- layer[[dropoutKey]](xResidual)
+    x <- x + xResidual
+    return(x)
+  }
+)
+
+
+FeedForwardBlock <- torch::nn_module(
+  name = "FeedForwardBlock",
+  initialize = function(dimToken, dimHidden, biasFirst, biasSecond,
+                        dropout, activation) {
+    self$linearFirst <- torch::nn_linear(dimToken, dimHidden * 2, biasFirst)
+    self$activation <- activation()
+    self$dropout <- torch::nn_dropout(dropout)
+    self$linearSecond <- torch::nn_linear(dimHidden, dimToken, biasSecond)
+  },
+  forward = function(x) {
+    x <- self$linearFirst(x)
+    x <- self$activation(x)
+    x <- self$dropout(x)
+    x <- self$linearSecond(x)
+    return(x)
+  }
+)
+
+Head <- torch::nn_module(
+  name = "Head",
+  initialize = function(dimIn, bias, activation, normalization, dimOut) {
+    self$normalization <- normalization(dimIn)
+    self$activation <- activation()
+    self$linear <- torch::nn_linear(dimIn, dimOut, bias)
+  },
+  forward = function(x) {
+    x <- x[, -1] # ?
+    x <- self$normalization(x)
+    x <- self$activation(x)
+    x <- self$linear(x)
+    return(x)
+  }
+)
+
+Embedding <- torch::nn_module(
+  name = "Embedding",
+  initialize = function(numEmbeddings, embeddingDim) {
+    self$embedding <- torch::nn_embedding(numEmbeddings, embeddingDim, padding_idx = 1)
+  },
+  forward = function(x_cat) {
+    x <- self$embedding(x_cat + 1L) # padding idx is 1L
+    return(x)
+  }
+)
+
+numericalEmbedding <- torch::nn_module(
+  name = "numericalEmbedding",
+  initialize = function(numEmbeddings, embeddingDim, bias = TRUE) {
+    self$weight <- torch::nn_parameter(torch::torch_empty(numEmbeddings, embeddingDim))
+    if (bias) {
+      self$bias <- torch::nn_parameter(torch::torch_empty(numEmbeddings, embeddingDim))
+    } else {
+      self$bias <- NULL
+    }
+
+    for (parameter in list(self$weight, self$bias)) {
+      if (!is.null(parameter)) {
+        torch::nn_init_kaiming_uniform_(parameter, a = sqrt(5))
+      }
+    }
+  },
+  forward = function(x) {
+    x <- self$weight$unsqueeze(1) * x$unsqueeze(-1)
+    if (!is.null(self$bias)) {
+      x <- x + self$bias$unsqueeze(1)
+    }
+    return(x)
+  }
+)
+
+# adds a class token embedding to embeddings
+ClassToken <- torch::nn_module(
+  name = "ClassToken",
+  initialize = function(dimToken) {
+    self$weight <- torch::nn_parameter(torch::torch_empty(dimToken, 1))
+    torch::nn_init_kaiming_uniform_(self$weight, a = sqrt(5))
+  },
+  expand = function(dims) {
+    newDims <- vector("integer", length(dims) - 1) + 1
+    return(self$weight$view(c(newDims, -1))$expand(c(dims, -1)))
+  },
+  forward = function(x) {
+    return(torch::torch_cat(c(x, self$expand(c(dim(x)[[1]], 1))), dim = 2))
+  }
+)
+
+nn_reglu <- torch::nn_module(
+  name = "ReGlu",
+  forward = function(x) {
+    return(reglu(x))
+  }
+)
+
+
+reglu <- function(x) {
+  chunks <- x$chunk(2, dim = -1)
+
+  return(chunks[[1]] * torch::nnf_relu(chunks[[2]]))
+}
diff --git a/README.md b/README.md
index 21f8d67..ad0e3c6 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,8 @@
 DeepPatientLevelPrediction
 ======================
 
-[![Build Status](https://github.com/OHDSI/DeepPatientLevelPrediction/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/DeepPatientLevelPrediction/actions?query=workflow%3AR-CMD-check)
-[![codecov.io](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction/coverage.svg?branch=master)](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction?branch=master)
+[![Build Status](https://github.com/OHDSI/DeepPatientLevelPrediction/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/DeepPatientLevelPrediction/actions?query=workflow%3AR-CMD-check?branch=develop)
+[![codecov.io](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction/coverage.svg?branch=master)](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction?branch=main)
 
 
 Introduction
@@ -15,25 +15,27 @@ Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. [Design and implementati
 
 Features
 ========
-- add
-
+- Adds deep learning models to use in the OHDSI PatientLevelPrediction framework.
+- Allows to add custom deep learning models.
+- Includes an MLP, ResNet and a Transformer
+- Allows to use all the features of [PatientLevelPrediction](https://github.com/OHDSI/PatientLevelPrediction/) to validate and explore your model performance.
 
 
 Technology
 ==========
-DeepPatientLevelPrediction is an R package, with some functions implemented in C++ and python.
+DeepPatientLevelPrediction is an R package. It uses [torch in R](https://torch.mlverse.org/) to build deep learning models without using python.
 
 System Requirements
 ===================
-Requires R (version 3.3.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in DeepPatientLevelPrediction require Java and Python.
+Requires R (version 3.5.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). For training deep learning models in most cases an nvidia GPU is required using either Windows or Linux.
 
-The python installation is required for some of the machine learning algorithms. We advise to
-install Python 3.7 using Anaconda (https://www.continuum.io/downloads). 
 
 Getting Started
 ===============
 
-- add
+- To install the package please read the [Package installation guide]()
+- Please read the main vignette for the package:
+[Building Deep Learning Models](https://ohdsi.github.io/DeepPatientLevelPrediction/articles/BuildingDeepModels.html)
 
 User Documentation
 ==================
diff --git a/_pkgdown.yml b/_pkgdown.yml
new file mode 100644
index 0000000..a552c95
--- /dev/null
+++ b/_pkgdown.yml
@@ -0,0 +1,33 @@
+template:
+  params:
+    bootswatch: cosmo
+
+home:
+  links:
+  - text: Ask a question
+    href: http://forums.ohdsi.org
+    
+navbar:
+  structure:
+    left:
+    - home
+    - intro
+    - reference
+    - articles
+    right: [hades, github]
+  components:
+    home:
+      icon: fa-home fa-lg
+      href: index.html
+    reference:
+      text: Reference
+      href: reference/index.html
+    intro:
+      text: Get started
+      href: articles/Installing.html
+    github:
+      icon: fa-github fa-lg
+      href: https://github.com/OHDSI/DeepPatientLevelPrediction
+    hades:
+      text: hadesLogo
+      href: https://ohdsi.github.io/Hades
diff --git a/docs/404.html b/docs/404.html
new file mode 100644
index 0000000..491064f
--- /dev/null
+++ b/docs/404.html
@@ -0,0 +1,127 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Page not found (404) • DeepPatientLevelPrediction</title>
+<!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css">
+<script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
+<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
+<!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet">
+<script src="pkgdown.js"></script><meta property="og:title" content="Page not found (404)">
+<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]-->
+</head>
+<body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-title-body">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav">
+<li>
+  <a href="index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu">
+<li>
+      <a href="articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul>
+</li>
+      </ul>
+<ul class="nav navbar-nav navbar-right">
+<li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul>
+</div>
+<!--/.nav-collapse -->
+  </div>
+<!--/.container -->
+</div>
+<!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="contents col-md-9">
+    <div class="page-header">
+      <h1>Page not found (404)</h1>
+    </div>
+
+Content not found. Please use links in the navbar.
+
+  </div>
+
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav>
+</div>
+
+</div>
+
+
+
+      <footer><div class="copyright">
+  <p></p>
+<p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p>
+<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer>
+</div>
+
+  
+
+
+  
+
+  </body>
+</html>
diff --git a/docs/articles/BuildingDeepModels.html b/docs/articles/BuildingDeepModels.html
new file mode 100644
index 0000000..70c805a
--- /dev/null
+++ b/docs/articles/BuildingDeepModels.html
@@ -0,0 +1,378 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Building Deep Learning Models • DeepPatientLevelPrediction</title>
+<!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css">
+<script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
+<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
+<!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet">
+<script src="../pkgdown.js"></script><meta property="og:title" content="Building Deep Learning Models">
+<meta property="og:description" content="DeepPatientLevelPrediction">
+<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]-->
+</head>
+<body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-article">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav">
+<li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu">
+<li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul>
+</li>
+      </ul>
+<ul class="nav navbar-nav navbar-right">
+<li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul>
+</div>
+<!--/.nav-collapse -->
+  </div>
+<!--/.container -->
+</div>
+<!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header toc-ignore">
+      <h1 data-toc-skip>Building Deep Learning Models</h1>
+                        <h4 data-toc-skip class="author">Jenna Reps, Egill Fridgeirsson, Chungsoo Kim, Henrik John, Seng Chan You, Xiaoyong Pan</h4>
+            
+            <h4 data-toc-skip class="date">2022-08-09</h4>
+      
+      <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/vignettes/BuildingDeepModels.Rmd" class="external-link"><code>vignettes/BuildingDeepModels.Rmd</code></a></small>
+      <div class="hidden name"><code>BuildingDeepModels.Rmd</code></div>
+
+    </div>
+
+    
+    
+<!--
+%\VignetteEngine{knitr}
+%\VignetteIndexEntry{Building Deep Learning Models}
+-->
+<div class="section level2">
+<h2 id="introduction">Introduction<a class="anchor" aria-label="anchor" href="#introduction"></a>
+</h2>
+<div class="section level3">
+<h3 id="deeppatientlevelprediction">DeepPatientLevelPrediction<a class="anchor" aria-label="anchor" href="#deeppatientlevelprediction"></a>
+</h3>
+<p>Patient level prediction aims to use historic data to learn a function between an input (a patient’s features such as age/gender/comorbidities at index) and an output (whether the patient experienced an outcome during some time-at-risk). Deep learning is example of the the current state-of-the-art classifiers that can be implemented to learn the function between inputs and outputs.</p>
+<p>Deep Learning models are widely used to automatically learn high-level feature representations from the data, and have achieved remarkable results in image processing, speech recognition and computational biology. Recently, interesting results have been shown using large observational healthcare data (e.g., electronic healthcare data or claims data), but more extensive research is needed to assess the power of Deep Learning in this domain.</p>
+<p>This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) <a href="http://github.com/OHDSI/PatientLevelPrediction" class="external-link"><code>PatientLevelPrediction</code></a> package and <a href="http://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link"><code>DeepPatientLevelPrediction</code></a> package to build Deep Learning models. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the <a href="https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf" class="external-link"><code>BuildingPredictiveModels</code> vignette</a>. Furthermore, this vignette assumes you are familiar with Deep Learning methods.</p>
+</div>
+<div class="section level3">
+<h3 id="background">Background<a class="anchor" aria-label="anchor" href="#background"></a>
+</h3>
+<p>Deep Learning models are build by stacking an often large number of neural network layers that perform feature engineering steps, e.g embedding, and are collapsed in a final softmax layer (basically a logistic regression layer). These algorithms need a lot of data to converge to a good representation, but currently the sizes of the large observational healthcare databases are growing fast which would make Deep Learning an interesting approach to test within OHDSI’s <a href="https://academic.oup.com/jamia/article/25/8/969/4989437" class="external-link">Patient-Level Prediction Framework</a>. The current implementation allows us to perform research at scale on the value and limitations of Deep Learning using observational healthcare data.</p>
+<p>In the package we have used <a href="https://cran.r-project.org/web/packages/torch/index.html" class="external-link">torch</a> and <a href="https://cran.r-project.org/web/packages/tabnet/index.html" class="external-link">tabnet</a> but we invite the community to add other backends.</p>
+<p>Many network architectures have recently been proposed and we have implemented a number of them, however, this list will grow in the near future. It is important to understand that some of these architectures require a 2D data matrix, i.e. |patient|x|feature|, and others use a 3D data matrix |patient|x|feature|x|time|. The <a href="www.github.com%5Cohdsi%5CFeatureExtraction">FeatureExtraction Package</a> has been extended to enable the extraction of both data formats as will be described with examples below.</p>
+<p>Note that training Deep Learning models is computationally intensive, our implementation therefore supports both GPU and CPU. It will automatically check whether there is GPU or not in your computer. A GPU is highly recommended for Deep Learning!</p>
+</div>
+<div class="section level3">
+<h3 id="requirements">Requirements<a class="anchor" aria-label="anchor" href="#requirements"></a>
+</h3>
+<p>Full details about the package requirements and instructions on installing the package can be found <a href="https://ohdsi.github.io/DeepPatientLevelPrediction/articles/Installing.html" class="external-link">here</a>.</p>
+</div>
+<div class="section level3">
+<h3 id="integration-with-patientlevelprediction">Integration with PatientLevelPrediction<a class="anchor" aria-label="anchor" href="#integration-with-patientlevelprediction"></a>
+</h3>
+<p>The <code>DeepPatientLevelPrediction</code> package provides additional model settings that can be used within the <code>PatientLevelPrediction</code> package <code>runPlp()</code> function. To use both packages you first need to pick the deep learning architecture you wish to fit (see below) and then you specifiy this as the modelSettings inside <code>runPlp()</code>.</p>
+<div class="sourceCode" id="cb1"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co"># load the data</span></span>
+<span><span class="va">plpData</span> <span class="op">&lt;-</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/loadPlpData.html" class="external-link">loadPlpData</a></span><span class="op">(</span><span class="st">'locationOfData'</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># pick the set&lt;Model&gt; from  DeepPatientLevelPrediction</span></span>
+<span><span class="va">deepLearningModel</span> <span class="op">&lt;-</span> <span class="fu">DeepPatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="../reference/setResNet.html">setResNet</a></span><span class="op">(</span><span class="op">)</span></span>
+<span></span>
+<span><span class="co"># use PatientLevelPrediction to fit model</span></span>
+<span><span class="va">deepLearningResult</span> <span class="op">&lt;-</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/runPlp.html" class="external-link">runPlp</a></span><span class="op">(</span></span>
+<span>    plpData <span class="op">=</span> <span class="va">plpData</span>, </span>
+<span>    outcomeId <span class="op">=</span> <span class="fl">1230</span>, </span>
+<span>    modelSettings <span class="op">=</span> <span class="va">deepLearningModel</span>,</span>
+<span>    analysisId <span class="op">=</span> <span class="st">'resNetTorch'</span>, </span>
+<span>    <span class="va">...</span></span>
+<span>  <span class="op">)</span></span></code></pre></div>
+</div>
+</div>
+<div class="section level2">
+<h2 id="non-temporal-architectures">Non-Temporal Architectures<a class="anchor" aria-label="anchor" href="#non-temporal-architectures"></a>
+</h2>
+<p>We implemented the following non-temporal (2D data matrix) architectures:</p>
+<div class="section level3">
+<h3 id="simple-mlp">Simple MLP<a class="anchor" aria-label="anchor" href="#simple-mlp"></a>
+</h3>
+<div class="section level4">
+<h4 id="overall-concept">Overall concept<a class="anchor" aria-label="anchor" href="#overall-concept"></a>
+</h4>
+<p>A multilayer perceptron (MLP) model is a directed graph consisting of an input layer, one or more hidden layers and an output layer. The model takes in the input feature values and feeds these forward through the graph to determine the output class. A process known as ‘backpropogation’ is used to train the model. Backpropogation requires labelled data and involves iteratively calculating the error between the MLP model’s predictions and ground truth to learn how to adjust the model.</p>
+</div>
+<div class="section level4">
+<h4 id="example">Example<a class="anchor" aria-label="anchor" href="#example"></a>
+</h4>
+<div class="section level5">
+<h5 id="set-fuction">Set Fuction<a class="anchor" aria-label="anchor" href="#set-fuction"></a>
+</h5>
+<p>To use the package to fit a MLP model you can use the <code><a href="../reference/setDeepNNTorch.html">setDeepNNTorch()</a></code> function to specify the hyper-parameter settings for the MLP.</p>
+</div>
+<div class="section level5">
+<h5 id="inputs">Inputs<a class="anchor" aria-label="anchor" href="#inputs"></a>
+</h5>
+<p>The <code>units</code> input defines the network topology via the number of nodes per layer in the networks hidden layers. A list of different topologies can be investigated <code>list(c(10,63), 128)</code> means two different topologies will be fit, the first has two hidden layers with 10 nodes in the first hidden layer and 63 in the second hidden layer. The second just has one hidden layer with 128 nodes.</p>
+<p>The <code>layer_dropout</code> input specifies the probability that a layer randomly sets input units to 0 at each step during training time. A value of <code>0.2</code> means that 20% of the time the layer input units will be set to 0. This is used to reduce overfitting.</p>
+<p>The <code>lr</code> input is the learning rate which is a hyperparameter that controls how much to change the model in response to the estimated error each time the model weights are updated. The smaller the <code>lr</code> the longer it will take to fit the model and the model weights may get stuck, but if the <code>lr</code> is too large, the weights may sub-optimally converge too fast.</p>
+<p>The <code>decay</code> input corresponds to the weight decay in the objective function. During model fitting the aim is to minimize the objective function. The objective function is made up of the prediction error (the difference between the prediction vs the truth) plus the square of the weights multiplied by the weight decay. The larger the weight decay, the more you penalize having large weights. If you set the weight decay too large, the model will never fit well enough, if you set it too low, you need to be careful of overfitting (so try to stop model fitting earlier).</p>
+<p>The <code>outcome_weight</code> specifies whether to add more weight to misclassifying one class (e.g., with outcome during TAR) vs the other (e.g., without outcome during TAR). This can be useful if there is imbalance between the classes (e.g., the outcome rarely occurs during TAR).</p>
+<p>The <code>batch_size</code> corresponds to the number of data points (patients) used per iteration to estimate the network error during model fitting.</p>
+<p>The <code>epochs</code> corresponds to how many time to run through the entire training data while fitting the model.</p>
+<p>The <code>seed</code> lets the user reproduce the same network given the same training data and hyper-parameter settings if they use the same seed.</p>
+</div>
+<div class="section level5">
+<h5 id="example-code">Example Code<a class="anchor" aria-label="anchor" href="#example-code"></a>
+</h5>
+<p>For example, the following code will try two different network topologies and pick the topology that obtains the greatest AUROC via cross validation in the training data and then fit the model with that topology using all the training data. The standard output of <code>runPlp()</code> will be returned - this contains the MLP model along with the performance details and settings.</p>
+<div class="sourceCode" id="cb2"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="co">#singleLayerNN(inputN = 10, layer1 = 100, outputN = 2, layer_dropout = 0.1)</span></span>
+<span><span class="va">deepset</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/setDeepNNTorch.html">setDeepNNTorch</a></span><span class="op">(</span></span>
+<span>  units <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">10</span>,<span class="fl">63</span><span class="op">)</span>, <span class="fl">128</span><span class="op">)</span>, </span>
+<span>  layer_dropout <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0.2</span><span class="op">)</span>,</span>
+<span>  lr <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1e-4</span><span class="op">)</span>, </span>
+<span>  decay <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1e-5</span><span class="op">)</span>, </span>
+<span>  outcome_weight <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1.0</span><span class="op">)</span>, </span>
+<span>  batch_size <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">100</span><span class="op">)</span>, </span>
+<span>  epochs <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">5</span><span class="op">)</span>,  </span>
+<span>  seed <span class="op">=</span> <span class="fl">12</span>  </span>
+<span>  <span class="op">)</span></span>
+<span></span>
+<span><span class="va">mlpResult</span> <span class="op">&lt;-</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/runPlp.html" class="external-link">runPlp</a></span><span class="op">(</span></span>
+<span>    plpData <span class="op">=</span> <span class="va">plpData</span>, </span>
+<span>    outcomeId <span class="op">=</span> <span class="fl">3</span>, </span>
+<span>    modelSettings <span class="op">=</span> <span class="va">deepset</span>,</span>
+<span>    analysisId <span class="op">=</span> <span class="st">'DeepNNTorch'</span>, </span>
+<span>    analysisName <span class="op">=</span> <span class="st">'Testing Deep Learning'</span>, </span>
+<span>    populationSettings <span class="op">=</span> <span class="va">populationSet</span>, </span>
+<span>    splitSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createDefaultSplitSetting.html" class="external-link">createDefaultSplitSetting</a></span><span class="op">(</span><span class="op">)</span>, </span>
+<span>    sampleSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createSampleSettings.html" class="external-link">createSampleSettings</a></span><span class="op">(</span><span class="op">)</span>,  <span class="co"># none </span></span>
+<span>    featureEngineeringSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createFeatureEngineeringSettings.html" class="external-link">createFeatureEngineeringSettings</a></span><span class="op">(</span><span class="op">)</span>, <span class="co"># none </span></span>
+<span>    preprocessSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createPreprocessSettings.html" class="external-link">createPreprocessSettings</a></span><span class="op">(</span><span class="op">)</span>, </span>
+<span>    executeSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createExecuteSettings.html" class="external-link">createExecuteSettings</a></span><span class="op">(</span></span>
+<span>      runSplitData <span class="op">=</span> <span class="cn">T</span>, </span>
+<span>      runSampleData <span class="op">=</span> <span class="cn">F</span>, </span>
+<span>      runfeatureEngineering <span class="op">=</span> <span class="cn">F</span>, </span>
+<span>      runPreprocessData <span class="op">=</span> <span class="cn">T</span>, </span>
+<span>      runModelDevelopment <span class="op">=</span> <span class="cn">T</span>, </span>
+<span>      runCovariateSummary <span class="op">=</span> <span class="cn">F</span></span>
+<span>    <span class="op">)</span>, </span>
+<span>    saveDirectory <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">testLoc</span>, <span class="st">'DeepNNTorch'</span><span class="op">)</span></span>
+<span>  <span class="op">)</span></span></code></pre></div>
+</div>
+</div>
+</div>
+<div class="section level3">
+<h3 id="resnet">ResNet<a class="anchor" aria-label="anchor" href="#resnet"></a>
+</h3>
+<div class="section level4">
+<h4 id="overall-concept-1">Overall concept<a class="anchor" aria-label="anchor" href="#overall-concept-1"></a>
+</h4>
+<p>Deep learning models are often trained via a process known as gradient descent during backpropogation. During this process the network weights are updated based on the gradient of the error function for the current weights. However, as the number of layers in the network increase, there is a greater chance of experiencing an issue known as the vanishing or exploding gradient during this process. The vanishing or exploding gradient is when the gradient goes to 0 or infinity, which negatively impacts the model fitting.</p>
+<p>The residual network (ResNet) was introduced to address the vanishing or exploding gradient issue. It works by adding connections between non-adjacent layers, termed a ‘skip connection’. Using some form of regularization with these ‘skip connections’ enables the network to ignore any problematic layer that resulted due to gradient issues.</p>
+</div>
+<div class="section level4">
+<h4 id="example-1">Example<a class="anchor" aria-label="anchor" href="#example-1"></a>
+</h4>
+<div class="section level5">
+<h5 id="set-fuction-1">Set Fuction<a class="anchor" aria-label="anchor" href="#set-fuction-1"></a>
+</h5>
+<p>To use the package to fit a ResNet model you can use the <code><a href="../reference/setResNet.html">setResNet()</a></code> function to specify the hyper-parameter settings for the network.</p>
+</div>
+<div class="section level5">
+<h5 id="inputs-1">Inputs<a class="anchor" aria-label="anchor" href="#inputs-1"></a>
+</h5>
+<p>[add info about each input here]</p>
+</div>
+<div class="section level5">
+<h5 id="example-code-1">Example Code<a class="anchor" aria-label="anchor" href="#example-code-1"></a>
+</h5>
+<p>For example, the following code will …</p>
+<div class="sourceCode" id="cb3"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="va">resset</span> <span class="op">&lt;-</span> <span class="fu"><a href="../reference/setResNet.html">setResNet</a></span><span class="op">(</span></span>
+<span>  numLayers <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">2</span><span class="op">)</span>, </span>
+<span>  sizeHidden <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">32</span><span class="op">)</span>,</span>
+<span>  hiddenFactor <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">2</span><span class="op">)</span>,</span>
+<span>  residualDropout <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0.1</span><span class="op">)</span>, </span>
+<span>  hiddenDropout <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0.1</span><span class="op">)</span>,</span>
+<span>  normalization <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">'BatchNorm'</span><span class="op">)</span>, </span>
+<span>  activation <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">'RelU'</span><span class="op">)</span>,</span>
+<span>  sizeEmbedding <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">32</span><span class="op">)</span>, </span>
+<span>  weightDecay <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1e-6</span><span class="op">)</span>,</span>
+<span>  learningRate <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">3e-4</span><span class="op">)</span>, </span>
+<span>  seed <span class="op">=</span> <span class="fl">42</span>, </span>
+<span>  hyperParamSearch <span class="op">=</span> <span class="st">'random'</span>,</span>
+<span>  randomSample <span class="op">=</span> <span class="fl">1</span>, </span>
+<span>  <span class="co">#device='cuda:0', </span></span>
+<span>  batchSize <span class="op">=</span> <span class="fl">128</span>, </span>
+<span>  epochs <span class="op">=</span> <span class="fl">3</span></span>
+<span><span class="op">)</span></span>
+<span></span>
+<span><span class="va">resResult</span> <span class="op">&lt;-</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/runPlp.html" class="external-link">runPlp</a></span><span class="op">(</span></span>
+<span>    plpData <span class="op">=</span> <span class="va">plpData</span>, </span>
+<span>    outcomeId <span class="op">=</span> <span class="fl">3</span>, </span>
+<span>    modelSettings <span class="op">=</span> <span class="va">resset</span>,</span>
+<span>    analysisId <span class="op">=</span> <span class="st">'ResNet'</span>, </span>
+<span>    analysisName <span class="op">=</span> <span class="st">'Testing ResNet'</span>, </span>
+<span>    populationSettings <span class="op">=</span> <span class="va">populationSet</span>, </span>
+<span>    splitSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createDefaultSplitSetting.html" class="external-link">createDefaultSplitSetting</a></span><span class="op">(</span><span class="op">)</span>, </span>
+<span>    sampleSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createSampleSettings.html" class="external-link">createSampleSettings</a></span><span class="op">(</span><span class="op">)</span>,  <span class="co"># none </span></span>
+<span>    featureEngineeringSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createFeatureEngineeringSettings.html" class="external-link">createFeatureEngineeringSettings</a></span><span class="op">(</span><span class="op">)</span>, <span class="co"># none </span></span>
+<span>    preprocessSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createPreprocessSettings.html" class="external-link">createPreprocessSettings</a></span><span class="op">(</span><span class="op">)</span>, </span>
+<span>    executeSettings <span class="op">=</span> <span class="fu">PatientLevelPrediction</span><span class="fu">::</span><span class="fu"><a href="https://ohdsi.github.io/PatientLevelPrediction/reference/createExecuteSettings.html" class="external-link">createExecuteSettings</a></span><span class="op">(</span></span>
+<span>      runSplitData <span class="op">=</span> <span class="cn">T</span>, </span>
+<span>      runSampleData <span class="op">=</span> <span class="cn">F</span>, </span>
+<span>      runfeatureEngineering <span class="op">=</span> <span class="cn">F</span>, </span>
+<span>      runPreprocessData <span class="op">=</span> <span class="cn">T</span>, </span>
+<span>      runModelDevelopment <span class="op">=</span> <span class="cn">T</span>, </span>
+<span>      runCovariateSummary <span class="op">=</span> <span class="cn">F</span></span>
+<span>    <span class="op">)</span>, </span>
+<span>    saveDirectory <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/file.path.html" class="external-link">file.path</a></span><span class="op">(</span><span class="va">testLoc</span>, <span class="st">'ResNet'</span><span class="op">)</span></span>
+<span>  <span class="op">)</span></span></code></pre></div>
+</div>
+</div>
+</div>
+<div class="section level3">
+<h3 id="tabnet">TabNet<a class="anchor" aria-label="anchor" href="#tabnet"></a>
+</h3>
+<div class="section level4">
+<h4 id="overall-concept-2">Overall concept<a class="anchor" aria-label="anchor" href="#overall-concept-2"></a>
+</h4>
+</div>
+<div class="section level4">
+<h4 id="examples">Examples<a class="anchor" aria-label="anchor" href="#examples"></a>
+</h4>
+</div>
+</div>
+<div class="section level3">
+<h3 id="transformer">Transformer<a class="anchor" aria-label="anchor" href="#transformer"></a>
+</h3>
+<div class="section level4">
+<h4 id="overall-concept-3">Overall concept<a class="anchor" aria-label="anchor" href="#overall-concept-3"></a>
+</h4>
+</div>
+<div class="section level4">
+<h4 id="examples-1">Examples<a class="anchor" aria-label="anchor" href="#examples-1"></a>
+</h4>
+</div>
+</div>
+</div>
+<div class="section level2">
+<h2 id="acknowledgments">Acknowledgments<a class="anchor" aria-label="anchor" href="#acknowledgments"></a>
+</h2>
+<p>Considerable work has been dedicated to provide the <code>DeepPatientLevelPrediction</code> package.</p>
+<div class="sourceCode" id="cb4"><pre class="downlit sourceCode r">
+<code class="sourceCode R"><span><span class="fu"><a href="https://rdrr.io/r/utils/citation.html" class="external-link">citation</a></span><span class="op">(</span><span class="st">"DeepPatientLevelPrediction"</span><span class="op">)</span></span></code></pre></div>
+<pre><code><span><span class="co">## </span></span>
+<span><span class="co">## To cite package 'DeepPatientLevelPrediction' in publications use:</span></span>
+<span><span class="co">## </span></span>
+<span><span class="co">##   Reps J, Fridgeirsson E, Chan You S, Kim C, John H (2021).</span></span>
+<span><span class="co">##   _DeepPatientLevelPrediction: Deep learning function for patient level</span></span>
+<span><span class="co">##   prediction using data in the OMOP Common Data Model_.</span></span>
+<span><span class="co">##   https://ohdsi.github.io/PatientLevelPrediction,</span></span>
+<span><span class="co">##   https://github.com/OHDSI/DeepPatientLevelPrediction.</span></span>
+<span><span class="co">## </span></span>
+<span><span class="co">## A BibTeX entry for LaTeX users is</span></span>
+<span><span class="co">## </span></span>
+<span><span class="co">##   @Manual{,</span></span>
+<span><span class="co">##     title = {DeepPatientLevelPrediction: Deep learning function for patient level prediction using data in the OMOP Common Data Model},</span></span>
+<span><span class="co">##     author = {Jenna Reps and Egill Fridgeirsson and Seng {Chan You} and Chungsoo Kim and Henrik John},</span></span>
+<span><span class="co">##     year = {2021},</span></span>
+<span><span class="co">##     note = {https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction},</span></span>
+<span><span class="co">##   }</span></span></code></pre>
+<p><strong>Please reference this paper if you use the PLP Package in your work:</strong></p>
+<p><a href="http://dx.doi.org/10.1093/jamia/ocy032" class="external-link">Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.</a></p>
+</div>
+  </div>
+
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+
+        <nav id="toc" data-toggle="toc"><h2 data-toc-skip>Contents</h2>
+    </nav>
+</div>
+
+</div>
+
+
+
+      <footer><div class="copyright">
+  <p></p>
+<p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p>
+<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer>
+</div>
+
+  
+
+
+  
+
+  </body>
+</html>
diff --git a/docs/articles/index.html b/docs/articles/index.html
new file mode 100644
index 0000000..06f1bc3
--- /dev/null
+++ b/docs/articles/index.html
@@ -0,0 +1,100 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Articles • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Articles"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-article-index">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+      <h1>Articles</h1>
+    </div>
+
+    <div class="section ">
+      <h3>All vignettes</h3>
+      <p class="section-desc"></p>
+
+      <dl><dt><a href="BuildingDeepModels.html">Building Deep Learning Models</a></dt>
+        <dd>
+        </dd><dt><a href="Installing.html">DeepPatientLevelPrediction Installation Guide</a></dt>
+        <dd>
+      </dd></dl></div>
+  </div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/authors.html b/docs/authors.html
new file mode 100644
index 0000000..b203b7f
--- /dev/null
+++ b/docs/authors.html
@@ -0,0 +1,135 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Authors and Citation • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css"><script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet"><script src="pkgdown.js"></script><meta property="og:title" content="Authors and Citation"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-citation-authors">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="contents col-md-9">
+    <div class="section level2 authors-section">
+      <div class="page-header">
+        <h1>Authors</h1>
+      </div>
+
+      
+      <ul class="list-unstyled"><li>
+          <p><strong>Jenna Reps</strong>. Author. 
+          </p>
+        </li>
+        <li>
+          <p><strong>Egill Fridgeirsson</strong>. Author, maintainer. 
+          </p>
+        </li>
+        <li>
+          <p><strong>Seng Chan You</strong>. Author. 
+          </p>
+        </li>
+        <li>
+          <p><strong>Chungsoo Kim</strong>. Author. 
+          </p>
+        </li>
+        <li>
+          <p><strong>Henrik John</strong>. Author. 
+          </p>
+        </li>
+      </ul></div>
+    <div class="section level2 citation-section">
+    <div>
+      <h1 id="citation">Citation</h1>
+      <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/DESCRIPTION" class="external-link"><code>DESCRIPTION</code></a></small>
+    </div>
+    </div>
+
+
+    <p>Reps J, Fridgeirsson E, Chan You S, Kim C, John H (2022).
+<em>DeepPatientLevelPrediction: Deep learning function for patient level prediction using data in the OMOP Common Data Model</em>.
+https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction. 
+</p>
+    <pre>@Manual{,
+  title = {DeepPatientLevelPrediction: Deep learning function for patient level prediction using data in the OMOP Common Data Model},
+  author = {Jenna Reps and Egill Fridgeirsson and Seng {Chan You} and Chungsoo Kim and Henrik John},
+  year = {2022},
+  note = {https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction},
+}</pre>
+
+  </div>
+
+</div>
+
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css
new file mode 100644
index 0000000..5a85941
--- /dev/null
+++ b/docs/bootstrap-toc.css
@@ -0,0 +1,60 @@
+/*!
+ * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/)
+ * Copyright 2015 Aidan Feldman
+ * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */
+
+/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */
+
+/* All levels of nav */
+nav[data-toggle='toc'] .nav > li > a {
+  display: block;
+  padding: 4px 20px;
+  font-size: 13px;
+  font-weight: 500;
+  color: #767676;
+}
+nav[data-toggle='toc'] .nav > li > a:hover,
+nav[data-toggle='toc'] .nav > li > a:focus {
+  padding-left: 19px;
+  color: #563d7c;
+  text-decoration: none;
+  background-color: transparent;
+  border-left: 1px solid #563d7c;
+}
+nav[data-toggle='toc'] .nav > .active > a,
+nav[data-toggle='toc'] .nav > .active:hover > a,
+nav[data-toggle='toc'] .nav > .active:focus > a {
+  padding-left: 18px;
+  font-weight: bold;
+  color: #563d7c;
+  background-color: transparent;
+  border-left: 2px solid #563d7c;
+}
+
+/* Nav: second level (shown on .active) */
+nav[data-toggle='toc'] .nav .nav {
+  display: none; /* Hide by default, but at >768px, show it */
+  padding-bottom: 10px;
+}
+nav[data-toggle='toc'] .nav .nav > li > a {
+  padding-top: 1px;
+  padding-bottom: 1px;
+  padding-left: 30px;
+  font-size: 12px;
+  font-weight: normal;
+}
+nav[data-toggle='toc'] .nav .nav > li > a:hover,
+nav[data-toggle='toc'] .nav .nav > li > a:focus {
+  padding-left: 29px;
+}
+nav[data-toggle='toc'] .nav .nav > .active > a,
+nav[data-toggle='toc'] .nav .nav > .active:hover > a,
+nav[data-toggle='toc'] .nav .nav > .active:focus > a {
+  padding-left: 28px;
+  font-weight: 500;
+}
+
+/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */
+nav[data-toggle='toc'] .nav > .active > ul {
+  display: block;
+}
diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js
new file mode 100644
index 0000000..1cdd573
--- /dev/null
+++ b/docs/bootstrap-toc.js
@@ -0,0 +1,159 @@
+/*!
+ * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/)
+ * Copyright 2015 Aidan Feldman
+ * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */
+(function() {
+  'use strict';
+
+  window.Toc = {
+    helpers: {
+      // return all matching elements in the set, or their descendants
+      findOrFilter: function($el, selector) {
+        // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/
+        // http://stackoverflow.com/a/12731439/358804
+        var $descendants = $el.find(selector);
+        return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])');
+      },
+
+      generateUniqueIdBase: function(el) {
+        var text = $(el).text();
+        var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-');
+        return anchor || el.tagName.toLowerCase();
+      },
+
+      generateUniqueId: function(el) {
+        var anchorBase = this.generateUniqueIdBase(el);
+        for (var i = 0; ; i++) {
+          var anchor = anchorBase;
+          if (i > 0) {
+            // add suffix
+            anchor += '-' + i;
+          }
+          // check if ID already exists
+          if (!document.getElementById(anchor)) {
+            return anchor;
+          }
+        }
+      },
+
+      generateAnchor: function(el) {
+        if (el.id) {
+          return el.id;
+        } else {
+          var anchor = this.generateUniqueId(el);
+          el.id = anchor;
+          return anchor;
+        }
+      },
+
+      createNavList: function() {
+        return $('<ul class="nav"></ul>');
+      },
+
+      createChildNavList: function($parent) {
+        var $childList = this.createNavList();
+        $parent.append($childList);
+        return $childList;
+      },
+
+      generateNavEl: function(anchor, text) {
+        var $a = $('<a></a>');
+        $a.attr('href', '#' + anchor);
+        $a.text(text);
+        var $li = $('<li></li>');
+        $li.append($a);
+        return $li;
+      },
+
+      generateNavItem: function(headingEl) {
+        var anchor = this.generateAnchor(headingEl);
+        var $heading = $(headingEl);
+        var text = $heading.data('toc-text') || $heading.text();
+        return this.generateNavEl(anchor, text);
+      },
+
+      // Find the first heading level (`<h1>`, then `<h2>`, etc.) that has more than one element. Defaults to 1 (for `<h1>`).
+      getTopLevel: function($scope) {
+        for (var i = 1; i <= 6; i++) {
+          var $headings = this.findOrFilter($scope, 'h' + i);
+          if ($headings.length > 1) {
+            return i;
+          }
+        }
+
+        return 1;
+      },
+
+      // returns the elements for the top level, and the next below it
+      getHeadings: function($scope, topLevel) {
+        var topSelector = 'h' + topLevel;
+
+        var secondaryLevel = topLevel + 1;
+        var secondarySelector = 'h' + secondaryLevel;
+
+        return this.findOrFilter($scope, topSelector + ',' + secondarySelector);
+      },
+
+      getNavLevel: function(el) {
+        return parseInt(el.tagName.charAt(1), 10);
+      },
+
+      populateNav: function($topContext, topLevel, $headings) {
+        var $context = $topContext;
+        var $prevNav;
+
+        var helpers = this;
+        $headings.each(function(i, el) {
+          var $newNav = helpers.generateNavItem(el);
+          var navLevel = helpers.getNavLevel(el);
+
+          // determine the proper $context
+          if (navLevel === topLevel) {
+            // use top level
+            $context = $topContext;
+          } else if ($prevNav && $context === $topContext) {
+            // create a new level of the tree and switch to it
+            $context = helpers.createChildNavList($prevNav);
+          } // else use the current $context
+
+          $context.append($newNav);
+
+          $prevNav = $newNav;
+        });
+      },
+
+      parseOps: function(arg) {
+        var opts;
+        if (arg.jquery) {
+          opts = {
+            $nav: arg
+          };
+        } else {
+          opts = arg;
+        }
+        opts.$scope = opts.$scope || $(document.body);
+        return opts;
+      }
+    },
+
+    // accepts a jQuery object, or an options object
+    init: function(opts) {
+      opts = this.helpers.parseOps(opts);
+
+      // ensure that the data attribute is in place for styling
+      opts.$nav.attr('data-toggle', 'toc');
+
+      var $topContext = this.helpers.createChildNavList(opts.$nav);
+      var topLevel = this.helpers.getTopLevel(opts.$scope);
+      var $headings = this.helpers.getHeadings(opts.$scope, topLevel);
+      this.helpers.populateNav($topContext, topLevel, $headings);
+    }
+  };
+
+  $(function() {
+    $('nav[data-toggle="toc"]').each(function(i, el) {
+      var $nav = $(el);
+      Toc.init($nav);
+    });
+  });
+})();
diff --git a/docs/docsearch.css b/docs/docsearch.css
new file mode 100644
index 0000000..e5f1fe1
--- /dev/null
+++ b/docs/docsearch.css
@@ -0,0 +1,148 @@
+/* Docsearch -------------------------------------------------------------- */
+/*
+  Source: https://github.com/algolia/docsearch/
+  License: MIT
+*/
+
+.algolia-autocomplete {
+  display: block;
+  -webkit-box-flex: 1;
+  -ms-flex: 1;
+  flex: 1
+}
+
+.algolia-autocomplete .ds-dropdown-menu {
+  width: 100%;
+  min-width: none;
+  max-width: none;
+  padding: .75rem 0;
+  background-color: #fff;
+  background-clip: padding-box;
+  border: 1px solid rgba(0, 0, 0, .1);
+  box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175);
+}
+
+@media (min-width:768px) {
+  .algolia-autocomplete .ds-dropdown-menu {
+      width: 175%
+  }
+}
+
+.algolia-autocomplete .ds-dropdown-menu::before {
+  display: none
+}
+
+.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] {
+  padding: 0;
+  background-color: rgb(255,255,255);
+  border: 0;
+  max-height: 80vh;
+}
+
+.algolia-autocomplete .ds-dropdown-menu .ds-suggestions {
+  margin-top: 0
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion {
+  padding: 0;
+  overflow: visible
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--category-header {
+  padding: .125rem 1rem;
+  margin-top: 0;
+  font-size: 1.3em;
+  font-weight: 500;
+  color: #00008B;
+  border-bottom: 0
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--wrapper {
+    float: none;
+    padding-top: 0
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column {
+  float: none;
+  width: auto;
+  padding: 0;
+  text-align: left
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--content {
+  float: none;
+  width: auto;
+  padding: 0
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--content::before {
+  display: none
+}
+
+.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header {
+  padding-top: .75rem;
+  margin-top: .75rem;
+  border-top: 1px solid rgba(0, 0, 0, .1)
+}
+
+.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column {
+  display: block;
+  padding: .1rem 1rem;
+  margin-bottom: 0.1;
+  font-size: 1.0em;
+  font-weight: 400
+  /* display: none */
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--title {
+  display: block;
+  padding: .25rem 1rem;
+  margin-bottom: 0;
+  font-size: 0.9em;
+  font-weight: 400
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--text {
+  padding: 0 1rem .5rem;
+  margin-top: -.25rem;
+  font-size: 0.8em;
+  font-weight: 400;
+  line-height: 1.25
+}
+
+.algolia-autocomplete .algolia-docsearch-footer {
+  width: 110px;
+  height: 20px;
+  z-index: 3;
+  margin-top: 10.66667px;
+  float: right;
+  font-size: 0;
+  line-height: 0;
+}
+
+.algolia-autocomplete .algolia-docsearch-footer--logo {
+  background-image: url("data:image/svg+xml;utf8,<svg viewBox='0 0 130 18' xmlns='http://www.w3.org/2000/svg'><defs><linearGradient x1='-36.868%' y1='134.936%' x2='129.432%' y2='-27.7%' id='a'><stop stop-color='%2300AEFF' offset='0%'/><stop stop-color='%233369E7' offset='100%'/></linearGradient></defs><g fill='none' fill-rule='evenodd'><path d='M59.399.022h13.299a2.372 2.372 0 0 1 2.377 2.364V15.62a2.372 2.372 0 0 1-2.377 2.364H59.399a2.372 2.372 0 0 1-2.377-2.364V2.381A2.368 2.368 0 0 1 59.399.022z' fill='url(%23a)'/><path d='M66.257 4.56c-2.815 0-5.1 2.272-5.1 5.078 0 2.806 2.284 5.072 5.1 5.072 2.815 0 5.1-2.272 5.1-5.078 0-2.806-2.279-5.072-5.1-5.072zm0 8.652c-1.983 0-3.593-1.602-3.593-3.574 0-1.972 1.61-3.574 3.593-3.574 1.983 0 3.593 1.602 3.593 3.574a3.582 3.582 0 0 1-3.593 3.574zm0-6.418v2.664c0 .076.082.131.153.093l2.377-1.226c.055-.027.071-.093.044-.147a2.96 2.96 0 0 0-2.465-1.487c-.055 0-.11.044-.11.104l.001-.001zm-3.33-1.956l-.312-.311a.783.783 0 0 0-1.106 0l-.372.37a.773.773 0 0 0 0 1.101l.307.305c.049.049.121.038.164-.011.181-.245.378-.479.597-.697.225-.223.455-.42.707-.599.055-.033.06-.109.016-.158h-.001zm5.001-.806v-.616a.781.781 0 0 0-.783-.779h-1.824a.78.78 0 0 0-.783.779v.632c0 .071.066.12.137.104a5.736 5.736 0 0 1 1.588-.223c.52 0 1.035.071 1.534.207a.106.106 0 0 0 .131-.104z' fill='%23FFF'/><path d='M102.162 13.762c0 1.455-.372 2.517-1.123 3.193-.75.676-1.895 1.013-3.44 1.013-.564 0-1.736-.109-2.673-.316l.345-1.689c.783.163 1.819.207 2.361.207.86 0 1.473-.174 1.84-.523.367-.349.548-.866.548-1.553v-.349a6.374 6.374 0 0 1-.838.316 4.151 4.151 0 0 1-1.194.158 4.515 4.515 0 0 1-1.616-.278 3.385 3.385 0 0 1-1.254-.817 3.744 3.744 0 0 1-.811-1.351c-.192-.539-.29-1.504-.29-2.212 0-.665.104-1.498.307-2.054a3.925 3.925 0 0 1 .904-1.433 4.124 4.124 0 0 1 1.441-.926 5.31 5.31 0 0 1 1.945-.365c.696 0 1.337.087 1.961.191a15.86 15.86 0 0 1 1.588.332v8.456h-.001zm-5.954-4.206c0 .893.197 1.885.592 2.299.394.414.904.621 1.528.621.34 0 .663-.049.964-.142a2.75 2.75 0 0 0 .734-.332v-5.29a8.531 8.531 0 0 0-1.413-.18c-.778-.022-1.369.294-1.786.801-.411.507-.619 1.395-.619 2.223zm16.12 0c0 .719-.104 1.264-.318 1.858a4.389 4.389 0 0 1-.904 1.52c-.389.42-.854.746-1.402.975-.548.229-1.391.36-1.813.36-.422-.005-1.26-.125-1.802-.36a4.088 4.088 0 0 1-1.397-.975 4.486 4.486 0 0 1-.909-1.52 5.037 5.037 0 0 1-.329-1.858c0-.719.099-1.411.318-1.999.219-.588.526-1.09.92-1.509.394-.42.865-.741 1.402-.97a4.547 4.547 0 0 1 1.786-.338 4.69 4.69 0 0 1 1.791.338c.548.229 1.019.55 1.402.97.389.42.69.921.909 1.509.23.588.345 1.28.345 1.999h.001zm-2.191.005c0-.921-.203-1.689-.597-2.223-.394-.539-.948-.806-1.654-.806-.707 0-1.26.267-1.654.806-.394.539-.586 1.302-.586 2.223 0 .932.197 1.558.592 2.098.394.545.948.812 1.654.812.707 0 1.26-.272 1.654-.812.394-.545.592-1.166.592-2.098h-.001zm6.962 4.707c-3.511.016-3.511-2.822-3.511-3.274L113.583.926l2.142-.338v10.003c0 .256 0 1.88 1.375 1.885v1.792h-.001zm3.774 0h-2.153V5.072l2.153-.338v9.534zm-1.079-10.542c.718 0 1.304-.578 1.304-1.291 0-.714-.581-1.291-1.304-1.291-.723 0-1.304.578-1.304 1.291 0 .714.586 1.291 1.304 1.291zm6.431 1.013c.707 0 1.304.087 1.786.262.482.174.871.42 1.156.73.285.311.488.735.608 1.182.126.447.186.937.186 1.476v5.481a25.24 25.24 0 0 1-1.495.251c-.668.098-1.419.147-2.251.147a6.829 6.829 0 0 1-1.517-.158 3.213 3.213 0 0 1-1.178-.507 2.455 2.455 0 0 1-.761-.904c-.181-.37-.274-.893-.274-1.438 0-.523.104-.855.307-1.215.208-.36.487-.654.838-.883a3.609 3.609 0 0 1 1.227-.49 7.073 7.073 0 0 1 2.202-.103c.263.027.537.076.833.147v-.349c0-.245-.027-.479-.088-.697a1.486 1.486 0 0 0-.307-.583c-.148-.169-.34-.3-.581-.392a2.536 2.536 0 0 0-.915-.163c-.493 0-.942.06-1.353.131-.411.071-.75.153-1.008.245l-.257-1.749c.268-.093.668-.185 1.183-.278a9.335 9.335 0 0 1 1.66-.142l-.001-.001zm.181 7.731c.657 0 1.145-.038 1.484-.104v-2.168a5.097 5.097 0 0 0-1.978-.104c-.241.033-.46.098-.652.191a1.167 1.167 0 0 0-.466.392c-.121.169-.175.267-.175.523 0 .501.175.79.493.981.323.196.75.289 1.293.289h.001zM84.109 4.794c.707 0 1.304.087 1.786.262.482.174.871.42 1.156.73.29.316.487.735.608 1.182.126.447.186.937.186 1.476v5.481a25.24 25.24 0 0 1-1.495.251c-.668.098-1.419.147-2.251.147a6.829 6.829 0 0 1-1.517-.158 3.213 3.213 0 0 1-1.178-.507 2.455 2.455 0 0 1-.761-.904c-.181-.37-.274-.893-.274-1.438 0-.523.104-.855.307-1.215.208-.36.487-.654.838-.883a3.609 3.609 0 0 1 1.227-.49 7.073 7.073 0 0 1 2.202-.103c.257.027.537.076.833.147v-.349c0-.245-.027-.479-.088-.697a1.486 1.486 0 0 0-.307-.583c-.148-.169-.34-.3-.581-.392a2.536 2.536 0 0 0-.915-.163c-.493 0-.942.06-1.353.131-.411.071-.75.153-1.008.245l-.257-1.749c.268-.093.668-.185 1.183-.278a8.89 8.89 0 0 1 1.66-.142l-.001-.001zm.186 7.736c.657 0 1.145-.038 1.484-.104v-2.168a5.097 5.097 0 0 0-1.978-.104c-.241.033-.46.098-.652.191a1.167 1.167 0 0 0-.466.392c-.121.169-.175.267-.175.523 0 .501.175.79.493.981.318.191.75.289 1.293.289h.001zm8.682 1.738c-3.511.016-3.511-2.822-3.511-3.274L89.461.926l2.142-.338v10.003c0 .256 0 1.88 1.375 1.885v1.792h-.001z' fill='%23182359'/><path d='M5.027 11.025c0 .698-.252 1.246-.757 1.644-.505.397-1.201.596-2.089.596-.888 0-1.615-.138-2.181-.414v-1.214c.358.168.739.301 1.141.397.403.097.778.145 1.125.145.508 0 .884-.097 1.125-.29a.945.945 0 0 0 .363-.779.978.978 0 0 0-.333-.747c-.222-.204-.68-.446-1.375-.725-.716-.29-1.221-.621-1.515-.994-.294-.372-.44-.82-.44-1.343 0-.655.233-1.171.698-1.547.466-.376 1.09-.564 1.875-.564.752 0 1.5.165 2.245.494l-.408 1.047c-.698-.294-1.321-.44-1.869-.44-.415 0-.73.09-.945.271a.89.89 0 0 0-.322.717c0 .204.043.379.129.524.086.145.227.282.424.411.197.129.551.299 1.063.51.577.24.999.464 1.268.671.269.208.466.442.591.704.125.261.188.569.188.924l-.001.002zm3.98 2.24c-.924 0-1.646-.269-2.167-.808-.521-.539-.782-1.281-.782-2.226 0-.97.242-1.733.725-2.288.483-.555 1.148-.833 1.993-.833.784 0 1.404.238 1.858.714.455.476.682 1.132.682 1.966v.682H7.357c.018.577.174 1.02.467 1.329.294.31.707.465 1.241.465.351 0 .678-.033.98-.099a5.1 5.1 0 0 0 .975-.33v1.026a3.865 3.865 0 0 1-.935.312 5.723 5.723 0 0 1-1.08.091l.002-.001zm-.231-5.199c-.401 0-.722.127-.964.381s-.386.625-.432 1.112h2.696c-.007-.491-.125-.862-.354-1.115-.229-.252-.544-.379-.945-.379l-.001.001zm7.692 5.092l-.252-.827h-.043c-.286.362-.575.608-.865.739-.29.131-.662.196-1.117.196-.584 0-1.039-.158-1.367-.473-.328-.315-.491-.761-.491-1.337 0-.612.227-1.074.682-1.386.455-.312 1.148-.482 2.079-.51l1.026-.032v-.317c0-.38-.089-.663-.266-.851-.177-.188-.452-.282-.824-.282-.304 0-.596.045-.876.134a6.68 6.68 0 0 0-.806.317l-.408-.902a4.414 4.414 0 0 1 1.058-.384 4.856 4.856 0 0 1 1.085-.132c.756 0 1.326.165 1.711.494.385.329.577.847.577 1.552v4.002h-.902l-.001-.001zm-1.88-.859c.458 0 .826-.128 1.104-.384.278-.256.416-.615.416-1.077v-.516l-.763.032c-.594.021-1.027.121-1.297.298s-.406.448-.406.814c0 .265.079.47.236.615.158.145.394.218.709.218h.001zm7.557-5.189c.254 0 .464.018.628.054l-.124 1.176a2.383 2.383 0 0 0-.559-.064c-.505 0-.914.165-1.227.494-.313.329-.47.757-.47 1.284v3.105h-1.262V7.218h.988l.167 1.047h.064c.197-.354.454-.636.771-.843a1.83 1.83 0 0 1 1.023-.312h.001zm4.125 6.155c-.899 0-1.582-.262-2.049-.787-.467-.525-.701-1.277-.701-2.259 0-.999.244-1.767.733-2.304.489-.537 1.195-.806 2.119-.806.627 0 1.191.116 1.692.349l-.381 1.015c-.534-.208-.974-.312-1.321-.312-1.028 0-1.542.682-1.542 2.046 0 .666.128 1.166.384 1.501.256.335.631.502 1.125.502a3.23 3.23 0 0 0 1.595-.419v1.101a2.53 2.53 0 0 1-.722.285 4.356 4.356 0 0 1-.932.086v.002zm8.277-.107h-1.268V9.506c0-.458-.092-.8-.277-1.026-.184-.226-.477-.338-.878-.338-.53 0-.919.158-1.168.475-.249.317-.373.848-.373 1.593v2.949h-1.262V4.801h1.262v2.122c0 .34-.021.704-.064 1.09h.081a1.76 1.76 0 0 1 .717-.666c.306-.158.663-.236 1.072-.236 1.439 0 2.159.725 2.159 2.175v3.873l-.001-.001zm7.649-6.048c.741 0 1.319.269 1.732.806.414.537.62 1.291.62 2.261 0 .974-.209 1.732-.628 2.275-.419.542-1.001.814-1.746.814-.752 0-1.336-.27-1.751-.811h-.086l-.231.704h-.945V4.801h1.262v1.987l-.021.655-.032.553h.054c.401-.591.992-.886 1.772-.886zm-.328 1.031c-.508 0-.875.149-1.098.448-.224.299-.339.799-.346 1.501v.086c0 .723.115 1.247.344 1.571.229.324.603.486 1.123.486.448 0 .787-.177 1.018-.532.231-.354.346-.867.346-1.536 0-1.35-.462-2.025-1.386-2.025l-.001.001zm3.244-.924h1.375l1.209 3.368c.183.48.304.931.365 1.354h.043c.032-.197.091-.436.177-.717.086-.281.541-1.616 1.364-4.004h1.364l-2.541 6.73c-.462 1.235-1.232 1.853-2.31 1.853-.279 0-.551-.03-.816-.091v-.999c.19.043.406.064.65.064.609 0 1.037-.353 1.284-1.058l.22-.559-2.385-5.941h.001z' fill='%231D3657'/></g></svg>");
+  background-repeat: no-repeat;
+  background-position: 50%;
+  background-size: 100%;
+  overflow: hidden;
+  text-indent: -9000px;
+  width: 100%;
+  height: 100%;
+  display: block;
+  transform: translate(-8px);
+}
+
+.algolia-autocomplete .algolia-docsearch-suggestion--highlight {
+  color: #FF8C00;
+  background: rgba(232, 189, 54, 0.1)
+}
+
+
+.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight {
+  box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5)
+}
+
+.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content {
+  background-color: rgba(192, 192, 192, .15)
+}
diff --git a/docs/docsearch.js b/docs/docsearch.js
new file mode 100644
index 0000000..b35504c
--- /dev/null
+++ b/docs/docsearch.js
@@ -0,0 +1,85 @@
+$(function() {
+
+  // register a handler to move the focus to the search bar
+  // upon pressing shift + "/" (i.e. "?")
+  $(document).on('keydown', function(e) {
+    if (e.shiftKey && e.keyCode == 191) {
+      e.preventDefault();
+      $("#search-input").focus();
+    }
+  });
+
+  $(document).ready(function() {
+    // do keyword highlighting
+    /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */
+    var mark = function() {
+
+      var referrer = document.URL ;
+      var paramKey = "q" ;
+
+      if (referrer.indexOf("?") !== -1) {
+        var qs = referrer.substr(referrer.indexOf('?') + 1);
+        var qs_noanchor = qs.split('#')[0];
+        var qsa = qs_noanchor.split('&');
+        var keyword = "";
+
+        for (var i = 0; i < qsa.length; i++) {
+          var currentParam = qsa[i].split('=');
+
+          if (currentParam.length !== 2) {
+            continue;
+          }
+
+          if (currentParam[0] == paramKey) {
+            keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20"));
+          }
+        }
+
+        if (keyword !== "") {
+          $(".contents").unmark({
+            done: function() {
+              $(".contents").mark(keyword);
+            }
+          });
+        }
+      }
+    };
+
+    mark();
+  });
+});
+
+/* Search term highlighting ------------------------------*/
+
+function matchedWords(hit) {
+  var words = [];
+
+  var hierarchy = hit._highlightResult.hierarchy;
+  // loop to fetch from lvl0, lvl1, etc.
+  for (var idx in hierarchy) {
+    words = words.concat(hierarchy[idx].matchedWords);
+  }
+
+  var content = hit._highlightResult.content;
+  if (content) {
+    words = words.concat(content.matchedWords);
+  }
+
+  // return unique words
+  var words_uniq = [...new Set(words)];
+  return words_uniq;
+}
+
+function updateHitURL(hit) {
+
+  var words = matchedWords(hit);
+  var url = "";
+
+  if (hit.anchor) {
+    url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor;
+  } else {
+    url = hit.url + '?q=' + escape(words.join(" "));
+  }
+
+  return url;
+}
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 0000000..020d123
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1,234 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta charset="utf-8">
+<meta http-equiv="X-UA-Compatible" content="IE=edge">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Deep learning function for patient level prediction using data in the OMOP Common Data Model • DeepPatientLevelPrediction</title>
+<!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="bootstrap-toc.css">
+<script src="bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous">
+<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous">
+<!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="pkgdown.css" rel="stylesheet">
+<script src="pkgdown.js"></script><meta property="og:title" content="Deep learning function for patient level prediction using data in the OMOP Common Data Model">
+<meta property="og:description" content="A package for creating deep learning patient level prediction models following the OHDSI PatientLevelPrediction framework.">
+<!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]-->
+</head>
+<body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-home">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav">
+<li>
+  <a href="index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu">
+<li>
+      <a href="articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul>
+</li>
+      </ul>
+<ul class="nav navbar-nav navbar-right">
+<li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul>
+</div>
+<!--/.nav-collapse -->
+  </div>
+<!--/.container -->
+</div>
+<!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="contents col-md-9">
+
+<div class="section level1">
+<div class="page-header"><h1 id="deeppatientlevelprediction">DeepPatientLevelPrediction<a class="anchor" aria-label="anchor" href="#deeppatientlevelprediction"></a>
+</h1></div>
+
+</div>
+<div class="section level1">
+<h1 id="introduction">Introduction<a class="anchor" aria-label="anchor" href="#introduction"></a>
+</h1>
+<p>DeepPatientLevelPrediction is an R package for building and validating deep learning patient-level predictive models using data in the OMOP Common Data Model format and OHDSI PatientLevelPrediction framework.</p>
+<p>Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. <a href="https://academic.oup.com/jamia/article/25/8/969/4989437" class="external-link">Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data.</a> J Am Med Inform Assoc. 2018;25(8):969-975.</p>
+</div>
+<div class="section level1">
+<h1 id="features">Features<a class="anchor" aria-label="anchor" href="#features"></a>
+</h1>
+<ul>
+<li>Adds deep learning models to use in the OHDSI PatientLevelPrediction framework.</li>
+<li>Allows to add custom deep learning models.</li>
+<li>Includes an MLP, ResNet and a Transformer</li>
+<li>Allows to use all the features of <a href="https://github.com/OHDSI/PatientLevelPrediction/" class="external-link">PatientLevelPrediction</a> to validate and explore your model performance.</li>
+</ul>
+</div>
+<div class="section level1">
+<h1 id="technology">Technology<a class="anchor" aria-label="anchor" href="#technology"></a>
+</h1>
+<p>DeepPatientLevelPrediction is an R package. It uses <a href="https://torch.mlverse.org/" class="external-link">torch in R</a> to build deep learning models without using python.</p>
+</div>
+<div class="section level1">
+<h1 id="system-requirements">System Requirements<a class="anchor" aria-label="anchor" href="#system-requirements"></a>
+</h1>
+<p>Requires R (version 3.5.0 or higher). Installation on Windows requires <a href="http://cran.r-project.org/bin/windows/Rtools/" class="external-link">RTools</a>. For training deep learning models in most cases an nvidia GPU is required using either Windows or Linux.</p>
+</div>
+<div class="section level1">
+<h1 id="getting-started">Getting Started<a class="anchor" aria-label="anchor" href="#getting-started"></a>
+</h1>
+<ul>
+<li>To install the package please read the <a href="">Package installation guide</a>
+</li>
+<li>Please read the main vignette for the package: <a href="https://ohdsi.github.io/DeepPatientLevelPrediction/articles/BuildingDeepModels.html" class="external-link">Building Deep Learning Models</a>
+</li>
+</ul>
+</div>
+<div class="section level1">
+<h1 id="user-documentation">User Documentation<a class="anchor" aria-label="anchor" href="#user-documentation"></a>
+</h1>
+<p>Documentation can be found on the <a href="https://ohdsi.github.io/DeepPatientLevelPrediction" class="external-link">package website</a>.</p>
+<p>PDF versions of the documentation are also available, as mentioned above.</p>
+</div>
+<div class="section level1">
+<h1 id="support">Support<a class="anchor" aria-label="anchor" href="#support"></a>
+</h1>
+<ul>
+<li>Developer questions/comments/feedback: <a href="http://forums.ohdsi.org/c/developers" class="external-link">OHDSI Forum</a>
+</li>
+<li>We use the <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/issues" class="external-link">GitHub issue tracker</a> for all bugs/issues/enhancements</li>
+</ul>
+</div>
+<div class="section level1">
+<h1 id="contributing">Contributing<a class="anchor" aria-label="anchor" href="#contributing"></a>
+</h1>
+<p>Read <a href="https://ohdsi.github.io/Hades/contribute.html" class="external-link">here</a> how you can contribute to this package.</p>
+</div>
+<div class="section level1">
+<h1 id="license">License<a class="anchor" aria-label="anchor" href="#license"></a>
+</h1>
+<p>DeepPatientLevelPrediction is licensed under Apache License 2.0</p>
+</div>
+<div class="section level1">
+<h1 id="development">Development<a class="anchor" aria-label="anchor" href="#development"></a>
+</h1>
+<p>DeepPatientLevelPrediction is being developed in R Studio.</p>
+</div>
+
+  </div>
+
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <div class="links">
+<h2 data-toc-skip>Links</h2>
+<ul class="list-unstyled">
+<li><a href="https://github.com/OHDSI/DeepPatientLevelPrediction/" class="external-link">Browse source code</a></li>
+<li><a href="https://github.com/OHDSI/DeepPatientLevelPrediction/issues" class="external-link">Report a bug</a></li>
+<li><a href="http://forums.ohdsi.org" class="external-link">Ask a question</a></li>
+</ul>
+</div>
+
+<div class="license">
+<h2 data-toc-skip>License</h2>
+<ul class="list-unstyled">
+<li>Apache License 2.0</li>
+</ul>
+</div>
+
+
+<div class="citation">
+<h2 data-toc-skip>Citation</h2>
+<ul class="list-unstyled">
+<li><a href="authors.html#citation">Citing DeepPatientLevelPrediction</a></li>
+</ul>
+</div>
+
+<div class="developers">
+<h2 data-toc-skip>Developers</h2>
+<ul class="list-unstyled">
+<li>Jenna Reps <br><small class="roles"> Author </small>  </li>
+<li>Egill Fridgeirsson <br><small class="roles"> Author, maintainer </small>  </li>
+<li>Seng Chan You <br><small class="roles"> Author </small>  </li>
+<li>Chungsoo Kim <br><small class="roles"> Author </small>  </li>
+<li>Henrik John <br><small class="roles"> Author </small>  </li>
+</ul>
+</div>
+
+<div class="dev-status">
+<h2 data-toc-skip>Dev status</h2>
+<ul class="list-unstyled">
+<li><a href="https://github.com/OHDSI/DeepPatientLevelPrediction/actions?query=workflow%3AR-CMD-check?branch=develop" class="external-link"><img src="https://github.com/OHDSI/DeepPatientLevelPrediction/workflows/R-CMD-check/badge.svg" alt="Build Status"></a></li>
+<li><a href="https://codecov.io/github/OHDSI/DeepPatientLevelPrediction?branch=main" class="external-link"><img src="https://codecov.io/github/OHDSI/DeepPatientLevelPrediction/coverage.svg?branch=master" alt="codecov.io"></a></li>
+</ul>
+</div>
+
+  </div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p>
+<p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p>
+<p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer>
+</div>
+
+  
+
+
+  
+
+  </body>
+</html>
diff --git a/docs/link.svg b/docs/link.svg
new file mode 100644
index 0000000..88ad827
--- /dev/null
+++ b/docs/link.svg
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 19.2.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
+	 viewBox="0 0 20 20" style="enable-background:new 0 0 20 20;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#75AADB;}
+</style>
+<path class="st0" d="M4,11.3h1.3v1.3H4c-2,0-4-2.3-4-4.7s2.1-4.7,4-4.7h5.3c1.9,0,4,2.3,4,4.7c0,1.9-1.2,3.6-2.7,4.3v-1.5
+	C11.4,10.2,12,9.1,12,8c0-1.7-1.4-3.3-2.7-3.3H4C2.7,4.7,1.3,6.3,1.3,8S2.7,11.3,4,11.3z M16,7.3h-1.3v1.3H16c1.3,0,2.7,1.6,2.7,3.3
+	s-1.4,3.3-2.7,3.3h-5.3C9.4,15.3,8,13.7,8,12c0-1.1,0.6-2.2,1.3-2.8V7.7C7.9,8.4,6.7,10.1,6.7,12c0,2.4,2.1,4.7,4,4.7H16
+	c1.9,0,4-2.3,4-4.7S18,7.3,16,7.3z"/>
+</svg>
diff --git a/docs/pkgdown.css b/docs/pkgdown.css
new file mode 100644
index 0000000..80ea5b8
--- /dev/null
+++ b/docs/pkgdown.css
@@ -0,0 +1,384 @@
+/* Sticky footer */
+
+/**
+ * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/
+ * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css
+ *
+ * .Site -> body > .container
+ * .Site-content -> body > .container .row
+ * .footer -> footer
+ *
+ * Key idea seems to be to ensure that .container and __all its parents__
+ * have height set to 100%
+ *
+ */
+
+html, body {
+  height: 100%;
+}
+
+body {
+  position: relative;
+}
+
+body > .container {
+  display: flex;
+  height: 100%;
+  flex-direction: column;
+}
+
+body > .container .row {
+  flex: 1 0 auto;
+}
+
+footer {
+  margin-top: 45px;
+  padding: 35px 0 36px;
+  border-top: 1px solid #e5e5e5;
+  color: #666;
+  display: flex;
+  flex-shrink: 0;
+}
+footer p {
+  margin-bottom: 0;
+}
+footer div {
+  flex: 1;
+}
+footer .pkgdown {
+  text-align: right;
+}
+footer p {
+  margin-bottom: 0;
+}
+
+img.icon {
+  float: right;
+}
+
+/* Ensure in-page images don't run outside their container */
+.contents img {
+  max-width: 100%;
+  height: auto;
+}
+
+/* Fix bug in bootstrap (only seen in firefox) */
+summary {
+  display: list-item;
+}
+
+/* Typographic tweaking ---------------------------------*/
+
+.contents .page-header {
+  margin-top: calc(-60px + 1em);
+}
+
+dd {
+  margin-left: 3em;
+}
+
+/* Section anchors ---------------------------------*/
+
+a.anchor {
+  display: none;
+  margin-left: 5px;
+  width: 20px;
+  height: 20px;
+
+  background-image: url(./link.svg);
+  background-repeat: no-repeat;
+  background-size: 20px 20px;
+  background-position: center center;
+}
+
+h1:hover .anchor,
+h2:hover .anchor,
+h3:hover .anchor,
+h4:hover .anchor,
+h5:hover .anchor,
+h6:hover .anchor {
+  display: inline-block;
+}
+
+/* Fixes for fixed navbar --------------------------*/
+
+.contents h1, .contents h2, .contents h3, .contents h4 {
+  padding-top: 60px;
+  margin-top: -40px;
+}
+
+/* Navbar submenu --------------------------*/
+
+.dropdown-submenu {
+  position: relative;
+}
+
+.dropdown-submenu>.dropdown-menu {
+  top: 0;
+  left: 100%;
+  margin-top: -6px;
+  margin-left: -1px;
+  border-radius: 0 6px 6px 6px;
+}
+
+.dropdown-submenu:hover>.dropdown-menu {
+  display: block;
+}
+
+.dropdown-submenu>a:after {
+  display: block;
+  content: " ";
+  float: right;
+  width: 0;
+  height: 0;
+  border-color: transparent;
+  border-style: solid;
+  border-width: 5px 0 5px 5px;
+  border-left-color: #cccccc;
+  margin-top: 5px;
+  margin-right: -10px;
+}
+
+.dropdown-submenu:hover>a:after {
+  border-left-color: #ffffff;
+}
+
+.dropdown-submenu.pull-left {
+  float: none;
+}
+
+.dropdown-submenu.pull-left>.dropdown-menu {
+  left: -100%;
+  margin-left: 10px;
+  border-radius: 6px 0 6px 6px;
+}
+
+/* Sidebar --------------------------*/
+
+#pkgdown-sidebar {
+  margin-top: 30px;
+  position: -webkit-sticky;
+  position: sticky;
+  top: 70px;
+}
+
+#pkgdown-sidebar h2 {
+  font-size: 1.5em;
+  margin-top: 1em;
+}
+
+#pkgdown-sidebar h2:first-child {
+  margin-top: 0;
+}
+
+#pkgdown-sidebar .list-unstyled li {
+  margin-bottom: 0.5em;
+}
+
+/* bootstrap-toc tweaks ------------------------------------------------------*/
+
+/* All levels of nav */
+
+nav[data-toggle='toc'] .nav > li > a {
+  padding: 4px 20px 4px 6px;
+  font-size: 1.5rem;
+  font-weight: 400;
+  color: inherit;
+}
+
+nav[data-toggle='toc'] .nav > li > a:hover,
+nav[data-toggle='toc'] .nav > li > a:focus {
+  padding-left: 5px;
+  color: inherit;
+  border-left: 1px solid #878787;
+}
+
+nav[data-toggle='toc'] .nav > .active > a,
+nav[data-toggle='toc'] .nav > .active:hover > a,
+nav[data-toggle='toc'] .nav > .active:focus > a {
+  padding-left: 5px;
+  font-size: 1.5rem;
+  font-weight: 400;
+  color: inherit;
+  border-left: 2px solid #878787;
+}
+
+/* Nav: second level (shown on .active) */
+
+nav[data-toggle='toc'] .nav .nav {
+  display: none; /* Hide by default, but at >768px, show it */
+  padding-bottom: 10px;
+}
+
+nav[data-toggle='toc'] .nav .nav > li > a {
+  padding-left: 16px;
+  font-size: 1.35rem;
+}
+
+nav[data-toggle='toc'] .nav .nav > li > a:hover,
+nav[data-toggle='toc'] .nav .nav > li > a:focus {
+  padding-left: 15px;
+}
+
+nav[data-toggle='toc'] .nav .nav > .active > a,
+nav[data-toggle='toc'] .nav .nav > .active:hover > a,
+nav[data-toggle='toc'] .nav .nav > .active:focus > a {
+  padding-left: 15px;
+  font-weight: 500;
+  font-size: 1.35rem;
+}
+
+/* orcid ------------------------------------------------------------------- */
+
+.orcid {
+  font-size: 16px;
+  color: #A6CE39;
+  /* margins are required by official ORCID trademark and display guidelines */
+  margin-left:4px;
+  margin-right:4px;
+  vertical-align: middle;
+}
+
+/* Reference index & topics ----------------------------------------------- */
+
+.ref-index th {font-weight: normal;}
+
+.ref-index td {vertical-align: top; min-width: 100px}
+.ref-index .icon {width: 40px;}
+.ref-index .alias {width: 40%;}
+.ref-index-icons .alias {width: calc(40% - 40px);}
+.ref-index .title {width: 60%;}
+
+.ref-arguments th {text-align: right; padding-right: 10px;}
+.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px}
+.ref-arguments .name {width: 20%;}
+.ref-arguments .desc {width: 80%;}
+
+/* Nice scrolling for wide elements --------------------------------------- */
+
+table {
+  display: block;
+  overflow: auto;
+}
+
+/* Syntax highlighting ---------------------------------------------------- */
+
+pre, code, pre code {
+  background-color: #f8f8f8;
+  color: #333;
+}
+pre, pre code {
+  white-space: pre-wrap;
+  word-break: break-all;
+  overflow-wrap: break-word;
+}
+
+pre {
+  border: 1px solid #eee;
+}
+
+pre .img, pre .r-plt {
+  margin: 5px 0;
+}
+
+pre .img img, pre .r-plt img {
+  background-color: #fff;
+}
+
+code a, pre a {
+  color: #375f84;
+}
+
+a.sourceLine:hover {
+  text-decoration: none;
+}
+
+.fl      {color: #1514b5;}
+.fu      {color: #000000;} /* function */
+.ch,.st  {color: #036a07;} /* string */
+.kw      {color: #264D66;} /* keyword */
+.co      {color: #888888;} /* comment */
+
+.error   {font-weight: bolder;}
+.warning {font-weight: bolder;}
+
+/* Clipboard --------------------------*/
+
+.hasCopyButton {
+  position: relative;
+}
+
+.btn-copy-ex {
+  position: absolute;
+  right: 0;
+  top: 0;
+  visibility: hidden;
+}
+
+.hasCopyButton:hover button.btn-copy-ex {
+  visibility: visible;
+}
+
+/* headroom.js ------------------------ */
+
+.headroom {
+  will-change: transform;
+  transition: transform 200ms linear;
+}
+.headroom--pinned {
+  transform: translateY(0%);
+}
+.headroom--unpinned {
+  transform: translateY(-100%);
+}
+
+/* mark.js ----------------------------*/
+
+mark {
+  background-color: rgba(255, 255, 51, 0.5);
+  border-bottom: 2px solid rgba(255, 153, 51, 0.3);
+  padding: 1px;
+}
+
+/* vertical spacing after htmlwidgets */
+.html-widget {
+  margin-bottom: 10px;
+}
+
+/* fontawesome ------------------------ */
+
+.fab {
+    font-family: "Font Awesome 5 Brands" !important;
+}
+
+/* don't display links in code chunks when printing */
+/* source: https://stackoverflow.com/a/10781533 */
+@media print {
+  code a:link:after, code a:visited:after {
+    content: "";
+  }
+}
+
+/* Section anchors ---------------------------------
+   Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71
+*/
+
+div.csl-bib-body { }
+div.csl-entry {
+  clear: both;
+}
+.hanging-indent div.csl-entry {
+  margin-left:2em;
+  text-indent:-2em;
+}
+div.csl-left-margin {
+  min-width:2em;
+  float:left;
+}
+div.csl-right-inline {
+  margin-left:2em;
+  padding-left:1em;
+}
+div.csl-indent {
+  margin-left: 2em;
+}
diff --git a/docs/pkgdown.js b/docs/pkgdown.js
new file mode 100644
index 0000000..6f0eee4
--- /dev/null
+++ b/docs/pkgdown.js
@@ -0,0 +1,108 @@
+/* http://gregfranko.com/blog/jquery-best-practices/ */
+(function($) {
+  $(function() {
+
+    $('.navbar-fixed-top').headroom();
+
+    $('body').css('padding-top', $('.navbar').height() + 10);
+    $(window).resize(function(){
+      $('body').css('padding-top', $('.navbar').height() + 10);
+    });
+
+    $('[data-toggle="tooltip"]').tooltip();
+
+    var cur_path = paths(location.pathname);
+    var links = $("#navbar ul li a");
+    var max_length = -1;
+    var pos = -1;
+    for (var i = 0; i < links.length; i++) {
+      if (links[i].getAttribute("href") === "#")
+        continue;
+      // Ignore external links
+      if (links[i].host !== location.host)
+        continue;
+
+      var nav_path = paths(links[i].pathname);
+
+      var length = prefix_length(nav_path, cur_path);
+      if (length > max_length) {
+        max_length = length;
+        pos = i;
+      }
+    }
+
+    // Add class to parent <li>, and enclosing <li> if in dropdown
+    if (pos >= 0) {
+      var menu_anchor = $(links[pos]);
+      menu_anchor.parent().addClass("active");
+      menu_anchor.closest("li.dropdown").addClass("active");
+    }
+  });
+
+  function paths(pathname) {
+    var pieces = pathname.split("/");
+    pieces.shift(); // always starts with /
+
+    var end = pieces[pieces.length - 1];
+    if (end === "index.html" || end === "")
+      pieces.pop();
+    return(pieces);
+  }
+
+  // Returns -1 if not found
+  function prefix_length(needle, haystack) {
+    if (needle.length > haystack.length)
+      return(-1);
+
+    // Special case for length-0 haystack, since for loop won't run
+    if (haystack.length === 0) {
+      return(needle.length === 0 ? 0 : -1);
+    }
+
+    for (var i = 0; i < haystack.length; i++) {
+      if (needle[i] != haystack[i])
+        return(i);
+    }
+
+    return(haystack.length);
+  }
+
+  /* Clipboard --------------------------*/
+
+  function changeTooltipMessage(element, msg) {
+    var tooltipOriginalTitle=element.getAttribute('data-original-title');
+    element.setAttribute('data-original-title', msg);
+    $(element).tooltip('show');
+    element.setAttribute('data-original-title', tooltipOriginalTitle);
+  }
+
+  if(ClipboardJS.isSupported()) {
+    $(document).ready(function() {
+      var copyButton = "<button type='button' class='btn btn-primary btn-copy-ex' type = 'submit' title='Copy to clipboard' aria-label='Copy to clipboard' data-toggle='tooltip' data-placement='left auto' data-trigger='hover' data-clipboard-copy><i class='fa fa-copy'></i></button>";
+
+      $("div.sourceCode").addClass("hasCopyButton");
+
+      // Insert copy buttons:
+      $(copyButton).prependTo(".hasCopyButton");
+
+      // Initialize tooltips:
+      $('.btn-copy-ex').tooltip({container: 'body'});
+
+      // Initialize clipboard:
+      var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', {
+        text: function(trigger) {
+          return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, "");
+        }
+      });
+
+      clipboardBtnCopies.on('success', function(e) {
+        changeTooltipMessage(e.trigger, 'Copied!');
+        e.clearSelection();
+      });
+
+      clipboardBtnCopies.on('error', function() {
+        changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy');
+      });
+    });
+  }
+})(window.jQuery || window.$)
diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml
new file mode 100644
index 0000000..e9b1b73
--- /dev/null
+++ b/docs/pkgdown.yml
@@ -0,0 +1,8 @@
+pandoc: 2.14.0.3
+pkgdown: 2.0.5
+pkgdown_sha: ~
+articles:
+  BuildingDeepModels: BuildingDeepModels.html
+  Installing: Installing.html
+last_built: 2022-08-09T16:38Z
+
diff --git a/docs/reference/Dataset.html b/docs/reference/Dataset.html
new file mode 100644
index 0000000..cd4ccac
--- /dev/null
+++ b/docs/reference/Dataset.html
@@ -0,0 +1,123 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>A torch dataset — Dataset • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="A torch dataset — Dataset"><meta property="og:description" content="A torch dataset"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>A torch dataset</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Dataset.R" class="external-link"><code>R/Dataset.R</code></a></small>
+    <div class="hidden name"><code>Dataset.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>A torch dataset</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">Dataset</span><span class="op">(</span><span class="va">data</span>, labels <span class="op">=</span> <span class="cn">NULL</span>, numericalIndex <span class="op">=</span> <span class="cn">NULL</span>, all <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>data</dt>
+<dd><p>a dataframe like object with the covariates</p></dd>
+
+
+<dt>labels</dt>
+<dd><p>a dataframe with the labels</p></dd>
+
+
+<dt>numericalIndex</dt>
+<dd><p>in what column numeric data is in (if any)</p></dd>
+
+
+<dt>all</dt>
+<dd><p>if True then returns all features instead of splitting num/cat</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/DeepPatientLevelPrediction.html b/docs/reference/DeepPatientLevelPrediction.html
new file mode 100644
index 0000000..5e82fea
--- /dev/null
+++ b/docs/reference/DeepPatientLevelPrediction.html
@@ -0,0 +1,102 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>DeepPatientLevelPrediction — DeepPatientLevelPrediction • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="DeepPatientLevelPrediction — DeepPatientLevelPrediction"><meta property="og:description" content="A package containing deep learning extensions for developing prediction models using data in the OMOP CDM"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>DeepPatientLevelPrediction</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/DeepPatientLevelPrediction.R" class="external-link"><code>R/DeepPatientLevelPrediction.R</code></a></small>
+    <div class="hidden name"><code>DeepPatientLevelPrediction.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>A package containing deep learning extensions for developing prediction models using data in the OMOP CDM</p>
+    </div>
+
+
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/EarlyStopping.html b/docs/reference/EarlyStopping.html
new file mode 100644
index 0000000..049d2c9
--- /dev/null
+++ b/docs/reference/EarlyStopping.html
@@ -0,0 +1,175 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Earlystopping class — EarlyStopping • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Earlystopping class — EarlyStopping"><meta property="og:description" content="Stops training if a loss or metric has stopped improving"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Earlystopping class</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Estimator.R" class="external-link"><code>R/Estimator.R</code></a></small>
+    <div class="hidden name"><code>EarlyStopping.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Stops training if a loss or metric has stopped improving</p>
+    </div>
+
+
+    <div id="methods">
+    <h2>Methods</h2>
+    
+<div class="section">
+<h3 id="public-methods">Public methods<a class="anchor" aria-label="anchor" href="#public-methods"></a></h3>
+
+<ul><li><p><a href="#method-EarlyStopping-new"><code>EarlyStopping$new()</code></a></p></li>
+<li><p><a href="#method-EarlyStopping-call"><code>EarlyStopping$call()</code></a></p></li>
+<li><p><a href="#method-EarlyStopping-clone"><code>EarlyStopping$clone()</code></a></p></li>
+</ul></div><p></p><hr><a id="method-EarlyStopping-new"></a><div class="section">
+<h3 id="method-new-">Method <code>new()</code><a class="anchor" aria-label="anchor" href="#method-new-"></a></h3>
+<p>Creates a new earlystopping object</p><div class="section">
+<h4 id="usage">Usage<a class="anchor" aria-label="anchor" href="#usage"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va"><a href="../reference/EarlyStopping.html">EarlyStopping</a></span><span class="op">$</span><span class="fu">new</span><span class="op">(</span>patience <span class="op">=</span> <span class="fl">3</span>, delta <span class="op">=</span> <span class="fl">0</span>, verbose <span class="op">=</span> <span class="cn">TRUE</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>patience</code></dt>
+<dd><p>Stop after this number of epochs if loss doesn't improve</p></dd>
+
+
+<dt><code>delta</code></dt>
+<dd><p>How much does the loss need to improve to count as improvement</p></dd>
+
+
+<dt><code>verbose</code></dt>
+<dd></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns">Returns<a class="anchor" aria-label="anchor" href="#returns"></a></h4>
+<p>a new earlystopping object</p>
+</div>
+
+</div><p></p><hr><a id="method-EarlyStopping-call"></a><div class="section">
+<h3 id="method-call-">Method <code><a href="https://rdrr.io/r/base/call.html" class="external-link">call()</a></code><a class="anchor" aria-label="anchor" href="#method-call-"></a></h3>
+<p>call the earlystopping object and increment a counter if loss is not
+improving</p><div class="section">
+<h4 id="usage-1">Usage<a class="anchor" aria-label="anchor" href="#usage-1"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">EarlyStopping</span><span class="op">$</span><span class="fu">call</span><span class="op">(</span><span class="va">metric</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-1">Arguments<a class="anchor" aria-label="anchor" href="#arguments-1"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>metric</code></dt>
+<dd><p>the current metric value</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div><p></p><hr><a id="method-EarlyStopping-clone"></a><div class="section">
+<h3 id="method-clone-">Method <code>clone()</code><a class="anchor" aria-label="anchor" href="#method-clone-"></a></h3>
+<p>The objects of this class are cloneable with this method.</p><div class="section">
+<h4 id="usage-2">Usage<a class="anchor" aria-label="anchor" href="#usage-2"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">EarlyStopping</span><span class="op">$</span><span class="fu">clone</span><span class="op">(</span>deep <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-2">Arguments<a class="anchor" aria-label="anchor" href="#arguments-2"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>deep</code></dt>
+<dd><p>Whether to make a deep clone.</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div>
+
+    </div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/Estimator.html b/docs/reference/Estimator.html
new file mode 100644
index 0000000..dc17110
--- /dev/null
+++ b/docs/reference/Estimator.html
@@ -0,0 +1,431 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Estimator — Estimator • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Estimator — Estimator"><meta property="og:description" content="A generic R6 class that wraps around a torch nn module and can be used to 
+fit and predict the model defined in that module."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Estimator</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Estimator.R" class="external-link"><code>R/Estimator.R</code></a></small>
+    <div class="hidden name"><code>Estimator.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>A generic R6 class that wraps around a torch nn module and can be used to 
+fit and predict the model defined in that module.</p>
+    </div>
+
+
+    <div id="methods">
+    <h2>Methods</h2>
+    
+<div class="section">
+<h3 id="public-methods">Public methods<a class="anchor" aria-label="anchor" href="#public-methods"></a></h3>
+
+<ul><li><p><a href="#method-Estimator-new"><code>Estimator$new()</code></a></p></li>
+<li><p><a href="#method-Estimator-fit"><code>Estimator$fit()</code></a></p></li>
+<li><p><a href="#method-Estimator-fitEpoch"><code>Estimator$fitEpoch()</code></a></p></li>
+<li><p><a href="#method-Estimator-score"><code>Estimator$score()</code></a></p></li>
+<li><p><a href="#method-Estimator-finishFit"><code>Estimator$finishFit()</code></a></p></li>
+<li><p><a href="#method-Estimator-fitWholeTrainingSet"><code>Estimator$fitWholeTrainingSet()</code></a></p></li>
+<li><p><a href="#method-Estimator-save"><code>Estimator$save()</code></a></p></li>
+<li><p><a href="#method-Estimator-predictProba"><code>Estimator$predictProba()</code></a></p></li>
+<li><p><a href="#method-Estimator-predict"><code>Estimator$predict()</code></a></p></li>
+<li><p><a href="#method-Estimator-batchToDevice"><code>Estimator$batchToDevice()</code></a></p></li>
+<li><p><a href="#method-Estimator-itemOrDefaults"><code>Estimator$itemOrDefaults()</code></a></p></li>
+<li><p><a href="#method-Estimator-clone"><code>Estimator$clone()</code></a></p></li>
+</ul></div><p></p><hr><a id="method-Estimator-new"></a><div class="section">
+<h3 id="method-new-">Method <code>new()</code><a class="anchor" aria-label="anchor" href="#method-new-"></a></h3>
+<p>Creates a new estimator</p><div class="section">
+<h4 id="usage">Usage<a class="anchor" aria-label="anchor" href="#usage"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va"><a href="../reference/Estimator.html">Estimator</a></span><span class="op">$</span><span class="fu">new</span><span class="op">(</span></span>
+<span>  <span class="va">baseModel</span>,</span>
+<span>  <span class="va">modelParameters</span>,</span>
+<span>  <span class="va">fitParameters</span>,</span>
+<span>  optimizer <span class="op">=</span> <span class="fu">torch</span><span class="fu">::</span><span class="va"><a href="https://rdrr.io/pkg/torch/man/optim_adam.html" class="external-link">optim_adam</a></span>,</span>
+<span>  criterion <span class="op">=</span> <span class="fu">torch</span><span class="fu">::</span><span class="va"><a href="https://rdrr.io/pkg/torch/man/nn_bce_with_logits_loss.html" class="external-link">nn_bce_with_logits_loss</a></span>,</span>
+<span>  scheduler <span class="op">=</span> <span class="fu">torch</span><span class="fu">::</span><span class="va">lr_reduce_on_plateau</span>,</span>
+<span>  device <span class="op">=</span> <span class="st">"cpu"</span>,</span>
+<span>  patience <span class="op">=</span> <span class="fl">4</span></span>
+<span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments">Arguments<a class="anchor" aria-label="anchor" href="#arguments"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>baseModel</code></dt>
+<dd><p>The torch nn module to use as model</p></dd>
+
+
+<dt><code>modelParameters</code></dt>
+<dd><p>Parameters to initialize the baseModel</p></dd>
+
+
+<dt><code>fitParameters</code></dt>
+<dd><p>Parameters required for the estimator fitting</p></dd>
+
+
+<dt><code>optimizer</code></dt>
+<dd><p>A torch optimizer to use, default is Adam</p></dd>
+
+
+<dt><code>criterion</code></dt>
+<dd><p>The torch loss function to use, defaults to 
+binary cross entropy with logits</p></dd>
+
+
+<dt><code>scheduler</code></dt>
+<dd><p>learning rate scheduler to use</p></dd>
+
+
+<dt><code>device</code></dt>
+<dd><p>Which device to use for fitting, default is cpu</p></dd>
+
+
+<dt><code>patience</code></dt>
+<dd><p>Patience to use for early stopping</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-fit"></a><div class="section">
+<h3 id="method-fit-">Method <code>fit()</code><a class="anchor" aria-label="anchor" href="#method-fit-"></a></h3>
+<p>fits the estimator</p><div class="section">
+<h4 id="usage-1">Usage<a class="anchor" aria-label="anchor" href="#usage-1"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">fit</span><span class="op">(</span><span class="va">dataset</span>, <span class="va">testDataset</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-1">Arguments<a class="anchor" aria-label="anchor" href="#arguments-1"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>dataset</code></dt>
+<dd><p>a torch dataset to use for model fitting</p></dd>
+
+
+<dt><code>testDataset</code></dt>
+<dd><p>a torch dataset to use for early stopping</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-fitEpoch"></a><div class="section">
+<h3 id="method-fitepoch-">Method <code>fitEpoch()</code><a class="anchor" aria-label="anchor" href="#method-fitepoch-"></a></h3>
+<p>fits estimator for one epoch (one round through the data)</p><div class="section">
+<h4 id="usage-2">Usage<a class="anchor" aria-label="anchor" href="#usage-2"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">fitEpoch</span><span class="op">(</span><span class="va">dataset</span>, <span class="va">batchIndex</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-2">Arguments<a class="anchor" aria-label="anchor" href="#arguments-2"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>dataset</code></dt>
+<dd><p>torch dataset to use for fitting</p></dd>
+
+
+<dt><code>batchIndex</code></dt>
+<dd><p>indices of batches</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-score"></a><div class="section">
+<h3 id="method-score-">Method <code>score()</code><a class="anchor" aria-label="anchor" href="#method-score-"></a></h3>
+<p>calculates loss and auc after training for one epoch</p><div class="section">
+<h4 id="usage-3">Usage<a class="anchor" aria-label="anchor" href="#usage-3"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">score</span><span class="op">(</span><span class="va">dataset</span>, <span class="va">batchIndex</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-3">Arguments<a class="anchor" aria-label="anchor" href="#arguments-3"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>dataset</code></dt>
+<dd><p>The torch dataset to use to evaluate loss and auc</p></dd>
+
+
+<dt><code>batchIndex</code></dt>
+<dd><p>Indices of batches in the dataset</p></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns">Returns<a class="anchor" aria-label="anchor" href="#returns"></a></h4>
+<p>list with average loss and auc in the dataset</p>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-finishFit"></a><div class="section">
+<h3 id="method-finishfit-">Method <code>finishFit()</code><a class="anchor" aria-label="anchor" href="#method-finishfit-"></a></h3>
+<p>operations that run when fitting is finished</p><div class="section">
+<h4 id="usage-4">Usage<a class="anchor" aria-label="anchor" href="#usage-4"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">finishFit</span><span class="op">(</span><span class="va">valAUCs</span>, <span class="va">modelStateDict</span>, <span class="va">valLosses</span>, <span class="va">epoch</span>, <span class="va">learnRates</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-4">Arguments<a class="anchor" aria-label="anchor" href="#arguments-4"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>valAUCs</code></dt>
+<dd><p>validation AUC values</p></dd>
+
+
+<dt><code>modelStateDict</code></dt>
+<dd><p>fitted model parameters</p></dd>
+
+
+<dt><code>valLosses</code></dt>
+<dd><p>validation losses</p></dd>
+
+
+<dt><code>epoch</code></dt>
+<dd><p>list of epochs fit</p></dd>
+
+
+<dt><code>learnRates</code></dt>
+<dd><p>learning rate sequence used so far</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-fitWholeTrainingSet"></a><div class="section">
+<h3 id="method-fitwholetrainingset-">Method <code>fitWholeTrainingSet()</code><a class="anchor" aria-label="anchor" href="#method-fitwholetrainingset-"></a></h3>
+<p>Fits whole training set on a specific number of epochs
+TODO What happens when learning rate changes per epochs?
+Ideally I would copy the learning rate strategy from before
+and adjust for different sizes ie more iterations/updates???</p><div class="section">
+<h4 id="usage-5">Usage<a class="anchor" aria-label="anchor" href="#usage-5"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">fitWholeTrainingSet</span><span class="op">(</span><span class="va">dataset</span>, learnRates <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-5">Arguments<a class="anchor" aria-label="anchor" href="#arguments-5"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>dataset</code></dt>
+<dd><p>torch dataset</p></dd>
+
+
+<dt><code>learnRates</code></dt>
+<dd><p>learnRateSchedule from CV</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-save"></a><div class="section">
+<h3 id="method-save-">Method <code><a href="https://rdrr.io/r/base/save.html" class="external-link">save()</a></code><a class="anchor" aria-label="anchor" href="#method-save-"></a></h3>
+<p>save model and those parameters needed to reconstruct it</p><div class="section">
+<h4 id="usage-6">Usage<a class="anchor" aria-label="anchor" href="#usage-6"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">save</span><span class="op">(</span><span class="va">path</span>, <span class="va">name</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-6">Arguments<a class="anchor" aria-label="anchor" href="#arguments-6"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>path</code></dt>
+<dd><p>where to save the model</p></dd>
+
+
+<dt><code>name</code></dt>
+<dd><p>name of file</p></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns-1">Returns<a class="anchor" aria-label="anchor" href="#returns-1"></a></h4>
+<p>the path to saved model</p>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-predictProba"></a><div class="section">
+<h3 id="method-predictproba-">Method <code>predictProba()</code><a class="anchor" aria-label="anchor" href="#method-predictproba-"></a></h3>
+<p>predicts and outputs the probabilities</p><div class="section">
+<h4 id="usage-7">Usage<a class="anchor" aria-label="anchor" href="#usage-7"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">predictProba</span><span class="op">(</span><span class="va">dataset</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-7">Arguments<a class="anchor" aria-label="anchor" href="#arguments-7"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>dataset</code></dt>
+<dd><p>Torch dataset to create predictions for</p></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns-2">Returns<a class="anchor" aria-label="anchor" href="#returns-2"></a></h4>
+<p>predictions as probabilities</p>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-predict"></a><div class="section">
+<h3 id="method-predict-">Method <code><a href="https://rdrr.io/r/stats/predict.html" class="external-link">predict()</a></code><a class="anchor" aria-label="anchor" href="#method-predict-"></a></h3>
+<p>predicts and outputs the class</p><div class="section">
+<h4 id="usage-8">Usage<a class="anchor" aria-label="anchor" href="#usage-8"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">predict</span><span class="op">(</span><span class="va">dataset</span>, threshold <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-8">Arguments<a class="anchor" aria-label="anchor" href="#arguments-8"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>dataset</code></dt>
+<dd><p>A torch dataset to create predictions for</p></dd>
+
+
+<dt><code>threshold</code></dt>
+<dd><p>Which threshold to use for predictions</p></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns-3">Returns<a class="anchor" aria-label="anchor" href="#returns-3"></a></h4>
+<p>The predicted class for the data in the dataset</p>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-batchToDevice"></a><div class="section">
+<h3 id="method-batchtodevice-">Method <code>batchToDevice()</code><a class="anchor" aria-label="anchor" href="#method-batchtodevice-"></a></h3>
+<p>sends a batch of data to device
+assumes batch includes lists of tensors to arbitrary nested depths</p><div class="section">
+<h4 id="usage-9">Usage<a class="anchor" aria-label="anchor" href="#usage-9"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">batchToDevice</span><span class="op">(</span><span class="va">batch</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-9">Arguments<a class="anchor" aria-label="anchor" href="#arguments-9"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>batch</code></dt>
+<dd><p>the batch to send, usually a list of torch tensors</p></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns-4">Returns<a class="anchor" aria-label="anchor" href="#returns-4"></a></h4>
+<p>the batch on the required device</p>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-itemOrDefaults"></a><div class="section">
+<h3 id="method-itemordefaults-">Method <code>itemOrDefaults()</code><a class="anchor" aria-label="anchor" href="#method-itemordefaults-"></a></h3>
+<p>select item from list, and if it's null sets a default</p><div class="section">
+<h4 id="usage-10">Usage<a class="anchor" aria-label="anchor" href="#usage-10"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">itemOrDefaults</span><span class="op">(</span><span class="va">list</span>, <span class="va">item</span>, default <span class="op">=</span> <span class="cn">NULL</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-10">Arguments<a class="anchor" aria-label="anchor" href="#arguments-10"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>list</code></dt>
+<dd><p>A list with items</p></dd>
+
+
+<dt><code>item</code></dt>
+<dd><p>Which list item to retrieve</p></dd>
+
+
+<dt><code>default</code></dt>
+<dd><p>The value to return if list doesn't have item</p></dd>
+
+
+</dl><p></p></div>
+</div>
+<div class="section">
+<h4 id="returns-5">Returns<a class="anchor" aria-label="anchor" href="#returns-5"></a></h4>
+<p>the list item or default</p>
+</div>
+
+</div><p></p><hr><a id="method-Estimator-clone"></a><div class="section">
+<h3 id="method-clone-">Method <code>clone()</code><a class="anchor" aria-label="anchor" href="#method-clone-"></a></h3>
+<p>The objects of this class are cloneable with this method.</p><div class="section">
+<h4 id="usage-11">Usage<a class="anchor" aria-label="anchor" href="#usage-11"></a></h4>
+<p></p><div class="r"><div class="sourceCode"><pre><code><span><span class="va">Estimator</span><span class="op">$</span><span class="fu">clone</span><span class="op">(</span>deep <span class="op">=</span> <span class="cn">FALSE</span><span class="op">)</span></span></code></pre></div><p></p></div>
+</div>
+
+<div class="section">
+<h4 id="arguments-11">Arguments<a class="anchor" aria-label="anchor" href="#arguments-11"></a></h4>
+<p></p><div class="arguments"><dl><dt><code>deep</code></dt>
+<dd><p>Whether to make a deep clone.</p></dd>
+
+
+</dl><p></p></div>
+</div>
+
+</div>
+
+    </div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/doubleLayerNN.html b/docs/reference/doubleLayerNN.html
new file mode 100644
index 0000000..b3bc13f
--- /dev/null
+++ b/docs/reference/doubleLayerNN.html
@@ -0,0 +1,127 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Double layer neural network — doubleLayerNN • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Double layer neural network — doubleLayerNN"><meta property="og:description" content="Double layer neural network"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Double layer neural network</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Topologies.R" class="external-link"><code>R/Topologies.R</code></a></small>
+    <div class="hidden name"><code>doubleLayerNN.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Double layer neural network</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">doubleLayerNN</span><span class="op">(</span><span class="va">inputN</span>, <span class="va">layer1</span>, <span class="va">layer2</span>, <span class="va">outputN</span>, <span class="va">layer_dropout</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>inputN</dt>
+<dd><p>Input neurons</p></dd>
+
+
+<dt>layer1</dt>
+<dd><p>Layer 1 neurons</p></dd>
+
+
+<dt>layer2</dt>
+<dd><p>Layer 2 neurons</p></dd>
+
+
+<dt>outputN</dt>
+<dd><p>output neurons</p></dd>
+
+
+<dt>layer_dropout</dt>
+<dd><p>layer_dropout to use</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/fitDeepNNTorch.html b/docs/reference/fitDeepNNTorch.html
new file mode 100644
index 0000000..fa8c2e3
--- /dev/null
+++ b/docs/reference/fitDeepNNTorch.html
@@ -0,0 +1,123 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Fits a deep neural network — fitDeepNNTorch • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Fits a deep neural network — fitDeepNNTorch"><meta property="og:description" content="Fits a deep neural network"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Fits a deep neural network</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/DeepNNTorch.R" class="external-link"><code>R/DeepNNTorch.R</code></a></small>
+    <div class="hidden name"><code>fitDeepNNTorch.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Fits a deep neural network</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">fitDeepNNTorch</span><span class="op">(</span><span class="va">trainData</span>, <span class="va">param</span>, search <span class="op">=</span> <span class="st">"grid"</span>, <span class="va">analysisId</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>trainData</dt>
+<dd><p>Training data object</p></dd>
+
+
+<dt>param</dt>
+<dd><p>Hyperparameters to search over</p></dd>
+
+
+<dt>search</dt>
+<dd><p>Which kind of search strategy to use</p></dd>
+
+
+<dt>analysisId</dt>
+<dd><p>Analysis Id</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/fitEstimator.html b/docs/reference/fitEstimator.html
new file mode 100644
index 0000000..da8803b
--- /dev/null
+++ b/docs/reference/fitEstimator.html
@@ -0,0 +1,123 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>fitEstimator — fitEstimator • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="fitEstimator — fitEstimator"><meta property="og:description" content="fits a deep learning estimator to data."><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>fitEstimator</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Estimator.R" class="external-link"><code>R/Estimator.R</code></a></small>
+    <div class="hidden name"><code>fitEstimator.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>fits a deep learning estimator to data.</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">fitEstimator</span><span class="op">(</span><span class="va">trainData</span>, <span class="va">param</span>, <span class="va">analysisId</span>, <span class="va">...</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>trainData</dt>
+<dd><p>the data to use</p></dd>
+
+
+<dt>param</dt>
+<dd><p>model parameters</p></dd>
+
+
+<dt>analysisId</dt>
+<dd><p>Id of the analysis</p></dd>
+
+
+<dt>...</dt>
+<dd><p>Extra inputs</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/gridCvDeep.html b/docs/reference/gridCvDeep.html
new file mode 100644
index 0000000..9467482
--- /dev/null
+++ b/docs/reference/gridCvDeep.html
@@ -0,0 +1,127 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>gridCvDeep — gridCvDeep • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="gridCvDeep — gridCvDeep"><meta property="og:description" content="Performs grid search for a deep learning estimator"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>gridCvDeep</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Estimator.R" class="external-link"><code>R/Estimator.R</code></a></small>
+    <div class="hidden name"><code>gridCvDeep.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Performs grid search for a deep learning estimator</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">gridCvDeep</span><span class="op">(</span><span class="va">mappedData</span>, <span class="va">labels</span>, <span class="va">settings</span>, <span class="va">modelLocation</span>, <span class="va">paramSearch</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>mappedData</dt>
+<dd><p>Mapped data with covariates</p></dd>
+
+
+<dt>labels</dt>
+<dd><p>Dataframe with the outcomes</p></dd>
+
+
+<dt>settings</dt>
+<dd><p>Settings of the model</p></dd>
+
+
+<dt>modelLocation</dt>
+<dd><p>Where to save the model</p></dd>
+
+
+<dt>paramSearch</dt>
+<dd><p>model parameters to perform search over</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/index.html b/docs/reference/index.html
new file mode 100644
index 0000000..98aff88
--- /dev/null
+++ b/docs/reference/index.html
@@ -0,0 +1,159 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Function reference • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Function reference"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-index">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="contents col-md-9">
+    <div class="page-header">
+      <h1>Reference</h1>
+    </div>
+
+    <table class="ref-index"><colgroup><col class="alias"><col class="title"></colgroup><tbody><tr><th colspan="2">
+          <h2 id="all-functions">All functions <a href="#all-functions" class="anchor" aria-hidden="true"></a></h2>
+          <p class="section-desc"></p>
+        </th>
+      </tr></tbody><tbody><tr><td>
+          <p><code><a href="Dataset.html">Dataset()</a></code> </p>
+        </td>
+        <td><p>A torch dataset</p></td>
+      </tr><tr><td>
+          <p><code><a href="DeepPatientLevelPrediction.html">DeepPatientLevelPrediction</a></code> </p>
+        </td>
+        <td><p>DeepPatientLevelPrediction</p></td>
+      </tr><tr><td>
+          <p><code><a href="EarlyStopping.html">EarlyStopping</a></code> </p>
+        </td>
+        <td><p>Earlystopping class</p></td>
+      </tr><tr><td>
+          <p><code><a href="Estimator.html">Estimator</a></code> </p>
+        </td>
+        <td><p>Estimator</p></td>
+      </tr><tr><td>
+          <p><code><a href="doubleLayerNN.html">doubleLayerNN()</a></code> </p>
+        </td>
+        <td><p>Double layer neural network</p></td>
+      </tr><tr><td>
+          <p><code><a href="fitDeepNNTorch.html">fitDeepNNTorch()</a></code> </p>
+        </td>
+        <td><p>Fits a deep neural network</p></td>
+      </tr><tr><td>
+          <p><code><a href="fitEstimator.html">fitEstimator()</a></code> </p>
+        </td>
+        <td><p>fitEstimator</p></td>
+      </tr><tr><td>
+          <p><code><a href="gridCvDeep.html">gridCvDeep()</a></code> </p>
+        </td>
+        <td><p>gridCvDeep</p></td>
+      </tr><tr><td>
+          <p><code><a href="predictDeepEstimator.html">predictDeepEstimator()</a></code> </p>
+        </td>
+        <td><p>predictDeepEstimator</p></td>
+      </tr><tr><td>
+          <p><code><a href="predictDeepNN.html">predictDeepNN()</a></code> </p>
+        </td>
+        <td><p>Create predictions for a deep neural network</p></td>
+      </tr><tr><td>
+          <p><code><a href="setDeepNNTorch.html">setDeepNNTorch()</a></code> </p>
+        </td>
+        <td><p>settings for a Deep neural network</p></td>
+      </tr><tr><td>
+          <p><code><a href="setResNet.html">setResNet()</a></code> </p>
+        </td>
+        <td><p>setResNet</p></td>
+      </tr><tr><td>
+          <p><code><a href="setTransformer.html">setTransformer()</a></code> </p>
+        </td>
+        <td><p>create settings for training a non-temporal transformer</p></td>
+      </tr><tr><td>
+          <p><code><a href="singleLayerNN.html">singleLayerNN()</a></code> </p>
+        </td>
+        <td><p>A single layer neural network</p></td>
+      </tr><tr><td>
+          <p><code><a href="tripleLayerNN.html">tripleLayerNN()</a></code> </p>
+        </td>
+        <td><p>Triple layer neural network</p></td>
+      </tr></tbody></table></div>
+
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/predictDeepEstimator.html b/docs/reference/predictDeepEstimator.html
new file mode 100644
index 0000000..3e9eb58
--- /dev/null
+++ b/docs/reference/predictDeepEstimator.html
@@ -0,0 +1,119 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>predictDeepEstimator — predictDeepEstimator • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="predictDeepEstimator — predictDeepEstimator"><meta property="og:description" content="the prediction function for the estimator"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>predictDeepEstimator</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Estimator.R" class="external-link"><code>R/Estimator.R</code></a></small>
+    <div class="hidden name"><code>predictDeepEstimator.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>the prediction function for the estimator</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">predictDeepEstimator</span><span class="op">(</span><span class="va">plpModel</span>, <span class="va">data</span>, <span class="va">cohort</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>plpModel</dt>
+<dd><p>the plpModel</p></dd>
+
+
+<dt>data</dt>
+<dd><p>plp data object or a torch dataset</p></dd>
+
+
+<dt>cohort</dt>
+<dd><p>data.frame with the rowIds of the people</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/predictDeepNN.html b/docs/reference/predictDeepNN.html
new file mode 100644
index 0000000..75341ec
--- /dev/null
+++ b/docs/reference/predictDeepNN.html
@@ -0,0 +1,119 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Create predictions for a deep neural network — predictDeepNN • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Create predictions for a deep neural network — predictDeepNN"><meta property="og:description" content="Create predictions for a deep neural network"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Create predictions for a deep neural network</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/DeepNNTorch.R" class="external-link"><code>R/DeepNNTorch.R</code></a></small>
+    <div class="hidden name"><code>predictDeepNN.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Create predictions for a deep neural network</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">predictDeepNN</span><span class="op">(</span><span class="va">plpModel</span>, <span class="va">data</span>, <span class="va">cohort</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>plpModel</dt>
+<dd><p>The plpModel to predict for</p></dd>
+
+
+<dt>data</dt>
+<dd><p>The data to make predictions for</p></dd>
+
+
+<dt>cohort</dt>
+<dd><p>The cohort to use</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/setDeepNNTorch.html b/docs/reference/setDeepNNTorch.html
new file mode 100644
index 0000000..41b51de
--- /dev/null
+++ b/docs/reference/setDeepNNTorch.html
@@ -0,0 +1,153 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>settings for a Deep neural network — setDeepNNTorch • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="settings for a Deep neural network — setDeepNNTorch"><meta property="og:description" content="settings for a Deep neural network"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>settings for a Deep neural network</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/DeepNNTorch.R" class="external-link"><code>R/DeepNNTorch.R</code></a></small>
+    <div class="hidden name"><code>setDeepNNTorch.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>settings for a Deep neural network</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">setDeepNNTorch</span><span class="op">(</span></span>
+<span>  units <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/list.html" class="external-link">list</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">128</span>, <span class="fl">64</span><span class="op">)</span>, <span class="fl">128</span><span class="op">)</span>,</span>
+<span>  layer_dropout <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0.2</span><span class="op">)</span>,</span>
+<span>  lr <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1e-04</span><span class="op">)</span>,</span>
+<span>  decay <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1e-05</span><span class="op">)</span>,</span>
+<span>  outcome_weight <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">)</span>,</span>
+<span>  batch_size <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">10000</span><span class="op">)</span>,</span>
+<span>  epochs <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">100</span><span class="op">)</span>,</span>
+<span>  device <span class="op">=</span> <span class="st">"cpu"</span>,</span>
+<span>  seed <span class="op">=</span> <span class="cn">NULL</span></span>
+<span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>units</dt>
+<dd><p>A list of vectors for neurons per layer</p></dd>
+
+
+<dt>layer_dropout</dt>
+<dd><p>Dropout to use per layer</p></dd>
+
+
+<dt>lr</dt>
+<dd><p>Learning rate ot use</p></dd>
+
+
+<dt>decay</dt>
+<dd><p>Weight decay to use</p></dd>
+
+
+<dt>outcome_weight</dt>
+<dd><p>Weight for minority outcome in cost function</p></dd>
+
+
+<dt>batch_size</dt>
+<dd><p>Batch size to use</p></dd>
+
+
+<dt>epochs</dt>
+<dd><p>How many epochs to use</p></dd>
+
+
+<dt>device</dt>
+<dd><p>Which device to use</p></dd>
+
+
+<dt>seed</dt>
+<dd><p>A seed to make experiments more reproducible</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/setResNet.html b/docs/reference/setResNet.html
new file mode 100644
index 0000000..e183455
--- /dev/null
+++ b/docs/reference/setResNet.html
@@ -0,0 +1,192 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>setResNet — setResNet • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="setResNet — setResNet"><meta property="og:description" content="Creates settings for a ResNet model"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>setResNet</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/ResNet.R" class="external-link"><code>R/ResNet.R</code></a></small>
+    <div class="hidden name"><code>setResNet.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Creates settings for a ResNet model</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">setResNet</span><span class="op">(</span></span>
+<span>  numLayers <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">8</span><span class="op">)</span>,</span>
+<span>  sizeHidden <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">2</span><span class="op">^</span><span class="op">(</span><span class="fl">6</span><span class="op">:</span><span class="fl">9</span><span class="op">)</span><span class="op">)</span>,</span>
+<span>  hiddenFactor <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1</span><span class="op">:</span><span class="fl">4</span><span class="op">)</span>,</span>
+<span>  residualDropout <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/seq.html" class="external-link">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0.5</span>, <span class="fl">0.05</span><span class="op">)</span><span class="op">)</span>,</span>
+<span>  hiddenDropout <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fu"><a href="https://rdrr.io/r/base/seq.html" class="external-link">seq</a></span><span class="op">(</span><span class="fl">0</span>, <span class="fl">0.5</span>, <span class="fl">0.05</span><span class="op">)</span><span class="op">)</span>,</span>
+<span>  normalization <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"BatchNorm"</span><span class="op">)</span>,</span>
+<span>  activation <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="st">"RelU"</span><span class="op">)</span>,</span>
+<span>  sizeEmbedding <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">2</span><span class="op">^</span><span class="op">(</span><span class="fl">6</span><span class="op">:</span><span class="fl">9</span><span class="op">)</span><span class="op">)</span>,</span>
+<span>  weightDecay <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">1e-06</span>, <span class="fl">0.001</span><span class="op">)</span>,</span>
+<span>  learningRate <span class="op">=</span> <span class="fu"><a href="https://rdrr.io/r/base/c.html" class="external-link">c</a></span><span class="op">(</span><span class="fl">0.01</span>, <span class="fl">3e-04</span>, <span class="fl">1e-05</span><span class="op">)</span>,</span>
+<span>  seed <span class="op">=</span> <span class="cn">NULL</span>,</span>
+<span>  hyperParamSearch <span class="op">=</span> <span class="st">"random"</span>,</span>
+<span>  randomSample <span class="op">=</span> <span class="fl">100</span>,</span>
+<span>  device <span class="op">=</span> <span class="st">"cpu"</span>,</span>
+<span>  batchSize <span class="op">=</span> <span class="fl">1024</span>,</span>
+<span>  epochs <span class="op">=</span> <span class="fl">30</span></span>
+<span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>numLayers</dt>
+<dd><p>Number of layers in network, default: 1:16</p></dd>
+
+
+<dt>sizeHidden</dt>
+<dd><p>Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)</p></dd>
+
+
+<dt>hiddenFactor</dt>
+<dd><p>How much to grow the amount of neurons in each ResLayer, default: 1:4</p></dd>
+
+
+<dt>residualDropout</dt>
+<dd><p>How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)</p></dd>
+
+
+<dt>hiddenDropout</dt>
+<dd><p>How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)</p></dd>
+
+
+<dt>normalization</dt>
+<dd><p>Which type of normalization to use. Default: 'Batchnorm'</p></dd>
+
+
+<dt>activation</dt>
+<dd><p>What kind of activation to use. Default: 'RelU'</p></dd>
+
+
+<dt>sizeEmbedding</dt>
+<dd><p>Size of embedding layer, default: 2^(6:9) (64 to 512)</p></dd>
+
+
+<dt>weightDecay</dt>
+<dd><p>Weight decay to apply, default: c(1e-6, 1e-3)</p></dd>
+
+
+<dt>learningRate</dt>
+<dd><p>Learning rate to use. default: c(1e-2, 1e-5)</p></dd>
+
+
+<dt>seed</dt>
+<dd><p>Seed to use for sampling hyperparameter space</p></dd>
+
+
+<dt>hyperParamSearch</dt>
+<dd><p>Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'</p></dd>
+
+
+<dt>randomSample</dt>
+<dd><p>How many random samples from hyperparameter space to use</p></dd>
+
+
+<dt>device</dt>
+<dd><p>Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'</p></dd>
+
+
+<dt>batchSize</dt>
+<dd><p>Size of batch, default: 1024</p></dd>
+
+
+<dt>epochs</dt>
+<dd><p>Number of epochs to run, default: 10</p></dd>
+
+</dl></div>
+    <div id="details">
+    <h2>Details</h2>
+    <p>Model architecture from by https://arxiv.org/abs/2106.11959</p>
+    </div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/setTransformer.html b/docs/reference/setTransformer.html
new file mode 100644
index 0000000..fb2503d
--- /dev/null
+++ b/docs/reference/setTransformer.html
@@ -0,0 +1,192 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>create settings for training a non-temporal transformer — setTransformer • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="create settings for training a non-temporal transformer — setTransformer"><meta property="og:description" content="A transformer model"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>create settings for training a non-temporal transformer</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Transformer.R" class="external-link"><code>R/Transformer.R</code></a></small>
+    <div class="hidden name"><code>setTransformer.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>A transformer model</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">setTransformer</span><span class="op">(</span></span>
+<span>  numBlocks <span class="op">=</span> <span class="fl">3</span>,</span>
+<span>  dimToken <span class="op">=</span> <span class="fl">96</span>,</span>
+<span>  dimOut <span class="op">=</span> <span class="fl">1</span>,</span>
+<span>  numHeads <span class="op">=</span> <span class="fl">8</span>,</span>
+<span>  attDropout <span class="op">=</span> <span class="fl">0.25</span>,</span>
+<span>  ffnDropout <span class="op">=</span> <span class="fl">0.25</span>,</span>
+<span>  resDropout <span class="op">=</span> <span class="fl">0</span>,</span>
+<span>  dimHidden <span class="op">=</span> <span class="fl">512</span>,</span>
+<span>  weightDecay <span class="op">=</span> <span class="fl">1e-06</span>,</span>
+<span>  learningRate <span class="op">=</span> <span class="fl">3e-04</span>,</span>
+<span>  batchSize <span class="op">=</span> <span class="fl">1024</span>,</span>
+<span>  epochs <span class="op">=</span> <span class="fl">10</span>,</span>
+<span>  device <span class="op">=</span> <span class="st">"cpu"</span>,</span>
+<span>  hyperParamSearch <span class="op">=</span> <span class="st">"random"</span>,</span>
+<span>  randomSamples <span class="op">=</span> <span class="fl">100</span>,</span>
+<span>  seed <span class="op">=</span> <span class="cn">NULL</span></span>
+<span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>numBlocks</dt>
+<dd><p>number of transformer blocks</p></dd>
+
+
+<dt>dimToken</dt>
+<dd><p>dimension of each token (embedding size)</p></dd>
+
+
+<dt>dimOut</dt>
+<dd><p>dimension of output, usually 1 for binary problems</p></dd>
+
+
+<dt>numHeads</dt>
+<dd><p>number of attention heads</p></dd>
+
+
+<dt>attDropout</dt>
+<dd><p>dropout to use on attentions</p></dd>
+
+
+<dt>ffnDropout</dt>
+<dd><p>dropout to use in feedforward block</p></dd>
+
+
+<dt>resDropout</dt>
+<dd><p>dropout to use in residual connections</p></dd>
+
+
+<dt>dimHidden</dt>
+<dd><p>dimension of the feedworward block</p></dd>
+
+
+<dt>weightDecay</dt>
+<dd><p>weightdecay to use</p></dd>
+
+
+<dt>learningRate</dt>
+<dd><p>learning rate to use</p></dd>
+
+
+<dt>batchSize</dt>
+<dd><p>batchSize to use</p></dd>
+
+
+<dt>epochs</dt>
+<dd><p>How many epochs to run the model for</p></dd>
+
+
+<dt>device</dt>
+<dd><p>Which device to use, cpu or cuda</p></dd>
+
+
+<dt>hyperParamSearch</dt>
+<dd><p>what kind of hyperparameter search to do, default 'random'</p></dd>
+
+
+<dt>randomSamples</dt>
+<dd><p>How many samples to use in hyperparameter search if random</p></dd>
+
+
+<dt>seed</dt>
+<dd><p>Random seed to use</p></dd>
+
+</dl></div>
+    <div id="details">
+    <h2>Details</h2>
+    <p>from https://arxiv.org/abs/2106.11959</p>
+    </div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/singleLayerNN.html b/docs/reference/singleLayerNN.html
new file mode 100644
index 0000000..40fb76e
--- /dev/null
+++ b/docs/reference/singleLayerNN.html
@@ -0,0 +1,123 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>A single layer neural network — singleLayerNN • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="A single layer neural network — singleLayerNN"><meta property="og:description" content="A single layer neural network"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>A single layer neural network</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Topologies.R" class="external-link"><code>R/Topologies.R</code></a></small>
+    <div class="hidden name"><code>singleLayerNN.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>A single layer neural network</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">singleLayerNN</span><span class="op">(</span><span class="va">inputN</span>, <span class="va">layer1</span>, outputN <span class="op">=</span> <span class="fl">2</span>, <span class="va">layer_dropout</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>inputN</dt>
+<dd><p>Input neurons</p></dd>
+
+
+<dt>layer1</dt>
+<dd><p>Layer 1 neurons</p></dd>
+
+
+<dt>outputN</dt>
+<dd><p>Output neurons</p></dd>
+
+
+<dt>layer_dropout</dt>
+<dd><p>Layer dropout to use</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/reference/tripleLayerNN.html b/docs/reference/tripleLayerNN.html
new file mode 100644
index 0000000..b5e0969
--- /dev/null
+++ b/docs/reference/tripleLayerNN.html
@@ -0,0 +1,131 @@
+<!DOCTYPE html>
+<!-- Generated by pkgdown: do not edit by hand --><html lang="en"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8"><meta charset="utf-8"><meta http-equiv="X-UA-Compatible" content="IE=edge"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Triple layer neural network — tripleLayerNN • DeepPatientLevelPrediction</title><!-- jquery --><script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.4.1/jquery.min.js" integrity="sha256-CSXorXvZcTkaix6Yvo6HppcZGetbYMGWSFlBw8HfCJo=" crossorigin="anonymous"></script><!-- Bootstrap --><link href="https://cdnjs.cloudflare.com/ajax/libs/bootswatch/3.4.0/cosmo/bootstrap.min.css" rel="stylesheet" crossorigin="anonymous"><script src="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.4.1/js/bootstrap.min.js" integrity="sha256-nuL8/2cJ5NDSSwnKD8VqreErSWHtnEP9E7AySL+1ev4=" crossorigin="anonymous"></script><!-- bootstrap-toc --><link rel="stylesheet" href="../bootstrap-toc.css"><script src="../bootstrap-toc.js"></script><!-- Font Awesome icons --><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/all.min.css" integrity="sha256-mmgLkCYLUQbXn0B1SRqzHar6dCnv9oZFPEC1g1cwlkk=" crossorigin="anonymous"><link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.12.1/css/v4-shims.min.css" integrity="sha256-wZjR52fzng1pJHwx4aV2AO3yyTOXrcDW7jBpJtTwVxw=" crossorigin="anonymous"><!-- clipboard.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/2.0.6/clipboard.min.js" integrity="sha256-inc5kl9MA1hkeYUt+EC3BhlIgyp/2jDIyBLS6k3UxPI=" crossorigin="anonymous"></script><!-- headroom.js --><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/headroom.min.js" integrity="sha256-AsUX4SJE1+yuDu5+mAVzJbuYNPHj/WroHuZ8Ir/CkE0=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/headroom/0.11.0/jQuery.headroom.min.js" integrity="sha256-ZX/yNShbjqsohH1k95liqY9Gd8uOiE1S4vZc+9KQ1K4=" crossorigin="anonymous"></script><!-- pkgdown --><link href="../pkgdown.css" rel="stylesheet"><script src="../pkgdown.js"></script><meta property="og:title" content="Triple layer neural network — tripleLayerNN"><meta property="og:description" content="Triple layer neural network"><!-- mathjax --><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js" integrity="sha256-nvJJv9wWKEm88qvoQl9ekL2J+k/RWIsaSScxxlsrv8k=" crossorigin="anonymous"></script><script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/config/TeX-AMS-MML_HTMLorMML.js" integrity="sha256-84DKXVJXs0/F8OTMzX4UR909+jtl4G7SPypPavF+GfA=" crossorigin="anonymous"></script><!--[if lt IE 9]>
+<script src="https://oss.maxcdn.com/html5shiv/3.7.3/html5shiv.min.js"></script>
+<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
+<![endif]--></head><body data-spy="scroll" data-target="#toc">
+    
+
+    <div class="container template-reference-topic">
+      <header><div class="navbar navbar-default navbar-fixed-top" role="navigation">
+  <div class="container">
+    <div class="navbar-header">
+      <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#navbar" aria-expanded="false">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+      <span class="navbar-brand">
+        <a class="navbar-link" href="../index.html">DeepPatientLevelPrediction</a>
+        <span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="">0.0.1</span>
+      </span>
+    </div>
+
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav"><li>
+  <a href="../index.html">
+    <span class="fa fa-home fa-lg"></span>
+     
+  </a>
+</li>
+<li>
+  <a href="../articles/Installing.html">Get started</a>
+</li>
+<li>
+  <a href="../reference/index.html">Reference</a>
+</li>
+<li class="dropdown">
+  <a href="#" class="dropdown-toggle" data-toggle="dropdown" role="button" data-bs-toggle="dropdown" aria-expanded="false">
+    Articles
+     
+    <span class="caret"></span>
+  </a>
+  <ul class="dropdown-menu" role="menu"><li>
+      <a href="../articles/BuildingDeepModels.html">Building Deep Learning Models</a>
+    </li>
+    <li>
+      <a href="../articles/Installing.html">DeepPatientLevelPrediction Installation Guide</a>
+    </li>
+  </ul></li>
+      </ul><ul class="nav navbar-nav navbar-right"><li>
+  <a href="https://ohdsi.github.io/Hades" class="external-link"><img src='https://ohdsi.github.io/Hades/images/hadesMini.png' width=80 height=17 style='vertical-align: top;'></a>
+</li>
+<li>
+  <a href="https://github.com/OHDSI/DeepPatientLevelPrediction" class="external-link">
+    <span class="fa fa-github fa-lg"></span>
+     
+  </a>
+</li>
+      </ul></div><!--/.nav-collapse -->
+  </div><!--/.container -->
+</div><!--/.navbar -->
+
+      
+
+      </header><div class="row">
+  <div class="col-md-9 contents">
+    <div class="page-header">
+    <h1>Triple layer neural network</h1>
+    <small class="dont-index">Source: <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/blob/HEAD/R/Topologies.R" class="external-link"><code>R/Topologies.R</code></a></small>
+    <div class="hidden name"><code>tripleLayerNN.Rd</code></div>
+    </div>
+
+    <div class="ref-description">
+    <p>Triple layer neural network</p>
+    </div>
+
+    <div id="ref-usage">
+    <div class="sourceCode"><pre class="sourceCode r"><code><span><span class="fu">tripleLayerNN</span><span class="op">(</span><span class="va">inputN</span>, <span class="va">layer1</span>, <span class="va">layer2</span>, <span class="va">layer3</span>, <span class="va">outputN</span>, <span class="va">layer_dropout</span><span class="op">)</span></span></code></pre></div>
+    </div>
+
+    <div id="arguments">
+    <h2>Arguments</h2>
+    <dl><dt>inputN</dt>
+<dd><p>Input neurons</p></dd>
+
+
+<dt>layer1</dt>
+<dd><p>amount of layer 1 neurons</p></dd>
+
+
+<dt>layer2</dt>
+<dd><p>amount of layer 2 neurons</p></dd>
+
+
+<dt>layer3</dt>
+<dd><p>amount of layer 3 neurons</p></dd>
+
+
+<dt>outputN</dt>
+<dd><p>Number of output neurons</p></dd>
+
+
+<dt>layer_dropout</dt>
+<dd><p>The dropout to use in layer</p></dd>
+
+</dl></div>
+
+  </div>
+  <div class="col-md-3 hidden-xs hidden-sm" id="pkgdown-sidebar">
+    <nav id="toc" data-toggle="toc" class="sticky-top"><h2 data-toc-skip>Contents</h2>
+    </nav></div>
+</div>
+
+
+      <footer><div class="copyright">
+  <p></p><p>Developed by Jenna Reps, Egill Fridgeirsson, Seng Chan You, Chungsoo Kim, Henrik John.</p>
+</div>
+
+<div class="pkgdown">
+  <p></p><p>Site built with <a href="https://pkgdown.r-lib.org/" class="external-link">pkgdown</a> 2.0.5.</p>
+</div>
+
+      </footer></div>
+
+  
+
+
+  
+
+  </body></html>
+
diff --git a/docs/sitemap.xml b/docs/sitemap.xml
new file mode 100644
index 0000000..1e9115f
--- /dev/null
+++ b/docs/sitemap.xml
@@ -0,0 +1,69 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+  <url>
+    <loc>/404.html</loc>
+  </url>
+  <url>
+    <loc>/articles/BuildingDeepModels.html</loc>
+  </url>
+  <url>
+    <loc>/articles/Installing.html</loc>
+  </url>
+  <url>
+    <loc>/articles/index.html</loc>
+  </url>
+  <url>
+    <loc>/authors.html</loc>
+  </url>
+  <url>
+    <loc>/index.html</loc>
+  </url>
+  <url>
+    <loc>/reference/Dataset.html</loc>
+  </url>
+  <url>
+    <loc>/reference/DeepPatientLevelPrediction.html</loc>
+  </url>
+  <url>
+    <loc>/reference/EarlyStopping.html</loc>
+  </url>
+  <url>
+    <loc>/reference/Estimator.html</loc>
+  </url>
+  <url>
+    <loc>/reference/doubleLayerNN.html</loc>
+  </url>
+  <url>
+    <loc>/reference/fitDeepNNTorch.html</loc>
+  </url>
+  <url>
+    <loc>/reference/fitEstimator.html</loc>
+  </url>
+  <url>
+    <loc>/reference/gridCvDeep.html</loc>
+  </url>
+  <url>
+    <loc>/reference/index.html</loc>
+  </url>
+  <url>
+    <loc>/reference/predictDeepEstimator.html</loc>
+  </url>
+  <url>
+    <loc>/reference/predictDeepNN.html</loc>
+  </url>
+  <url>
+    <loc>/reference/setDeepNNTorch.html</loc>
+  </url>
+  <url>
+    <loc>/reference/setResNet.html</loc>
+  </url>
+  <url>
+    <loc>/reference/setTransformer.html</loc>
+  </url>
+  <url>
+    <loc>/reference/singleLayerNN.html</loc>
+  </url>
+  <url>
+    <loc>/reference/tripleLayerNN.html</loc>
+  </url>
+</urlset>
diff --git a/extras/DeepPatientLevelPrediction.pdf b/extras/DeepPatientLevelPrediction.pdf
new file mode 100644
index 0000000..d7bde43
Binary files /dev/null and b/extras/DeepPatientLevelPrediction.pdf differ
diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R
new file mode 100644
index 0000000..0b86eda
--- /dev/null
+++ b/extras/PackageMaintenance.R
@@ -0,0 +1,44 @@
+# @file PackageMaintenance
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Format and check code
+styler::style_pkg()
+OhdsiRTools::checkUsagePackage("DeepPatientLevelPrediction")
+OhdsiRTools::updateCopyrightYearFolder()
+devtools::spell_check()
+
+
+# Create manual and vignettes
+unlink("extras/DeepPatientLevelPrediction.pdf")
+system("R CMD Rd2pdf ./ --output=extras/DeepPatientLevelPrediction.pdf")
+
+rmarkdown::render("vignettes/BuildingDeepModels.Rmd",
+                  output_file = "../inst/doc/BuildingDeepModels.pdf",
+                  rmarkdown::pdf_document(latex_engine = "pdflatex",
+                                          toc = TRUE,
+                                          toc_depth = 3,
+                                          number_sections = TRUE))
+
+
+rmarkdown::render("vignettes/Installing.Rmd",
+                  output_file = "../inst/doc/Installing.pdf",
+                  rmarkdown::pdf_document(latex_engine = "pdflatex",
+                                          toc = TRUE,
+                                          toc_depth = 3,
+                                          number_sections = TRUE))
+
diff --git a/extras/example.R b/extras/example.R
new file mode 100644
index 0000000..fa989b1
--- /dev/null
+++ b/extras/example.R
@@ -0,0 +1,54 @@
+# testing code (requires sequential branch of FeatureExtraction):
+# rm(list = ls())
+library(PatientLevelPrediction)
+library(DeepPatientLevelPrediction)
+
+data(plpDataSimulationProfile)
+sampleSize <- 1e4
+plpData <- simulatePlpData(
+  plpDataSimulationProfile,
+  n = sampleSize 
+)
+
+populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
+  requireTimeAtRisk = F, 
+  riskWindowStart = 1, 
+  riskWindowEnd = 365)
+
+ 
+modelSettings <- setResNet(numLayers = 2, sizeHidden = 64, hiddenFactor = 1,
+                          residualDropout = 0, hiddenDropout = 0.2, normalization = 'BatchNorm',
+                          activation = 'RelU', sizeEmbedding = 512, weightDecay = 1e-6,
+                          learningRate = 3e-4, seed = 42, hyperParamSearch = 'random',
+                          randomSample = 1, device = 'cuda:0',batchSize = 32,epochs = 10)
+
+# modelSettings <- setTransformer(numBlocks=1, dimToken = 33, dimOut = 1, numHeads = 3,
+#                                 attDropout = 0.2, ffnDropout = 0.2, resDropout = 0,
+#                                 dimHidden = 8, batchSize = 32, hyperParamSearch = 'random',
+#                                 weightDecay = 1e-6, learningRate = 3e-4, epochs = 10,
+#                                 device = 'cuda:0', randomSamples = 1, seed = 42)
+
+res2 <- PatientLevelPrediction::runPlp(
+plpData = plpData,
+outcomeId = 3,
+modelSettings = modelSettings,
+analysisId = 'Test',
+analysisName = 'Testing DeepPlp',
+populationSettings = populationSet,
+splitSettings = createDefaultSplitSetting(),
+sampleSettings = createSampleSettings(),  # none
+featureEngineeringSettings = createFeatureEngineeringSettings(), # none
+preprocessSettings = createPreprocessSettings(),
+logSettings = createLogSettings(verbosity='TRACE'),
+executeSettings = createExecuteSettings(
+  runSplitData = T,
+  runSampleData = F,
+  runfeatureEngineering = F,
+  runPreprocessData = T,
+  runModelDevelopment = T,
+  runCovariateSummary = T
+),
+saveDirectory = '~/test/new_plp/'
+)
+
+
diff --git a/extras/example_plp5.R b/extras/example_plp5.R
new file mode 100644
index 0000000..0e65ecb
--- /dev/null
+++ b/extras/example_plp5.R
@@ -0,0 +1,142 @@
+# testing code (requires sequential branch of FeatureExtraction):
+rm(list = ls())
+library(FeatureExtraction)
+library(PatientLevelPrediction)
+library(DeepPatientLevelPrediction)
+connectionDetails <- Eunomia::getEunomiaConnectionDetails()
+Eunomia::createCohorts(connectionDetails)
+
+temp <- F
+
+covSet <- createCovariateSettings(useDemographicsGender = T, 
+                                  useDemographicsAge = T, 
+                                  useDemographicsRace = T,
+                                  useDemographicsEthnicity = T, 
+                                  useDemographicsAgeGroup = T,
+                                  useConditionGroupEraLongTerm = T, 
+                                  useDrugEraStartLongTerm  = T, 
+                                  endDays = -1
+)
+
+if(temp){
+covSetT <- createTemporalSequenceCovariateSettings(useDemographicsGender = T, 
+                                                  useDemographicsAge = T, 
+                                                  useDemographicsRace = T,
+                                                  useDemographicsEthnicity = T, 
+                                                  useDemographicsAgeGroup = T,
+                                                  useConditionEraGroupStart = T, 
+                                                  useDrugEraStart = T, 
+                                                  timePart = 'month', 
+                                                  timeInterval = 1, 
+                                                  sequenceEndDay = -1, 
+                                                  sequenceStartDay = -365*5)
+}
+
+
+databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
+  connectionDetails = connectionDetails, 
+  cdmDatabaseSchema = "main",
+  cohortDatabaseSchema = "main", 
+  cohortTable = "cohort", 
+  cohortId = 4, 
+  outcomeIds = 3,
+  outcomeDatabaseSchema = "main", 
+  outcomeTable =  "cohort", 
+  cdmDatabaseName = 'eunomia'
+)
+
+restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings(
+  firstExposureOnly = T, 
+  washoutPeriod = 365
+)
+  
+plpData <- PatientLevelPrediction::getPlpData(
+  databaseDetails = databaseDetails, 
+  restrictPlpDataSettings = restrictPlpDataSettings,
+  covariateSettings = covSet
+)
+
+if(temp){
+  plpDataT <- PatientLevelPrediction::getPlpData(  
+    databaseDetails = databaseDetails, 
+    restrictPlpDataSettings = restrictPlpDataSettings,
+    covariateSettings = covSetT
+  )
+}
+
+
+populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
+                                                            requireTimeAtRisk = F, 
+                                                            riskWindowStart = 1, 
+                                                            riskWindowEnd = 365
+  )
+
+# code to train models
+deepset <- setDeepNNTorch(units=list(c(128, 64), 128), layer_dropout=c(0.2),
+                          lr =c(1e-4), decay=c(1e-5), outcome_weight = c(1.0), batch_size = c(100), 
+                          epochs= c(5),  seed=NULL  )
+
+
+#debug(fitDeepNNTorch)
+# res <- runPlp(population = population, 
+#               plpData = plpData, 
+#               nfold = 3,
+#               modelSettings = deepset, 
+#               savePlpData = F, 
+#               savePlpResult = F, 
+#               savePlpPlots = F, 
+#               saveEvaluation = F)
+# 
+
+
+resSet <- setResNet_plp5(
+  numLayers = list(5), 
+  sizeHidden = list(256),
+  hiddenFactor = list(2),
+    residualDropout = list(0.1), 
+    hiddenDropout = list(0.1),
+    normalization = list('BatchNorm'), 
+    activation = list('RelU'),
+    sizeEmbedding = list(64), 
+    weightDecay = list(1e-6),
+    learningRate = list(3e-4), 
+    seed = 42, 
+    hyperParamSearch = 'random',
+    randomSample = 1, 
+    #device='cuda:0', 
+    batchSize = 128, 
+    epochs = 10
+  )
+
+
+res2 <- runPlp(
+  plpData = plpData, 
+  outcomeId = 3, 
+  modelSettings = resSet,
+  analysisId = 'ResNet', 
+  analysisName = 'Testing Deep Learning', 
+  populationSettings = populationSet, 
+  splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), 
+  sampleSettings = PatientLevelPrediction::createSampleSettings(),  # none 
+  featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none 
+  preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), 
+  executeSettings = PatientLevelPrediction::createExecuteSettings(
+    runSplitData = T, 
+    runSampleData = F, 
+    runfeatureEngineering = F, 
+    runPreprocessData = T, 
+    runModelDevelopment = T, 
+    runCovariateSummary = F
+    ), 
+  saveDirectory = 'D:/testing/Deep'
+  )
+
+
+##predict.customLibrary(libraryName, predictionFunction, inputList){
+##  libraryName <- 'PatientLevelPrediction'
+##  predictionFunction <- "createStudyPopulation"
+##  predictFun <- get(predictionFunction, envir = rlang::search_envs()[grep(paste0('package:', libraryName), search())][[1]])
+##  
+##  prediction <- do.call(predictFun, inputList)
+##  return(prediction)
+##}
diff --git a/inst/python/TorchMap.py b/inst/python/TorchMap.py
deleted file mode 100644
index ef8feac..0000000
--- a/inst/python/TorchMap.py
+++ /dev/null
@@ -1,19 +0,0 @@
-import torch
-
-def map_python_initiate(maxCol,maxRow, maxT=None):
-  if maxT != None:
-    matrix = torch.sparse.FloatTensor(torch.LongTensor([[0,0],[0,1],[0,0]]), torch.FloatTensor([0.,0.]), torch.Size([maxRow,maxCol,maxT]))
-  else:
-    matrix = torch.sparse.FloatTensor(torch.LongTensor([[0,0],[0,1]]), torch.FloatTensor([0.,0.]), torch.Size([maxRow,maxCol]))
-  return matrix
-  
-def map_python(matrix, datas, maxCol,maxRow, maxT=None):
-  if maxT != None:
-    indexes= datas[:,0:3]-1
-    matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:,3]), torch.Size([maxRow,maxCol, maxT]))
-    matrix = matrix.add(matrixt)
-  else:
-    indexes= datas[:,0:2]-1
-    matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:,2]), torch.Size([maxRow,maxCol]))
-    matrix = matrix.add(matrixt)
-  return matrix  
diff --git a/inst/python/TorchUtils.py b/inst/python/TorchUtils.py
deleted file mode 100755
index ccb21b9..0000000
--- a/inst/python/TorchUtils.py
+++ /dev/null
@@ -1,695 +0,0 @@
-"""
- deepUtils.py
-
- Copyright 2016 Observational Health Data Sciences and Informatics
-
- This file is part of PatientLevelPrediction
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import sys
-import os
-#import _pickle as cPickle
-import pdb
-import random
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.externals import joblib
-from sklearn.metrics import roc_auc_score
-from collections import OrderedDict
-output_dir = 'data'
-
-
-class FocalLoss(nn.Module):
-    """
-    Method to handle data imbalance based on paper (arXiv:1708.02002) entitled
-    Focal loss for dense object detection.
-    Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class])
-
-    """
-
-    def __init__(self, gamma=5, eps=1e-7, size_average=False):
-        super(FocalLoss, self).__init__()
-        self.gamma = gamma
-        self.eps = eps
-        self.size_average = size_average
-
-    def forward(self, input, target):
-        y = self.one_hot(target, input.size(-1))
-        logit = F.softmax(input)
-        logit = logit.clamp(self.eps, 1. - self.eps)
-
-        loss = -1 * y * torch.log(logit)  # cross entropy
-        loss = loss * (1 - logit) ** self.gamma  # focal loss
-
-        if self.size_average:
-            loss = loss.mean()
-        else:
-            loss = loss.sum()
-        return loss
-
-    def one_hot(self, index, classes):
-        """
-
-        :param index: is the labels
-        :param classes: number if classes
-        :return:
-        """
-        size = index.size() + (classes,)
-        view = index.size() + (1,)
-
-        mask = torch.Tensor(*size).fill_(0)
-        index = index.view(*view)
-        ones = 1.
-
-        if isinstance(index, Variable):
-            ones = Variable(torch.Tensor(index.size()).fill_(1))
-            mask = Variable(mask, volatile=index.volatile)
-            if torch.cuda.is_available():
-                ones = ones.cuda()
-                mask = mask.cuda()
-
-        return mask.scatter_(1, index, ones)
-
-def loss_function(recon_x, x, mu, logvar):
-    """Loss function for varational autoencoder VAE"""
-    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
-
-    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
-    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
-
-    return BCE + KLD
-
-
-def mixup_data(x, y, alpha=1.0):
-
-    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda
-    Data Augmentation method based on paper (arXiv:1710.09412) entitled
-    mixup: Beyond empirical risk minimization.
-    '''
-    if alpha > 0.:
-        lam = np.random.beta(alpha, alpha)
-    else:
-        lam = 1.
-    batch_size = x.size()[0]
-    if torch.cuda.is_available():
-        index = torch.randperm(batch_size).cuda()
-    else:
-        index = torch.randperm(batch_size)
-
-    mixed_x = lam * x + (1 - lam) * x[index,:]
-    y_a, y_b = y, y[index]
-    return mixed_x, y_a, y_b, lam
-
-def mixup_criterion(y_a, y_b, lam):
-    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
-
-def early_stop(metrics_hist, patience = 3):
-    if not np.all(np.isnan(metrics_hist)):
-        return np.nanargmin(metrics_hist) > len(metrics_hist) - patience
-    else:
-        #keep training if criterion results have all been nan so far
-        return False
-
-
-class Estimator(object):
-    """
-    It is used for training different deep models in the same interface.
-    """
-
-    def __init__(self, model):
-        self.model = model
-
-    def compile(self, optimizer, loss):
-        self.optimizer = optimizer
-        self.loss_f = loss
-
-    def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False):
-        """
-        train one epoch
-
-        :param train_loader: The data loaded using DataLoader
-        :param l1regularization: default False
-        :return: the return fitted loss and accuracy
-        """
-        loss_list = []
-        acc_list = []
-        for idx, (X, y) in enumerate(train_loader):
-            X_v = Variable(X)
-            y_v = Variable(y)
-            if torch.cuda.is_available():
-                X_v = X_v.cuda()
-                y_v = y_v.cuda()
-
-            if mixup:
-                X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v)
-                X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b)
-
-                # print 'GPU id', torch.cuda.current_device()
-            self.optimizer.zero_grad()
-            # the below comemnted lines are used for multiple GPU training
-            # if torch.cuda.device_count() > 1:
-            # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count()))
-            # if cuda:
-            #	net = net.cuda()
-            # y_pred = net(X_v)
-            if autoencoder:
-                if vae:
-                    y_pred, mu, logvar = self.model(X_v)
-                    loss = loss_function(y_pred, X_v, mu, logvar)
-                else:
-                    y_pred = self.model(X_v)
-                    loss = self.loss_f(y_pred, X_v)
-            else:
-                y_pred = self.model(X_v)
-
-                loss = self.loss_f(y_pred, y_v)
-
-                if mixup:
-                    loss_func = mixup_criterion(y_v_a, y_v_b, lam)
-                    loss = loss_func(self.loss_f, y_pred)
-
-                if l1regularization:
-                    l1_crit = nn.L1Loss(size_average=False)
-                    reg_loss = 0
-                    for param in self.model.parameters():
-                        target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32)))
-                        if torch.cuda.is_available():
-                            target = target.cuda()
-                        reg_loss += l1_crit(param, target)
-
-                    factor = 0.0005
-                    loss += factor * reg_loss
-
-            loss.backward()
-            self.optimizer.step()
-            loss_list.append(loss.item())
-            if autoencoder:
-                acc_list.append(0)
-            else:
-                classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
-                acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten())
-                acc_list.append(acc)
-            del loss
-            del y_pred
-
-        return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list)
-
-    def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False):
-        train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)),
-                                  torch.from_numpy(y.astype(np.float32)).long().view(-1))
-        train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
-        self.model.train()
-        for t in range(nb_epoch):
-            loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae)
-            #print loss
-            val_log = ''
-            if validation_data and not autoencoder:
-                val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size)
-
-                val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc)
-                print(val_log)
-                # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log))
-
-    def evaluate(self, X, y, batch_size=32):
-        y_pred = self.predict(X)
-        y_v = Variable(torch.from_numpy(y).long(), requires_grad=False)
-        if torch.cuda.is_available():
-            y_v = y_v.cuda()
-        loss = self.loss_f(y_pred, y_v)
-        predict = y_pred.data.cpu().numpy()[:, 1].flatten()
-        auc = roc_auc_score(y, predict)
-        return loss.item(), auc
-
-    def _accuracy(self, y_pred, y):
-        return float(sum(y_pred == y)) / y.shape[0]
-
-    def predict(self, X):
-        X = Variable(torch.from_numpy(X.astype(np.float32)))
-        if torch.cuda.is_available():
-            X = X.cuda()
-        y_pred = self.model(X)
-        return y_pred
-
-    def predict_proba(self, X):
-        self.model.eval()
-        return self.model.predict_proba(X)
-
-class EarlyStopping(object): # pylint: disable=R0902
-    """
-    Gives a criterion to stop training when a given metric is not
-    improving anymore
-    Args:
-        mode (str): One of `min`, `max`. In `min` mode, training will
-            be stopped when the quantity monitored has stopped
-            decreasing; in `max` mode it will be stopped when the
-            quantity monitored has stopped increasing. Default: 'min'.
-        patience (int): Number of epochs with no improvement after
-            which training is stopped. For example, if
-            `patience = 2`, then we will ignore the first 2 epochs
-            with no improvement, and will only stop learning after the
-            3rd epoch if the loss still hasn't improved then.
-            Default: 10.
-        threshold (float): Threshold for measuring the new optimum,
-            to only focus on significant changes. Default: 1e-4.
-        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
-            dynamic_threshold = best * ( 1 + threshold ) in 'max'
-            mode or best * ( 1 - threshold ) in `min` mode.
-            In `abs` mode, dynamic_threshold = best + threshold in
-            `max` mode or best - threshold in `min` mode. Default: 'rel'.
-    """
-
-    def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'):
-        self.patience = patience
-        self.mode = mode
-        self.threshold = threshold
-        self.threshold_mode = threshold_mode
-        self.best = None
-        self.num_bad_epochs = None
-        self.mode_worse = None  # the worse value for the chosen mode
-        self.is_better = None
-        self.last_epoch = -1
-        self._init_is_better(mode=mode, threshold=threshold,
-                             threshold_mode=threshold_mode)
-        self._reset()
-
-    def _reset(self):
-        """Resets num_bad_epochs counter and cooldown counter."""
-        self.best = self.mode_worse
-        self.num_bad_epochs = 0
-
-    def step(self, metrics, epoch=None):
-        """ Updates early stopping state """
-        current = metrics
-        if epoch is None:
-            epoch = self.last_epoch = self.last_epoch + 1
-        self.last_epoch = epoch
-
-        if self.is_better(current, self.best):
-            self.best = current
-            self.num_bad_epochs = 0
-        else:
-            self.num_bad_epochs += 1
-
-    @property
-    def stop(self):
-        """ Should we stop learning? """
-        return self.num_bad_epochs > self.patience
-
-    def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201
-        if mode == 'min' and threshold_mode == 'rel':
-            rel_epsilon = 1. - threshold
-            return a < best * rel_epsilon
-
-        elif mode == 'min' and threshold_mode == 'abs':
-            return a < best - threshold
-
-        elif mode == 'max' and threshold_mode == 'rel':
-            rel_epsilon = threshold + 1.
-            return a > best * rel_epsilon
-
-        return a > best + threshold
-
-    def _init_is_better(self, mode, threshold, threshold_mode):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if threshold_mode not in {'rel', 'abs'}:
-            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
-
-        if mode == 'min':
-            self.mode_worse = float('inf')
-        else:  # mode == 'max':
-            self.mode_worse = (-float('inf'))
-
-        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
-
-    def state_dict(self):
-        """ Returns early stopping state """
-        return {key: value for key, value in self.__dict__.items() if key != 'is_better'}
-
-    def load_state_dict(self, state_dict):
-        """ Loads early stopping state """
-        self.__dict__.update(state_dict)
-        self._init_is_better(mode=self.mode, threshold=self.threshold,
-                             threshold_mode=self.threshold_mode)
-
-def adjust_learning_rate(learning_rate, optimizer, epoch):
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-
-    lr = learning_rate * (0.1 ** (epoch // 10))
-
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-    return lr
-
-def batch(tensor, batch_size = 50):
-    """ It is used to create batch samples, each batch has batch_size samples"""
-    tensor_list = []
-    length = tensor.shape[0]
-    i = 0
-    while True:
-        if (i+1) * batch_size >= length:
-            tensor_list.append(tensor[i * batch_size: length])
-            return tensor_list
-        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
-        i += 1
-
-
-class selu(nn.Module):
-    def __init__(self):
-        super(selu, self).__init__()
-        self.alpha = 1.6732632423543772848170429916717
-        self.scale = 1.0507009873554804934193349852946
-
-    def forward(self, x):
-        temp1 = self.scale * F.relu(x)
-        temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x)))
-        return temp1 + temp2
-
-
-class alpha_drop(nn.Module):
-    def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1):
-        super(alpha_drop, self).__init__()
-        keep_prob = 1 - p
-        self.a = np.sqrt(
-            fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar)))
-        self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
-        self.alpha = alpha
-        self.keep_prob = 1 - p
-        self.drop_prob = p
-
-    def forward(self, x):
-        if self.keep_prob == 1 or not self.training:
-            # print("testing mode, direct return")
-            return x
-        else:
-            random_tensor = self.keep_prob + torch.rand(x.size())
-
-            binary_tensor = Variable(torch.floor(random_tensor))
-
-            if torch.cuda.is_available():
-                binary_tensor = binary_tensor.cuda()
-
-            x = x.mul(binary_tensor)
-            ret = x + self.alpha * (1 - binary_tensor)
-            ret.mul_(self.a).add_(self.b)
-            return ret
-
-def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None):
-    """
-    create matrix for temporal models.
-
-    :param covariate_ids: the covariate ids in the whole data
-    :param patient_dict: the dictionary contains the data for each patient
-    :param y_dict: if the output labels is known, it contains the labels for patients
-    :param timeid_len: the total number time window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    D = len(covariate_ids)
-    N = len(patient_dict)
-    T = timeid_len   
-    concept_list =list(covariate_ids)
-    concept_list.sort()
-    x_raw = np.zeros((N, D, T), dtype=float)
-    patient_ind = 0
-    p_ids = []
-    patient_keys = patient_dict.keys()
-    #print covariate_ids
-    for kk in patient_keys:
-        #print('-------------------')
-        vals = patient_dict[kk]
-        #sorted(vals)
-        p_ids.append(int(kk))
-        for timeid, meas in vals.iteritems():
-            int_time = int(timeid) - 1
-            for val in meas:
-                if not len(val):
-                    continue
-                cov_id, cov_val = val
-                if cov_id not in covariate_ids:
-                    continue
-                lab_ind = concept_list.index(cov_id)
-                if cov_mean_dict is None:
-                    x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-                else:
-                    mean_std = cov_mean_dict[cov_id]
-                    if mean_std[1]:
-                        x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1]
-                    else:
-                        x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-
-    
-        patient_ind = patient_ind + 1
-
-    #impute the data using the value of previous timestamp
-    #fw = open('patient_var.txt', 'w')
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-    #fw.close()
-
-    return x_raw, patient_keys
-
-def forward_impute_missing_value(x_raw):
-    N = x_raw.shape[0]
-    D = x_raw.shape[1]
-    T = x_raw.shape[2]
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-
-def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False):
-    """
-    It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix
-
-    :param covariates: covariates extracted by FeatureExtraction package
-    :param timeid_len: the total number of window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    patient_dict = OrderedDict()
-    print('Loading temporal data')
-    cov_vals_dict = {}
-    for row in covariates:
-        p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3]
-        cov_id = np.int64(cov_id)
-        #time_id = int(time_id)
-        cov_vals_dict.setdefault(cov_id, []).append(float(cov_val))
-        if p_id not in patient_dict:
-            patient_dict[p_id] = {time_id: [(cov_id, cov_val)]}
-        else:
-            if time_id not in patient_dict[p_id]:
-                patient_dict[p_id][time_id] = [(cov_id, cov_val)]
-            else:
-                patient_dict[p_id][time_id].append((cov_id, cov_val))
-        #covariate_ids.add(cov_id)
-    #T = 365/time_window
-    covariate_ids = set()
-    cov_mean_dict = {}
-    if not predict:
-        fw = open('covariate_mean_std.csv', 'w')
-        for key, val in cov_vals_dict.iteritems():
-            mean_val = np.mean(val)
-            std_val = np.std(val)
-
-            # Remove those covariates with few occurrence (<5)
-            if len(val) >= 5:
-                covariate_ids.add(key)
-                cov_mean_dict[key] = (mean_val, std_val)
-                fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n')
-        fw.close()
-    else:
-        fp = open('covariate_mean_std.csv', 'r')
-        for line in fp:
-            values = line.rstrip().split(',')
-            key = np.int64(values[0])
-            covariate_ids.add(key)
-            cov_mean_dict[key] = (float(values[1]), float(values[2]))
-        fp.close()
-
-
-    if normalize:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict)
-    else:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len)
-    
-    return x, patient_keys
-
-
-def read_covariates(covariate_file):
-    patient_dict = {}
-    head = True
-    with open(covariate_file, 'r') as fp:
-        for line in fp:
-            if head:
-                head = False
-                continue
-            values = line.rstrip().split(',')
-            patient_id = values[1]
-            cov_id = values[2]
-            #time_id = int(values[-1])
-            # covariates in one patient has time order
-            patient_dict.setdefault(patient_id, []).append((cov_id))
-    new_patient = []
-    for key in patient_dict.keys():
-        #patient_dict[key].sort()
-        sort_vals = []
-        for val in patient_dict[key]:
-            if val[1] not in sort_vals:
-                sort_vals.append(val)
-        new_patient.append(sort_vals)
-
-    return new_patient
-
-
-def word_embeddings(covariate_file, embedding_size=50):
-    import gensim.models.word2vec as w2v
-    modelname = "processed_%s.w2v" % ('heartfailure')
-    sentences = read_covariates(covariate_file)
-    model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1)
-    print("building word2vec vocab on %s..." % (covariate_file))
-
-    model.build_vocab(sentences)
-    print("training...")
-    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
-    out_file = modelname
-    print("writing embeddings to %s" % (out_file))
-    model.save(out_file)
-    return out_file
-
-def read_data(filename):
-    covariate_ids = set()
-    patient_dict = {}
-    head = True
-    with open(filename) as fp:
-        for lines in fp:
-            if head:
-                head = False
-                continue
-            lines = lines.strip('\n').strip('\r').split(',')
-            try:
-                p_id, cov_id, time_id, cov_val =  lines[1], lines[2], lines[3], lines[4]
-            except:
-                pdb.set_trace()
-            print(p_id, cov_id, time_id)
-            if p_id not in patient_dict:
-                patient_dict[p_id] = {}
-            else:
-                if time_id not in patient_dict[p_id]:
-                    patient_dict[p_id][time_id] = []
-                else:
-                    patient_dict[p_id][time_id].append((cov_id, cov_val))
-                    covariate_ids.add(cov_id)
-    patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients
-    #(15, 2000, 60) 20000 patients, 15 lab tests,
-    return covariate_ids, patient_dict
-
-def split_training_validation(classes, validation_size=0.2, shuffle=False):
-    """split sampels based on balnace classes"""
-    num_samples = len(classes)
-    classes = np.array(classes)
-    classes_unique = np.unique(classes)
-    num_classes = len(classes_unique)
-    indices = np.arange(num_samples)
-    # indices_folds=np.zeros([num_samples],dtype=int)
-    training_indice = []
-    training_label = []
-    validation_indice = []
-    validation_label = []
-    for cl in classes_unique:
-        indices_cl = indices[classes == cl]
-        num_samples_cl = len(indices_cl)
-
-        # split this class into k parts
-        if shuffle:
-            random.shuffle(indices_cl)  # in-place shuffle
-
-        # module and residual
-        num_samples_each_split = int(num_samples_cl * validation_size)
-        res = num_samples_cl - num_samples_each_split
-
-        training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]]
-        training_label = training_label + [cl] * res
-
-        validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]]
-        validation_label = validation_label + [cl] * num_samples_each_split
-
-    training_index = np.arange(len(training_label))
-    random.shuffle(training_index)
-    training_indice = np.array(training_indice)[training_index]
-    training_label = np.array(training_label)[training_index]
-
-    validation_index = np.arange(len(validation_label))
-    random.shuffle(validation_index)
-    validation_indice = np.array(validation_indice)[validation_index]
-    validation_label = np.array(validation_label)[validation_index]
-
-    return training_indice, training_label, validation_indice, validation_label
-
-
-if __name__ == "__main__":
-    x_raw = np.array([[1, 1, 0], [0,1,0]])
-    x = []
-    x.append(x_raw)
-    x.append(x_raw)
-    x = np.array(x)
-    #pdb.set_trace()
-    forward_impute_missing_value(x)
-    #filename = sys.argv[1]
-    #word_embeddings(filename)
-    '''
-    population = joblib.load('/data/share/plp/SYNPUF/population.pkl')
-    # y = population[:, 1]
-    covriate_ids, patient_dict = read_data(filename)
-    # y_ids = np.array([int(val) for val in patient_dict.keys()])
-    # Y = []
-    y_dict = dict(zip(population[:, 0], population[:, 1]))
-    #x, patient_keys = convert_2_cnn_format(covariates, timeid_len = 31)
-    # for val in y_ids:
-    #    Y.append(y_dict[y_ids]) 
-    x_train, x_valid, x_test, Y_train, Y_valid, Y_test = convert_to_temporal_format(covriate_ids, patient_dict, y_dict)
-    '''
diff --git a/inst/python/deepTorch.py b/inst/python/deepTorch.py
deleted file mode 100644
index 7425413..0000000
--- a/inst/python/deepTorch.py
+++ /dev/null
@@ -1,1186 +0,0 @@
-"""
- deepTorch.py: It implements different deep learning classifiers
-
- Copyright 2016 Observational Health Data Sciences and Informatics
-
- This file is part of PatientLevelPrediction
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import sys
-import os
-import pdb
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from torch.utils.data import DataLoader, TensorDataset
-from torch.nn.utils.rnn import pack_padded_sequence
-from torch.nn.utils.rnn import pad_packed_sequence
-from collections import OrderedDict
-import timeit
-import joblib
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import roc_auc_score
-import numpy as np
-
-import warnings
-warnings.filterwarnings("ignore")
-
-if "python_dir" in globals():
-    #print python_dir
-    sys.path.insert(0, python_dir)
-
-import TorchUtils as tu
-
-
-class LogisticRegression(nn.Module):
-    """
-    Train a logistic regression model using pytorch
-    """
-    def __init__(self, input_size, num_classes = 2):
-        super(LogisticRegression, self).__init__()
-        self.linear = nn.Linear(input_size, num_classes)
-
-    def forward(self, x):
-        out = self.linear(x)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class MLP(nn.Module):
-    """
-    Train a multiple-layer perceptron with one hideen layer
-    """
-    def __init__(self, input_dim, hidden_size, num_classes = 2):
-        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p =0.5, training=self.training)
-        x = self.fc2(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class SNN(nn.Module):
-    """
-    Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515
-    """
-
-    def __init__(self, input_dim, hidden_size, num_classes=2):
-        super(SNN, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = tu.selu()
-        self.ad1 = tu.alpha_drop()
-        self.fc4 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = self.fc2(x)
-        x = self.ad1(x)
-        x = self.fc4(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-class AutoEncoder(nn.Module):
-    """
-    A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(AutoEncoder, self).__init__()
-
-        self.encoder = nn.Sequential(
-            nn.Linear(input_size, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, encoding_size),
-            nn.ReLU(True)
-        )
-        self.decoder = nn.Sequential(
-            nn.Linear(encoding_size, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size)
-        )
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        encoded = self.encoder(x)
-        decoded = self.decoder(encoded)
-        return decoded
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            encoded = self.encoder(x)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class VAE(nn.Module):
-    """
-    A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(VAE, self).__init__()
-
-        self.fc1 = nn.Linear(input_size, input_size/2)
-        self.fc21 = nn.Linear(input_size/2, encoding_size)
-        self.fc22 = nn.Linear(input_size/2, encoding_size)
-        self.fc3 = nn.Linear(encoding_size, input_size/2)
-        self.fc4 = nn.Linear(input_size/2, input_size)
-
-        self.relu = nn.ReLU()
-        self.sigmoid = nn.Sigmoid()
-
-    def encode(self, x):
-        h1 = self.relu(self.fc1(x))
-        return self.fc21(h1), self.fc22(h1)
-
-    def reparameterize(self, mu, logvar):
-        if self.training:
-            std = logvar.mul(0.5).exp_()
-            eps = Variable(std.data.new(std.size()).normal_())
-            return eps.mul(std).add_(mu)
-        else:
-            return mu
-
-    def decode(self, z):
-        h3 = self.relu(self.fc3(z))
-        return self.sigmoid(self.fc4(h3))
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        mu, logvar = self.encode(x)
-        z = self.reparameterize(mu, logvar)
-        return self.decode(z), mu, logvar
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encode(x)
-            encoded = self.reparameterize(mu, logvar)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class Decoder(nn.Module):
-    """ VAE decoder input_size = original inputsize/16*256"""
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Decoder, self).__init__()
-        self.latent_size = latent_size
-        self.img_channels = img_channels
-
-        self.fc1 = nn.Linear(latent_size, input_size)
-        self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding)
-        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding)
-        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding)
-        self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.fc1(x))
-        x = x.unsqueeze(-1).unsqueeze(-1)
-        x = F.relu(self.deconv1(x))
-        x = F.relu(self.deconv2(x))
-        x = F.relu(self.deconv3(x))
-        reconstruction = torch.sigmoid(self.deconv4(x))
-        return reconstruction
-
-class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
-    """ VAE encoder """
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Encoder, self).__init__()
-        self.latent_size = latent_size
-        #self.img_size = img_size
-        self.img_channels = img_channels
-
-        self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding)
-        self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding)
-        self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding)
-        out_size = input_size / 16
-        self.fc_mu = nn.Linear(out_size, latent_size)
-        self.fc_logsigma = nn.Linear(out_size, latent_size)
-
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = F.relu(self.conv4(x))
-        x = x.view(x.size(0), -1)
-
-        mu = self.fc_mu(x)
-        logsigma = self.fc_logsigma(x)
-
-        return mu, logsigma
-
-class VAE_CNN(nn.Module):
-    """ Variational Autoencoder """
-    def __init__(self, latent_size, input_size):
-        super(VAE, self).__init__()
-        self.encoder = Encoder(latent_size, input_size)
-        input_size = input_size/16
-        self.decoder = Decoder(latent_size, input_size)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        mu, logsigma = self.encoder(x)
-        sigma = logsigma.exp()
-        eps = torch.randn_like(sigma)
-        z = eps.mul(sigma).add_(mu)
-
-        recon_x = self.decoder(z)
-        return recon_x, mu, logsigma
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encoder(x)
-            encoded = mu .data.cpu().numpy()
-            return encoded
-
-class CNN(nn.Module):
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size)
-        self.bn = nn.BatchNorm1d(hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.layer2(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.bn(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-#allow multiple kernel with differnt kernel size
-class CNN_MLF(nn.Module):
-    """
-    It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN_MLF, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1
-        maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size
-        maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1
-        maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        conv_outsize = 	maxpool1_size + maxpool2_size +maxpool3_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out1 = self.layer1(x)
-        out2 = self.layer2(x)
-        out3 = self.layer3(x)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_LSTM(nn.Module):
-    """
-    It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2):
-        super(CNN_LSTM, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding)
-        input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(hidden_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.downsample(out)
-        out = torch.squeeze(out, 1)
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
-        out, hn  = self.layer2(out, (h0, c0))
-        out = hn[0][-1]
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MIX(nn.Module):
-    """
-    It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MIX, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        out1_size = int(np.ceil(float(window_size)/pool_size[1]))
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        
-        out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer2(out)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer3(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MULTI(nn.Module):
-    """
-    It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MULTI, self).__init__()
-        # resolution 1
-        self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        #resolution 2
-        self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.layer4 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        merge_size = cnn1_size + cnn2_size + cnn4_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.pool1_1(x)
-        out = self.pool1_2(out)
-        out1 = self.layer1(out)
-        out = self.pool2_1(x)
-        out2 = self.layer2(out)
-        out = self.layer3(x)
-        out3 = self.layer4(out)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# 1x3 Convolution
-def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)):
-    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
-                     padding=padding, stride=stride, bias=False)
-
-
-# Residual Block
-class ResidualBlock(nn.Module):
-    def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn1 = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn2 = nn.BatchNorm2d(nb_filter)
-        self.downsample = downsample
-        
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        if self.downsample:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-
-
-# ResNet Module
-class ResNet(nn.Module):
-    def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100):
-        super(ResNet, self).__init__()
-        self.in_channels = 1
-        self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size)
-        self.bn = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self.make_layer(block, nb_filter, layers[0],  kernel_size = kernel_size)
-        self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter)
-        self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter)
-        self.avg_pool = nn.AvgPool2d(pool_size)
-        avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        last_layer_size = nb_filter*4*labcounts*avgpool2_1_size
-        self.fc = nn.Linear(last_layer_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def make_layer(self, block, out_channels, blocks, stride=1,  kernel_size = (1, 3), in_channels = 16):
-        downsample = None
-        if (stride != 1) or (self.in_channels != out_channels):
-            downsample = nn.Sequential(
-                convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride),
-                nn.BatchNorm2d(out_channels))
-        layers = []
-        layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample))
-        self.in_channels = out_channels
-        for i in range(1, blocks):
-            layers.append(block(out_channels, out_channels, kernel_size = kernel_size))
-        return nn.Sequential(*layers)
-     
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.conv(x)
-        out = self.bn(out)
-        out = self.relu(out)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.avg_pool(out)
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-        
-class GRU(nn.Module):
-    """
-    It is a deep network with one GRU layer, which are further fed into one fully connected layers.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(GRU, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.linear = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection
-        self.gru.flatten_parameters()
-        out, hn = self.gru(x, h0)
-
-        rearranged = hn[-1]
-        out = self.linear(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def initHidden(self, N):
-        return Variable(torch.randn(1, N, self.hidden_size))
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class RNN(nn.Module):
-    """
-    It is a deep network with one LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(RNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.fc = nn.Linear(hidden_size, num_classes)
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        rearranged = hn[0][-1]
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class BiRNN(nn.Module):
-    """
-    It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(BiRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
-                            batch_first = True, dropout = dropout, bidirectional=True)
-        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection 
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        hn = hn[0]
-
-        rearranged = hn[-2:].view(x.size(0), -1)
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# select model
-if __name__ == "__main__":
-    if model_type in ['LogisticRegression', 'MLP', 'SNN']:
-        y = population[:, 1]
-        X = plpData[population[:, 0], :]
-        trainInds = population[:, population.shape[1] - 1] > 0
-        if class_weight == -1:
-            loss = tu.FocalLoss(gamma = 5)
-        else:
-            if class_weight == 0:
-                weights = float(np.count_nonzero(y))/y.shape[0]
-                class_weight = [1 - weights, weights]
-            else:
-                class_weight = [class_weight, 1]
-            class_weight = 1/torch.Tensor(class_weight)
-            if torch.cuda.is_available():
-                class_weight = class_weight.cuda()
-            loss=nn.CrossEntropyLoss(weight = class_weight)
-
-        print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
-        print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]))
-        ###########################################################################
-        l1regularization = False
-
-        if train:
-            pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
-            print("Calculating prediction for train set of size %s" % (pred_size))
-            test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
-            for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-                testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-                trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-                train_x = X[trainInds, :][trainInd, :]
-                train_y = y[trainInds][trainInd]
-
-                test_x = X[trainInds, :][testInd, :]
-                print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-                print("Train set contains %s outcomes " % (np.sum(train_y)))
-                train_x = train_x.toarray()
-                test_x = test_x.toarray()
-
-                if autoencoder:
-                    print('first train stakced autoencoder')
-                    encoding_size = 256
-                    if vae:
-                        auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-                    else:
-                        auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-                    if torch.cuda.is_available():
-                        auto_model = auto_model.cuda()
-                    clf = tu.Estimator(auto_model)
-                    clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay),
-                                loss=nn.MSELoss())
-                    clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae)
-                    #split to batch for large dataset
-                    train_batch = tu.batch(train_x, batch_size=32)
-                    train_x = np.array([]).reshape(0, encoding_size)
-                    for train in train_batch:
-                        encode_train = auto_model.get_encode_features(train)
-                        train_x = np.concatenate((train_x, encode_train), axis=0)
-                    #train_x = auto_model.get_encode_features(train_x.toarray())
-                    #test_x = auto_model.get_encode_features(test_x.toarray())
-                    test_batch = tu.batch(test_x, batch_size=32)
-                    test_x = np.array([]).reshape(0, encoding_size)
-                    for test in test_batch:
-                        encode_Test = auto_model.get_encode_features(test)
-                        test_x = np.concatenate((test_x, encode_Test), axis=0)
-                    del auto_model
-                    del clf
-                # train on fold
-                print("Training fold %s" % (i))
-                start_time = timeit.default_timer()
-                if model_type == 'LogisticRegression':
-                    model = LogisticRegression(train_x.shape[1])
-                    l1regularization = True
-                elif model_type == 'SNN':
-                    model = SNN(train_x.shape[1], size)
-                else:
-                    model = MLP(train_x.shape[1], size)
-
-                if torch.cuda.is_available():
-                    model = model.cuda()
-                clf = tu.Estimator(model)
-                clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                            loss=loss)
-                #if not autoencoder:
-                #    train_x = train_x.toarray()
-                #    test_x = test_x.toarray()
-
-                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-
-                ind = (population[:, population.shape[1] - 1] > 0)
-                ind = population[ind, population.shape[1] - 1] == i
-
-                test_input_var = torch.from_numpy(test_x.astype(np.float32))
-
-                test_batch = tu.batch(test_x, batch_size = 32)
-                temp = []
-                for test in test_batch:
-                    pred_test1 = model.predict_proba(test)[:, 1]
-                    temp = np.concatenate((temp, pred_test1), axis = 0)
-                #temp = model.predict_proba(test_input_var)[:, 1]
-                #temp = preds.data.cpu().numpy().flatten()
-                #print temp
-                test_pred[ind] = temp
-                print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-                print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-
-            # merge pred with indexes[testInd,:]
-            test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-            prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-
-        # train final:
-        else:
-            print("Training final neural network model on all train data...")
-            print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-
-            start_time = timeit.default_timer()
-
-            train_x = X[trainInds, :]
-            train_x = train_x.toarray()
-            train_y = y[trainInds]
-            if not os.path.exists(modelOutput):
-                os.makedirs(modelOutput)
-            if autoencoder:
-                encoding_size = 256
-                if vae:
-                    auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-                else:
-                    auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-                if torch.cuda.is_available():
-                    auto_model = auto_model.cuda()
-                clf = tu.Estimator(auto_model)
-                clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay),
-                            loss=nn.MSELoss())
-
-                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae)
-                #train_x = auto_model.get_encode_features(train_x.toarray())
-                train_batch = tu.batch(train_x, batch_size=32)
-                train_x = np.array([]).reshape(0, encoding_size)
-                for train in train_batch:
-                    encode_train = auto_model.get_encode_features(train)
-                    train_x = np.concatenate((train_x, encode_train), axis=0)
-                joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl'))
-                del auto_model
-                del clf
-
-            print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay))
-            if model_type == 'LogisticRegression':
-                model = LogisticRegression(train_x.shape[1])
-                l1regularization = True
-            elif model_type == 'SNN':
-                model = SNN(train_x.shape[1], size)
-            else:
-                model = MLP(train_x.shape[1], size)
-
-            #if not autoencoder:
-            #    train_x = train_x.toarray()
-
-            if torch.cuda.is_available():
-                model = model.cuda()
-            clf = tu.Estimator(model)
-            clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                        loss=loss)
-            clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-
-            end_time = timeit.default_timer()
-            print("Training final took: %.2f s" % (end_time - start_time))
-
-            # save the model:
-            print("Model saved to: %s" % (modelOutput))
-
-            joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-
-
-    elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']:
-        #print 'running model', model_type
-        y = population[:, 1]
-        #plpData = plpData[population[:, 0], :]
-        #config = tf.ConfigProto()
-        #config.gpu_options.allow_growth = True
-        #with tf.Session(config=config) as sess:
-        #    X = tf.sparse_reorder(plpData)
-        #    X = tf.sparse_tensor_to_dense(X)
-        #    X = sess.run(X)
-            #tu.forward_impute_missing_value(X)
-        X = plpData.to_dense().numpy()
-        X = X[np.int64(population[:, 0]), :]
-        '''
-        p_ids_in_cov = set(covariates[:, 0])
-        full_covariates = np.array([]).reshape(0,4)
-        default_covid = covariates[0, 1]
-        timeid_len = len(set(covariates[:, -2]))
-        for p_id in  population[:, 0]:
-            if p_id not in p_ids_in_cov:
-                tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1
-                full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-            else:
-                tmp_x = covariates[covariates[:, 0] == p_id, :]
-                full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-
-        trainInds = population[:, population.shape[1] - 1] > 0
-        X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len= timeid_len)
-        full_covariates = []
-        '''
-        if class_weight == -1:
-            loss = tu.FocalLoss(gamma = 3)
-        else:
-            if class_weight == 0:
-                weights = float(np.count_nonzero(y))/y.shape[0]
-                class_weight = [1 - weights, weights]
-            else:
-                class_weight = [class_weight, 1]
-            class_weight = 1/torch.Tensor(class_weight)
-            if torch.cuda.is_available():
-                class_weight = class_weight.cuda()
-            loss=nn.CrossEntropyLoss(weight = class_weight)
-        trainInds = population[:, population.shape[1] - 1] > 0
-        if train:
-            pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
-            print("Calculating prediction for train set of size %s" % (pred_size))
-            test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
-            for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-                testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-                trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-                train_x = X[trainInds, :][trainInd, :]
-                train_y = y[trainInds][trainInd]
-
-                test_x = X[trainInds, :][testInd, :]
-                print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-                print("Train set contains %s outcomes " % (np.sum(train_y)))
-
-                # train on fold
-                learning_rate = 0.001
-                print("Training fold %s" % (i))
-                start_time = timeit.default_timer()
-                if model_type == 'CNN':
-                    model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'CNN_LSTM':
-                    model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-                elif model_type == 'CNN_MLF': # multiple kernels with different size
-                    model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis
-                    model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis
-                    model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'ResNet':
-                    print('train ResNet')
-                    model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-                elif model_type == 'RNN':
-                    model = RNN(train_x.shape[2], hidden_size, 2, 2)
-                elif model_type == 'BiRNN':
-                    model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-                elif model_type == 'GRU':
-                    model = GRU(train_x.shape[2], hidden_size, 2, 2)
-                else:
-                    print('temproal data not supported by this model')
-
-                if torch.cuda.is_available():
-                    model = model.cuda()
-                clf = tu.Estimator(model)
-                clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                            loss=loss)
-
-                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-
-                ind = (population[:, population.shape[1] - 1] > 0)
-                ind = population[ind, population.shape[1] - 1] == i
-
-                test_batch = tu.batch(test_x, batch_size = 32)
-                temp = []
-                for test in test_batch:
-                    pred_test1 = model.predict_proba(test)[:, 1]
-                    temp = np.concatenate((temp, pred_test1), axis = 0)
-
-                test_pred[ind] = temp
-                del model
-                print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-                print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-
-            # merge pred with indexes[testInd,:]
-            test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-            prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-
-        # train final:
-        else:
-            print("Training final neural network model on all train data...")
-            print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-
-            start_time = timeit.default_timer()
-
-            train_x = X[trainInds, :]
-            train_y = y[trainInds]
-            learning_rate = 0.001
-
-            if model_type == 'CNN':
-                model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'CNN_LSTM':
-                model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-            elif model_type == 'CNN_MLF': # multiple kernels with different size
-                model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis
-                model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis
-                model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'ResNet':
-                model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-            elif model_type == 'RNN':
-                model = RNN(train_x.shape[2], hidden_size, 2, 2)
-            elif model_type == 'BiRNN':
-                model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-            elif model_type == 'GRU':
-                model = GRU(train_x.shape[2], hidden_size, 2, 2)
-            else:
-                print('temproal data not supported by this model')
-
-            if torch.cuda.is_available():
-                model = model.cuda()
-            clf = tu.Estimator(model)
-            clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                        loss=loss)
-            clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-
-            end_time = timeit.default_timer()
-            print("Training final took: %.2f s" % (end_time - start_time))
-
-            # save the model:
-            if not os.path.exists(modelOutput):
-                os.makedirs(modelOutput)
-            print("Model saved to: %s" % (modelOutput))
-
-            joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-            
-            # prediction on train:
-            test_batch = tu.batch(train_x, batch_size = 32)
-            test_pred = []
-            for test in test_batch:
-                pred_test1 = model.predict_proba(test)[:, 1]
-                test_pred = np.concatenate((test_pred, pred_test1), axis = 0)
-            test_pred.shape = (population.shape[0], 1)
-            prediction = np.append(population, test_pred, axis=1)
-
-'''
-if __name__ == "__main__":
-    DATA_SIZE = 1000
-    INPUT_SIZE = 36
-    HIDDEN_SIZE = 100
-    class_size = 2
-    #X = np.random.randn(DATA_SIZE * class_size, 18, INPUT_SIZE)
-    X = np.random.randn(DATA_SIZE * class_size, INPUT_SIZE)
-    y = np.array([i for i in range(class_size) for _ in range(DATA_SIZE)])
-    
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
-    model = LogisticRegression(X_train.shape[1])
-    l1regularization = True
-    #model = CNN_LSTM(nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2]) 
-    #model = ResNet(ResidualBlock, [3, 3, 3], nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2])
-    #model = RNN(INPUT_SIZE, HIDDEN_SIZE, 2, class_size)
-    #pdb.set_trace()
-    if cuda:
-        model = model.cuda()
-    clf = Estimator(model)
-    clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-4),
-                loss=nn.CrossEntropyLoss())
-    clf.fit(X_train, y_train, batch_size=64, nb_epoch=10,
-            validation_data=(X_test, y_test), l1regularization = l1regularization)
-    score, auc = clf.evaluate(X_test, y_test)
-    
-    print('Test score: %s' %(auc))
-'''
diff --git a/inst/python/deepTorchFunctions.py b/inst/python/deepTorchFunctions.py
deleted file mode 100644
index ff2f6f0..0000000
--- a/inst/python/deepTorchFunctions.py
+++ /dev/null
@@ -1,1732 +0,0 @@
-"""
- deepTorch.py: It implements different deep learning classifiers
-
- Copyright 2016 Observational Health Data Sciences and Informatics
-
- This file is part of PatientLevelPrediction
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import sys
-import os
-import pdb
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from torch.utils.data import DataLoader, TensorDataset
-from torch.nn.utils.rnn import pack_padded_sequence
-from torch.nn.utils.rnn import pad_packed_sequence
-from collections import OrderedDict
-import timeit
-import joblib
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import roc_auc_score
-import numpy as np
-
-import warnings
-warnings.filterwarnings("ignore")
-
-class FocalLoss(nn.Module):
-    """
-    Method to handle data imbalance based on paper (arXiv:1708.02002) entitled
-    Focal loss for dense object detection.
-    Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class])
-
-    """
-
-    def __init__(self, gamma=5, eps=1e-7, size_average=False):
-        super(FocalLoss, self).__init__()
-        self.gamma = gamma
-        self.eps = eps
-        self.size_average = size_average
-
-    def forward(self, input, target):
-        y = self.one_hot(target, input.size(-1))
-        logit = F.softmax(input)
-        logit = logit.clamp(self.eps, 1. - self.eps)
-
-        loss = -1 * y * torch.log(logit)  # cross entropy
-        loss = loss * (1 - logit) ** self.gamma  # focal loss
-
-        if self.size_average:
-            loss = loss.mean()
-        else:
-            loss = loss.sum()
-        return loss
-
-    def one_hot(self, index, classes):
-        """
-
-        :param index: is the labels
-        :param classes: number if classes
-        :return:
-        """
-        size = index.size() + (classes,)
-        view = index.size() + (1,)
-
-        mask = torch.Tensor(*size).fill_(0)
-        index = index.view(*view)
-        ones = 1.
-
-        if isinstance(index, Variable):
-            ones = Variable(torch.Tensor(index.size()).fill_(1))
-            mask = Variable(mask, volatile=index.volatile)
-            if torch.cuda.is_available():
-                ones = ones.cuda()
-                mask = mask.cuda()
-
-        return mask.scatter_(1, index, ones)
-
-def loss_function(recon_x, x, mu, logvar):
-    """Loss function for varational autoencoder VAE"""
-    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
-
-    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
-    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
-
-    return BCE + KLD
-
-
-def mixup_data(x, y, alpha=1.0):
-
-    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda
-    Data Augmentation method based on paper (arXiv:1710.09412) entitled
-    mixup: Beyond empirical risk minimization.
-    '''
-    if alpha > 0.:
-        lam = np.random.beta(alpha, alpha)
-    else:
-        lam = 1.
-    batch_size = x.size()[0]
-    if torch.cuda.is_available():
-        index = torch.randperm(batch_size).cuda()
-    else:
-        index = torch.randperm(batch_size)
-
-    mixed_x = lam * x + (1 - lam) * x[index,:]
-    y_a, y_b = y, y[index]
-    return mixed_x, y_a, y_b, lam
-
-def mixup_criterion(y_a, y_b, lam):
-    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
-
-def early_stop(metrics_hist, patience = 3):
-    if not np.all(np.isnan(metrics_hist)):
-        return np.nanargmin(metrics_hist) > len(metrics_hist) - patience
-    else:
-        #keep training if criterion results have all been nan so far
-        return False
-
-
-class Estimator(object):
-    """
-    It is used for training different deep models in the same interface.
-    """
-
-    def __init__(self, model):
-        self.model = model
-
-    def compile(self, optimizer, loss):
-        self.optimizer = optimizer
-        self.loss_f = loss
-
-    def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False):
-        """
-        train one epoch
-
-        :param train_loader: The data loaded using DataLoader
-        :param l1regularization: default False
-        :return: the return fitted loss and accuracy
-        """
-        loss_list = []
-        acc_list = []
-        for idx, (X, y) in enumerate(train_loader):
-            X_v = Variable(X)
-            y_v = Variable(y)
-            if torch.cuda.is_available():
-                X_v = X_v.cuda()
-                y_v = y_v.cuda()
-
-            if mixup:
-                X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v)
-                X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b)
-
-                # print 'GPU id', torch.cuda.current_device()
-            self.optimizer.zero_grad()
-            # the below comemnted lines are used for multiple GPU training
-            # if torch.cuda.device_count() > 1:
-            # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count()))
-            # if cuda:
-            #	net = net.cuda()
-            # y_pred = net(X_v)
-            if autoencoder:
-                if vae:
-                    y_pred, mu, logvar = self.model(X_v)
-                    loss = loss_function(y_pred, X_v, mu, logvar)
-                else:
-                    y_pred = self.model(X_v)
-                    loss = self.loss_f(y_pred, X_v)
-            else:
-                y_pred = self.model(X_v)
-
-                loss = self.loss_f(y_pred, y_v)
-
-                if mixup:
-                    loss_func = mixup_criterion(y_v_a, y_v_b, lam)
-                    loss = loss_func(self.loss_f, y_pred)
-
-                if l1regularization:
-                    l1_crit = nn.L1Loss(size_average=False)
-                    reg_loss = 0
-                    for param in self.model.parameters():
-                        target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32)))
-                        if torch.cuda.is_available():
-                            target = target.cuda()
-                        reg_loss += l1_crit(param, target)
-
-                    factor = 0.0005
-                    loss += factor * reg_loss
-
-            loss.backward()
-            self.optimizer.step()
-            loss_list.append(loss.item())
-            if autoencoder:
-                acc_list.append(0)
-            else:
-                classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
-                acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten())
-                acc_list.append(acc)
-            del loss
-            del y_pred
-
-        return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list)
-
-    def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False):
-        train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)),
-                                  torch.from_numpy(y.astype(np.float32)).long().view(-1))
-        train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
-        self.model.train()
-        for t in range(nb_epoch):
-            loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae)
-            #print loss
-            val_log = ''
-            if validation_data and not autoencoder:
-                val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size)
-
-                val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc)
-                print(val_log)
-                # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log))
-
-    def evaluate(self, X, y, batch_size=32):
-        y_pred = self.predict(X)
-        y_v = Variable(torch.from_numpy(y).long(), requires_grad=False)
-        if torch.cuda.is_available():
-            y_v = y_v.cuda()
-        loss = self.loss_f(y_pred, y_v)
-        predict = y_pred.data.cpu().numpy()[:, 1].flatten()
-        auc = roc_auc_score(y, predict)
-        return loss.item(), auc
-
-    def _accuracy(self, y_pred, y):
-        return float(sum(y_pred == y)) / y.shape[0]
-
-    def predict(self, X):
-        X = Variable(torch.from_numpy(X.astype(np.float32)))
-        if torch.cuda.is_available():
-            X = X.cuda()
-        y_pred = self.model(X)
-        return y_pred
-
-    def predict_proba(self, X):
-        self.model.eval()
-        return self.model.predict_proba(X)
-
-class EarlyStopping(object): # pylint: disable=R0902
-    """
-    Gives a criterion to stop training when a given metric is not
-    improving anymore
-    Args:
-        mode (str): One of `min`, `max`. In `min` mode, training will
-            be stopped when the quantity monitored has stopped
-            decreasing; in `max` mode it will be stopped when the
-            quantity monitored has stopped increasing. Default: 'min'.
-        patience (int): Number of epochs with no improvement after
-            which training is stopped. For example, if
-            `patience = 2`, then we will ignore the first 2 epochs
-            with no improvement, and will only stop learning after the
-            3rd epoch if the loss still hasn't improved then.
-            Default: 10.
-        threshold (float): Threshold for measuring the new optimum,
-            to only focus on significant changes. Default: 1e-4.
-        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
-            dynamic_threshold = best * ( 1 + threshold ) in 'max'
-            mode or best * ( 1 - threshold ) in `min` mode.
-            In `abs` mode, dynamic_threshold = best + threshold in
-            `max` mode or best - threshold in `min` mode. Default: 'rel'.
-    """
-
-    def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'):
-        self.patience = patience
-        self.mode = mode
-        self.threshold = threshold
-        self.threshold_mode = threshold_mode
-        self.best = None
-        self.num_bad_epochs = None
-        self.mode_worse = None  # the worse value for the chosen mode
-        self.is_better = None
-        self.last_epoch = -1
-        self._init_is_better(mode=mode, threshold=threshold,
-                             threshold_mode=threshold_mode)
-        self._reset()
-
-    def _reset(self):
-        """Resets num_bad_epochs counter and cooldown counter."""
-        self.best = self.mode_worse
-        self.num_bad_epochs = 0
-
-    def step(self, metrics, epoch=None):
-        """ Updates early stopping state """
-        current = metrics
-        if epoch is None:
-            epoch = self.last_epoch = self.last_epoch + 1
-        self.last_epoch = epoch
-
-        if self.is_better(current, self.best):
-            self.best = current
-            self.num_bad_epochs = 0
-        else:
-            self.num_bad_epochs += 1
-
-    @property
-    def stop(self):
-        """ Should we stop learning? """
-        return self.num_bad_epochs > self.patience
-
-    def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201
-        if mode == 'min' and threshold_mode == 'rel':
-            rel_epsilon = 1. - threshold
-            return a < best * rel_epsilon
-
-        elif mode == 'min' and threshold_mode == 'abs':
-            return a < best - threshold
-
-        elif mode == 'max' and threshold_mode == 'rel':
-            rel_epsilon = threshold + 1.
-            return a > best * rel_epsilon
-
-        return a > best + threshold
-
-    def _init_is_better(self, mode, threshold, threshold_mode):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if threshold_mode not in {'rel', 'abs'}:
-            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
-
-        if mode == 'min':
-            self.mode_worse = float('inf')
-        else:  # mode == 'max':
-            self.mode_worse = (-float('inf'))
-
-        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
-
-    def state_dict(self):
-        """ Returns early stopping state """
-        return {key: value for key, value in self.__dict__.items() if key != 'is_better'}
-
-    def load_state_dict(self, state_dict):
-        """ Loads early stopping state """
-        self.__dict__.update(state_dict)
-        self._init_is_better(mode=self.mode, threshold=self.threshold,
-                             threshold_mode=self.threshold_mode)
-
-def adjust_learning_rate(learning_rate, optimizer, epoch):
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-
-    lr = learning_rate * (0.1 ** (epoch // 10))
-
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-    return lr
-
-def batch(tensor, batch_size = 50):
-    """ It is used to create batch samples, each batch has batch_size samples"""
-    tensor_list = []
-    length = tensor.shape[0]
-    i = 0
-    while True:
-        if (i+1) * batch_size >= length:
-            tensor_list.append(tensor[i * batch_size: length])
-            return tensor_list
-        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
-        i += 1
-
-
-class selu(nn.Module):
-    def __init__(self):
-        super(selu, self).__init__()
-        self.alpha = 1.6732632423543772848170429916717
-        self.scale = 1.0507009873554804934193349852946
-
-    def forward(self, x):
-        temp1 = self.scale * F.relu(x)
-        temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x)))
-        return temp1 + temp2
-
-
-class alpha_drop(nn.Module):
-    def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1):
-        super(alpha_drop, self).__init__()
-        keep_prob = 1 - p
-        self.a = np.sqrt(
-            fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar)))
-        self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
-        self.alpha = alpha
-        self.keep_prob = 1 - p
-        self.drop_prob = p
-
-    def forward(self, x):
-        if self.keep_prob == 1 or not self.training:
-            # print("testing mode, direct return")
-            return x
-        else:
-            random_tensor = self.keep_prob + torch.rand(x.size())
-
-            binary_tensor = Variable(torch.floor(random_tensor))
-
-            if torch.cuda.is_available():
-                binary_tensor = binary_tensor.cuda()
-
-            x = x.mul(binary_tensor)
-            ret = x + self.alpha * (1 - binary_tensor)
-            ret.mul_(self.a).add_(self.b)
-            return ret
-
-def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None):
-    """
-    create matrix for temporal models.
-
-    :param covariate_ids: the covariate ids in the whole data
-    :param patient_dict: the dictionary contains the data for each patient
-    :param y_dict: if the output labels is known, it contains the labels for patients
-    :param timeid_len: the total number time window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    D = len(covariate_ids)
-    N = len(patient_dict)
-    T = timeid_len   
-    concept_list =list(covariate_ids)
-    concept_list.sort()
-    x_raw = np.zeros((N, D, T), dtype=float)
-    patient_ind = 0
-    p_ids = []
-    patient_keys = patient_dict.keys()
-    #print covariate_ids
-    for kk in patient_keys:
-        #print('-------------------')
-        vals = patient_dict[kk]
-        #sorted(vals)
-        p_ids.append(int(kk))
-        for timeid, meas in vals.iteritems():
-            int_time = int(timeid) - 1
-            for val in meas:
-                if not len(val):
-                    continue
-                cov_id, cov_val = val
-                if cov_id not in covariate_ids:
-                    continue
-                lab_ind = concept_list.index(cov_id)
-                if cov_mean_dict is None:
-                    x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-                else:
-                    mean_std = cov_mean_dict[cov_id]
-                    if mean_std[1]:
-                        x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1]
-                    else:
-                        x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-
-    
-        patient_ind = patient_ind + 1
-
-    #impute the data using the value of previous timestamp
-    #fw = open('patient_var.txt', 'w')
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-    #fw.close()
-
-    return x_raw, patient_keys
-
-def forward_impute_missing_value(x_raw):
-    N = x_raw.shape[0]
-    D = x_raw.shape[1]
-    T = x_raw.shape[2]
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-
-def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False):
-    """
-    It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix
-
-    :param covariates: covariates extracted by FeatureExtraction package
-    :param timeid_len: the total number of window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    patient_dict = OrderedDict()
-    print('Loading temporal data')
-    cov_vals_dict = {}
-    for row in covariates:
-        p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3]
-        cov_id = np.int64(cov_id)
-        #time_id = int(time_id)
-        cov_vals_dict.setdefault(cov_id, []).append(float(cov_val))
-        if p_id not in patient_dict:
-            patient_dict[p_id] = {time_id: [(cov_id, cov_val)]}
-        else:
-            if time_id not in patient_dict[p_id]:
-                patient_dict[p_id][time_id] = [(cov_id, cov_val)]
-            else:
-                patient_dict[p_id][time_id].append((cov_id, cov_val))
-        #covariate_ids.add(cov_id)
-    #T = 365/time_window
-    covariate_ids = set()
-    cov_mean_dict = {}
-    if not predict:
-        fw = open('covariate_mean_std.csv', 'w')
-        for key, val in cov_vals_dict.iteritems():
-            mean_val = np.mean(val)
-            std_val = np.std(val)
-
-            # Remove those covariates with few occurrence (<5)
-            if len(val) >= 5:
-                covariate_ids.add(key)
-                cov_mean_dict[key] = (mean_val, std_val)
-                fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n')
-        fw.close()
-    else:
-        fp = open('covariate_mean_std.csv', 'r')
-        for line in fp:
-            values = line.rstrip().split(',')
-            key = np.int64(values[0])
-            covariate_ids.add(key)
-            cov_mean_dict[key] = (float(values[1]), float(values[2]))
-        fp.close()
-
-
-    if normalize:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict)
-    else:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len)
-    
-    return x, patient_keys
-
-
-def read_covariates(covariate_file):
-    patient_dict = {}
-    head = True
-    with open(covariate_file, 'r') as fp:
-        for line in fp:
-            if head:
-                head = False
-                continue
-            values = line.rstrip().split(',')
-            patient_id = values[1]
-            cov_id = values[2]
-            #time_id = int(values[-1])
-            # covariates in one patient has time order
-            patient_dict.setdefault(patient_id, []).append((cov_id))
-    new_patient = []
-    for key in patient_dict.keys():
-        #patient_dict[key].sort()
-        sort_vals = []
-        for val in patient_dict[key]:
-            if val[1] not in sort_vals:
-                sort_vals.append(val)
-        new_patient.append(sort_vals)
-
-    return new_patient
-
-
-def word_embeddings(covariate_file, embedding_size=50):
-    import gensim.models.word2vec as w2v
-    modelname = "processed_%s.w2v" % ('heartfailure')
-    sentences = read_covariates(covariate_file)
-    model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1)
-    print("building word2vec vocab on %s..." % (covariate_file))
-
-    model.build_vocab(sentences)
-    print("training...")
-    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
-    out_file = modelname
-    print("writing embeddings to %s" % (out_file))
-    model.save(out_file)
-    return out_file
-
-def read_data(filename):
-    covariate_ids = set()
-    patient_dict = {}
-    head = True
-    with open(filename) as fp:
-        for lines in fp:
-            if head:
-                head = False
-                continue
-            lines = lines.strip('\n').strip('\r').split(',')
-            try:
-                p_id, cov_id, time_id, cov_val =  lines[1], lines[2], lines[3], lines[4]
-            except:
-                pdb.set_trace()
-            print(p_id, cov_id, time_id)
-            if p_id not in patient_dict:
-                patient_dict[p_id] = {}
-            else:
-                if time_id not in patient_dict[p_id]:
-                    patient_dict[p_id][time_id] = []
-                else:
-                    patient_dict[p_id][time_id].append((cov_id, cov_val))
-                    covariate_ids.add(cov_id)
-    patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients
-    #(15, 2000, 60) 20000 patients, 15 lab tests,
-    return covariate_ids, patient_dict
-
-def split_training_validation(classes, validation_size=0.2, shuffle=False):
-    """split sampels based on balnace classes"""
-    num_samples = len(classes)
-    classes = np.array(classes)
-    classes_unique = np.unique(classes)
-    num_classes = len(classes_unique)
-    indices = np.arange(num_samples)
-    # indices_folds=np.zeros([num_samples],dtype=int)
-    training_indice = []
-    training_label = []
-    validation_indice = []
-    validation_label = []
-    for cl in classes_unique:
-        indices_cl = indices[classes == cl]
-        num_samples_cl = len(indices_cl)
-
-        # split this class into k parts
-        if shuffle:
-            random.shuffle(indices_cl)  # in-place shuffle
-
-        # module and residual
-        num_samples_each_split = int(num_samples_cl * validation_size)
-        res = num_samples_cl - num_samples_each_split
-
-        training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]]
-        training_label = training_label + [cl] * res
-
-        validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]]
-        validation_label = validation_label + [cl] * num_samples_each_split
-
-    training_index = np.arange(len(training_label))
-    random.shuffle(training_index)
-    training_indice = np.array(training_indice)[training_index]
-    training_label = np.array(training_label)[training_index]
-
-    validation_index = np.arange(len(validation_label))
-    random.shuffle(validation_index)
-    validation_indice = np.array(validation_indice)[validation_index]
-    validation_label = np.array(validation_label)[validation_index]
-
-    return training_indice, training_label, validation_indice, validation_label
-
-
-class LogisticRegression(nn.Module):
-    """
-    Train a logistic regression model using pytorch
-    """
-    def __init__(self, input_size, num_classes = 2):
-        super(LogisticRegression, self).__init__()
-        self.linear = nn.Linear(input_size, num_classes)
-
-    def forward(self, x):
-        out = self.linear(x)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class MLP(nn.Module):
-    """
-    Train a multiple-layer perceptron with one hideen layer
-    """
-    def __init__(self, input_dim, hidden_size, num_classes = 2):
-        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p =0.5, training=self.training)
-        x = self.fc2(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class SNN(nn.Module):
-    """
-    Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515
-    """
-
-    def __init__(self, input_dim, hidden_size, num_classes=2):
-        super(SNN, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = selu()
-        self.ad1 = alpha_drop()
-        self.fc4 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = self.fc2(x)
-        x = self.ad1(x)
-        x = self.fc4(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-class AutoEncoder(nn.Module):
-    """
-    A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(AutoEncoder, self).__init__()
-
-        self.encoder = nn.Sequential(
-            nn.Linear(input_size, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, encoding_size),
-            nn.ReLU(True)
-        )
-        self.decoder = nn.Sequential(
-            nn.Linear(encoding_size, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size)
-        )
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        encoded = self.encoder(x)
-        decoded = self.decoder(encoded)
-        return decoded
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            encoded = self.encoder(x)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class VAE(nn.Module):
-    """
-    A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(VAE, self).__init__()
-
-        self.fc1 = nn.Linear(input_size, input_size/2)
-        self.fc21 = nn.Linear(input_size/2, encoding_size)
-        self.fc22 = nn.Linear(input_size/2, encoding_size)
-        self.fc3 = nn.Linear(encoding_size, input_size/2)
-        self.fc4 = nn.Linear(input_size/2, input_size)
-
-        self.relu = nn.ReLU()
-        self.sigmoid = nn.Sigmoid()
-
-    def encode(self, x):
-        h1 = self.relu(self.fc1(x))
-        return self.fc21(h1), self.fc22(h1)
-
-    def reparameterize(self, mu, logvar):
-        if self.training:
-            std = logvar.mul(0.5).exp_()
-            eps = Variable(std.data.new(std.size()).normal_())
-            return eps.mul(std).add_(mu)
-        else:
-            return mu
-
-    def decode(self, z):
-        h3 = self.relu(self.fc3(z))
-        return self.sigmoid(self.fc4(h3))
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        mu, logvar = self.encode(x)
-        z = self.reparameterize(mu, logvar)
-        return self.decode(z), mu, logvar
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encode(x)
-            encoded = self.reparameterize(mu, logvar)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class Decoder(nn.Module):
-    """ VAE decoder input_size = original inputsize/16*256"""
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Decoder, self).__init__()
-        self.latent_size = latent_size
-        self.img_channels = img_channels
-
-        self.fc1 = nn.Linear(latent_size, input_size)
-        self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding)
-        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding)
-        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding)
-        self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.fc1(x))
-        x = x.unsqueeze(-1).unsqueeze(-1)
-        x = F.relu(self.deconv1(x))
-        x = F.relu(self.deconv2(x))
-        x = F.relu(self.deconv3(x))
-        reconstruction = torch.sigmoid(self.deconv4(x))
-        return reconstruction
-
-class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
-    """ VAE encoder """
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Encoder, self).__init__()
-        self.latent_size = latent_size
-        #self.img_size = img_size
-        self.img_channels = img_channels
-
-        self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding)
-        self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding)
-        self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding)
-        out_size = input_size / 16
-        self.fc_mu = nn.Linear(out_size, latent_size)
-        self.fc_logsigma = nn.Linear(out_size, latent_size)
-
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = F.relu(self.conv4(x))
-        x = x.view(x.size(0), -1)
-
-        mu = self.fc_mu(x)
-        logsigma = self.fc_logsigma(x)
-
-        return mu, logsigma
-
-class VAE_CNN(nn.Module):
-    """ Variational Autoencoder """
-    def __init__(self, latent_size, input_size):
-        super(VAE, self).__init__()
-        self.encoder = Encoder(latent_size, input_size)
-        input_size = input_size/16
-        self.decoder = Decoder(latent_size, input_size)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        mu, logsigma = self.encoder(x)
-        sigma = logsigma.exp()
-        eps = torch.randn_like(sigma)
-        z = eps.mul(sigma).add_(mu)
-
-        recon_x = self.decoder(z)
-        return recon_x, mu, logsigma
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encoder(x)
-            encoded = mu .data.cpu().numpy()
-            return encoded
-
-class CNN(nn.Module):
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size)
-        self.bn = nn.BatchNorm1d(hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.layer2(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.bn(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-#allow multiple kernel with differnt kernel size
-class CNN_MLF(nn.Module):
-    """
-    It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN_MLF, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1
-        maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size
-        maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1
-        maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        conv_outsize = 	maxpool1_size + maxpool2_size +maxpool3_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out1 = self.layer1(x)
-        out2 = self.layer2(x)
-        out3 = self.layer3(x)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_LSTM(nn.Module):
-    """
-    It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2):
-        super(CNN_LSTM, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding)
-        input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(hidden_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.downsample(out)
-        out = torch.squeeze(out, 1)
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
-        out, hn  = self.layer2(out, (h0, c0))
-        out = hn[0][-1]
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MIX(nn.Module):
-    """
-    It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MIX, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        out1_size = int(np.ceil(float(window_size)/pool_size[1]))
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        
-        out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer2(out)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer3(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MULTI(nn.Module):
-    """
-    It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MULTI, self).__init__()
-        # resolution 1
-        self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        #resolution 2
-        self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.layer4 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        merge_size = cnn1_size + cnn2_size + cnn4_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.pool1_1(x)
-        out = self.pool1_2(out)
-        out1 = self.layer1(out)
-        out = self.pool2_1(x)
-        out2 = self.layer2(out)
-        out = self.layer3(x)
-        out3 = self.layer4(out)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# 1x3 Convolution
-def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)):
-    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
-                     padding=padding, stride=stride, bias=False)
-
-
-# Residual Block
-class ResidualBlock(nn.Module):
-    def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn1 = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn2 = nn.BatchNorm2d(nb_filter)
-        self.downsample = downsample
-        
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        if self.downsample:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-
-
-# ResNet Module
-class ResNet(nn.Module):
-    def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100):
-        super(ResNet, self).__init__()
-        self.in_channels = 1
-        self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size)
-        self.bn = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self.make_layer(block, nb_filter, layers[0],  kernel_size = kernel_size)
-        self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter)
-        self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter)
-        self.avg_pool = nn.AvgPool2d(pool_size)
-        avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        last_layer_size = nb_filter*4*labcounts*avgpool2_1_size
-        self.fc = nn.Linear(last_layer_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def make_layer(self, block, out_channels, blocks, stride=1,  kernel_size = (1, 3), in_channels = 16):
-        downsample = None
-        if (stride != 1) or (self.in_channels != out_channels):
-            downsample = nn.Sequential(
-                convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride),
-                nn.BatchNorm2d(out_channels))
-        layers = []
-        layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample))
-        self.in_channels = out_channels
-        for i in range(1, blocks):
-            layers.append(block(out_channels, out_channels, kernel_size = kernel_size))
-        return nn.Sequential(*layers)
-     
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.conv(x)
-        out = self.bn(out)
-        out = self.relu(out)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.avg_pool(out)
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-        
-class GRU(nn.Module):
-    """
-    It is a deep network with one GRU layer, which are further fed into one fully connected layers.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(GRU, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.linear = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection
-        self.gru.flatten_parameters()
-        out, hn = self.gru(x, h0)
-
-        rearranged = hn[-1]
-        out = self.linear(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def initHidden(self, N):
-        return Variable(torch.randn(1, N, self.hidden_size))
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class RNN(nn.Module):
-    """
-    It is a deep network with one LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(RNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.fc = nn.Linear(hidden_size, num_classes)
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        rearranged = hn[0][-1]
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class BiRNN(nn.Module):
-    """
-    It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(BiRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
-                            batch_first = True, dropout = dropout, bidirectional=True)
-        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection 
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        hn = hn[0]
-
-        rearranged = hn[-2:].view(x.size(0), -1)
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# select model
-def train_deeptorch(population, plpData, train = True, model_type = 'LogisticRegression', class_weight =0,  autoencoder = True, w_decay =0.9, epochs = 1, vae = False, size = 100, loss = 'LogSoftmax', nbfilters = 4, learning_rate = 0.0001, hidden_size = 100, modelOutput = 'C:/deeptorch', seed = 1, quiet = False):
-  if model_type in ['LogisticRegression', 'MLP', 'SNN']:
-    y = population[:, 1]
-    X = plpData[population[:, 0], :]
-    trainInds = population[:, population.shape[1] - 1] > 0
-    if class_weight == -1:
-      loss = FocalLoss(gamma = 5)
-    else:
-      if class_weight == 0:
-        weights = float(np.count_nonzero(y))/y.shape[0]
-        class_weight = [1 - weights, weights]
-      else:
-        class_weight = [class_weight, 1]
-      class_weight = 1/torch.Tensor(class_weight)
-      if torch.cuda.is_available():
-        class_weight = class_weight.cuda()
-      loss=nn.CrossEntropyLoss(weight = class_weight)
-    
-    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
-    print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]))
-    ###########################################################################
-    l1regularization = False
-    if train:
-      pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
-      print("Calculating prediction for train set of size %s" % (pred_size))
-      test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
-      for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-        testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-        trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-        train_x = X[trainInds, :][trainInd, :]
-        train_y = y[trainInds][trainInd]
-        test_x = X[trainInds, :][testInd, :]
-        print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-        print("Train set contains %s outcomes " % (np.sum(train_y)))
-        train_x = train_x.toarray()
-        test_x = test_x.toarray()
-        if autoencoder:
-          print('first train stakced autoencoder')
-          encoding_size = 256
-          if vae:
-            auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-          else:
-            auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-          if torch.cuda.is_available():
-            auto_model = auto_model.cuda()
-          clf = Estimator(auto_model)
-          clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay),
-                                loss=nn.MSELoss())
-          clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae)
-          #split to batch for large dataset
-          train_batch = batch(train_x, batch_size=32)
-          train_x = np.array([]).reshape(0, encoding_size)
-          for train in train_batch:
-            encode_train = auto_model.get_encode_features(train)
-            train_x = np.concatenate((train_x, encode_train), axis=0)
-          test_batch = batch(test_x, batch_size=32)
-          test_x = np.array([]).reshape(0, encoding_size)
-          for test in test_batch:
-            encode_Test = auto_model.get_encode_features(test)
-            test_x = np.concatenate((test_x, encode_Test), axis=0)
-          del auto_model
-          del clf
-        # train on fold
-        print("Training fold %s" % (i))
-        start_time = timeit.default_timer()
-        if model_type == 'LogisticRegression':
-          model = LogisticRegression(train_x.shape[1])
-          l1regularization = True
-        elif model_type == 'SNN':
-          model = SNN(train_x.shape[1], size)
-        else:
-          model = MLP(train_x.shape[1], size)
-        
-        if torch.cuda.is_available():
-          model = model.cuda()
-        clf = Estimator(model)
-        clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                            loss=loss)
-        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-        
-        ind = (population[:, population.shape[1] - 1] > 0)
-        ind = population[ind, population.shape[1] - 1] == i
-        test_input_var = torch.from_numpy(test_x.astype(np.float32))
-        test_batch = batch(test_x, batch_size = 32)
-        temp = []
-        for test in test_batch:
-          pred_test1 = model.predict_proba(test)[:, 1]
-          temp = np.concatenate((temp, pred_test1), axis = 0)
-        test_pred[ind] = temp
-        print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-        print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-
-      # RETURN CV PREDICTION WHEN TRAIN == T
-      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-      return prediction;
-      
-    # train final:
-    else:
-      print("Training final neural network model on all train data...")
-      print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-      start_time = timeit.default_timer()
-      train_x = X[trainInds, :]
-      train_x = train_x.toarray()
-      train_y = y[trainInds]
-      if not os.path.exists(modelOutput):
-        os.makedirs(modelOutput)
-      if autoencoder:
-        encoding_size = 256
-        if vae:
-          auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-        else:
-          auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-        if torch.cuda.is_available():
-          auto_model = auto_model.cuda()
-        clf = Estimator(auto_model)
-        clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay),
-                            loss=nn.MSELoss())
-        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae)
-        train_batch = batch(train_x, batch_size=32)
-        train_x = np.array([]).reshape(0, encoding_size)
-        for train in train_batch:
-          encode_train = auto_model.get_encode_features(train)
-          train_x = np.concatenate((train_x, encode_train), axis=0)
-        joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl'))
-        del auto_model
-        del clf
-      print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay))
-      if model_type == 'LogisticRegression':
-        model = LogisticRegression(train_x.shape[1])
-        l1regularization = True
-      elif model_type == 'SNN':
-        model = SNN(train_x.shape[1], size)
-      else:
-        model = MLP(train_x.shape[1], size)
-      if torch.cuda.is_available():
-        model = model.cuda()
-      clf = Estimator(model)
-      clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                        loss=loss)
-      clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-      end_time = timeit.default_timer()
-      print("Training final took: %.2f s" % (end_time - start_time))
-      print("Model saved to: %s" % (modelOutput))
-      joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-      # DO PREDICTION ON TRAIN:
-      train_batch = batch(train_x, batch_size = 32)
-      train_pred = []
-      for train in train_batch:
-        preds = model.predict_proba(train)[:, 1]
-        train_pred = np.concatenate((train_pred, preds), axis = 0)
-      train_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], train_pred, axis=1)
-      # RETURN TRAIN PREDICTION WHEN TRAIN == F
-      return prediction;
-  
-  elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']:
-    y = population[:, 1]
-    X = plpData.to_dense().numpy()
-    X = X[np.int64(population[:, 0]), :]
-    trainInds = population[:, population.shape[1] - 1] > 0
-    
-    if class_weight == -1:
-      loss = FocalLoss(gamma = 3)
-    else:
-      if class_weight == 0:
-        weights = float(np.count_nonzero(y))/y.shape[0]
-        class_weight = [1 - weights, weights]
-      else:
-        class_weight = [class_weight, 1]
-      class_weight = 1/torch.Tensor(class_weight)
-      if torch.cuda.is_available():
-        class_weight = class_weight.cuda()
-      loss=nn.CrossEntropyLoss(weight = class_weight)
-    
-    if train:
-      test_pred = np.zeros(population[population[:, population.shape[1] - 1] > 0, :].shape[0])  # zeros length sum(population[:,population.size[1]] ==i)
-      for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-        testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-        trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-        train_x = X[trainInds, :][trainInd, :]
-        train_y = y[trainInds][trainInd]
-        test_x = X[trainInds, :][testInd, :]
-        print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-        print("Train set contains %s outcomes " % (np.sum(train_y)))
-        # train on fold
-        learning_rate = 0.001
-        print("Training fold %s" % (i))
-        start_time = timeit.default_timer()
-        if model_type == 'CNN':
-          model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'CNN_LSTM':
-          model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-        elif model_type == 'CNN_MLF': # multiple kernels with different size
-          model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis
-          model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis
-          model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'ResNet':
-          print('train ResNet')
-          model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-        elif model_type == 'RNN':
-          model = RNN(train_x.shape[2], hidden_size, 2, 2)
-        elif model_type == 'BiRNN':
-          model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-        elif model_type == 'GRU':
-          model = GRU(train_x.shape[2], hidden_size, 2, 2)
-        else:
-          print('temproal data not supported by this model')
-          
-        if torch.cuda.is_available():
-          model = model.cuda()
-        clf = Estimator(model)
-        clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                            loss=loss)
-        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-        ind = (population[:, population.shape[1] - 1] > 0)
-        ind = population[ind, population.shape[1] - 1] == i
-        test_batch = batch(test_x, batch_size = 32)
-        temp = []
-        for test in test_batch:
-          pred_test1 = model.predict_proba(test)[:, 1]
-          temp = np.concatenate((temp, pred_test1), axis = 0)
-        test_pred[ind] = temp
-        del model
-        print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-        print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-      # RETURN CV PREDICTION
-      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-      return prediction;
-      
-    # train final:
-    else:
-      print("Training final neural network model on all train data...")
-      print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-      start_time = timeit.default_timer()
-      train_x = X[trainInds, :]
-      train_y = y[trainInds]
-      learning_rate = 0.001
-      if model_type == 'CNN':
-        model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'CNN_LSTM':
-        model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-      elif model_type == 'CNN_MLF': # multiple kernels with different size
-        model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis
-        model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis
-        model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'ResNet':
-        model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-      elif model_type == 'RNN':
-        model = RNN(train_x.shape[2], hidden_size, 2, 2)
-      elif model_type == 'BiRNN':
-        model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-      elif model_type == 'GRU':
-        model = GRU(train_x.shape[2], hidden_size, 2, 2)
-      else:
-        print('temproal data not supported by this model')
-      
-      if torch.cuda.is_available():
-        model = model.cuda()
-      clf = Estimator(model)
-      clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                        loss=loss)
-      clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-      end_time = timeit.default_timer()
-      print("Training final took: %.2f s" % (end_time - start_time))
-      # save the model:
-      if not os.path.exists(modelOutput):
-        os.makedirs(modelOutput)
-      print("Model saved to: %s" % (modelOutput))
-      
-      joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-      # prediction on train:
-      test_batch = batch(train_x, batch_size = 32)
-      test_pred = []
-      for test in test_batch:
-        pred_test1 = model.predict_proba(test)[:, 1]
-        test_pred = np.concatenate((test_pred, pred_test1), axis = 0)
-      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-      return prediction;
diff --git a/inst/python/predictFunctions.py b/inst/python/predictFunctions.py
deleted file mode 100644
index fcbb2e5..0000000
--- a/inst/python/predictFunctions.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#  apply random forest model on new data
-#===============================================================
-# INPUT:
-# 1) location of new data
-# 2) location of model
-#
-# OUTPUT:
-# it returns a file with indexes merged with prediction for test index  - named new_pred
-#================================================================
-import numpy as np
-from collections import OrderedDict
-import os
-import sys
-import timeit
-import math
-from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack
-from joblib import Memory
-import joblib
-
-def batch(tensor, batch_size = 50):
-    """ It is used to create batch samples, each batch has batch_size samples"""
-    tensor_list = []
-    length = tensor.shape[0]
-    i = 0
-    while True:
-        if (i+1) * batch_size >= length:
-            tensor_list.append(tensor[i * batch_size: length])
-            return tensor_list
-        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
-        i += 1
-#================================================================
-###########################################################################
-def python_predict_temporal(population, plpData, model_loc, dense, autoencoder):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  #y=population[:,1]
-  
-  ###########################################################################	
-  # uf dense convert 
-  if dense==1:
-    print("converting to dense data...")
-    X = plpData.to_dense().numpy()
-  if dense==0:
-    print("keeping data sparse...")
-    X = plpData.numpy()
-  ###########################################################################	
-  
-  # order the data
-  X = X[np.int64(population[:, 0]), :]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-
-  # load model
-  print("Loading model...")
-  if autoencoder:
-    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
-    X = autoencoder_model.get_encode_features(X)
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_batch = batch(X, batch_size = 32)
-  test_pred = []
-  for test in test_batch:
-    pred_test1 = modelTrained.predict_proba(test)[:, 1]
-    test_pred = np.concatenate((test_pred , pred_test1), axis = 0)
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
-  
-def python_predict(population, plpData, model_loc, dense, autoencoder):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  #y=population[:,1]
-  X = plpData[population[:,0].astype(int),:]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-  ###########################################################################	
-  # uf dense convert 
-  if dense==1:
-    print("converting to dense data...")
-    X=X.toarray()
-  ###########################################################################	
-  # load model
-  print("Loading model...")
-  if autoencoder:
-    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
-    X = autoencoder_model.get_encode_features(X)
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_pred = modelTrained.predict_proba(X)[:, 1]
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
-
-
-def python_predict_survival(population, plpData, model_loc):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  X = plpData[population[:,0].astype(int),:]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-  ###########################################################################	
-  # load model
-  print("Loading model...")
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_pred = modelTrained.predict(X.toarray())
-  test_pred = test_pred.flatten()
-  rowCount = population.shape[0]
-  test_pred = test_pred[0:(rowCount)]
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
-  
-def python_predict_garden(population, plpData, model_loc,quantile=None):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  #y=population[:,1]
-  X = plpData[population[:,0].astype(int),:]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-  ###########################################################################	
-  # load model
-  print("Loading model...")
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_pred = modelTrained.predict(X,quantile=quantile)[:, 0]
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
diff --git a/inst/python/python_predict.py b/inst/python/python_predict.py
deleted file mode 100644
index 3a2f132..0000000
--- a/inst/python/python_predict.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  apply random forest model on new data
-#===============================================================
-# INPUT:
-# 1) location of new data
-# 2) location of model
-#
-# OUTPUT:
-# it returns a file with indexes merged with prediction for test index  - named new_pred
-#================================================================
-import numpy as np
-from collections import OrderedDict
-import os
-import sys
-import timeit
-import math
-#from sklearn.ensemble import RandomForestClassifier
-#from sklearn.naive_bayes import GaussianNB
-from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack
-#from sklearn.feature_selection import SelectFromModel
-#from sklearn.cross_validation import PredefinedSplit
-from sklearn.externals.joblib import Memory
-#from sklearn.datasets import load_svmlight_file
-from sklearn.externals import joblib
-if "python_dir" in globals():
-    sys.path.insert(0, python_dir)
-    import TorchUtils as tu
-#================================================================
-
-
-print("Applying Python Model") 
-
-###########################################################################
-	
-def get_temproal_data(covariates, population):
-    p_ids_in_cov = set(covariates[:, 0])
-    timeid_len = len(set(covariates[:, -2]))
-    full_covariates = np.array([]).reshape(0,4)
-    default_covid = covariates[0, 1]
-    for p_id in  population[:, 0]:
-        if p_id not in p_ids_in_cov:
-            tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1
-            full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-        else:
-            tmp_x = covariates[covariates[:, 0] == p_id, :]
-            #print tmp_x.shape, X.shape
-            full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-
-    X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len  = timeid_len, predict = True)
-    return X
-
-
-print("Loading Data...")
-# load data + train,test indexes + validation index
-
-y=population[:,1]
-#print covariates.shape
-
-if modeltype == 'temporal':
-    X = plpData.to_dense().numpy()
-    X = X[np.int64(population[:, 0]), :]
-	#X = get_temproal_data(covariates, population)
-    dense = 0
-else:
-    #print included
-	X = plpData[population[:,0],:]
-	X = X[:,included.flatten()]
-
-# load index file
-print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-print("Data ready for model has %s features" %(np.shape(X)[1]))
-
-###########################################################################	
-# uf dense convert 
-if dense==1:
-  print("converting to dense data...")
-  X=X.toarray()
-###########################################################################	
-
-# load model
-print("Loading model...")
-
-modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-
-print(X.shape)
-print("Calculating predictions on population...")
-
-if autoencoder:
-    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
-    X = autoencoder_model.get_encode_features(X)
-
-if modeltype == 'temporal':
-    test_batch = tu.batch(X, batch_size = 32)
-    test_pred = []
-    for test in test_batch:
-        pred_test1 = modelTrained.predict_proba(test)[:, 1]
-        test_pred = np.concatenate((test_pred , pred_test1), axis = 0)
-else:
-    test_pred = modelTrained.predict_proba(X)[:, 1]
-
-
-if test_pred.ndim != 1:
-    test_pred = test_pred[:,1]
-  
-
-print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-print("Mean: %s prediction value" %(np.mean(test_pred)))
-
-# merge pred with population
-test_pred.shape = (population.shape[0], 1)
-prediction = np.append(population,test_pred, axis=1)
diff --git a/inst/sql/sql_server/DomainConceptTemporal.sql b/inst/sql/sql_server/DomainConceptTemporal.sql
deleted file mode 100644
index a3ebc2a..0000000
--- a/inst/sql/sql_server/DomainConceptTemporal.sql
+++ /dev/null
@@ -1,90 +0,0 @@
--- Feature construction
-SELECT 
-CAST(@domain_concept_id AS BIGINT) * 1000 + @analysis_id AS covariate_id,
-   time_id,
-   duration, -- does aggregation make sense for this?
-{@aggregated} ? { 
-  cohort_definition_id,
-  COUNT(*) AS sum_value
-} : {
-  row_id,
-  1 AS covariate_value 
-}
-INTO @covariate_table
-FROM (
-  SELECT DISTINCT @domain_concept_id,
-    FLOOR(DATEDIFF(@time_part, @cdm_database_schema.@domain_table.@domain_start_date, cohort.cohort_start_date)*1.0/@time_interval ) as time_id,
-    {@aggregated} ? {	
-    max(
-    }
-    
-    CASE WHEN @cdm_database_schema.@domain_table.@domain_end_date <= cohort.cohort_start_date 
-    THEN
-     DATEDIFF(@time_part, @cdm_database_schema.@domain_table.@domain_start_date, @cdm_database_schema.@domain_table.@domain_end_date)*1.0/@time_interval
-     ELSE 
-     DATEDIFF(@time_part, @cdm_database_schema.@domain_table.@domain_start_date, cohort.cohort_start_date)*1.0/@time_interval
-     END
-    {@aggregated} ? {	
-    )
-    }
-   as duration,
-  {@aggregated} ? {
-    cohort_definition_id,
-    cohort.subject_id,
-    cohort.cohort_start_date
-  } : {
-    cohort.@row_id_field AS row_id
-  }
-  FROM @cohort_table cohort
-  INNER JOIN @cdm_database_schema.@domain_table
-  ON cohort.subject_id = @domain_table.person_id
-
-    WHERE @domain_start_date <= DATEADD(DAY, @end_day, cohort.cohort_start_date)
-    AND @domain_concept_id != 0
-
-  {@sub_type == 'inpatient'} ? {	AND condition_type_concept_id IN (38000183, 38000184, 38000199, 38000200)}
-  {@excluded_concept_table != ''} ? {		AND @domain_concept_id NOT IN (SELECT id FROM @excluded_concept_table)}
-  {@included_concept_table != ''} ? {		AND @domain_concept_id IN (SELECT id FROM @included_concept_table)}
-  {@included_cov_table != ''} ? {		AND CAST(@domain_concept_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)}
-  {@cohort_definition_id != -1} ? {		AND cohort.cohort_definition_id IN (@cohort_definition_id)}
-) by_row_id
-{@aggregated} ? {		
-  GROUP BY 
-  cohort_definition_id,
-  @domain_concept_id,
-  time_id
-} 
-;
-
--- Reference construction
-INSERT INTO #cov_ref (
-covariate_id,
-covariate_name,
-analysis_id,
-concept_id
-)
-SELECT covariate_id,
-
-CAST(CONCAT('@domain_table: ', CASE WHEN concept_name IS NULL THEN 'Unknown concept' ELSE concept_name END {@sub_type == 'inpatient'} ? {, ' (inpatient)'}) AS VARCHAR(512)) AS covariate_name,
-
-@analysis_id AS analysis_id,
-CAST((covariate_id - @analysis_id) / 1000 AS INT) AS concept_id
-FROM (
-  SELECT DISTINCT covariate_id
-  FROM @covariate_table
-) t1
-LEFT JOIN @cdm_database_schema.concept
-ON concept_id = CAST((covariate_id - @analysis_id) / 1000 AS INT);
-
-INSERT INTO #analysis_ref (
-analysis_id,
-analysis_name,
-domain_id,
-is_binary,
-missing_means_zero
-)
-SELECT @analysis_id AS analysis_id,
-CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name,
-CAST('@domain_id' AS VARCHAR(20)) AS domain_id,
-CAST('Y' AS VARCHAR(1)) AS is_binary,
-CAST(NULL AS VARCHAR(1)) AS missing_means_zero;
\ No newline at end of file
diff --git a/man/Dataset.Rd b/man/Dataset.Rd
new file mode 100644
index 0000000..eb12468
--- /dev/null
+++ b/man/Dataset.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Dataset.R
+\name{Dataset}
+\alias{Dataset}
+\title{A torch dataset}
+\usage{
+Dataset(data, labels = NULL, numericalIndex = NULL)
+}
+\arguments{
+\item{data}{a dataframe like object with the covariates}
+
+\item{labels}{a dataframe with the labels}
+
+\item{numericalIndex}{in what column numeric data is in (if any)}
+}
+\description{
+A torch dataset
+}
diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd
new file mode 100644
index 0000000..c97f163
--- /dev/null
+++ b/man/DeepPatientLevelPrediction.Rd
@@ -0,0 +1,9 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/DeepPatientLevelPrediction.R
+\docType{package}
+\name{DeepPatientLevelPrediction}
+\alias{DeepPatientLevelPrediction}
+\title{DeepPatientLevelPrediction}
+\description{
+A package containing deep learning extensions for developing prediction models using data in the OMOP CDM
+}
diff --git a/man/EarlyStopping.Rd b/man/EarlyStopping.Rd
new file mode 100644
index 0000000..1087705
--- /dev/null
+++ b/man/EarlyStopping.Rd
@@ -0,0 +1,76 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Estimator.R
+\name{EarlyStopping}
+\alias{EarlyStopping}
+\title{Earlystopping class}
+\description{
+Stops training if a loss or metric has stopped improving
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-EarlyStopping-new}{\code{EarlyStopping$new()}}
+\item \href{#method-EarlyStopping-call}{\code{EarlyStopping$call()}}
+\item \href{#method-EarlyStopping-clone}{\code{EarlyStopping$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-EarlyStopping-new"></a>}}
+\if{latex}{\out{\hypertarget{method-EarlyStopping-new}{}}}
+\subsection{Method \code{new()}}{
+Creates a new earlystopping object
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{EarlyStopping$new(patience = 3, delta = 0, verbose = TRUE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{patience}}{Stop after this number of epochs if loss doesn't improve}
+
+\item{\code{delta}}{How much does the loss need to improve to count as improvement}
+
+\item{\code{verbose}}{If information should be printed out}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+a new earlystopping object
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-EarlyStopping-call"></a>}}
+\if{latex}{\out{\hypertarget{method-EarlyStopping-call}{}}}
+\subsection{Method \code{call()}}{
+call the earlystopping object and increment a counter if loss is not
+improving
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{EarlyStopping$call(metric)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{metric}}{the current metric value}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-EarlyStopping-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-EarlyStopping-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{EarlyStopping$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/man/Estimator.Rd b/man/Estimator.Rd
new file mode 100644
index 0000000..a6fc4bc
--- /dev/null
+++ b/man/Estimator.Rd
@@ -0,0 +1,301 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Estimator.R
+\name{Estimator}
+\alias{Estimator}
+\title{Estimator}
+\description{
+A generic R6 class that wraps around a torch nn module and can be used to
+fit and predict the model defined in that module.
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-Estimator-new}{\code{Estimator$new()}}
+\item \href{#method-Estimator-fit}{\code{Estimator$fit()}}
+\item \href{#method-Estimator-fitEpoch}{\code{Estimator$fitEpoch()}}
+\item \href{#method-Estimator-score}{\code{Estimator$score()}}
+\item \href{#method-Estimator-finishFit}{\code{Estimator$finishFit()}}
+\item \href{#method-Estimator-fitWholeTrainingSet}{\code{Estimator$fitWholeTrainingSet()}}
+\item \href{#method-Estimator-save}{\code{Estimator$save()}}
+\item \href{#method-Estimator-predictProba}{\code{Estimator$predictProba()}}
+\item \href{#method-Estimator-predict}{\code{Estimator$predict()}}
+\item \href{#method-Estimator-batchToDevice}{\code{Estimator$batchToDevice()}}
+\item \href{#method-Estimator-itemOrDefaults}{\code{Estimator$itemOrDefaults()}}
+\item \href{#method-Estimator-clone}{\code{Estimator$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-new"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-new}{}}}
+\subsection{Method \code{new()}}{
+Creates a new estimator
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$new(
+  baseModel,
+  modelParameters,
+  fitParameters,
+  optimizer = torch::optim_adam,
+  criterion = torch::nn_bce_with_logits_loss,
+  scheduler = torch::lr_reduce_on_plateau,
+  device = "cpu",
+  patience = 4
+)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{baseModel}}{The torch nn module to use as model}
+
+\item{\code{modelParameters}}{Parameters to initialize the baseModel}
+
+\item{\code{fitParameters}}{Parameters required for the estimator fitting}
+
+\item{\code{optimizer}}{A torch optimizer to use, default is Adam}
+
+\item{\code{criterion}}{The torch loss function to use, defaults to
+binary cross entropy with logits}
+
+\item{\code{scheduler}}{learning rate scheduler to use}
+
+\item{\code{device}}{Which device to use for fitting, default is cpu}
+
+\item{\code{patience}}{Patience to use for early stopping}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-fit"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-fit}{}}}
+\subsection{Method \code{fit()}}{
+fits the estimator
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$fit(dataset, testDataset)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{dataset}}{a torch dataset to use for model fitting}
+
+\item{\code{testDataset}}{a torch dataset to use for early stopping}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-fitEpoch"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-fitEpoch}{}}}
+\subsection{Method \code{fitEpoch()}}{
+fits estimator for one epoch (one round through the data)
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$fitEpoch(dataset, batchIndex)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{dataset}}{torch dataset to use for fitting}
+
+\item{\code{batchIndex}}{indices of batches}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-score"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-score}{}}}
+\subsection{Method \code{score()}}{
+calculates loss and auc after training for one epoch
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$score(dataset, batchIndex)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{dataset}}{The torch dataset to use to evaluate loss and auc}
+
+\item{\code{batchIndex}}{Indices of batches in the dataset}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+list with average loss and auc in the dataset
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-finishFit"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-finishFit}{}}}
+\subsection{Method \code{finishFit()}}{
+operations that run when fitting is finished
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{valAUCs}}{validation AUC values}
+
+\item{\code{modelStateDict}}{fitted model parameters}
+
+\item{\code{valLosses}}{validation losses}
+
+\item{\code{epoch}}{list of epochs fit}
+
+\item{\code{learnRates}}{learning rate sequence used so far}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-fitWholeTrainingSet"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-fitWholeTrainingSet}{}}}
+\subsection{Method \code{fitWholeTrainingSet()}}{
+Fits whole training set on a specific number of epochs
+TODO What happens when learning rate changes per epochs?
+Ideally I would copy the learning rate strategy from before
+and adjust for different sizes ie more iterations/updates???
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$fitWholeTrainingSet(dataset, learnRates = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{dataset}}{torch dataset}
+
+\item{\code{learnRates}}{learnRateSchedule from CV}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-save"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-save}{}}}
+\subsection{Method \code{save()}}{
+save model and those parameters needed to reconstruct it
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$save(path, name)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{path}}{where to save the model}
+
+\item{\code{name}}{name of file}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+the path to saved model
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-predictProba"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-predictProba}{}}}
+\subsection{Method \code{predictProba()}}{
+predicts and outputs the probabilities
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$predictProba(dataset)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{dataset}}{Torch dataset to create predictions for}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+predictions as probabilities
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-predict"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-predict}{}}}
+\subsection{Method \code{predict()}}{
+predicts and outputs the class
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$predict(dataset, threshold = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{dataset}}{A torch dataset to create predictions for}
+
+\item{\code{threshold}}{Which threshold to use for predictions}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+The predicted class for the data in the dataset
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-batchToDevice"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-batchToDevice}{}}}
+\subsection{Method \code{batchToDevice()}}{
+sends a batch of data to device
+assumes batch includes lists of tensors to arbitrary nested depths
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$batchToDevice(batch)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{batch}}{the batch to send, usually a list of torch tensors}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+the batch on the required device
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-itemOrDefaults"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-itemOrDefaults}{}}}
+\subsection{Method \code{itemOrDefaults()}}{
+select item from list, and if it's null sets a default
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$itemOrDefaults(list, item, default = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{list}}{A list with items}
+
+\item{\code{item}}{Which list item to retrieve}
+
+\item{\code{default}}{The value to return if list doesn't have item}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+the list item or default
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-Estimator-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-Estimator-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{Estimator$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/man/fitEstimator.Rd b/man/fitEstimator.Rd
new file mode 100644
index 0000000..87b6e39
--- /dev/null
+++ b/man/fitEstimator.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Estimator.R
+\name{fitEstimator}
+\alias{fitEstimator}
+\title{fitEstimator}
+\usage{
+fitEstimator(trainData, modelSettings, analysisId, ...)
+}
+\arguments{
+\item{trainData}{the data to use}
+
+\item{modelSettings}{modelSettings object}
+
+\item{analysisId}{Id of the analysis}
+
+\item{...}{Extra inputs}
+}
+\description{
+fits a deep learning estimator to data.
+}
diff --git a/man/gridCvDeep.Rd b/man/gridCvDeep.Rd
new file mode 100644
index 0000000..4d52d4d
--- /dev/null
+++ b/man/gridCvDeep.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Estimator.R
+\name{gridCvDeep}
+\alias{gridCvDeep}
+\title{gridCvDeep}
+\usage{
+gridCvDeep(mappedData, labels, settings, modelLocation, paramSearch)
+}
+\arguments{
+\item{mappedData}{Mapped data with covariates}
+
+\item{labels}{Dataframe with the outcomes}
+
+\item{settings}{Settings of the model}
+
+\item{modelLocation}{Where to save the model}
+
+\item{paramSearch}{model parameters to perform search over}
+}
+\description{
+Performs grid search for a deep learning estimator
+}
diff --git a/man/predictDeepEstimator.Rd b/man/predictDeepEstimator.Rd
new file mode 100644
index 0000000..31d9e8d
--- /dev/null
+++ b/man/predictDeepEstimator.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Estimator.R
+\name{predictDeepEstimator}
+\alias{predictDeepEstimator}
+\title{predictDeepEstimator}
+\usage{
+predictDeepEstimator(plpModel, data, cohort)
+}
+\arguments{
+\item{plpModel}{the plpModel}
+
+\item{data}{plp data object or a torch dataset}
+
+\item{cohort}{data.frame with the rowIds of the people}
+}
+\description{
+the prediction function for the estimator
+}
diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd
new file mode 100644
index 0000000..d3001e3
--- /dev/null
+++ b/man/setEstimator.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Estimator.R
+\name{setEstimator}
+\alias{setEstimator}
+\title{setEstimator}
+\arguments{
+\item{learningRate}{what learning rate to use}
+
+\item{weightDecay}{what weight_decay to use}
+
+\item{optimizer}{which optimizer to use}
+
+\item{scheduler}{which learning rate scheduler to use}
+
+\item{criterion}{loss function to use}
+
+\item{posWeight}{If more weight should be added to positive labels during training - will result in miscalibrated models}
+
+\item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving}
+
+\item{earlyStoppingMetric}{Which parameter to use for early stopping}
+
+\item{patience}{patience for earlyStopper}
+
+\item{hyperparameterMetric}{which metric to use for hyperparameter, loss, auc, auprc or a custom function}
+}
+\description{
+creates settings for the Estimator, which takes a model and trains it
+}
diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd
new file mode 100644
index 0000000..a79ab49
--- /dev/null
+++ b/man/setMultiLayerPerceptron.Rd
@@ -0,0 +1,52 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/MLP.R
+\name{setMultiLayerPerceptron}
+\alias{setMultiLayerPerceptron}
+\title{setMultiLayerPerceptron}
+\usage{
+setMultiLayerPerceptron(
+  numLayers = c(1:8),
+  sizeHidden = c(2^(6:9)),
+  dropout = c(seq(0, 0.5, 0.05)),
+  sizeEmbedding = c(2^(6:9)),
+  weightDecay = c(1e-06, 0.001),
+  learningRate = c(0.01, 3e-04, 1e-05),
+  seed = NULL,
+  hyperParamSearch = "random",
+  randomSample = 100,
+  device = "cpu",
+  batchSize = 1024,
+  epochs = 30
+)
+}
+\arguments{
+\item{numLayers}{Number of layers in network, default: 1:16}
+
+\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)}
+
+\item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)}
+
+\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)}
+
+\item{weightDecay}{Weight decay to apply, default: c(1e-6, 1e-3)}
+
+\item{learningRate}{Learning rate to use. default: c(1e-2, 1e-5)}
+
+\item{seed}{Seed to use for sampling hyperparameter space}
+
+\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'}
+
+\item{randomSample}{How many random samples from hyperparameter space to use}
+
+\item{device}{Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'}
+
+\item{batchSize}{Size of batch, default: 1024}
+
+\item{epochs}{Number of epochs to run, default: 10}
+}
+\description{
+Creates settings for a Multilayer perceptron model
+}
+\details{
+Model architecture
+}
diff --git a/man/setResNet.Rd b/man/setResNet.Rd
new file mode 100644
index 0000000..d0ced0d
--- /dev/null
+++ b/man/setResNet.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ResNet.R
+\name{setResNet}
+\alias{setResNet}
+\title{setResNet}
+\usage{
+setResNet(
+  numLayers = c(1:8),
+  sizeHidden = c(2^(6:9)),
+  hiddenFactor = c(1:4),
+  residualDropout = c(seq(0, 0.5, 0.05)),
+  hiddenDropout = c(seq(0, 0.5, 0.05)),
+  sizeEmbedding = c(2^(6:9)),
+  weightDecay = c(1e-06, 0.001),
+  learningRate = c(0.01, 3e-04, 1e-05),
+  seed = NULL,
+  hyperParamSearch = "random",
+  randomSample = 100,
+  device = "cpu",
+  batchSize = 1024,
+  epochs = 30
+)
+}
+\arguments{
+\item{numLayers}{Number of layers in network, default: 1:16}
+
+\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)}
+
+\item{hiddenFactor}{How much to grow the amount of neurons in each ResLayer, default: 1:4}
+
+\item{residualDropout}{How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)}
+
+\item{hiddenDropout}{How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)}
+
+\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)}
+
+\item{weightDecay}{Weight decay to apply, default: c(1e-6, 1e-3)}
+
+\item{learningRate}{Learning rate to use. default: c(1e-2, 1e-5)}
+
+\item{seed}{Seed to use for sampling hyperparameter space}
+
+\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'}
+
+\item{randomSample}{How many random samples from hyperparameter space to use}
+
+\item{device}{Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'}
+
+\item{batchSize}{Size of batch, default: 1024}
+
+\item{epochs}{Number of epochs to run, default: 10}
+}
+\description{
+Creates settings for a ResNet model
+}
+\details{
+Model architecture from by https://arxiv.org/abs/2106.11959
+}
diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd
new file mode 100644
index 0000000..54927b0
--- /dev/null
+++ b/man/setTransformer.Rd
@@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/Transformer.R
+\name{setTransformer}
+\alias{setTransformer}
+\title{create settings for training a non-temporal transformer}
+\usage{
+setTransformer(
+  numBlocks = 3,
+  dimToken = 96,
+  dimOut = 1,
+  numHeads = 8,
+  attDropout = 0.25,
+  ffnDropout = 0.25,
+  resDropout = 0,
+  dimHidden = 512,
+  weightDecay = 1e-06,
+  learningRate = 3e-04,
+  batchSize = 1024,
+  epochs = 10,
+  device = "cpu",
+  hyperParamSearch = "random",
+  randomSamples = 100,
+  seed = NULL
+)
+}
+\arguments{
+\item{numBlocks}{number of transformer blocks}
+
+\item{dimToken}{dimension of each token (embedding size)}
+
+\item{dimOut}{dimension of output, usually 1 for binary problems}
+
+\item{numHeads}{number of attention heads}
+
+\item{attDropout}{dropout to use on attentions}
+
+\item{ffnDropout}{dropout to use in feedforward block}
+
+\item{resDropout}{dropout to use in residual connections}
+
+\item{dimHidden}{dimension of the feedworward block}
+
+\item{weightDecay}{weightdecay to use}
+
+\item{learningRate}{learning rate to use}
+
+\item{batchSize}{batchSize to use}
+
+\item{epochs}{How many epochs to run the model for}
+
+\item{device}{Which device to use, cpu or cuda}
+
+\item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'}
+
+\item{randomSamples}{How many samples to use in hyperparameter search if random}
+
+\item{seed}{Random seed to use}
+}
+\description{
+A transformer model
+}
+\details{
+from https://arxiv.org/abs/2106.11959
+}
diff --git a/tests/testthat.R b/tests/testthat.R
index 59feab0..fc53f82 100644
--- a/tests/testthat.R
+++ b/tests/testthat.R
@@ -1,6 +1,4 @@
-#Sys.setenv("R_TESTS" = "")
-#options(fftempdir = file.path(getwd(),'fftemp'))
 library(testthat)
 library(DeepPatientLevelPrediction)
+
 test_check("DeepPatientLevelPrediction")
-unlink('T:/Temp', recursive = T)
diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R
new file mode 100644
index 0000000..93c9232
--- /dev/null
+++ b/tests/testthat/setup.R
@@ -0,0 +1,94 @@
+library(PatientLevelPrediction)
+
+testLoc <- tempdir()
+
+# get connection and data from Eunomia
+connectionDetails <- Eunomia::getEunomiaConnectionDetails()
+Eunomia::createCohorts(connectionDetails)
+
+covSet <- FeatureExtraction::createCovariateSettings(
+  useDemographicsGender = T,
+  useDemographicsAge = T,
+  useDemographicsRace = T,
+  useDemographicsEthnicity = T,
+  useDemographicsAgeGroup = T,
+  useConditionGroupEraLongTerm = T,
+  useDrugEraStartLongTerm = T,
+  endDays = -1
+)
+
+
+covSetT <- FeatureExtraction::createTemporalSequenceCovariateSettings(
+  useDemographicsGender = T,
+  useDemographicsAge = T,
+  useDemographicsRace = T,
+  useDemographicsEthnicity = T,
+  useDemographicsAgeGroup = T,
+  useConditionEraGroupStart = T,
+  useDrugEraStart = T,
+  timePart = "month",
+  timeInterval = 1,
+  sequenceEndDay = -1,
+  sequenceStartDay = -365 * 5
+)
+
+
+databaseDetails <- PatientLevelPrediction::createDatabaseDetails(
+  connectionDetails = connectionDetails,
+  cdmDatabaseSchema = "main",
+  cohortDatabaseSchema = "main",
+  cohortTable = "cohort",
+  targetId = 4,
+  outcomeIds = 3,
+  outcomeDatabaseSchema = "main",
+  outcomeTable = "cohort",
+  cdmDatabaseName = "eunomia"
+)
+
+restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings(
+  firstExposureOnly = T,
+  washoutPeriod = 365
+)
+
+plpData <- PatientLevelPrediction::getPlpData(
+  databaseDetails = databaseDetails,
+  restrictPlpDataSettings = restrictPlpDataSettings,
+  covariateSettings = covSet
+)
+
+
+plpDataT <- PatientLevelPrediction::getPlpData(
+  databaseDetails = databaseDetails,
+  restrictPlpDataSettings = restrictPlpDataSettings,
+  covariateSettings = covSetT
+)
+
+
+populationSet <- PatientLevelPrediction::createStudyPopulationSettings(
+  requireTimeAtRisk = F,
+  riskWindowStart = 1,
+  riskWindowEnd = 365
+)
+
+population <- PatientLevelPrediction::createStudyPopulation(
+  plpData = plpData,
+  outcomeId = 3,
+  populationSettings = populationSet
+)
+
+trainData <- PatientLevelPrediction::splitData(
+  plpData,
+  population = population,
+  splitSettings = PatientLevelPrediction::createDefaultSplitSetting()
+)
+
+mappedData <- PatientLevelPrediction::MapIds(
+  covariateData = trainData$Train$covariateData,
+  cohort = trainData$Train$labels
+)
+
+dataset <- Dataset(
+  data = mappedData$covariates,
+  labels = trainData$Train$labels$outcomeCount,
+  numericalIndex = NULL
+)
diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R
new file mode 100644
index 0000000..989f886
--- /dev/null
+++ b/tests/testthat/test-Dataset.R
@@ -0,0 +1,60 @@
+test_that("dataset correct class", {
+  testthat::expect_true("myDataset" %in% class(dataset))
+})
+
+test_that("length of index correct", {
+  testthat::expect_equal(
+    length(dataset$getNumericalIndex()),
+    dplyr::n_distinct(mappedData$covariates %>% dplyr::pull(covariateId))
+  )
+})
+
+test_that("number of num and cat features sum correctly", {
+  testthat::expect_equal(
+    dataset$numNumFeatures() + dataset$numCatFeatures(),
+    dplyr::n_distinct(mappedData$covariates %>% dplyr::pull(covariateId))
+  )
+})
+
+
+test_that("length of dataset correct", {
+  expect_equal(length(dataset), dataset$cat$shape[1])
+  expect_equal(length(dataset), dataset$num$shape[1])
+  expect_equal(
+    dataset$.length(),
+    dplyr::n_distinct(mappedData$covariates %>% dplyr::pull(rowId))
+  )
+})
+
+test_that(".getbatch works", {
+  batch_size <- 16
+  # get one sample
+  out <- dataset[10]
+
+  # output should be a list of two items, the batch and targets,
+  # the batch is what goes to the model
+  expect_equal(length(out), 2)
+
+  # targets should be binary
+  expect_true(out$target$item() %in% c(0, 1))
+
+  # shape of batch is correct
+  expect_equal(length(out$batch), 2)
+  expect_equal(out$batch$cat$shape[1], 1)
+  expect_equal(out$batch$num$shape[1], 1)
+
+  # shape of target
+  expect_equal(out$target$shape[1], 1)
+
+  # get a whole batch
+  out <- dataset[10:(10 + batch_size - 1)]
+
+  expect_equal(length(out), 2)
+  expect_true(all(torch::as_array(out$target) %in% c(0, 1)))
+
+  expect_equal(length(out$batch), 2)
+  expect_equal(out$batch$cat$shape[1], 16)
+  expect_equal(out$batch$num$shape[1], 16)
+
+  expect_equal(out$target$shape[1], 16)
+})
diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R
new file mode 100644
index 0000000..2f110d3
--- /dev/null
+++ b/tests/testthat/test-Estimator.R
@@ -0,0 +1,155 @@
+catFeatures <- dataset$numCatFeatures()
+numFeatures <- dataset$numNumFeatures()
+
+fitParams <- list()
+baseModel <- ResNet
+
+modelParameters <- list(
+  catFeatures = catFeatures,
+  numFeatures = numFeatures,
+  sizeEmbedding = 16,
+  sizeHidden = 16,
+  numLayers = 2,
+  hiddenFactor = 2
+)
+
+estimator <- Estimator$new(
+  baseModel = baseModel,
+  modelParameters = modelParameters,
+  fitParameters = fitParams,
+  device = "cpu"
+)
+
+test_that("Estimator initialization works", {
+
+  # count parameters in both instances
+  testthat::expect_equal(
+    sum(sapply(estimator$model$parameters, function(x) prod(x$shape))),
+    sum(sapply(do.call(baseModel, modelParameters)$parameters, function(x) prod(x$shape)))
+  )
+
+  testthat::expect_equal(
+    estimator$modelParameters,
+    modelParameters
+  )
+
+  # check the function that results the value from a list
+  val <- estimator$itemOrDefaults(list(param = 1, test = 3), "param", default = NULL)
+  expect_equal(val, 1)
+  val <- estimator$itemOrDefaults(list(param = 1, test = 3), "paramater", default = NULL)
+  expect_true(is.null(val))
+})
+sink(nullfile())
+estimator$fit(dataset, dataset)
+sink()
+
+test_that("estimator fitting works", {
+
+
+  # check the fitting
+  # estimator$fitEpoch(dataset, batchIndex)
+  # estimator$finishFit(valAUCs, modelStateDict, valLosses, epoch)
+  # estimator$score(dataset, batchIndex)
+  expect_true(!is.null(estimator$bestEpoch))
+  expect_true(!is.null(estimator$bestScore$loss))
+  expect_true(!is.null(estimator$bestScore$auc))
+
+  old_weights <- estimator$model$head$weight$mean()$item()
+
+  sink(nullfile())
+  estimator$fitWholeTrainingSet(dataset, estimator$learnRateSchedule)
+  sink()
+
+  expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learnRateSchedule, 1)[[1]])
+
+  new_weights <- estimator$model$head$weight$mean()$item()
+
+  # model should be updated when refitting
+  expect_true(old_weights != new_weights)
+
+  estimator$save(testLoc, "estimator.pt")
+
+  expect_true(file.exists(file.path(testLoc, "estimator.pt")))
+
+  preds <- estimator$predictProba(dataset)
+
+  expect_lt(max(preds), 1)
+  expect_gt(min(preds), 0)
+
+  classes <- estimator$predict(dataset, threshold = 0.5)
+  expect_equal(all(unique(classes) %in% c(0, 1)), TRUE)
+
+  # not sure how to test: batchToDevice(batch)
+})
+
+test_that("early stopping works", {
+  earlyStop <- EarlyStopping$new(patience = 3, delta = 0, verbose = FALSE)
+  testthat::expect_true(is.null(earlyStop$bestScore))
+  testthat::expect_false(earlyStop$earlyStop)
+  earlyStop$call(0.5)
+  testthat::expect_equal(earlyStop$bestScore, 0.5)
+  earlyStop$call(0.4)
+  earlyStop$call(0.4)
+  earlyStop$call(0.4)
+  testthat::expect_true(earlyStop$earlyStop)
+})
+
+modelSettings <- setResNet(
+  numLayers = 1, sizeHidden = 16, hiddenFactor = 1,
+  residualDropout = 0, hiddenDropout = 0,
+  sizeEmbedding = 16, hyperParamSearch = "random",
+  randomSample = 1, epochs = 1
+)
+
+sink(nullfile())
+results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1)
+sink()
+
+test_that("Estimator fit function works", {
+  expect_true(!is.null(results$trainDetails$trainingTime))
+
+  expect_equal(class(results), "plpModel")
+  expect_equal(attr(results, "modelType"), "binary")
+  expect_equal(attr(results, "saveType"), "file")
+})
+
+test_that("predictDeepEstimator works", {
+
+  # input is an estimator and a dataset
+  sink(nullfile())
+  predictions <- predictDeepEstimator(estimator, dataset, cohort = trainData$Train$labels)
+  sink()
+
+  expect_lt(max(predictions$value), 1)
+  expect_gt(min(predictions$value), 0)
+  expect_equal(nrow(predictions), nrow(trainData$Train$labels))
+
+  # input is a plpModel and data
+  sink(nullfile())
+  predictions <- predictDeepEstimator(
+    plpModel = results, data = trainData$Test,
+    trainData$Test$labels
+  )
+  sink()
+  expect_lt(max(predictions$value), 1)
+  expect_gt(min(predictions$value), 0)
+  expect_equal(nrow(predictions), nrow(trainData$Test$labels))
+})
+
+test_that("batchToDevice works", {
+  # since we can't test moving to gpu there is a fake device called meta
+  # which we can use to test of the device is updated
+  estimator$device <- "meta"
+  b <- 1:10
+  batch <- estimator$batchToDevice(dataset[b])
+
+  devices <- lapply(
+    lapply(unlist(batch, recursive = TRUE), function(x) x$device),
+    function(x) x == torch::torch_device(type = "meta")
+  )
+  # test that all are meta
+  expect_true(all(devices == TRUE))
+})
+
+
+# cases to add, estimator with early stopping that stops, and estimator without earlystopping
diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R
new file mode 100644
index 0000000..3bc0d8f
--- /dev/null
+++ b/tests/testthat/test-MLP.R
@@ -0,0 +1,110 @@
+
+modelSettings <- setMultiLayerPerceptron(
+  numLayers = c(2),
+  sizeHidden = c(32),
+  dropout = c(0.1),
+  sizeEmbedding = c(32),
+  weightDecay = c(1e-6),
+  learningRate = c(3e-4),
+  seed = 42,
+  hyperParamSearch = "random",
+  randomSample = 1,
+  batchSize = 128,
+  epochs = 3
+)
+
+test_that("setMultiLayerPerceptron works", {
+  testthat::expect_s3_class(object = modelSettings, class = "modelSettings")
+
+  testthat::expect_equal(modelSettings$fitFunction, "fitEstimator")
+
+  testthat::expect_true(length(modelSettings$param) > 0)
+})
+
+sink(nullfile())
+results <- tryCatch(
+  {
+    PatientLevelPrediction::runPlp(
+      plpData = plpData,
+      outcomeId = 3,
+      modelSettings = modelSettings,
+      analysisId = "MLP",
+      analysisName = "Testing Deep Learning",
+      populationSettings = populationSet,
+      splitSettings = PatientLevelPrediction::createDefaultSplitSetting(),
+      sampleSettings = PatientLevelPrediction::createSampleSettings(), # none
+      featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none
+      preprocessSettings = PatientLevelPrediction::createPreprocessSettings(),
+      executeSettings = PatientLevelPrediction::createExecuteSettings(
+        runSplitData = T,
+        runSampleData = F,
+        runfeatureEngineering = F,
+        runPreprocessData = T,
+        runModelDevelopment = T,
+        runCovariateSummary = F
+      ),
+      saveDirectory = file.path(testLoc, "MLP")
+    )
+  },
+  error = function(e) {
+    print(e)
+    return(NULL)
+  }
+)
+sink()
+
+test_that("MLP with runPlp working checks", {
+  testthat::expect_false(is.null(results))
+
+  # check structure
+  testthat::expect_true("prediction" %in% names(results))
+  testthat::expect_true("model" %in% names(results))
+  testthat::expect_true("covariateSummary" %in% names(results))
+  testthat::expect_true("performanceEvaluation" %in% names(results))
+
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(results$prediction %>%
+    dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population))
+
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(results$prediction$value), 0)
+  testthat::expect_lte(max(results$prediction$value), 1)
+})
+
+
+test_that("MLP nn-module works ", {
+  model <- MLP(
+    catFeatures = 5, numFeatures = 1, sizeEmbedding = 5,
+    sizeHidden = 16, numLayers = 1,
+    activation = torch::nn_relu,
+    normalization = torch::nn_batch_norm1d, dropout = 0.3,
+    d_out = 1
+  )
+
+  pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
+
+  # expected number of parameters
+  expect_equal(pars, 489)
+
+  input <- list()
+  input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())
+  input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32())
+
+
+  output <- model(input)
+
+  # output is correct shape
+  expect_equal(output$shape, 10)
+
+  input$num <- NULL
+  model <- MLP(
+    catFeatures = 5, numFeatures = 0, sizeEmbedding = 5,
+    sizeHidden = 16, numLayers = 1,
+    activation = torch::nn_relu,
+    normalization = torch::nn_batch_norm1d, dropout = 0.3,
+    d_out = 1
+  )
+  output <- model(input)
+  # model works without numeric variables
+  expect_equal(output$shape, 10)
+})
diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R
new file mode 100644
index 0000000..cb9a397
--- /dev/null
+++ b/tests/testthat/test-ResNet.R
@@ -0,0 +1,113 @@
+
+resSet <- setResNet(
+  numLayers = c(2),
+  sizeHidden = c(32),
+  hiddenFactor = c(2),
+  residualDropout = c(0.1),
+  hiddenDropout = c(0.1),
+  sizeEmbedding = c(32),
+  weightDecay = c(1e-6),
+  learningRate = c(3e-4),
+  seed = 42,
+  hyperParamSearch = "random",
+  randomSample = 1,
+  # device='cuda:0',
+  batchSize = 128,
+  epochs = 1
+)
+
+test_that("setResNet works", {
+  testthat::expect_s3_class(object = resSet, class = "modelSettings")
+
+  testthat::expect_equal(resSet$fitFunction, "fitEstimator")
+
+  testthat::expect_true(length(resSet$param) > 0)
+})
+
+sink(nullfile())
+res2 <- tryCatch(
+  {
+    PatientLevelPrediction::runPlp(
+      plpData = plpData,
+      outcomeId = 3,
+      modelSettings = resSet,
+      analysisId = "ResNet",
+      analysisName = "Testing Deep Learning",
+      populationSettings = populationSet,
+      splitSettings = PatientLevelPrediction::createDefaultSplitSetting(),
+      sampleSettings = PatientLevelPrediction::createSampleSettings(), # none
+      featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none
+      preprocessSettings = PatientLevelPrediction::createPreprocessSettings(),
+      executeSettings = PatientLevelPrediction::createExecuteSettings(
+        runSplitData = T,
+        runSampleData = F,
+        runfeatureEngineering = F,
+        runPreprocessData = T,
+        runModelDevelopment = T,
+        runCovariateSummary = F
+      ),
+      saveDirectory = file.path(testLoc, "Deep")
+    )
+  },
+  error = function(e) {
+    print(e)
+    return(NULL)
+  }
+)
+sink()
+
+test_that("ResNet with runPlp working checks", {
+  testthat::expect_false(is.null(res2))
+
+  # check structure
+  testthat::expect_true("prediction" %in% names(res2))
+  testthat::expect_true("model" %in% names(res2))
+  testthat::expect_true("covariateSummary" %in% names(res2))
+  testthat::expect_true("performanceEvaluation" %in% names(res2))
+
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(res2$prediction %>%
+    dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population))
+
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(res2$prediction$value), 0)
+  testthat::expect_lte(max(res2$prediction$value), 1)
+})
+
+
+test_that("ResNet nn-module works ", {
+  model <- ResNet(
+    catFeatures = 5, numFeatures = 1, sizeEmbedding = 5,
+    sizeHidden = 16, numLayers = 1, hiddenFactor = 2,
+    activation = torch::nn_relu,
+    normalization = torch::nn_batch_norm1d, hiddenDropout = 0.3,
+    residualDropout = 0.3, d_out = 1
+  )
+
+  pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
+
+  # expected number of parameters
+  expect_equal(pars, 1289)
+
+  input <- list()
+  input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())
+  input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32())
+
+
+  output <- model(input)
+
+  # output is correct shape
+  expect_equal(output$shape, 10)
+
+  input$num <- NULL
+  model <- ResNet(
+    catFeatures = 5, numFeatures = 0, sizeEmbedding = 5,
+    sizeHidden = 16, numLayers = 1, hiddenFactor = 2,
+    activation = torch::nn_relu,
+    normalization = torch::nn_batch_norm1d, hiddenDropout = 0.3,
+    residualDropout = 0.3, d_out = 1
+  )
+  output <- model(input)
+  # model works without numeric variables
+  expect_equal(output$shape, 10)
+})
diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R
new file mode 100644
index 0000000..913e687
--- /dev/null
+++ b/tests/testthat/test-Transformer.R
@@ -0,0 +1,61 @@
+settings <- setTransformer(
+  numBlocks = 1, dimToken = 8, dimOut = 1,
+  numHeads = 2, attDropout = 0.0, ffnDropout = 0.2,
+  resDropout = 0.0, dimHidden = 32, batchSize = 64,
+  epochs = 1, randomSamples = 1
+)
+
+test_that("Transformer settings work", {
+  testthat::expect_s3_class(object = settings, class = "modelSettings")
+  testthat::expect_equal(settings$fitFunction, "fitEstimator")
+  testthat::expect_true(length(settings$param) > 0)
+  testthat::expect_error(setTransformer(
+    numBlocks = 1, dimToken = 50,
+    numHeads = 7
+  ))
+})
+
+test_that("fitEstimator with Transformer works", {
+  results <- fitEstimator(trainData$Train, settings, analysisId = 1)
+
+  expect_equal(class(results), "plpModel")
+  expect_equal(attr(results, "modelType"), "binary")
+  expect_equal(attr(results, "saveType"), "file")
+
+  # check prediction between 0 and 1
+  expect_gt(min(results$prediction$value), 0)
+  expect_lt(max(results$prediction$value), 1)
+})
+
+test_that("transformer nn-module works", {
+  model <- Transformer(
+    catFeatures = 5, numFeatures = 1, numBlocks = 2,
+    dimToken = 16, numHeads = 2, attDropout = 0, ffnDropout = 0,
+    resDropout = 0, dimHidden = 32
+  )
+
+  pars <- sum(sapply(model$parameters, function(x) prod(x$shape)))
+
+  # expected number of parameters
+  expect_equal(pars, 5697)
+
+  input <- list()
+  input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long())
+  input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32())
+
+
+  output <- model(input)
+
+  # output is correct shape, size of batch
+  expect_equal(output$shape, 10)
+
+  input$num <- NULL
+
+  model <- Transformer(
+    catFeatures = 5, numFeatures = 0, numBlocks = 2,
+    dimToken = 16, numHeads = 2, attDropout = 0, ffnDropout = 0,
+    resDropout = 0, dimHidden = 32
+  )
+  output <- model(input)
+  expect_equal(output$shape, 10)
+})
diff --git a/tests/testthat/test-keras.R b/tests/testthat/test-keras.R
deleted file mode 100644
index 944f5bb..0000000
--- a/tests/testthat/test-keras.R
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-library("testthat")
-context("Keras")
-
-deepSet <- setDeepNN(epochs = 1)
-
-plpResultDeep <- runPlp(population = population,
-                                               plpData = plpData, 
-                                               modelSettings = deepSet, 
-                                               savePlpData = F, 
-                                               savePlpResult = F, 
-                                               saveEvaluation = F, 
-                                               savePlpPlots = F, 
-                                               analysisId = 'deepTest',
-                                               saveDirectory =  saveLoc)
-
-
-
-#TODO: add input checks and test these...
-#options(fftempdir = getwd())
-
-
-test_that("deepNN working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultDeep), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultDeep$prediction), nrow(population))
-
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultDeep$prediction$value), 0)
-  testthat::expect_lte(max(plpResultDeep$prediction$value), 1)
-
-})
-
-
-# add temporal data:
-CNN1Set <- setCovNN(epochs = 1, kernelSize = 3, batchSize =30)
-plpResultCNN1 <- runPlp(population = population2,
-                                                plpData = plpData3, 
-                                                modelSettings = CNN1Set, 
-                                                savePlpData = F, 
-                                                savePlpResult = F, 
-                                                saveEvaluation = F, 
-                                                savePlpPlots = F, 
-                                                analysisId = 'cnn1Test',
-                                                saveDirectory =  saveLoc)
-
-test_that("covNN working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultCNN1), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultCNN1$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultCNN1$prediction$value), 0)
-  testthat::expect_lte(max(plpResultCNN1$prediction$value), 1)
-  
-})
-
-CNN2Set <- setCovNN2(epochs = 1, kernelSize = 4, filters=4)
-plpResultCNN2 <- runPlp(population = population2,
-                                                plpData = plpData3, 
-                                                modelSettings = CNN2Set, 
-                                                savePlpData = F, 
-                                                savePlpResult = F, 
-                                                saveEvaluation = F, 
-                                                savePlpPlots = F, 
-                                                analysisId = 'cnn1Test',
-                                                saveDirectory =  saveLoc)
-
-test_that("covNN2 working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultCNN2), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultCNN2$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultCNN2$prediction$value), 0)
-  testthat::expect_lte(max(plpResultCNN2$prediction$value), 1)
-  
-})
-
-
-if(!travis){
-CIReNNSet <- setCIReNN(epochs = 1, useVae = F, units=c(10) )
-plpResultCIReNN <- runPlp(population = population2, plpData = plpData3, 
-                                         minCovariateFraction = 0.001, normalizeData = F, 
-                                         modelSettings = CIReNNSet, testSplit = 'person', 
-                                         testFraction = 0.25, splitSeed = 1, 
-                                         nfold = 3, savePlpData = F, savePlpResult = F,
-                                         savePlpPlots = F,
-                                         saveEvaluation = F, 
-                          analysisId = 'cireNNTest',
-                          saveDirectory =  saveLoc)
-}
-
-test_that("CIReNN working checks", {
-  if(travis){
-    skip("Too slow for travis")
-  }
-  # check same structure
-  testthat::expect_equal(names(plpResultCIReNN), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultCIReNN$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultCIReNN$prediction$value), 0)
-  testthat::expect_lte(max(plpResultCIReNN$prediction$value), 1)
-  
-})
-
diff --git a/tests/testthat/test-torch.R b/tests/testthat/test-torch.R
deleted file mode 100644
index ab732d2..0000000
--- a/tests/testthat/test-torch.R
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-library("testthat")
-context("Torch")
-
-lrtSet <- setLRTorch(w_decay=0.0005, 
-                     epochs=1, 
-                     class_weight = 0, 
-                     autoencoder = FALSE, 
-                     vae =FALSE)
-  
-
-plpResultLrt <- runPlp(population = population,
-                                               plpData = plpData, 
-                                               modelSettings = lrtSet, 
-                                               savePlpData = F, 
-                                               savePlpResult = F, 
-                                               saveEvaluation = F, 
-                                               savePlpPlots = F, 
-                                               analysisId = 'lrtTest',
-                                               saveDirectory =  saveLoc)
-
-
-
-#TODO: add input checks and test these...
-#options(fftempdir = getwd())
-
-
-test_that("torch LR working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultLrt), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultLrt$prediction), nrow(population))
-
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultLrt$prediction$value), 0)
-  testthat::expect_lte(max(plpResultLrt$prediction$value), 1)
-
-})
-
-mlptSet <- setMLPTorch(size = 10, 
-                       w_decay = 0.001, 
-                       epochs = 1,
-                       autoencode = F)
-
-
-plpResultMlpt <- runPlp(population = population,
-                       plpData = plpData, 
-                       modelSettings = mlptSet, 
-                       savePlpData = F, 
-                       savePlpResult = F, 
-                       saveEvaluation = F, 
-                       savePlpPlots = F, 
-                       analysisId = 'mlptTest',
-                       saveDirectory =  saveLoc)
-
-test_that("MLP LR working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultMlpt), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultMlpt$prediction), nrow(population))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultMlpt$prediction$value), 0)
-  testthat::expect_lte(max(plpResultMlpt$prediction$value), 1)
-  
-})
-
-
-# add temporal data:
-if(!travis){
-RNNTSet <- setRNNTorch(hidden_size = 1, 
-                        epochs =1)
-plpResultRNNT <- runPlp(population = population2,
-                                                plpData = plpData3, 
-                                                modelSettings = RNNTSet, 
-                                                savePlpData = F, 
-                                                savePlpResult = F, 
-                                                saveEvaluation = F, 
-                                                savePlpPlots = F, 
-                                                analysisId = 'rnntTest',
-                                                saveDirectory =  saveLoc)
-}
-
-test_that("RNN Torch working checks", {
-  if(travis){
-    skip("Too slow for travis")
-  }
-  # check same structure
-  testthat::expect_equal(names(plpResultRNNT), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultRNNT$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultRNNT$prediction$value), 0)
-  testthat::expect_lte(max(plpResultRNNT$prediction$value), 1)
-  
-})
-
-# add CNN when it is fixed
-
diff --git a/vignettes/.gitignore b/vignettes/.gitignore
new file mode 100644
index 0000000..397b4a7
--- /dev/null
+++ b/vignettes/.gitignore
@@ -0,0 +1 @@
+*.log
diff --git a/vignettes/BuildingDeepModels.Rmd b/vignettes/BuildingDeepModels.Rmd
new file mode 100644
index 0000000..8ac9925
--- /dev/null
+++ b/vignettes/BuildingDeepModels.Rmd
@@ -0,0 +1,498 @@
+---
+title: "Building Deep Learning Models"
+author: "Jenna Reps, Egill Fridgeirsson, Chungsoo Kim, Henrik John, Seng Chan You, Xiaoyong Pan"
+date: '`r Sys.Date()`'
+header-includes:
+  - \usepackage{fancyhdr}
+  - \pagestyle{fancy}
+  - \fancyhead{}
+  - \fancyfoot[LE,RO]{\thepage}
+  - \renewcommand{\headrulewidth}{0.4pt}
+  - \renewcommand{\footrulewidth}{0.4pt}
+  - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r  utils::packageVersion("PatientLevelPrediction")`}
+  - \fancyfoot[CO,CE]{DeepPatientLevelPrediction Package Version `r  utils::packageVersion("DeepPatientLevelPrediction")`}
+output:
+  pdf_document:
+    includes:
+      in_header: preamble.tex
+    number_sections: yes
+    toc: yes
+  word_document:
+    toc: yes
+  html_document:
+    number_sections: yes
+    toc: yes
+editor_options: 
+  markdown: 
+    wrap: 72
+---
+
+```{=html}
+<!--
+%\VignetteEngine{knitr}
+%\VignetteIndexEntry{Building Deep Learning Models}
+-->
+```
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Introduction
+
+## DeepPatientLevelPrediction
+
+Patient level prediction aims to use historic data to learn a function
+between an input (a patient's features such as age/gender/comorbidities
+at index) and an output (whether the patient experienced an outcome
+during some time-at-risk). Deep learning is example of the the current
+state-of-the-art classifiers that can be implemented to learn the
+function between inputs and outputs.
+
+Deep Learning models are widely used to automatically learn high-level
+feature representations from the data, and have achieved remarkable
+results in image processing, speech recognition and computational
+biology. Recently, interesting results have been shown using large
+observational healthcare data (e.g., electronic healthcare data or
+claims data), but more extensive research is needed to assess the power
+of Deep Learning in this domain.
+
+This vignette describes how you can use the Observational Health Data
+Sciences and Informatics (OHDSI)
+[`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction)
+package and
+[`DeepPatientLevelPrediction`](http://github.com/OHDSI/DeepPatientLevelPrediction)
+package to build Deep Learning models. This vignette assumes you have
+read and are comfortable with building patient level prediction models
+as described in the [`BuildingPredictiveModels`
+vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf).
+Furthermore, this vignette assumes you are familiar with Deep Learning
+methods.
+
+## Background
+
+Deep Learning models are build by stacking an often large number of
+neural network layers that perform feature engineering steps, e.g
+embedding, and are collapsed in a final linear layer (equivalent to
+logistic regression). These algorithms need a lot of data to converge to
+a good representation, but currently the sizes of the large
+observational healthcare databases are growing fast which would make
+Deep Learning an interesting approach to test within OHDSI's
+[Patient-Level Prediction
+Framework](https://academic.oup.com/jamia/article/25/8/969/4989437). The
+current implementation allows us to perform research at scale on the
+value and limitations of Deep Learning using observational healthcare
+data.
+
+In the package we have used
+[torch](https://cran.r-project.org/web/packages/torch/index.html) but we
+invite the community to add other backends.
+
+Many network architectures have recently been proposed and we have
+implemented a number of them, however, this list will grow in the near
+future. It is important to understand that some of these architectures
+require a 2D data matrix, i.e. \|patient\|x\|feature\|, and others use a
+3D data matrix \|patient\|x\|feature\|x\|time\|. The [FeatureExtraction
+Package](www.github.com\ohdsi\FeatureExtraction) has been extended to
+enable the extraction of both data formats as will be described with
+examples below.
+
+Note that training Deep Learning models is computationally intensive,
+our implementation therefore supports both GPU and CPU. It will
+automatically check whether there is GPU or not in your computer. A GPU
+is highly recommended for Deep Learning!
+
+## Requirements
+
+Full details about the package requirements and instructions on
+installing the package can be found
+[here](https://ohdsi.github.io/DeepPatientLevelPrediction/articles/Installing.html).
+
+## Integration with PatientLevelPrediction
+
+The `DeepPatientLevelPrediction` package provides additional model
+settings that can be used within the `PatientLevelPrediction` package
+`runPlp()` function. To use both packages you first need to pick the
+deep learning architecture you wish to fit (see below) and then you
+specify this as the modelSettings inside `runPlp()`.
+
+```{r, eval=FALSE}
+
+# load the data
+plpData <- PatientLevelPrediction::loadPlpData('locationOfData')
+
+# pick the set<Model> from  DeepPatientLevelPrediction
+deepLearningModel <- DeepPatientLevelPrediction::setResNet()
+
+# use PatientLevelPrediction to fit model
+deepLearningResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 1230, 
+    modelSettings = deepLearningModel,
+    analysisId = 'resNetTorch', 
+    ...
+  )
+
+```
+
+# Non-Temporal Architectures
+
+We implemented the following non-temporal (2D data matrix)
+architectures:
+
+## Simple MLP
+
+### Overall concept
+
+A multilayer perceptron (MLP) model is a directed graph consisting of an
+input layer, one or more hidden layers and an output layer. The model
+takes in the input feature values and feeds these forward through the
+graph to determine the output class. A process known as
+'backpropagation' is used to train the model. Backpropagation requires
+labelled data and involves automatically calculating the derivative of
+the model parameters with respect to the the error between the model's
+predictions and ground truth. Then the model learns how to adjust the
+model's parameters to reduce the error.
+
+### Example
+
+#### Set Function
+
+To use the package to fit a MLP model you can use the `setMultiLayerPerceptron()`
+function to specify the hyper-parameter settings for the MLP.
+
+#### Inputs
+
+The `numLayers` and `sizeHidden` inputs define the network topology via the number
+of layers and neurons in the network's hidden layers.
+
+The `dropout` input specifies the probability that a layer
+randomly sets some inputs to 0 at each step during training time. A
+value of `0.2` means that 20% of the layers inputs will be set to
+0. This is used to reduce overfitting.
+
+The `sizeEmbedding` input specifices the size of the embedding used. The first
+layer is an embedding layer which converts each sparse feature to a dense vector
+which it learns. An embedding is a lower dimensional projection of the features
+where distance between points is a measure of similarity.
+
+The `weightDecay` input corresponds to the weight decay in the objective
+function. During model fitting the aim is to minimize the objective
+function. The objective function is made up of the prediction error (the
+difference between the prediction vs the truth) plus the square of the
+weights multiplied by the weight decay. The larger the weight decay, the
+more you penalize having large weights. If you set the weight decay too
+large, the model will never fit well enough, if you set it too low, you
+need to be careful of overfitting (so try to stop model fitting
+earlier).
+
+The `learningRate` input is the learning rate which is a hyperparameter that
+controls how much to change the model in response to the estimated error
+each time the model weights are updated. The smaller the `learningRate` the longer
+it will take to fit the model and the model weights may get stuck, but
+if the `learningRate` is too large, the weights may sub-optimally converge too
+fast.
+
+The `seed` lets the user use the same random initialization of the network's
+weights as a previous run.
+
+The `hyperParamSearch` chooses the strategy to find the best hyperparameters. 
+Currently a random search and grid search are supported. Grid search searches
+every possible combination of hyperparameters while random search samples 
+randomly from the combinations. Since neural networks can be very flexible and
+have many hyperparameter combinations it's almost never feasible to do a full
+grid search unless the network is really small.
+
+The `randomSample` chooses how many random samples to use.
+
+The `device` specifies what device to use. Either `cpu` or `cuda`. Or if you 
+have many GPU's `cuda:x` where x is the gpu number as seen in `nvidia-smi`.
+
+The `batchSize` corresponds to the number of data points (patients)
+used per iteration to estimate the network error during model fitting.
+
+The `epochs` corresponds to how many time to run through the entire
+training data while fitting the model.
+
+
+
+#### Example Code
+
+For example, the following code will try 10 different network
+configurations sampled from the possible combinations given and pick the one 
+that obtains the greatest AUROC via cross validation in the training data and 
+then fit the model with that configuration using all the training data. The 
+standard output of `runPlp()` will be returned - this contains the MLP model 
+along with the performance details and settings. Note that all possible
+combinations are 2*2*2*2 or 16 but specify ```randomSample=10``` to only try
+10 of those.
+
+```{r, eval=FALSE}
+
+modelSettings <- setMultiLayerPerceptron(
+                                numLayers = c(3, 5),
+                                numHidden = c(64, 128), 
+                                dropout = c(0.2),
+                                sizeEmbedding = c(32, 64),
+                                learningRate = c(1e-3, 1e-4), 
+                                weightDecay = c(1e-5), 
+                                randomSample=10,
+                                batchSize = c(100), 
+                                epochs = c(5),  
+                                seed = 12
+                                )
+
+mlpResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 3, 
+    modelSettings = modelSettings,
+    analysisId = 'MLP', 
+    analysisName = 'Testing Deep Learning', 
+    populationSettings = populationSet, 
+    splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), 
+    sampleSettings = PatientLevelPrediction::createSampleSettings(),  # none 
+    featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none 
+    preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), 
+    executeSettings = PatientLevelPrediction::createExecuteSettings(
+      runSplitData = T, 
+      runSampleData = F, 
+      runfeatureEngineering = F, 
+      runPreprocessData = T, 
+      runModelDevelopment = T, 
+      runCovariateSummary = F
+    ), 
+    saveDirectory = file.path(testLoc, 'DeepNNTorch')
+  )
+
+```
+
+## ResNet
+
+### Overall concept
+
+Deep learning models are often trained via a process known as gradient
+descent during backpropogation. During this process the network weights
+are updated based on the gradient of the error function for the current
+weights. However, as the number of layers in the network increase, there
+is a greater chance of experiencing an issue known as the vanishing or
+exploding gradient during this process. The vanishing or exploding
+gradient is when the gradient goes to 0 or infinity, which negatively
+impacts the model fitting.
+
+The residual network (ResNet) was introduced to address the vanishing or
+exploding gradient issue. It works by adding connections between
+non-adjacent layers, termed a 'skip connection'. Using some form of
+regularization with these 'skip connections' enables the network to
+ignore any problematic layer that resulted due to gradient issues.
+
+The ResNet calculates embeddings for every feature and then averages
+them to compute an embedding per patient.
+
+This implementation of a ResNet for tabular data is based on [this
+paper](https://arxiv.org/abs/2106.11959).
+
+### Example
+
+#### Set Function
+
+To use the package to fit a ResNet model you can use the `setResNet()`
+function to specify the hyperparameter settings for the network.
+
+#### Inputs
+
+##### Model inputs:
+
+`numLayers`: How many layers to use in the model.
+
+`sizeHidden`: How many neurons in each hidden layer
+
+`hiddenFactor`: How much to increase number of neurons in each layer
+
+`residualDropout` and`hiddenDropout` : How much dropout to apply in
+hidden layer or residual connection
+
+`sizeEmbedding` : The size of the initial embedding layer
+
+##### Training process inputs:
+
+`weightDecay` : How much weight decay to apply, which penalizes bigger
+weights
+
+`learningRate` : Which learning rate to use
+
+`seed` : Use a seed for reproducibility
+
+`device` : Which device to use, such as a cpu or a gpu
+
+`batchSize` : Size of batch of data used per iteration during training
+
+`epochs` : How many runs through the data
+
+##### Hyperparameter tuning inputs:
+
+`hyperParamSearch` : Which type of hyperparameter search to use, either
+random sampling or exhaustive (grid) search
+
+`randomSample`: If doing a random search for hyperparameters, how many
+random samples to use
+
+#### Example Code
+
+For example, the following code will fit a two layer ResNet where each
+layer has 32 neurons which increases by a factor of two before
+decreasing againg (hiddenFactor). 10% of inputs to each layer and
+residual connection within the layer are randomly zeroed. The embedding
+layer has 32 neurons. Learning rate of 3e-4 with weight decay of 1e-6 is
+used for the optimizer. No hyperparameter search is done since each
+input only includes one option.
+
+```{r, eval=FALSE}
+
+resset <- setResNet(
+  numLayers = c(2), 
+  sizeHidden = c(32),
+  hiddenFactor = c(2),
+  residualDropout = c(0.1), 
+  hiddenDropout = c(0.1),
+  sizeEmbedding = c(32), 
+  weightDecay = c(1e-6),
+  learningRate = c(3e-4), 
+  seed = 42, 
+  hyperParamSearch = 'random',
+  randomSample = 1, 
+  #device='cuda:0', # uncomment to use GPU
+  batchSize = 128, 
+  epochs = 3
+)
+
+resResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 3, 
+    modelSettings = resset,
+    analysisId = 'ResNet', 
+    analysisName = 'Testing ResNet', 
+    populationSettings = populationSet, 
+    splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), 
+    sampleSettings = PatientLevelPrediction::createSampleSettings(),  
+    featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), 
+    preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), 
+    executeSettings = PatientLevelPrediction::createExecuteSettings(
+      runSplitData = T, 
+      runSampleData = F, 
+      runfeatureEngineering = F, 
+      runPreprocessData = T, 
+      runModelDevelopment = T, 
+      runCovariateSummary = F
+    ), 
+    saveDirectory = file.path(getwd(), 'ResNet') # change to save elsewhere
+  )
+
+```
+
+## Transformer
+
+### Overall concept
+
+Recently there has been a surge of models in natural language processing
+and computer vision that utilize attention. This is a technique where
+the model learns where to look and what to focus on in the input data.
+This was first described in the attention is all you need
+[paper](https://arxiv.org/abs/1706.03762). Here we have used an
+implementation that has shown good performance on non-temporal tabular
+data from this [paper](https://arxiv.org/abs/2106.11959).
+
+This architecture is computationally expensive and scales badly with
+longer sequence length. In this case the sequence is the amount of
+features each patient has. Users need to be aware of how many features
+they are feeding to the model since this will effect the computation
+time heavily. This is something you control in `FeatureExtraction` when
+you create your covariate settings.
+
+### Examples
+
+#### Set Function
+
+To use the package to fit a Transformer model you can use the
+`setTransformer()` function to specify the hyperparameter settings for
+the network.
+
+#### Inputs
+
+The training and hyperparameter tuning inputs are the same as for the
+ResNet.
+
+##### Model inputs:
+
+`numBlocks` : How many Transformer blocks to use, each block includes a
+self-attention layer and a feedforward block with two linear layers.
+
+`dimToken` : Dimension of the embedding for each feature's embedding
+
+`dimOut` : Dimension of output, for binary problems this is 1.
+
+`numHeads` : Number of attention heads for the self-attention
+
+`attDropout` , `ffnDropout` and `resDropout` : How much dropout to apply
+on attentions, in feedforward block or in residual connections
+
+`dimHidden` : How many neurons in linear layers inside the feedforward
+block
+
+#### Example Code
+
+```{r, eval=FALSE}
+
+modelSettings <- setTransformer(numBlocks = 3,
+                                dimToken = 32,
+                                dimOut = 1, 
+                                numHeads = 4,
+                                attDropout = 0.25,
+                                ffnDropout = 0.25,
+                                resDropout = 0,
+                                dimHidden = 128,
+                                weightDecay = 1e-06,
+                                learningRate = 3e-04
+                                batchSize = 128,
+                                epochs = 10,
+                                device = 'cpu',  # or 'cuda' for GPU
+                                randomSamples = 1)
+
+
+TransformerResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 3, 
+    modelSettings = modelSettings,
+    analysisId = 'Transformer', 
+    analysisName = 'Testing transformer', 
+    populationSettings = populationSet, 
+    splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), 
+    sampleSettings = PatientLevelPrediction::createSampleSettings(),  # none 
+    featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none 
+    preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), 
+    executeSettings = PatientLevelPrediction::createExecuteSettings(
+      runSplitData = T, 
+      runSampleData = F, 
+      runfeatureEngineering = F, 
+      runPreprocessData = T, 
+      runModelDevelopment = T, 
+      runCovariateSummary = F
+    ), 
+    saveDirectory = file.path(getwd(), 'Transformer') # change to save elsewhere
+  )
+```
+
+# Acknowledgments
+
+Considerable work has been dedicated to provide the
+`DeepPatientLevelPrediction` package.
+
+```{r tidy=TRUE,eval=TRUE}
+citation("DeepPatientLevelPrediction")
+```
+
+**Please reference this paper if you use the PLP Package in your work:**
+
+[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and
+implementation of a standardized framework to generate and evaluate
+patient-level prediction models using observational healthcare data. J
+Am Med Inform Assoc.
+2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032)
diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd
new file mode 100644
index 0000000..7946f76
--- /dev/null
+++ b/vignettes/Installing.Rmd
@@ -0,0 +1,149 @@
+---
+title: "DeepPatientLevelPrediction Installation Guide"
+author: "Egill Fridgeirsson"
+date: '`r Sys.Date()`'
+header-includes:
+  - \usepackage{fancyhdr}
+  - \pagestyle{fancy}
+  - \fancyhead{}
+  - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r  utils::packageVersion("PatientLevelPrediction")`}
+  - \fancyfoot[CO,CE]{DeepPatientLevelPrediction Package Version `r  utils::packageVersion("DeepPatientLevelPrediction")`}
+  - \fancyfoot[LE,RO]{\thepage}
+  - \renewcommand{\headrulewidth}{0.4pt}
+  - \renewcommand{\footrulewidth}{0.4pt}
+output:
+  pdf_document:
+    includes:
+      in_header: preamble.tex
+    number_sections: yes
+    toc: yes
+  word_document:
+    toc: yes
+  html_document:
+    number_sections: yes
+    toc: yes
+---
+
+```{=html}
+<!--
+%\VignetteEngine{knitr}
+%\VignetteIndexEntry{Installing DeepPLP}
+-->
+```
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+# Introduction
+
+This vignette describes how you need to install the Observational Health Data Science and Informatics (OHDSI) DeepPatientLevelPrediction under Windows, Mac and Linux.
+
+# Software Prerequisites
+
+## Windows Users
+
+Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires installing:
+
+-   R (<https://cran.cnr.berkeley.edu/> ) - (R \>= 3.5.0, but latest is recommended)
+-   Rstudio (<https://www.rstudio.com/> )
+-   Java (<http://www.java.com> )
+-   RTools (<https://cran.r-project.org/bin/windows/Rtools/>)
+
+## Mac/Linux Users
+
+Under Mac and Linux the OHDSI deepPLP package requires installing:
+
+-   R (<https://cran.cnr.berkeley.edu/> ) - (R \>= 3.3.0, but latest is recommended)
+-   Rstudio (<https://www.rstudio.com/> )
+-   Java (<http://www.java.com> )
+-   Xcode command line tools(run in terminal: xcode-select --install) [MAC USERS ONLY]
+
+# Installing the Package
+
+The preferred way to install the package is by using `remotes`, which will automatically install the latest release and all the latest dependencies.
+
+If you do not want the official release you could install the bleeding edge version of the package (latest develop branch).
+
+Note that the latest develop branch could contain bugs, please report them to us if you experience problems.
+
+## Installing DeepPatientLevelPrediction using remotes
+
+To install using `remotes` run:
+
+```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE}
+install.packages("remotes")
+remotes::install_github("OHDSI/FeatureExtraction")
+remotes::install_github("OHDSI/PatientLevelPrediction")
+remotes::install_github("OHDSI/DeepPatientLevelPrediction")
+```
+
+DeepPLP relies on [torch for R](https://torch.mlverse.org/). When torch is installed the user 
+will be prompted if libtorch and lantern binaries should be downloaded. These binaries are neccesary 
+for the package to run.
+
+If you are using DeepPLP in an offline environment the function `torch::install_torch_from_file()` can be used. This will first require to download and move the correct binaries to the offline environment. See [torch installation guide](https://torch.mlverse.org/docs/articles/installation.html) for more detailed instructions.
+
+When installing make sure to close any other Rstudio sessions that are using `DeepPatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing.
+
+# Testing Installation
+
+```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE}
+library(PatientLevelPrediction)
+library(DeepPatientLevelPrediction)
+
+data(plpDataSimulationProfile)
+sampleSize <- 1e4
+plpData <- simulatePlpData(
+  plpDataSimulationProfile,
+  n = sampleSize 
+)
+
+populationSettings <- PatientLevelPrediction::createStudyPopulationSettings(
+                                                          requireTimeAtRisk = F, 
+                                                          riskWindowStart = 1, 
+                                                          riskWindowEnd = 365)
+# a very simple resnet
+modelSettings <- setResNet(numLayers = 2, 
+                           sizeHidden = 64, 
+                           hiddenFactor = 1,
+                           residualDropout = 0, 
+                           hiddenDropout = 0.2, 
+                           sizeEmbedding = 64, 
+                           weightDecay = 1e-6,
+                           learningRate = 3e-4, 
+                           seed = 42, 
+                           hyperParamSearch = 'random',
+                           randomSample = 1, device = 'cpu',batchSize = 128,
+                           epochs = 3)
+
+plpResults <- PatientLevelPrediction::runPlp(plpData = plpData,
+               outcomeId = 3,
+               modelSettings = modelSettings,
+               analysisId = 'Test',
+               analysisName = 'Testing DeepPlp',
+               populationSettings = populationSettings,
+               splitSettings = createDefaultSplitSetting(),
+               sampleSettings = createSampleSettings(), 
+               featureEngineeringSettings = createFeatureEngineeringSettings(), 
+               preprocessSettings = createPreprocessSettings(),
+               logSettings = createLogSettings(),
+               executeSettings = createExecuteSettings(runSplitData = T,
+                                                      runSampleData = F,
+                                                      runfeatureEngineering = F,
+                                                      runPreprocessData = T,
+                                                      runModelDevelopment = T,
+                                                      runCovariateSummary = T
+                                                      ))
+```
+
+# Acknowledgments
+
+Considerable work has been dedicated to provide the `DeepPatientLevelPrediction` package.
+
+```{r tidy=TRUE,eval=TRUE}
+citation("DeepPatientLevelPrediction")
+```
+
+**Please reference this paper if you use the PLP Package in your work:**
+
+[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032)
\ No newline at end of file
diff --git a/vignettes/preamble.tex b/vignettes/preamble.tex
new file mode 100644
index 0000000..2040267
--- /dev/null
+++ b/vignettes/preamble.tex
@@ -0,0 +1,8 @@
+\usepackage{float}
+\let\origfigure\figure
+\let\endorigfigure\endfigure
+\renewenvironment{figure}[1][2] {
+    \expandafter\origfigure\expandafter[H]
+} {
+    \endorigfigure
+}
\ No newline at end of file