From 27ee112060f92d6bd74a78bd7c5c46a1c7ed66bb Mon Sep 17 00:00:00 2001
From: Jenna Reps <jreps@its.jnj.com>
Date: Mon, 7 Jun 2021 13:32:09 -0400
Subject: [PATCH] initial commit with code from PLP

initial commit with code from PLP
---
 .Rbuildignore                     |    2 +
 DESCRIPTION                       |   49 +
 DeepPatientLevelPrediction.Rproj  |   20 +
 R/CIReNN.R                        | 1162 +++++++++++++++++++
 R/CNNTorch.R                      |  178 +++
 R/CovNN.R                         |  535 +++++++++
 R/CovNN2.R                        |  461 ++++++++
 R/DeepNN.R                        |  557 ++++++++++
 R/Formatting.R                    |  457 ++++++++
 R/Predict.R                       |  643 +++++++++++
 R/RNNTorch.R                      |  173 +++
 R/SaveLoadPlp.R                   |  891 +++++++++++++++
 README.md                         |   60 +
 inst/python/TorchMap.py           |   19 +
 inst/python/TorchUtils.py         |  695 ++++++++++++
 inst/python/deepTorch.py          | 1186 ++++++++++++++++++++
 inst/python/deepTorchFunctions.py | 1732 +++++++++++++++++++++++++++++
 inst/python/predictFunctions.py   |  153 +++
 inst/python/python_predict.py     |  111 ++
 tests/testthat.R                  |    6 +
 tests/testthat/test-keras.R       |  137 +++
 tests/testthat/test-torch.R       |  123 ++
 22 files changed, 9350 insertions(+)
 create mode 100644 .Rbuildignore
 create mode 100644 DESCRIPTION
 create mode 100644 DeepPatientLevelPrediction.Rproj
 create mode 100644 R/CIReNN.R
 create mode 100644 R/CNNTorch.R
 create mode 100644 R/CovNN.R
 create mode 100644 R/CovNN2.R
 create mode 100644 R/DeepNN.R
 create mode 100644 R/Formatting.R
 create mode 100644 R/Predict.R
 create mode 100644 R/RNNTorch.R
 create mode 100644 R/SaveLoadPlp.R
 create mode 100644 README.md
 create mode 100644 inst/python/TorchMap.py
 create mode 100755 inst/python/TorchUtils.py
 create mode 100644 inst/python/deepTorch.py
 create mode 100644 inst/python/deepTorchFunctions.py
 create mode 100644 inst/python/predictFunctions.py
 create mode 100644 inst/python/python_predict.py
 create mode 100644 tests/testthat.R
 create mode 100644 tests/testthat/test-keras.R
 create mode 100644 tests/testthat/test-torch.R

diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000..91114bf
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..42e28a4
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,49 @@
+Package: DeepPatientLevelPrediction
+Type: Package
+Title: Package for deep learning patient level prediction using data in the OMOP Common Data
+    Model
+Version: 0.0.1
+Date: 2021-06-07
+Authors@R: c(
+    person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")),
+    person("Seng", "Chan You", role = c("aut")),
+    person("Egill", "Friogeirsson", role = c("aut"))
+    )
+    
+Maintainer: Jenna Reps <jreps@its.jnj.com>
+Description: A package for creating deep learning patient level prediction models following
+the OHDSI PatientLevelPrediction framework.
+License: Apache License 2.0
+URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction
+BugReports: https://github.com/OHDSI/DeepPatientLevelPrediction/issues
+VignetteBuilder: knitr
+Depends:
+    R (>= 3.3.0),
+    FeatureExtraction (>= 3.0.0)
+Imports:
+    Andromeda,
+    DatabaseConnector (>= 3.0.0),
+    dplyr,
+    knitr,
+    magrittr,
+    Matrix,
+    ParallelLogger (>= 2.0.0),
+    reshape2,
+    reticulate (> 1.16),
+    RSQLite,
+    slam,
+    SqlRender (>= 1.1.3),
+    tibble,
+    tidyr,
+Suggests:
+    devtools,
+    keras,
+    plyr,  
+    tensorflow,
+    testthat
+Remotes:
+    ohdsi/FeatureExtraction
+LinkingTo: Rcpp
+NeedsCompilation: yes
+RoxygenNote: 7.1.1
+Encoding: UTF-8
diff --git a/DeepPatientLevelPrediction.Rproj b/DeepPatientLevelPrediction.Rproj
new file mode 100644
index 0000000..6284fc1
--- /dev/null
+++ b/DeepPatientLevelPrediction.Rproj
@@ -0,0 +1,20 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: knitr
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageBuildArgs: --compact-vignettes=both
+PackageCheckArgs: --as-cran
+PackageRoxygenize: rd,namespace
diff --git a/R/CIReNN.R b/R/CIReNN.R
new file mode 100644
index 0000000..4cdaa81
--- /dev/null
+++ b/R/CIReNN.R
@@ -0,0 +1,1162 @@
+# @file CIReNN.R
+# Code edited from OHDSI contributor @chandryou CIReNN branch
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# BuildCIReNN<-function(outcomes=ff::as.ffdf(population[,c('rowId','y')]),
+#     covariates = result$data,
+#     indexFolder=indexFolder){
+# 
+# }
+
+#' Create setting for CIReNN model
+#'
+#' @param numberOfRNNLayer The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3) 
+#' @param units         The number of units of RNN layer - as a list of vectors
+#' @param recurrentDropout  The reccurrent dropout rate (regularisation)
+#' @param layerDropout      The layer dropout rate (regularisation)
+#' @param lr                 Learning rate
+#' @param decay              Learning rate decay over each update.
+#' @param outcomeWeight      The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight. 
+#' @param batchSize          The number of data points to use per training batch
+#' @param epochs          Number of times to iterate over dataset
+#' @param earlyStoppingMinDelta         minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement.
+#' @param earlyStoppingPatience         Number of epochs with no improvement after which training will be stopped.
+#' @param useDeepEnsemble               logical (either TRUE or FALSE) value for using Deep Ensemble
+#' @param numberOfEnsembleNetwork       Integer, Number of Ensemble. If you want to use Deep Ensemble, this number should be greater than 1. 
+#' @param bayes                         logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer
+#' @param useDeepEnsemble               logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing. 
+#' @param numberOfEnsembleNetwork       Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5).
+#' @param useVae                        logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN
+#' @param vaeDataSamplingProportion     Data sampling proportion for VAE
+#' @param vaeValidationSplit            Validation split proportion for VAE
+#' @param vaeBatchSize                  batch size for VAE
+#' @param vaeLatentDim                  Number of latent dimesion for VAE
+#' @param vaeIntermediateDim            Number of intermediate dimesion for VAE
+#' @param vaeEpoch                      Number of times to interate over dataset for VAE
+#' @param vaeEpislonStd                 Epsilon
+#' @param useGPU                        logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE
+#' @param maxGPUs                       Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas.
+#' @param seed            Random seed used by deep learning model
+#' @importFrom zeallot %<-%
+#' @examples
+#' \dontrun{
+#' model.CIReNN <- setCIReNN()
+#' }
+#' @export
+setCIReNN <- function(numberOfRNNLayer=c(1),units=c(128, 64), recurrentDropout=c(0.2), layerDropout=c(0.2),
+                      lr =c(1e-4), decay=c(1e-5), outcomeWeight = c(0), batchSize = c(100), 
+                      epochs= c(100), earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), 
+                      bayes = T, useDeepEnsemble = F, numberOfEnsembleNetwork = 5, 
+                      useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, 
+                      vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, 
+                      vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2,
+                      seed=1234  ){
+  
+  ensure_installed("keras")
+  ensure_installed("tensorflow")
+  ensure_installed("plyr")
+  
+  if( sum(!( numberOfRNNLayer %in% c(1,2,3)))!=0 ) stop ('Only 1,2 or 3 is available now. ')
+  if(!((all.equal(numberOfEnsembleNetwork, as.integer(numberOfEnsembleNetwork))) & (numberOfEnsembleNetwork>=1) )) {
+    stop ('Number of ensemble network should be a natural number')  }
+  
+  # if(class(indexFolder)!='character')
+  #     stop('IndexFolder must be a character')
+  # if(length(indexFolder)>1)
+  #     stop('IndexFolder must be one')
+  # 
+  # if(class(units)!='numeric')
+  #     stop('units must be a numeric value >0 ')
+  # if(units<1)
+  #     stop('units must be a numeric value >0 ')
+  # 
+  # #if(length(units)>1)
+  # #    stop('units can only be a single value')
+  # 
+  # if(class(recurrent_dropout)!='numeric')
+  #     stop('dropout must be a numeric value >=0 and <1')
+  # if( (recurrent_dropout<0) | (recurrent_dropout>=1))
+  #     stop('dropout must be a numeric value >=0 and <1')
+  # if(class(layer_dropout)!='numeric')
+  #     stop('layer_dropout must be a numeric value >=0 and <1')
+  # if( (layer_dropout<0) | (layer_dropout>=1))
+  #     stop('layer_dropout must be a numeric value >=0 and <1')
+  # if(class(lr)!='numeric')
+  #     stop('lr must be a numeric value >0')
+  # if(lr<=0)
+  #     stop('lr must be a numeric value >0')
+  # if(class(decay)!='numeric')
+  #     stop('decay must be a numeric value >=0')
+  # if(decay<=0)
+  #     stop('decay must be a numeric value >=0')
+  # if(class(outcome_weight)!='numeric')
+  #     stop('outcome_weight must be a numeric value >=0')
+  # if(outcome_weight<=0)
+  #     stop('outcome_weight must be a numeric value >=0')
+  # if(class(batch_size)!='numeric')
+  #     stop('batch_size must be an integer')
+  # if(batch_size%%1!=0)
+  #     stop('batch_size must be an integer')
+  # if(class(epochs)!='numeric')
+  #     stop('epochs must be an integer')
+  # if(epochs%%1!=0)
+  #     stop('epochs must be an integer')
+  # if(!class(seed)%in%c('numeric','NULL'))
+  #     stop('Invalid seed')
+  #if(class(UsetidyCovariateData)!='logical')
+  #    stop('UsetidyCovariateData must be an TRUE or FALSE')
+  
+  result <- list(model='fitCIReNN', param=split(expand.grid(
+    numberOfRNNLayer=numberOfRNNLayer,units=units, recurrentDropout=recurrentDropout, 
+    layerDropout=layerDropout,
+    lr =lr, decay=decay, outcomeWeight=outcomeWeight, epochs= epochs,
+    earlyStoppingMinDelta = earlyStoppingMinDelta, earlyStoppingPatience = earlyStoppingPatience,
+    bayes= bayes, useDeepEnsemble = useDeepEnsemble,numberOfEnsembleNetwork = numberOfEnsembleNetwork,
+    useVae= useVae,vaeDataSamplingProportion = vaeDataSamplingProportion, vaeValidationSplit= vaeValidationSplit, 
+    vaeBatchSize = vaeBatchSize, vaeLatentDim = vaeLatentDim, vaeIntermediateDim = vaeIntermediateDim, 
+    vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd, useGPU = useGPU, maxGPUs = maxGPUs,
+    seed=ifelse(is.null(seed),'NULL', seed)),
+    
+    1:(length(numberOfRNNLayer)*length(units)*length(recurrentDropout)*length(layerDropout)*length(lr)*length(decay)*length(outcomeWeight)*length(earlyStoppingMinDelta)*length(earlyStoppingPatience)*length(epochs)*max(1,length(seed)))),
+    name='CIReNN'
+  )
+  
+  class(result) <- 'modelSettings' 
+  return(result)
+}
+
+
+fitCIReNN <- function(plpData,population, param, search='grid', quiet=F,
+                      outcomeId, cohortId, ...){
+  # check plpData is coo format:
+  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
+    stop("Needs correct covariateData")
+  
+  metaData <- attr(population, 'metaData')
+  if(!is.null(population$indexes))
+    population <- population[population$indexes>0,]
+  attr(population, 'metaData') <- metaData
+  
+  start<-Sys.time()
+  
+  result<- toSparseM(plpData,population,map=NULL, temporal=T)
+  data <- result$data
+  covariateMap <- result$map
+  
+  #remove result to save memory
+  rm(result)
+  
+  if(param[[1]]$useVae){
+    #Sampling the data for bulding VAE
+    vaeSampleData <- data[sample(seq(dim(data)[1]), floor(dim(data)[1]*param[[1]]$vaeDataSamplingProportion),replace=FALSE),,]
+    
+    #Build VAE
+    vae <- buildVae(vaeSampleData, vaeValidationSplit= param[[1]]$vaeValidationSplit, 
+                  vaeBatchSize = param[[1]]$vaeBatchSize, vaeLatentDim = param[[1]]$vaeLatentDim, vaeIntermediateDim = param[[1]]$vaeIntermediateDim,
+                  vaeEpoch = param[[1]]$vaeEpoch, vaeEpislonStd = param[[1]]$vaeEpislonStd, useGPU= param[[1]]$useGPU, maxGPUs= param[[1]]$maxGPUs, temporal = TRUE)
+    #remove sample data for VAE to save memory
+    vaeSampleData <- NULL
+    
+    vaeEnDecoder<- vae[[1]]
+    vaeEncoder  <- vae[[2]]
+    
+    #Embedding by using VAE encoder
+    data <- plyr::aaply(as.array(data), 2, function(x) stats::predict(vaeEncoder, x, batch_size = param$vaeBatchSize))
+    data <- aperm(data, perm = c(2,1,3))#rearrange of dimension
+    
+    ##Check the performance of vae
+    # decodedVaeData<-plyr::aaply(as.array(data), 2, function(x) predict(vaeEnDecoder, x, batch_size = param$vaeBatchSzie))
+    # decodedVaeData<-aperm(decodedVaeData, c(2,1,3))
+    # a1=Epi::ROC(form=as.factor(as.vector(data))~as.vector(decodedVaeData),plot="ROC")
+    
+  }else {
+    vaeEnDecoder <- NULL
+    vaeEncoder <- NULL
+  }
+  
+  #one-hot encoding
+  population$y <- keras::to_categorical(population$outcomeCount, 2)#[,2] #population$outcomeCount
+  
+  # do cross validation to find hyperParameter
+  datas <- list(population=population, plpData=data)
+  
+  #remove data to save memory
+  data <- NULL
+  
+  #Selection of hyperparameters
+  hyperParamSel <- lapply(param, function(x) do.call(trainCIReNN, c(x,datas,train=TRUE)  ))
+  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
+  hyperSummary <- as.data.frame(hyperSummary)
+  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
+  hyperParamSel <- unlist(lapply(hyperParamSel, function(x) x$auc))
+  
+  #now train the final model and return coef
+  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
+  finalModel<-do.call(trainCIReNN, c(param[[bestInd]],datas, train=FALSE))
+  
+  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
+  incs <- rep(1, nrow(covariateRef)) 
+  covariateRef$included <- incs
+  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
+  
+  #modelTrained <- file.path(outLoc) 
+  param.best <- param[[bestInd]]
+  
+  comp <- start-Sys.time()
+  
+  # train prediction
+  prediction <- finalModel$prediction
+  finalModel$prediction <- NULL
+  
+  # return model location 
+  result <- list(model = finalModel$model,
+                 trainCVAuc = -1, # ToDo decide on how to deal with this
+                 hyperParamSearch = hyperSummary,
+                 modelSettings = list(model='fitCIReNN',modelParameters=param.best),
+                 metaData = plpData$metaData,
+                 populationSettings = attr(population, 'metaData'),
+                 outcomeId = outcomeId,
+                 cohortId = cohortId,
+                 varImp = covariateRef, 
+                 trainingTime = comp,
+                 covariateMap = covariateMap,
+                 useDeepEnsemble = param.best$useDeepEnsemble,
+                 numberOfEnsembleNetwork = param.best$numberOfEnsembleNetwork,
+                 useVae = param.best$useVae,
+                 vaeBatchSize = param.best$vaeBatchSize,
+                 vaeEnDecoder = vaeEnDecoder,
+                 vaeEncoder = vaeEncoder,
+                 predictionTrain  = prediction
+  )
+  class(result) <- 'plpModel'
+  attr(result, 'type') <- 'deep'
+  if(param.best$useDeepEnsemble){
+    attr(result, 'type') <- 'deepEnsemble'
+    }
+  if(param.best$bayes){
+    attr(result, 'type') <- 'BayesianDeep'
+    }
+  attr(result, 'predictionType') <- 'binary'
+  
+  return(result)
+}
+
+trainCIReNN<-function(plpData, population,
+                      numberOfRNNLayer=1,units=128, recurrentDropout=0.2, layerDropout=0.2,
+                      lr =1e-4, decay=1e-5, outcomeWeight = 0, batchSize = 100, 
+                      epochs= 100, earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), 
+                      bayes = T, useDeepEnsemble = F,numberOfEnsembleNetwork =3,
+                      useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, 
+                      vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, 
+                      vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2,
+                      seed=NULL, train=TRUE){
+  
+  mu <- function(){return(NULL)}
+  sigma <- function(){return(NULL)}
+  
+  output_dim = 2 #output dimension for outcomes
+  num_MC_samples = 100 #sample number for MC sampling in Bayesian Deep Learning Prediction
+  if(outcomeWeight == 0){
+    outcomeWeight = round(sum(population$outcomeCount==0)/sum(population$outcomeCount>=1),1) #if outcome weight = 0, then it means balanced weight
+  }
+  #heteroscedatic loss function
+  heteroscedastic_loss = function(y_true, y_pred) {
+    mean = y_pred[, 1:output_dim]
+    log_var = y_pred[, (output_dim + 1):(output_dim * 2)]
+    precision = keras::k_exp(-log_var)
+    keras::k_sum(precision * (y_true - mean) ^ 2 + log_var, axis = 2)
+  }
+  
+  if(!is.null(population$indexes) && train==T){
+    writeLines(paste('Training recurrent neural network with ',length(unique(population$indexes)),' fold CV'))
+    
+    index_vect <- unique(population$indexes)
+    perform <- c()
+    
+    # create prediction matrix to store all predictions
+    predictionMat <- population
+    predictionMat$value <- 0
+    attr(predictionMat, "metaData") <- list(predictionType = "binary")
+    
+    for(index in 1:length(index_vect )){
+      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
+      
+      if(useDeepEnsemble){
+        predList<-list()
+        for (i in seq(numberOfEnsembleNetwork)){
+          #print(i)
+          ParallelLogger::logInfo(paste(i,'th process is started'))
+          pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs,
+                                      earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience,
+                                      train_rows=train_rows,index=index,lr=lr,decay=decay,
+                                      units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer,
+                                      layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs)
+          ParallelLogger::logInfo(paste(i,'th process is ended started'))
+          predList <- append(predList,pred)
+        }
+        model <- predList
+        
+        # batch prediciton 
+        maxVal <- sum(population$indexes==index)
+        batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+        prediction <- population[population$indexes==index,]
+        prediction$value <- 0
+        prediction$sigmas <- 0
+        
+        for(batch in batches){
+          
+          for (i in seq(numberOfEnsembleNetwork)){
+            if(i==1){
+              muMatrix <- data.frame()
+              sigmaMatrix <-data.frame()
+            }
+            c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,])))
+            muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2])))
+            sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
+          }
+          
+          muMean <- apply(muMatrix,2,mean)
+          muSq <- muMatrix^2
+          sigmaSq <- sigmaMatrix^2
+          sigmaMean <- apply(sigmaMatrix,2,mean)
+          sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2
+          
+          prediction$value[batch] <- c(muMean)
+          #if prediction$value is negative, make this positive
+          prediction$sigmas[batch] <- c(sigmaResult)
+          
+        }
+        prediction$value[prediction$value>1] <- 1
+        prediction$value[prediction$value<0] <- 0
+        #prediction$value[batch] <- mu[,2]
+        #prediction$sigmas[batch] <- sigma[,2]
+        
+        #writeLines(paste0(dim(pred[,2]), collapse='-'))
+        #writeLines(paste0(pred[1,2], collapse='-'))
+        
+        attr(prediction, "metaData") <- list(predictionType = "binary")
+        aucVal <- computeAuc(prediction)
+        perform <- c(perform,aucVal)
+        
+        # add the fold predictions and compute AUC after loop
+        predictionMat$value[population$indexes==index] <- prediction$value
+        
+      }else{
+        layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
+        if(useGPU){
+          ##GRU layer
+          if(numberOfRNNLayer==1){
+            layerOutput <- layerInput %>% 
+              keras::layer_cudnn_gru(units=units, #time step x number of features
+                                     return_sequences=FALSE) %>% 
+              keras::layer_dropout(layerDropout)
+          } 
+          if(numberOfRNNLayer==2){
+            layerOutput <- layerInput %>% 
+              keras::layer_cudnn_gru(units=units, #time step x number of features
+                                     return_sequences=TRUE) %>% 
+              keras::layer_dropout(layerDropout) %>%
+              keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
+              keras::layer_dropout(layerDropout)
+          }
+          if(numberOfRNNLayer==3){
+            layerOutput <- layerInput %>% 
+              keras::layer_cudnn_gru(units=units, #time step x number of features
+                                     return_sequences=TRUE) %>% 
+              keras::layer_dropout(layerDropout) %>%
+              keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
+              keras::layer_dropout(layerDropout) %>%
+              keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
+              keras::layer_dropout(layerDropout)
+          }
+        }else{
+          ##GRU layer
+          if(numberOfRNNLayer == 1){
+            layerOutput <- layerInput %>% 
+              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                               return_sequences=FALSE) 
+          }
+          if(numberOfRNNLayer > 1 ){
+            layerInput %>% # !ISSUE : "missing layerOutput <- "?
+              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                               return_sequences=TRUE) 
+          }
+          if(numberOfRNNLayer == 2){
+            layerOutput <- layerInput %>% 
+              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                               return_sequences=FALSE)
+          }
+          if(numberOfRNNLayer==3){
+            layerOutput <- layerInput %>% 
+              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                               return_sequences=TRUE) %>%
+              # ISSUE- I removed layerOutput <- layerInput %>%  as this was after a pipe
+              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                               return_sequences=FALSE) 
+          }
+        }
+        
+        earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
+                                                     mode="auto",min_delta = earlyStoppingMinDelta)
+        reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                      patience = 5,mode = "auto", 
+                                                      min_delta = 1e-5, cooldown = 0, min_lr = 0)
+        
+        class_weight=list("0"=1,"1"=outcomeWeight)
+        
+        if(bayes){
+          mean = layerOutput %>% 
+            layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
+          
+          log_var = layerOutput %>% 
+            layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
+          
+          output = keras::layer_concatenate(list(mean, log_var))
+          model = keras::keras_model(layerInput, output)
+          #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output)
+          model %>% keras::compile(
+            optimizer = "adam",
+            loss = heteroscedastic_loss,
+            metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss))
+          )
+          
+        }else{
+          model<- layerInput %>% 
+            keras::layer_dense(units=2, activation='softmax')
+          model %>% keras::compile(
+            loss = 'binary_crossentropy',
+            metrics = c('accuracy'),
+            optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+          )
+        }
+        
+        data <- plpData[population$rowId[population$indexes!=index],,]
+        
+        #Extract validation set first - 10k people or 5%
+        valN <- min(10000,sum(population$indexes!=index)*0.05)
+        val_rows <- sample(1:sum(population$indexes!=index), valN, replace=FALSE)
+        train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
+        
+        sampling_generator <- function(data, population, batchSize, train_rows, index){
+          function(){
+            gc()
+            rows<-sample(train_rows, batchSize, replace=FALSE)
+            
+            list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,])
+          }
+        }
+        
+
+        history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
+                                                  steps_per_epoch = sum(population$indexes!=index)/batchSize,
+                                                  epochs=epochs,
+                                                  validation_data=list(as.array(data[val_rows,,]), 
+                                                                       population$y[population$indexes!=index,][val_rows,]),
+                                                  callbacks=list(earlyStopping,reduceLr),
+                                                  class_weight=class_weight)
+        
+        # batch prediciton 
+        maxVal <- sum(population$indexes==index)
+        batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+        prediction <- population[population$indexes==index,]
+        prediction$value <- 0
+        
+        if(bayes){
+          prediction$epistemicUncertainty <- 0
+          prediction$aleatoricUncertainty <- 0
+          for(batch in batches){
+            MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
+            for (k in 1:num_MC_samples){
+              MC_samples[k,, ] = stats::predict(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+                #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+            }
+            pred <- apply(MC_samples[,,output_dim], 2, mean)
+            epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
+            logVar = MC_samples[, , output_dim * 2]
+            
+            if(length(dim(logVar))<=1){
+              aleatoricUncertainty = exp(mean(logVar))
+            }else{
+              aleatoricUncertainty = exp(colMeans(logVar))
+            }
+            
+            prediction$value[batch] <- pred
+            prediction$epistemicUncertainty[batch] = epistemicUncertainty
+            prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
+            
+          }
+          
+        }else{
+          for(batch in batches){
+            pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+            prediction$value[batch] <- pred[,2]
+            #writeLines(paste0(dim(pred[,2]), collapse='-'))
+            #writeLines(paste0(pred[1,2], collapse='-'))
+          }
+        }
+        prediction$value[prediction$value>1] <- 1
+        prediction$value[prediction$value<0] <- 0
+        attr(prediction, "metaData") <- list(predictionType = "binary")
+        aucVal <- computeAuc(prediction)
+        perform <- c(perform,aucVal)
+        
+        # add the fold predictions and compute AUC after loop
+        predictionMat$value[population$indexes==index] <- prediction$value
+        # add uncertainty
+        predictionMat$aleatoricUncertainty[population$indexes==index] <- prediction$aleatoricUncertainty
+        predictionMat$epistemicUncertainty[population$indexes==index] <- prediction$epistemicUncertainty
+      }
+      
+    }
+    
+    auc <- computeAuc(predictionMat)
+    foldPerm <- perform
+    
+    # Output  ----------------------------------------------------------------
+    param.val <- paste0('RNNlayer Number: ', numberOfRNNLayer, '-- units: ',units,'-- recurrentDropout: ', recurrentDropout,
+                        'layerDropout: ',layerDropout,'-- lr: ', lr,
+                        '-- decay: ', decay,'-- outcomeWeight',outcomeWeight, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
+    writeLines('==========================================')
+    writeLines(paste0('CIReNN with parameters:', param.val,' obtained an AUC of ',auc))
+    writeLines('==========================================')
+    
+  } else {
+    if(useDeepEnsemble){
+      predList<-list()
+      for (i in seq(numberOfEnsembleNetwork)){
+       #print(i)
+       pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs,
+                                   earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience,
+                                   train_rows=train_rows,index=index,lr=lr,decay=decay,
+                                   units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer,
+                                   layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs)
+       
+       predList <- append(predList,pred)
+      }
+      model <- predList
+      
+      # batch prediciton 
+      maxVal <- nrow(population)
+      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+      prediction <- population
+      prediction$value <- 0
+      prediction$sigmas <- 0
+      
+      for(batch in batches){
+        
+        for (i in seq(numberOfEnsembleNetwork)){
+          if(i == 1){
+            muMatrix <- data.frame()
+            sigmaMatrix <-data.frame()
+          }
+          c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[batch,,])))
+          muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2])))
+          sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
+        }
+        
+        muMean <- apply(muMatrix,2,mean)
+        muSq <- muMatrix^2
+        sigmaSq <- sigmaMatrix^2
+        sigmaMean <- apply(sigmaMatrix,2,mean)
+        sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2
+        
+        prediction$value[batch] <- c(muMean)
+        prediction$sigmas[batch] <- c(sigmaResult)
+      }
+      prediction$value[prediction$value>1] <- 1
+      prediction$value[prediction$value<0] <- 0
+
+      
+    }else{
+      layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
+      if(useGPU){
+        ##GRU layer
+        if(numberOfRNNLayer==1){
+          layerOutput <- layerInput %>% 
+            keras::layer_cudnn_gru(units=units, #time step x number of features
+                                   return_sequences=FALSE) %>% 
+            keras::layer_dropout(layerDropout)
+        } 
+        if(numberOfRNNLayer==2){
+          layerOutput <- layerInput %>% 
+            keras::layer_cudnn_gru(units=units, #time step x number of features
+                                   return_sequences=TRUE) %>% 
+            keras::layer_dropout(layerDropout) %>%
+            keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
+            keras::layer_dropout(layerDropout)
+        }
+        if(numberOfRNNLayer==3){
+          layerOutput <- layerInput %>% 
+            keras::layer_cudnn_gru(units=units, #time step x number of features
+                                   return_sequences=TRUE) %>% 
+            keras::layer_dropout(layerDropout) %>%
+            keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
+            keras::layer_dropout(layerDropout) %>%
+            keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
+            keras::layer_dropout(layerDropout)
+        }
+      }else{
+        ##GRU layer
+        if(numberOfRNNLayer==1){
+          layerOutput <- layerInput %>% 
+            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                             return_sequences=FALSE) 
+        }
+        if(numberOfRNNLayer>1 ){
+          layerInput %>%   # ISSUE - "layerInput <- " missing?
+            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                             return_sequences=TRUE) 
+        }
+        if(numberOfRNNLayer==2){
+          layerOutput <- layerInput %>% 
+            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                             return_sequences=FALSE) 
+        }
+        if(numberOfRNNLayer==3){
+          layerOutput <- layerInput %>% 
+            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                             return_sequences=TRUE) %>%
+            #layerOutput <- layerInput %>%  ISSUE - pipe above?
+            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
+                             return_sequences=FALSE) 
+        }
+      }
+      
+      earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
+                                                   mode="auto",min_delta = earlyStoppingMinDelta)
+      reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                    patience = 5,mode = "auto", 
+                                                    min_delta = 1e-5, cooldown = 0, min_lr = 0)
+      
+      class_weight = list("0" = 1, 
+                          "1" = outcomeWeight)
+      
+      if(bayes){
+        mean = layerOutput %>% 
+          layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
+        
+        log_var = layerOutput %>% 
+          layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
+        
+        output = keras::layer_concatenate(list(mean, log_var))
+        model = keras::keras_model(layerInput, output)
+        #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output)
+        model %>% keras::compile(
+          optimizer = "adam",
+          loss = heteroscedastic_loss,
+          metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss))
+        )
+        
+      }else{
+        model <- layerInput %>% 
+          keras::layer_dense(units=2, activation='softmax')
+        model %>% keras::compile(
+          loss = 'binary_crossentropy',
+          metrics = c('accuracy'),
+          optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+        )
+      }
+      
+      data <- plpData[population$rowId,,]
+      
+      #Extract validation set first - 10k people or 5%
+      valN <- min(10000,length(population$indexes)*0.05)
+      val_rows <- sample(1:length(population$indexes), valN, replace=FALSE)
+      train_rows <- c(1:length(population$indexes))[-val_rows]
+      
+      sampling_generator2 <- function(data, population, batchSize, train_rows){
+        function(){
+          gc()
+          rows<-sample(train_rows, batchSize, replace=FALSE)
+          list(as.array(data[rows,,]), population$y[rows,])
+        }
+      }
+      
+      
+      history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
+                                                steps_per_epoch = nrow(population[-val_rows,])/batchSize,
+                                                epochs=epochs,
+                                                validation_data=list(as.array(data[val_rows,,]), 
+                                                                     population$y[val_rows,]),
+                                                callbacks=list(earlyStopping,reduceLr),
+                                                class_weight=class_weight,
+                                                view_metrics=F)
+      
+      
+      # batched prediciton 
+      maxVal <- nrow(population)
+      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+      prediction <- population
+      prediction$value <- 0
+      
+      if(bayes){
+        prediction$epistemicUncertainty <- 0
+        prediction$aleatoricUncertainty <- 0
+        for(batch in batches){
+          MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
+          for (k in 1:num_MC_samples){
+            MC_samples[k,, ] = stats::predict(model, as.array(plpData[batch,,]))
+            #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+          }
+          pred <- apply(MC_samples[,,output_dim], 2, mean)
+          epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
+          logVar = MC_samples[, , output_dim * 2]
+          if(length(dim(logVar)) <= 1){
+            aleatoricUncertainty = exp(mean(logVar))
+          }else{
+            aleatoricUncertainty = exp(colMeans(logVar))
+            
+          }
+          prediction$value[batch] <- pred
+          prediction$epistemicUncertainty[batch] = epistemicUncertainty
+          prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
+        }
+        
+      }else{
+        for(batch in batches){
+          pred <- keras::predict_on_batch(model, as.array(plpData[batch,,]))
+          prediction$value[batch] <- pred[,2]
+        }
+        
+      }
+      prediction$value[prediction$value>1] <- 1
+      prediction$value[prediction$value<0] <- 0
+      
+      attr(prediction, "metaData") <- list(predictionType = "binary")
+      auc <- computeAuc(prediction)
+      foldPerm <- auc
+      predictionMat <- prediction
+      
+    }
+  
+  }
+  result <- list(model=model,
+                 auc=auc,
+                 prediction = predictionMat,
+                 hyperSum = unlist(list(numberOfRNNLayer=numberOfRNNLayer, 
+                                        units=units, recurrentDropout=recurrentDropout, 
+                                        layerDropout=layerDropout,lr =lr, decay=decay,outcomeWeight=outcomeWeight,
+                                        batchSize = batchSize, epochs= epochs, earlyStoppingMinDelta = earlyStoppingMinDelta, 
+                                        earlyStoppingPatience=earlyStoppingPatience,
+                                        useDeepEnsemble = useDeepEnsemble,
+                                        numberOfEnsembleNetwork =numberOfEnsembleNetwork,
+                                        useVae = useVae, vaeDataSamplingProportion =vaeDataSamplingProportion ,vaeValidationSplit = vaeValidationSplit,
+                                        vaeBatchSize =vaeBatchSize,
+                                        vaeLatentDim = vaeLatentDim,
+                                        vaeIntermediateDim = vaeIntermediateDim,
+                                        vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd))
+  )
+  return(result)
+  
+}
+
+#function for building vae
+buildVae<-function(data, vaeValidationSplit= 0.2, vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L,
+                   vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU= FALSE, maxGPUs = NULL, temporal = TRUE){
+  if (temporal){
+    dataSample <- data %>% 
+      apply(3, as.numeric)
+    } else{
+      dataSample <- data
+    }
+  originalDim <- dim(dataSample)[2]
+  K <- keras::backend()
+  x <- keras::layer_input (shape =originalDim)
+  h <- keras::layer_dense (x, vaeIntermediateDim, activation = 'relu')
+  z_mean <- keras::layer_dense(h, vaeLatentDim)
+  z_log_var <- keras::layer_dense(h, vaeLatentDim)
+  
+  sampling<- function(arg){
+    z_mean <- arg[,1:vaeLatentDim]
+    z_log_var <- arg[, (vaeLatentDim+1):(2*vaeLatentDim)]
+    
+    epsilon <- keras::k_random_normal(
+      shape = c(keras::k_shape(z_mean)[[1]]),
+      mean = 0.,
+      stddev = vaeEpislonStd
+    )
+    
+    z_mean + keras::k_exp(z_log_var/2)*epsilon
+  }
+  
+  z <- keras::layer_concatenate(list(z_mean, z_log_var)) %>% 
+    keras::layer_lambda(sampling)
+  
+  #we instantiate these layers separately so as to reuse them later
+  decoder_h <- keras::layer_dense(units = vaeIntermediateDim, activation = 'relu')
+  decoder_mean <- keras::layer_dense (units = originalDim, activation = 'sigmoid')
+  h_decoded <- decoder_h (z)
+  x_decoded_mean <- decoder_mean(h_decoded)
+  
+  #end-to-end autoencoder
+  vae <- keras::keras_model (x,x_decoded_mean)
+  #encoder, from inputs to latent space
+  encoder <- keras::keras_model(x, z_mean)
+  
+  #generator, from latent space to reconstruted inputs
+  decoder_input <- keras::layer_input (shape = vaeLatentDim)
+  h_decoded_2 <- decoder_h(decoder_input)
+  x_decoded_mean_2 <- decoder_mean(h_decoded_2)
+  generator <- keras::keras_model (decoder_input, x_decoded_mean_2)
+  
+  vae_loss <- function(x, x_decoded_mean){
+    xent_loss <- (originalDim/1.0)* keras::loss_binary_crossentropy(x, x_decoded_mean)
+    k1_loss <- -0.5 * keras::k_mean(1 + z_log_var - keras::k_square(z_mean) - keras::k_exp(z_log_var), axis = -1L)
+    xent_loss + k1_loss
+  }
+  #Activating parallelisation of GPU in encoder
+  if(useGPU & (maxGPUs>1) ){
+    vae <- keras::multi_gpu_model(vae,gpus = maxGPUs)
+  }
+  
+  vae %>% keras::compile (optimizer = "rmsprop", loss = vae_loss)
+  #if (!is.null(dataValidation)) dataValidation<-list(dataValidation,dataValidation)
+  vaeEarlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=5,mode="auto",min_delta = 1e-3)
+  naanStopping = keras::callback_terminate_on_naan()
+  csvLogging = keras::callback_csv_logger (filename="./vae.csv",separator = ",", append =TRUE )
+  
+  vae %>% keras::fit (
+    dataSample,dataSample
+    ,shuffle = TRUE
+    ,epochs = vaeEpoch
+    ,batch_size = vaeBatchSize
+    #,validation_data = dataValidation
+    ,validation_split = vaeValidationSplit
+    ,callbacks = list(vaeEarlyStopping, 
+                      csvLogging,
+                      naanStopping)
+  )
+  return (list (vae,encoder))
+}
+
+#Defining Gaussian Layer for Deep Ensemble
+GaussianLayer <- R6::R6Class("GaussianLayer",
+                             inherit = keras::KerasLayer,
+                             
+                             public = list(
+                               output_dim = NULL,
+                               kernel_1 = NULL,
+                               kernel_2 = NULL,
+                               bias_1 = NULL,
+                               bias_2 = NULL,
+                               
+                               initialize = function(output_dim){
+                                 self$output_dim <- output_dim
+                               },
+                               build = function(input_shape){
+                                 super$build(input_shape)
+                                 
+                                 self$kernel_1 = self$add_weight(name = 'kernel_1',
+                                                                 shape = list(as.integer(input_shape[[2]]), self$output_dim), #list(30, self$output_dim),#shape = keras::shape(30, self$output_dim),
+                                                                 initializer = keras::initializer_glorot_normal(),
+                                                                 trainable = TRUE)
+                                 self$kernel_2 = self$add_weight(name = 'kernel_2',
+                                                                 shape = list(as.integer(input_shape[[2]]), self$output_dim),#list(30, self$output_dim),  #shape = keras::shape(30, self$output_dim),
+                                                                 initializer = keras::initializer_glorot_normal(),
+                                                                 trainable = TRUE)
+                                 self$bias_1 = self$add_weight(name = 'bias_1',
+                                                               shape = list(self$output_dim),  #shape = keras::shape(self$output_dim),
+                                                               initializer = keras::initializer_glorot_normal(),
+                                                               trainable = TRUE)
+                                 self$bias_2 = self$add_weight(name = 'bias_2',
+                                                               shape = list(self$output_dim), #shape = keras::shape(self$output_dim),
+                                                               initializer = keras::initializer_glorot_normal(),
+                                                               trainable = TRUE)
+                               },
+                               
+                               call = function(x, mask = NULL){
+                                 output_mu = keras::k_dot(x, self$kernel_1) + self$bias_1
+                                 output_sig = keras::k_dot(x, self$kernel_2) + self$bias_2
+                                 output_sig_pos = keras::k_log(1 + keras::k_exp(output_sig)) + 1e-06
+                                 return (list(output_mu, output_sig_pos))
+                               },
+                               
+                               
+                               compute_output_shape = function(input_shape){
+                                 return (list (
+                                   list(input_shape[[1]], self$output_dim), 
+                                   list(input_shape[[1]], self$output_dim) )
+                                 )
+                               } 
+                             )
+)
+
+#define layer wrapper function for Deep Ensemble
+layer_custom <- function(object, output_dim, name = NULL, trainable = TRUE) {
+  keras::create_layer(GaussianLayer, object, list(
+    output_dim = as.integer(output_dim),
+    name = name,
+    trainable = trainable
+  ))
+}
+
+#Custom loss function for Deep Ensemble
+custom_loss <- function(sigma){
+  gaussian_loss <- function(y_true,y_pred){
+    tensorflow::tf$reduce_mean(0.5*tensorflow::tf$log(sigma) + 0.5*tensorflow::tf$div(tensorflow::tf$square(y_true - y_pred), sigma)) + 1e-6
+  }
+  return(gaussian_loss)
+}
+
+#Create Deep Ensemble Network function
+createEnsembleNetwork<-function(train, plpData,population,batchSize,epochs, earlyStoppingPatience, earlyStoppingMinDelta,
+                                train_rows=NULL,index=NULL,lr,decay,
+                                units,recurrentDropout,numberOfRNNLayer,layerDropout, useGPU = useGPU, maxGPUs = maxGPUs){
+  
+  mu <- function(){return(NULL)}
+  sigma <- function(){return(NULL)}
+  
+  if(useGPU){
+    ##GRU layer
+    layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
+    
+    if(numberOfRNNLayer==1){
+      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
+                                                return_sequences=FALSE) %>% 
+        keras::layer_dropout(layerDropout) %>%
+        keras::layer_dense(units=2, activation='softmax')
+    } 
+    if(numberOfRNNLayer==2){
+      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
+                                                return_sequences=TRUE) %>% 
+        keras::layer_dropout(layerDropout) %>%
+        keras::layer_gru(units=units, return_sequences=FALSE) %>% 
+        keras::layer_dropout(layerDropout) %>%
+        keras::layer_dense(units=2, activation='softmax')
+    }
+    if(numberOfRNNLayer==3){
+      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
+                                                return_sequences=TRUE) %>% 
+        keras::layer_dropout(layerDropout) %>%
+        keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
+        keras::layer_dropout(layerDropout) %>%
+        keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
+        keras::layer_dropout(layerDropout) %>%
+        keras::layer_dense(units=2, activation='softmax')
+    }
+  }else{
+    ##GRU layer
+    layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
+    
+    if(numberOfRNNLayer==1){
+      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
+                                                return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>%
+        keras::layer_dense(units=2, activation='softmax')
+    } 
+    if(numberOfRNNLayer==2){
+      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
+                                                return_sequences=TRUE) %>% 
+        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE)%>% keras::layer_dropout(layerDropout) %>%
+        keras::layer_dense(units=2, activation='softmax')
+    }
+    if(numberOfRNNLayer==3){
+      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
+                                                return_sequences=TRUE) %>% 
+        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=TRUE) %>%
+        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>%
+        keras::layer_dense(units=2, activation='softmax')
+    }
+  }
+  
+  c(mu,sigma) %<-% layer_custom(layers, 2, name = 'main_output')
+  
+  model <- keras::keras_model(inputs = layerInput,outputs=mu)
+  
+  earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
+                                               mode="auto",min_delta = earlyStoppingMinDelta)
+  #Currently using Parallel GPU makes errors in Deep Ensemble
+  #if(useGPU & (maxGPUs>1) ) model <- keras::multi_gpu_model(model,gpus = maxGPUs) 
+  
+  model %>% keras::compile(
+    loss = custom_loss(!!sigma),
+    optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+  )
+  
+  
+  if(!is.null(population$indexes) && train==T){
+    data <- plpData[population$rowId[population$indexes!=index],,]
+    #Extract validation set first - 10k people or 5%
+    valN <- min(10000,sum(population$indexes!=index)*0.05)
+    val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
+    train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
+    
+    sampling_generator<-function(data, population, batchSize, train_rows, index){
+      function(){
+        gc()
+        rows<-sample(train_rows, batchSize, replace=FALSE)
+        
+        list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,])
+      }
+    }
+    
+    history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
+                                              steps_per_epoch = sum(population$indexes!=index)/batchSize,
+                                              epochs=epochs,
+                                              validation_data=list(as.array(data[val_rows,,]), 
+                                                                   population$y[population$indexes!=index,][val_rows,])#,
+                                              ,callbacks=list(earlyStopping)
+                                              #callbacks=list(earlyStopping,reduceLr),
+    )
+    
+  }else{
+    data <- plpData[population$rowId,,]
+    
+    #Extract validation set first - 10k people or 5%
+    valN <- min(10000,length(population$indexes)*0.05)
+    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
+    train_rows <- c(1:length(population$indexes))[-val_rows]
+    
+    sampling_generator2<-function(data, population, batchSize, train_rows){
+      function(){
+        gc()
+        rows<-sample(train_rows, batchSize, replace=FALSE)
+        list(as.array(data[rows,,]), population$y[rows,])
+      }
+    }
+    
+    
+    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
+                                              steps_per_epoch = nrow(population[-val_rows,])/batchSize,
+                                              epochs=epochs,
+                                              validation_data=list(as.array(data[val_rows,,]),
+                                                                   population$y[val_rows,]),
+                                              callbacks=list(earlyStopping),
+                                              view_metrics=F)
+
+  }
+  #ParallelLogger::logInfo('right before get_intermediate')
+  layer_name = 'main_output'
+  get_intermediate = keras::k_function(inputs=list(model$input),
+                                       outputs=model$get_layer(layer_name)$output)
+  #ParallelLogger::logInfo('right after get_intermediate')
+  return(get_intermediate)
+}
+
+#Custom layer for Bayesian Drop Out Layer
+ConcreteDropout <- R6::R6Class("ConcreteDropout",
+                               
+                               inherit = keras::KerasWrapper,
+                               
+                               public = list(
+                                 weight_regularizer = NULL,
+                                 dropout_regularizer = NULL,
+                                 init_min = NULL,
+                                 init_max = NULL,
+                                 is_mc_dropout = NULL,
+                                 supports_masking = TRUE,
+                                 p_logit = NULL,
+                                 p = NULL,
+                                 
+                                 initialize = function(weight_regularizer,
+                                                       dropout_regularizer,
+                                                       init_min,
+                                                       init_max,
+                                                       is_mc_dropout) {
+                                   self$weight_regularizer <- weight_regularizer
+                                   self$dropout_regularizer <- dropout_regularizer
+                                   self$is_mc_dropout <- is_mc_dropout
+                                   self$init_min <- keras::k_log(init_min) - keras::k_log(1 - init_min)
+                                   self$init_max <- keras::k_log(init_max) - keras::k_log(1 - init_max)
+                                 },
+                                 
+                                 build = function(input_shape) {
+                                   super$build(input_shape)
+                                   
+                                   self$p_logit <- super$add_weight(
+                                     name = "p_logit",
+                                     shape = keras::shape(1),
+                                     initializer = keras::initializer_random_uniform(self$init_min, self$init_max),
+                                     trainable = TRUE
+                                   )
+                                   
+                                   self$p <- keras::k_sigmoid(self$p_logit)
+                                   
+                                   input_dim <- input_shape[[2]]
+                                   
+                                   weight <- private$py_wrapper$layer$kernel
+                                   
+                                   kernel_regularizer <- self$weight_regularizer * 
+                                     keras::k_sum(keras::k_square(weight)) / 
+                                     (1 - self$p)
+                                   
+                                   dropout_regularizer <- self$p * keras::k_log(self$p)
+                                   dropout_regularizer <- dropout_regularizer +  
+                                     (1 - self$p) * keras::k_log(1 - self$p)
+                                   dropout_regularizer <- dropout_regularizer * 
+                                     self$dropout_regularizer * 
+                                     keras::k_cast(input_dim, keras::k_floatx())
+                                   
+                                   regularizer <- keras::k_sum(kernel_regularizer + dropout_regularizer)
+                                   super$add_loss(regularizer)
+                                 },
+                                 
+                                 concrete_dropout = function(x) {
+                                   eps <- keras::k_cast_to_floatx(keras::k_epsilon())
+                                   temp <- 0.1
+                                   
+                                   unif_noise <- keras::k_random_uniform(shape = keras::k_shape(x))
+                                   
+                                   drop_prob <- keras::k_log(self$p + eps) - 
+                                     keras::k_log(1 - self$p + eps) + 
+                                     keras::k_log(unif_noise + eps) - 
+                                     keras::k_log(1 - unif_noise + eps)
+                                   drop_prob <- keras::k_sigmoid(drop_prob / temp)
+                                   
+                                   random_tensor <- 1 - drop_prob
+                                   
+                                   retain_prob <- 1 - self$p
+                                   x <- x * random_tensor
+                                   x <- x / retain_prob
+                                   x
+                                 },
+                                 
+                                 call = function(x, mask = NULL, training = NULL) {
+                                   if (self$is_mc_dropout) {
+                                     super$call(self$concrete_dropout(x))
+                                   } else {
+                                     k_in_train_phase(
+                                       function()
+                                         super$call(self$concrete_dropout(x)),
+                                       super$call(x),
+                                       training = training
+                                     )
+                                   }
+                                 }
+                               )
+)
+#define layer wrapper for Bayesian Drop-out layer
+layer_concrete_dropout <- function(object, 
+                                   layer,
+                                   weight_regularizer = 1e-6,
+                                   dropout_regularizer = 1e-5,
+                                   init_min = 0.1,
+                                   init_max = 0.1,
+                                   is_mc_dropout = TRUE,
+                                   name = NULL,
+                                   trainable = TRUE) {
+  keras::create_wrapper(ConcreteDropout, object, list(
+    layer = layer,
+    weight_regularizer = weight_regularizer,
+    dropout_regularizer = dropout_regularizer,
+    init_min = init_min,
+    init_max = init_max,
+    is_mc_dropout = is_mc_dropout,
+    name = name,
+    trainable = trainable
+  ))
+}
diff --git a/R/CNNTorch.R b/R/CNNTorch.R
new file mode 100644
index 0000000..b2d021d
--- /dev/null
+++ b/R/CNNTorch.R
@@ -0,0 +1,178 @@
+# @file CNNTorch.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Create setting for CNN model with python 
+#' @param nbfilters  The number of filters
+#' @param epochs     The number of epochs
+#' @param seed       A seed for the model
+#' @param class_weight   The class weight used for imbalanced data: 
+#'                           0: Inverse ratio between positives and negatives
+#'                          -1: Focal loss
+#' @param type      It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, 
+#'                      'CNN_MIX', 'ResNet' and 'CNN_MULTI'
+#' 
+#' @examples
+#' \dontrun{
+#' model.cnnTorch <- setCNNTorch()
+#' }
+#' @export
+setCNNTorch <- function(nbfilters=c(16, 32), epochs=c(20, 50), seed=0, class_weight = 0, type = 'CNN'){
+  
+  ParallelLogger::logWarn('This model has broken - please use setCNN() or setCNN2() instead ')
+  
+  # set seed
+  if(is.null(seed[1])){
+    seed <- as.integer(sample(100000000,1))
+  }
+  
+  result <- list(model='fitCNNTorch', param=split(expand.grid(nbfilters=nbfilters,
+                                            epochs=epochs, seed=seed[1], 
+											class_weight = class_weight, type = type),
+											1:(length(nbfilters)*length(epochs)) ),
+                 					  name='CNN Torch')
+  
+  class(result) <- 'modelSettings' 
+  
+  return(result)
+}
+
+
+fitCNNTorch <- function(population, plpData, param, search='grid', quiet=F,
+                        outcomeId, cohortId, ...){
+  
+  # check plpData is libsvm format or convert if needed
+  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
+    stop("Needs correct covariateData")
+  
+  if(colnames(population)[ncol(population)]!='indexes'){
+    warning('indexes column not present as last column - setting all index to 1')
+    population$indexes <- rep(1, nrow(population))
+  }
+  
+  
+  start <- Sys.time()
+  
+  population$rowIdPython <- population$rowId-1  #to account for python/r index difference #subjectId
+  pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
+  
+  result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T)
+  
+  outLoc <- createTempModelLoc()
+  # clear the existing model pickles
+  for(file in dir(outLoc))
+    file.remove(file.path(outLoc,file))
+
+  # do cross validation to find hyperParameter
+  hyperParamSel <- lapply(param, function(x) do.call(trainCNNTorch, listAppend(x, 
+                                                                                list(plpData = result$data,
+                                                                                     population = pPopulation,
+                                                                                     train=TRUE,
+                                                                                     modelOutput=outLoc))  ))
+  
+  hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel))
+  
+  #now train the final model and return coef
+  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
+  finalModel <- do.call(trainCNNTorch, listAppend(param[[bestInd]], 
+                                                  list(plpData = result$data,
+                                                       population = pPopulation,
+                                                       train=FALSE,
+                                                       modelOutput=outLoc)))
+  
+
+  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
+  incs <- rep(1, nrow(covariateRef)) 
+  covariateRef$included <- incs
+  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
+  
+  modelTrained <- file.path(outLoc) 
+  param.best <- param[[bestInd]]
+  
+  comp <- start-Sys.time()
+  
+  # train prediction
+  pred <- as.matrix(finalModel)
+  pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes
+  colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
+  pred <- as.data.frame(pred)
+  attr(pred, "metaData") <- list(predictionType="binary")
+  
+  pred$value <- 1-pred$value
+  prediction <- merge(population, pred[,c('rowId','value')], by='rowId')
+  
+  
+  # return model location 
+  result <- list(model = modelTrained,
+                 trainCVAuc = -1, # ToDo decide on how to deal with this
+                 hyperParamSearch = hyperSummary,
+                 modelSettings = list(model='fitCNNTorch',modelParameters=param.best),
+                 metaData = plpData$metaData,
+                 populationSettings = attr(population, 'metaData'),
+                 outcomeId=outcomeId,
+                 cohortId=cohortId,
+                 varImp = covariateRef, 
+                 trainingTime =comp,
+                 dense=1,
+                 covariateMap=result$map, # I think this is need for new data to map the same?
+                 predictionTrain = prediction
+                 )
+  class(result) <- 'plpModel'
+  attr(result, 'type') <- 'pythonReticulate'
+  attr(result, 'predictionType') <- 'binary'
+  
+  return(result)
+}
+
+
+trainCNNTorch <- function(plpData, population, epochs=50, nbfilters = 16, seed=0, class_weight= 0, type = 'CNN', train=TRUE, modelOutput, quiet=F){
+  
+  train_deeptorch <- function(){return(NULL)}
+  
+  python_dir <- system.file(package='PatientLevelPrediction','python')
+  e <- environment()
+  reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e)
+  
+  
+  result <- train_deeptorch(population = population, 
+                            plpData = plpData, 
+                            epochs = as.integer(epochs),
+                            nbfilters = as.integer(nbfilters),
+                            seed = as.integer(seed), 
+                            class_weight = as.double(class_weight),
+                            model_type = as.character(type),
+                            train = train,
+                            modelOutput = modelOutput,
+                            quiet = quiet
+                            )
+  
+  if(train){
+    # then get the prediction 
+    pred <- as.matrix(result)
+    colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
+    pred <- as.data.frame(pred)
+    attr(pred, "metaData") <- list(predictionType="binary")
+    
+    pred$value <- 1-pred$value
+    auc <- computeAuc(pred)
+    writeLines(paste0('Model obtained CV AUC of ', auc))
+    return(auc)
+  }
+  
+  return(result)
+  
+}
diff --git a/R/CovNN.R b/R/CovNN.R
new file mode 100644
index 0000000..4c05ffb
--- /dev/null
+++ b/R/CovNN.R
@@ -0,0 +1,535 @@
+# @file CovNN.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)
+#'
+#' @param batchSize         The number of samples to used in each batch during model training
+#' @param outcomeWeight     The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)
+#' @param lr                The learning rate
+#' @param decay             The decay of the learning rate
+#' @param dropout            [currently not used] the dropout rate for regularisation
+#' @param epochs            The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)
+#' @param filters            The number of columns output by each convolution
+#' @param kernelSize        The number of time dimensions used for each convolution
+#' @param loss               The loss function implemented 
+#' @param seed               The random seed
+#'
+#' @examples
+#' \dontrun{
+#' model.CovNN <- setCovNN()
+#' }
+#' @export
+setCovNN <- function(batchSize = 1000,
+                     outcomeWeight=1,
+                     lr=0.00001,
+                     decay=0.000001,
+                     dropout=0,
+                     epochs = 10,
+                     filters = 3, kernelSize = 10,
+                     loss = "binary_crossentropy", 
+                     seed=NULL  ){
+  #[TODO: add input checks...]
+  
+  ensure_installed("keras")
+  
+  if(!is.null(seed)){
+    warning('seed currently not implemented in CovNN')
+  }
+  
+  
+  result <- list(model='fitCovNN', param=split(expand.grid(
+    batchSize=batchSize, 
+    outcomeWeight=outcomeWeight,
+    lr=lr,
+    decay=decay,
+    dropout=dropout,
+    epochs= epochs,filters=filters,
+    kernelSize=kernelSize,loss =loss,
+    seed=ifelse(is.null(seed),'NULL', seed)),
+    1:(length(batchSize)*length(outcomeWeight)*length(epochs)*
+         length(filters)*length(lr)*length(decay)*
+         length(kernelSize)*length(loss)*max(1,length(seed)))),
+    name='CovNN'
+  )
+  
+  class(result) <- 'modelSettings' 
+  return(result)
+}
+
+
+fitCovNN <- function(plpData,population, param, search='grid', quiet=F,
+                      outcomeId, cohortId, ...){
+  # check plpData is coo format:
+  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
+    stop("Needs correct covariateData")
+  if(is.null(plpData$timeRef)){
+    stop('Data not temporal...')
+  }
+  
+  metaData <- attr(population, 'metaData')
+  if(!is.null(population$indexes))
+    population <- population[population$indexes>0,]
+  attr(population, 'metaData') <- metaData
+  
+  start<-Sys.time()
+  
+  result<- toSparseM(plpData,population,map=NULL, temporal=T)
+  
+  data <- result$data#[population$rowId,,]
+  #data<-as.array(data) -- cant make dense on big data!
+  
+  #one-hot encoding
+  population$y <- keras::to_categorical(population$outcomeCount, 2)
+  #colnames(population$y) <- c('0','1')
+  
+  # do cross validation to find hyperParameter
+  datas <- list(population=population, plpData=data)
+  hyperParamSel <- lapply(param, function(x) do.call(trainCovNN, c(x,datas,train=TRUE)  ))
+  
+  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
+  hyperSummary <- as.data.frame(hyperSummary)
+  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
+  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
+  
+  #now train the final model and return coef
+  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
+  finalModel<-do.call(trainCovNN, c(param[[bestInd]],datas, train=FALSE))
+  
+  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
+  incs <- rep(1, nrow(covariateRef)) 
+  covariateRef$included <- incs
+  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
+  
+  #modelTrained <- file.path(outLoc) 
+  param.best <- param[[bestInd]]
+  
+  comp <- start-Sys.time()
+  
+  # train prediction
+  prediction <- finalModel$prediction
+  finalModel$prediction <- NULL
+  
+  # return model location 
+  result <- list(model = finalModel$model,
+                 trainCVAuc = -1, # ToDo decide on how to deal with this
+                 hyperParamSearch = hyperSummary,
+                 modelSettings = list(model='fitCovNN',modelParameters=param.best),
+                 metaData = plpData$metaData,
+                 populationSettings = attr(population, 'metaData'),
+                 outcomeId=outcomeId,
+                 cohortId=cohortId,
+                 varImp = covariateRef, 
+                 trainingTime =comp,
+                 covariateMap=result$map,
+                 predictionTrain = prediction
+  )
+  class(result) <- 'plpModel'
+  attr(result, 'type') <- 'deepMulti'
+  attr(result, 'inputs') <- '3'
+  attr(result, 'predictionType') <- 'binary'
+  
+  return(result)
+}
+
+trainCovNN<-function(plpData, population,
+                     outcomeWeight=1, lr=0.0001, decay=0.9,
+                     dropout=0.5, filters=3,
+                     kernelSize = dim(plpData)[3],
+                     batchSize, epochs, loss= "binary_crossentropy",
+                     seed=NULL, train=TRUE){
+  
+  if(!is.null(population$indexes) && train==T){
+    writeLines(paste('Training covolutional multi-resolution neural network with ',length(unique(population$indexes)),' fold CV'))
+    
+    index_vect <- unique(population$indexes)
+    perform <- c()
+    
+    # create prediction matrix to store all predictions
+    predictionMat <- population
+    predictionMat$value <- 0
+    attr(predictionMat, "metaData") <- list(predictionType = "binary")
+    
+    if(kernelSize>dim(plpData)[3]){
+      kernelSize <- dim(plpData)[3] -1
+      warning('kernelsize reduced')
+    }
+    
+    for(index in 1:length(index_vect )){
+      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
+      
+      #submodel1 <- keras::keras_model_sequential()
+      
+      submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+                                            name='submodel1_input')
+      submodel1_output <- submodel1_input %>%
+        # Begin with 1D convolutional layer
+        keras::layer_max_pooling_1d(pool_size = kernelSize) %>%
+        keras::layer_conv_1d(
+          #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), 
+          filters = filters, 
+          kernel_size = floor(dim(plpData)[2]/kernelSize),
+          padding = "valid"
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>% 
+       keras::layer_flatten()
+      
+      #submodel2 <- keras::keras_model_sequential()
+      #submodel2 %>%
+      #  keras::layer_reshape(input_shape=c(dim(plpData)[2], dim(plpData)[3]), 
+      #                       target_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
+      submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+                                            name='submodel2_input')
+      submodel2_output <- submodel2_input %>%
+        # Begin with 1D convolutional layer
+        keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
+        keras::layer_conv_1d(
+          #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), 
+          filters = filters, 
+          kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))),
+          padding = "valid"
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>% 
+       keras::layer_flatten()
+      
+      
+      #submodel3 <- keras::keras_model_sequential()
+      #submodel3 %>%
+      submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+                                            name='submodel3_input')
+      submodel3_output <- submodel3_input %>%
+        # Begin with 1D convolutional layer
+        keras::layer_conv_1d(
+          input_shape = c(dim(plpData)[2], dim(plpData)[3]), 
+          filters = filters, 
+          kernel_size = kernelSize,
+          padding = "valid"
+        ) %>%
+        # Normalize the activations of the previous layer
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>%
+        keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
+        keras::layer_conv_1d(
+          filters = filters, 
+          kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))),
+          padding = "valid", 
+          use_bias = T
+        ) %>%
+        keras::layer_flatten() 
+      
+        #model <- keras::keras_model_sequential()
+        
+        deep_output <- keras::layer_concatenate(list(submodel1_output, 
+                                                     submodel2_output, 
+                                                     submodel3_output)) %>%
+        keras::layer_dropout(rate=dropout) %>%
+        # add fully connected layer 2
+        keras::layer_dense( 
+          units = 100, 
+          activation = "linear", use_bias=T
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>%
+        
+        # =========== FULLY CONNECTED LAYER 2
+        # add drop out of 0.5
+        keras::layer_dropout(rate=dropout) %>%
+        # add fully connected layer 2
+        keras::layer_dense(
+          units = 100, 
+          activation = "linear", use_bias=T
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>%
+        
+        # =========== FINAL LAYER
+        keras::layer_dropout(rate=dropout) %>%
+        keras::layer_dense(name = 'final',
+                           units = 2, 
+                           activation = "linear", use_bias=T
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'sigmoid', name='deep_output')
+      
+        model <- keras::keras_model(
+          inputs = c(submodel1_input, submodel2_input, submodel3_input), 
+          outputs = c(deep_output)
+        )
+        
+        
+      # Prepare model for training
+      model %>% keras::compile(
+        loss = "binary_crossentropy",
+        metrics = c('accuracy'),
+        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+      )
+      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
+      
+      class_weight=list("0"=1,"1"=outcomeWeight)
+      
+      data <- plpData[population$rowId[population$indexes!=index],,]
+      
+      #Extract validation set first - 10k people or 5%
+      valN <- min(10000,sum(population$indexes!=index)*0.05)
+      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
+      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
+      
+      sampling_generator<-function(data, population, batchSize, train_rows, index){
+        function(){
+          gc()
+          rows<-sample(train_rows, batchSize, replace=FALSE)
+          
+          list(list(as.array(data[rows,,]),
+                    as.array(data[rows,,]),
+                    as.array(data[rows,,])),
+               population$y[population$indexes!=index,1:2][rows,])
+        }
+      }
+      
+      
+      #print(table(population$y))
+      
+      if(length(train_rows) < batchSize){
+        # checking if this fixes issue with batchsize too big
+        batchSize <- length(train_rows)
+        ParallelLogger::logInfo('Reduce batchSize to training size')
+      }
+      
+      history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
+                                                steps_per_epoch = length(train_rows)/batchSize,
+                                                epochs=epochs,
+                                                validation_data=list(list(as.array(data[val_rows,,]), 
+                                                                          as.array(data[val_rows,,]),
+                                                                          as.array(data[val_rows,,])),
+                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
+                                                callbacks=list(earlyStopping,reduceLr),
+                                                class_weight=class_weight)
+      
+      
+      
+      
+      # batch prediciton 
+      maxVal <- sum(population$indexes==index)
+      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+      prediction <- population[population$indexes==index,]
+      prediction$value <- 0
+      for(batch in batches){
+        pred <- keras::predict_on_batch(model, list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,]),
+                                                    as.array(plpData[population$rowId[population$indexes==index],,][batch,,]),
+                                                    as.array(plpData[population$rowId[population$indexes==index],,][batch,,])))
+        prediction$value[batch] <- as.double(pred[,2])
+        }
+        
+      attr(prediction, "metaData") <- list(predictionType = "binary")
+      aucVal <- computeAuc(prediction)
+      perform <- c(perform,aucVal)
+      
+      # add the fold predictions and compute AUC after loop
+      predictionMat$value[population$indexes==index] <- prediction$value
+    
+    }
+    
+    auc <- computeAuc(predictionMat)
+    foldPerm <- perform
+    
+    # Output  ----------------------------------------------------------------
+    param.val <- paste0('outcomeWeight: ', outcomeWeight,
+                        '-- kernelSize: ',paste0(kernelSize,collapse ='-'),
+                        '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay,
+                        '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
+    writeLines('==========================================')
+    writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc))
+    writeLines('==========================================')
+  } else {
+    
+    #Initialize model
+    submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+                                          name='submodel1_input')
+    submodel1_output <- submodel1_input %>%
+      # Begin with 1D convolutional layer
+      keras::layer_max_pooling_1d(pool_size = kernelSize) %>%
+      keras::layer_conv_1d(
+        #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), 
+        filters = filters, 
+        kernel_size = floor(dim(plpData)[2]/kernelSize),
+        padding = "valid"
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>% 
+      keras::layer_flatten()
+    
+    submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+                                          name='submodel2_input')
+    submodel2_output <- submodel2_input %>%
+      # Begin with 1D convolutional layer
+      keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
+      keras::layer_conv_1d(
+        #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), 
+        filters = filters, 
+        kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))),
+        padding = "valid"
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>% 
+      keras::layer_flatten()
+    
+ 
+    submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+                                          name='submodel3_input')
+    submodel3_output <- submodel3_input %>%
+      # Begin with 1D convolutional layer
+      keras::layer_conv_1d(
+        input_shape = c(dim(plpData)[2], dim(plpData)[3]), 
+        filters = filters, 
+        kernel_size = kernelSize,
+        padding = "valid"
+      ) %>%
+      # Normalize the activations of the previous layer
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>%
+      keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
+      keras::layer_conv_1d(
+        filters = filters, 
+        kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))),
+        padding = "valid", 
+        use_bias = T
+      ) %>%
+      keras::layer_flatten() 
+
+    deep_output <- keras::layer_concatenate(list(submodel1_output, 
+                                                 submodel2_output, 
+                                                 submodel3_output)) %>%
+      keras::layer_dropout(rate=dropout) %>%
+      # add fully connected layer 2
+      keras::layer_dense( 
+        units = 100, 
+        activation = "linear", use_bias=T
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>%
+      
+      # =========== FULLY CONNECTED LAYER 2
+      # add drop out of 0.5
+      keras::layer_dropout(rate=dropout) %>%
+      # add fully connected layer 2
+      keras::layer_dense(
+        units = 100, 
+        activation = "linear", use_bias=T
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>%
+      
+      # =========== FINAL LAYER
+      keras::layer_dropout(rate=dropout) %>%
+      keras::layer_dense(name = 'final',
+                         units = 2, 
+                         activation = "linear", use_bias=T
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'sigmoid', name='deep_output')
+    
+    model <- keras::keras_model(
+      inputs = c(submodel1_input, submodel2_input, submodel3_input), 
+      outputs = c(deep_output)
+    )
+    
+    
+    # Prepare model for training
+    model %>% keras::compile(
+      loss = "binary_crossentropy",#loss, 
+      metrics = c('accuracy'),
+      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+    )
+    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
+    
+    
+    class_weight=list("0"=1,"1"=outcomeWeight)
+    
+    data <- plpData[population$rowId,,]
+    
+    #Extract validation set first - 10k people or 5%
+    valN <- min(10000,length(population$indexes)*0.05)
+    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
+    train_rows <- c(1:length(population$indexes))[-val_rows]
+    
+    
+    sampling_generator2<-function(data, population, batchSize, train_rows){
+      function(){
+        gc()
+        rows<-sample(train_rows, batchSize, replace=FALSE)
+        list(list(as.array(data[rows,,]),
+                  as.array(data[rows,,]),
+                  as.array(data[rows,,])), population$y[rows,])
+      }
+    }
+    
+    
+    if(length(train_rows) < batchSize){
+      # checking if this fixes issue with batchsize too big
+      batchSize <- length(train_rows)
+      ParallelLogger::logInfo('Reduce batchSize to training size')
+    }
+    
+    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
+                                              steps_per_epoch = length(train_rows)/batchSize,
+                                              epochs=epochs,
+                                              validation_data=list(list(as.array(data[val_rows,,]), 
+                                                                   as.array(data[val_rows,,]),
+                                                                   as.array(data[val_rows,,])),
+                                                                   population$y[val_rows,]),
+                                              callbacks=list(earlyStopping,reduceLr),
+                                              class_weight=class_weight,
+                                              view_metrics=F)
+    
+    # batched prediciton 
+    maxVal <- nrow(population)
+    batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      pred <- keras::predict_on_batch(model, list(as.array(plpData[batch,,]),
+                                                  as.array(plpData[batch,,]),
+                                                  as.array(plpData[batch,,])))
+      prediction$value[batch] <- as.double(pred[,2])
+    }
+
+    attr(prediction, "metaData") <- list(predictionType = "binary")
+    auc <- computeAuc(prediction)
+    foldPerm <- auc
+    predictionMat <- prediction
+  }
+  
+  result <- list(model=model,
+                 auc=auc, 
+                 prediction = predictionMat,
+                 hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay,
+                                        outcomeWeight=outcomeWeight,
+                                        dropout=dropout,filters=filters,
+                                        kernelSize=paste0(kernelSize,collapse ='-'),
+                                        epochs=epochs, loss=loss,
+                                        fold_auc=foldPerm))
+                 )
+  
+  return(result)
+  
+}
diff --git a/R/CovNN2.R b/R/CovNN2.R
new file mode 100644
index 0000000..de7bd45
--- /dev/null
+++ b/R/CovNN2.R
@@ -0,0 +1,461 @@
+# @file CovNN2.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf
+#'
+#' @param batchSize         The number of samples to used in each batch during model training
+#' @param outcomeWeight     The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)
+#' @param lr                The learning rate
+#' @param decay             The decay of the learning rate
+#' @param dropout            [currently not used] the dropout rate for regularisation
+#' @param epochs            The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)
+#' @param filters            The number of columns output by each convolution
+#' @param kernelSize        The number of time dimensions used for each convolution
+#' @param loss               The loss function implemented 
+#' @param seed               The random seed
+#'
+#' @examples
+#' \dontrun{
+#' model.CovNN <- setCovNN()
+#' }
+#' @export
+setCovNN2 <- function(batchSize = 1000,
+                      outcomeWeight=1,
+                      lr=0.00001,
+                      decay=0.000001,
+                      dropout=0,
+                      epochs = 10,
+                      filters = 3, kernelSize = 10,
+                      loss = "binary_crossentropy", 
+                      seed=NULL  ){
+  #[TODO: add input checks...]
+  
+  ensure_installed("keras")
+  
+  if(!is.null(seed)){
+    warning('seed currently not implemented in CovNN')
+  }
+  
+  result <- list(model='fitCovNN2', param=split(expand.grid(
+    batchSize=batchSize, 
+    outcomeWeight=outcomeWeight,
+    lr=lr,
+    decay=decay,
+    dropout=dropout,
+    epochs= epochs,filters=filters,
+    kernelSize=kernelSize,loss =loss,
+    seed=ifelse(is.null(seed),'NULL', seed)),
+    1:(length(batchSize)*length(outcomeWeight)*length(epochs)*
+         length(filters)*length(lr)*length(decay)*
+         length(kernelSize)*length(loss)*max(1,length(seed)))),
+    name='CovNN2'
+  )
+  
+  class(result) <- 'modelSettings' 
+  return(result)
+}
+
+
+fitCovNN2 <- function(plpData,population, param, search='grid', quiet=F,
+                      outcomeId, cohortId, ...){
+  # check plpData is coo format:
+  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
+    stop("Needs correct covariateData")
+  if(is.null(plpData$timeRef)){
+    stop('Data not temporal...')
+  }
+  
+  metaData <- attr(population, 'metaData')
+  if(!is.null(population$indexes))
+    population <- population[population$indexes>0,]
+  attr(population, 'metaData') <- metaData
+  
+  start<-Sys.time()
+  
+  result<- toSparseM(plpData,population,map=NULL, temporal=T)
+  
+  data <- result$data#[population$rowId,,]
+  #data<-as.array(data) -- cant make dense on big data!
+  
+  #one-hot encoding
+  population$y <- keras::to_categorical(population$outcomeCount, 2)
+  #colnames(population$y) <- c('0','1')
+  
+  # do cross validation to find hyperParameter
+  datas <- list(population=population, plpData=data)
+  hyperParamSel <- lapply(param, function(x) do.call(trainCovNN2, c(x,datas,train=TRUE)  ))
+  
+  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
+  hyperSummary <- as.data.frame(hyperSummary)
+  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
+  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
+  
+  #now train the final model and return coef
+  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
+  finalModel<-do.call(trainCovNN2, c(param[[bestInd]],datas, train=FALSE))
+  
+  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
+  incs <- rep(1, nrow(covariateRef)) 
+  covariateRef$included <- incs
+  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
+  
+  #modelTrained <- file.path(outLoc) 
+  param.best <- param[[bestInd]]
+  
+  comp <- start-Sys.time()
+  
+  # train prediction 
+  prediction <- finalModel$prediction
+  finalModel$prediction <- NULL
+  
+  # return model location 
+  result <- list(model = finalModel$model,
+                 trainCVAuc = -1, # ToDo decide on how to deal with this
+                 hyperParamSearch = hyperSummary,
+                 modelSettings = list(model='fitCovNN2',modelParameters=param.best),
+                 metaData = plpData$metaData,
+                 populationSettings = attr(population, 'metaData'),
+                 outcomeId=outcomeId,
+                 cohortId=cohortId,
+                 varImp = covariateRef, 
+                 trainingTime =comp,
+                 covariateMap=result$map,
+                 predictionTrain = prediction
+  )
+  class(result) <- 'plpModel'
+  attr(result, 'type') <- 'deep'
+  attr(result, 'predictionType') <- 'binary'
+  
+  return(result)
+}
+
+trainCovNN2<-function(plpData, population,
+                      outcomeWeight=1, lr=0.0001, decay=0.9,
+                      dropout=0.5, filters=3,
+                      kernelSize = dim(plpData)[3],
+                      batchSize, epochs, loss= "binary_crossentropy",
+                      seed=NULL, train=TRUE){
+  
+  if(!is.null(population$indexes) && train==T){
+    writeLines(paste('Training covolutional neural network (input and time) with ',length(unique(population$indexes)),' fold CV'))
+    
+    index_vect <- unique(population$indexes)
+    perform <- c()
+    
+    # create prediction matrix to store all predictions
+    predictionMat <- population
+    predictionMat$value <- 0
+    attr(predictionMat, "metaData") <- list(predictionType = "binary")
+    
+    for(index in 1:length(index_vect )){
+      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
+      
+      model <- keras::keras_model_sequential()  %>%
+        
+        ##model_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
+        ##                                      name='initial_input') %>%
+        keras::layer_permute(dims = c(2,1), 
+                             input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
+        
+        # Begin with 1D convolutional layer across input - hidden layer 1
+        ##model_output <-  keras::layer_input(shape=c(dim(plpData)[3], dim(plpData)[2]), 
+        ##                                    name='second_input') %>%
+        keras::layer_conv_1d(
+          filters = filters, 
+          kernel_size = dim(plpData)[3]-2,
+          padding = "valid"
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>% 
+        # second layer across input - hidden layer 2   time x filters
+        keras::layer_conv_1d(
+          filters = filters, 
+          kernel_size = 3,#filters,
+          padding = "valid"
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>% 
+        
+        # permute bact to time x var
+        keras::layer_permute(dims = c(2,1)) %>% 
+        #max pool over time and conv
+        keras::layer_max_pooling_1d(pool_size = 2) %>%
+        keras::layer_conv_1d(
+          filters = filters, 
+          kernel_size = 2,
+          padding = "valid"
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>% 
+        keras::layer_flatten() %>%
+        # final 2 deep layers with dropout and batchnorm/ relu activation
+        keras::layer_dropout(rate=dropout) %>%
+        # add fully connected layer 2
+        keras::layer_dense( 
+          units = 100, 
+          activation = "linear", use_bias=T
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>%
+        
+        # =========== FULLY CONNECTED LAYER 2
+        # add drop out of 0.5
+        keras::layer_dropout(rate=dropout) %>%
+        # add fully connected layer 2
+        keras::layer_dense(
+          units = 100, 
+          activation = "linear", use_bias=T
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'relu') %>%
+        
+        # =========== FINAL LAYER
+        keras::layer_dropout(rate=dropout) %>%
+        keras::layer_dense(name = 'final',
+                           units = 2, 
+                           activation = "linear", use_bias=T
+        ) %>%
+        keras::layer_batch_normalization() %>%
+        keras::layer_activation(activation = 'sigmoid', name='main_output')
+      
+      ##model <- keras::keras_model(
+      ##  inputs = c(model_input), 
+      ##  outputs = c(model_output)
+      ##)
+      
+      # Prepare model for training
+      model %>% keras::compile(
+        loss = "binary_crossentropy",
+        metrics = c('accuracy'),
+        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+      )
+      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
+      
+      class_weight=list("0"=1,"1"=outcomeWeight)
+      
+      data <- plpData[population$rowId[population$indexes!=index],,]
+      
+      #Extract validation set first - 10k people or 5%
+      valN <- min(10000,sum(population$indexes!=index)*0.05)
+      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
+      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
+      
+      sampling_generator<-function(data, population, batchSize, train_rows, index){
+        function(){
+          gc()
+          rows<-sample(train_rows, batchSize, replace=FALSE)
+          
+          list(as.array(data[rows,,]),
+               population$y[population$indexes!=index,1:2][rows,])
+        }
+      }
+      
+      
+      #print(table(population$y))
+      if(length(train_rows) < batchSize){
+        # checking if this fixes issue with batchsize too big
+        batchSize <- length(train_rows)
+        ParallelLogger::logInfo('Reducing batchSize to training size')
+      }
+      
+      history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
+                                                steps_per_epoch = length(train_rows)/batchSize,
+                                                epochs=epochs,
+                                                validation_data=list(as.array(data[val_rows,,]),
+                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
+                                                callbacks=list(earlyStopping,reduceLr),
+                                                class_weight=class_weight)
+      
+      
+      
+      
+      # batch prediciton 
+      maxVal <- sum(population$indexes==index)
+      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+      prediction <- population[population$indexes==index,]
+      prediction$value <- 0
+      for(batch in batches){
+        pred <- keras::predict_on_batch(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+        prediction$value[batch] <- as.double(pred[,2])
+      }
+      
+      attr(prediction, "metaData") <- list(predictionType = "binary")
+      aucVal <- computeAuc(prediction)
+      perform <- c(perform,aucVal)
+      
+      # add the fold predictions and compute AUC after loop
+      predictionMat$value[population$indexes==index] <- prediction$value
+      
+    }
+    
+    auc <- computeAuc(predictionMat)
+    foldPerm <- perform
+    
+    # Output  ----------------------------------------------------------------
+    param.val <- paste0('outcomeWeight: ', outcomeWeight,
+                        '-- kernelSize: ',paste0(kernelSize,collapse ='-'),
+                        '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay,
+                        '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
+    writeLines('==========================================')
+    writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc))
+    writeLines('==========================================')
+  } else {
+    
+    model <- keras::keras_model_sequential()  %>%
+      
+      keras::layer_permute(dims = c(2,1), 
+                           input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
+      
+      # Begin with 1D convolutional layer across input - hidden layer 1
+      keras::layer_conv_1d(
+        filters = filters, 
+        kernel_size = dim(plpData)[3]-2,
+        padding = "valid"
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>% 
+      # second layer across input - hidden layer 2   time x filters
+      keras::layer_conv_1d(
+        filters = filters, 
+        kernel_size = 3,
+        padding = "valid"
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>% 
+      
+      # permute bact to time x var
+      keras::layer_permute(dims = c(2,1)) %>% 
+      #max pool over time and conv
+      keras::layer_max_pooling_1d(pool_size = 2) %>%
+      keras::layer_conv_1d(
+        filters = filters, 
+        kernel_size = 2,
+        padding = "valid"
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>% 
+      keras::layer_flatten() %>%
+      # final 2 deep layers with dropout and batchnorm/ relu activation
+      keras::layer_dropout(rate=dropout) %>%
+      # add fully connected layer 2
+      keras::layer_dense( 
+        units = 100, 
+        activation = "linear", use_bias=T
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>%
+      
+      # =========== FULLY CONNECTED LAYER 2
+      # add drop out of 0.5
+      keras::layer_dropout(rate=dropout) %>%
+      # add fully connected layer 2
+      keras::layer_dense(
+        units = 100, 
+        activation = "linear", use_bias=T
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'relu') %>%
+      
+      # =========== FINAL LAYER
+      keras::layer_dropout(rate=dropout) %>%
+      keras::layer_dense(name = 'final',
+                         units = 2, 
+                         activation = "linear", use_bias=T
+      ) %>%
+      keras::layer_batch_normalization() %>%
+      keras::layer_activation(activation = 'sigmoid', name='main_output')
+    
+    
+    
+    # Prepare model for training
+    model %>% keras::compile(
+      loss = "binary_crossentropy",#loss, 
+      metrics = c('accuracy'),
+      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+    )
+    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
+    
+    
+    class_weight=list("0"=1,"1"=outcomeWeight)
+    
+    data <- plpData[population$rowId,,]
+    
+    #Extract validation set first - 10k people or 5%
+    valN <- min(10000,length(population$indexes)*0.05)
+    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
+    train_rows <- c(1:length(population$indexes))[-val_rows]
+    
+    
+    sampling_generator2<-function(data, population, batchSize, train_rows){
+      function(){
+        gc()
+        rows<-sample(train_rows, batchSize, replace=FALSE)
+        list(as.array(data[rows,,]), population$y[rows,])
+      }
+    }
+    
+    if(length(train_rows) < batchSize){
+      # checking if this fixes issue with batchsize too big
+      batchSize <- length(train_rows)
+      ParallelLogger::logInfo('Reducing batchSize to training size')
+    }
+    
+    
+    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
+                                              steps_per_epoch = length(train_rows)/batchSize,
+                                              epochs=epochs,
+                                              validation_data=list(as.array(data[val_rows,,]),
+                                                                   population$y[val_rows,]),
+                                              callbacks=list(earlyStopping,reduceLr),
+                                              class_weight=class_weight,
+                                              view_metrics=F)
+    
+    # batched prediciton 
+    maxVal <- nrow(population)
+    batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      pred <- keras::predict_on_batch(model, as.array(plpData[batch,,]))
+      prediction$value[batch] <- as.double(pred[,2])
+    }
+    
+    attr(prediction, "metaData") <- list(predictionType = "binary")
+    auc <- computeAuc(prediction)
+    foldPerm <- auc
+    predictionMat <- prediction
+  }
+  
+  result <- list(model=model,
+                 auc=auc,
+                 prediction = predictionMat,
+                 hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay,
+                                        outcomeWeight=outcomeWeight,
+                                        dropout=dropout,filters=filters,
+                                        kernelSize=paste0(kernelSize,collapse ='-'),
+                                        epochs=epochs, loss=loss,
+                                        fold_auc=foldPerm))
+  )
+  
+  return(result)
+  
+}
diff --git a/R/DeepNN.R b/R/DeepNN.R
new file mode 100644
index 0000000..14c57b0
--- /dev/null
+++ b/R/DeepNN.R
@@ -0,0 +1,557 @@
+# @file DeepNN.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Create setting for DeepNN model
+#'
+#' @param units         The number of units of the deep network - as a list of vectors
+#' @param layer_dropout      The layer dropout rate (regularisation)
+#' @param lr                 Learning rate
+#' @param decay              Learning rate decay over each update.
+#' @param outcome_weight      The weight of the outcome class in the loss function
+#' @param batch_size          The number of data points to use per training batch
+#' @param epochs          Number of times to iterate over dataset
+#' @param seed            Random seed used by deep learning model
+#'
+#' @examples
+#' \dontrun{
+#' model <- setDeepNN()
+#' }
+#' @export
+setDeepNN <- function(units=list(c(128, 64), 128), layer_dropout=c(0.2),
+                      lr =c(1e-4), decay=c(1e-5), outcome_weight = c(1.0), batch_size = c(100), 
+                      epochs= c(100),  seed=NULL  ){
+  
+  ensure_installed("keras")
+  
+  # if(class(indexFolder)!='character')
+  #     stop('IndexFolder must be a character')
+  # if(length(indexFolder)>1)
+  #     stop('IndexFolder must be one')
+  # 
+  # if(class(units)!='numeric')
+  #     stop('units must be a numeric value >0 ')
+  # if(units<1)
+  #     stop('units must be a numeric value >0 ')
+  # 
+  # #if(length(units)>1)
+  # #    stop('units can only be a single value')
+  # 
+  # if(class(recurrent_dropout)!='numeric')
+  #     stop('dropout must be a numeric value >=0 and <1')
+  # if( (recurrent_dropout<0) | (recurrent_dropout>=1))
+  #     stop('dropout must be a numeric value >=0 and <1')
+  # if(class(layer_dropout)!='numeric')
+  #     stop('layer_dropout must be a numeric value >=0 and <1')
+  # if( (layer_dropout<0) | (layer_dropout>=1))
+  #     stop('layer_dropout must be a numeric value >=0 and <1')
+  # if(class(lr)!='numeric')
+  #     stop('lr must be a numeric value >0')
+  # if(lr<=0)
+  #     stop('lr must be a numeric value >0')
+  # if(class(decay)!='numeric')
+  #     stop('decay must be a numeric value >=0')
+  # if(decay<=0)
+  #     stop('decay must be a numeric value >=0')
+  # if(class(outcome_weight)!='numeric')
+  #     stop('outcome_weight must be a numeric value >=0')
+  # if(outcome_weight<=0)
+  #     stop('outcome_weight must be a numeric value >=0')
+  # if(class(batch_size)!='numeric')
+  #     stop('batch_size must be an integer')
+  # if(batch_size%%1!=0)
+  #     stop('batch_size must be an integer')
+  # if(class(epochs)!='numeric')
+  #     stop('epochs must be an integer')
+  # if(epochs%%1!=0)
+  #     stop('epochs must be an integer')
+  # if(!class(seed)%in%c('numeric','NULL'))
+  #     stop('Invalid seed')
+  #if(class(UsetidyCovariateData)!='logical')
+  #    stop('UsetidyCovariateData must be an TRUE or FALSE')
+  
+  param <- expand.grid(units=units,
+    layer_dropout=layer_dropout,
+    lr =lr, decay=decay, outcome_weight=outcome_weight,epochs= epochs,
+    seed=ifelse(is.null(seed),'NULL', seed))
+  param$units1=unlist(lapply(param$units, function(x) x[1])) 
+  param$units2=unlist(lapply(param$units, function(x) x[2])) 
+  param$units3=unlist(lapply(param$units, function(x) x[3]))
+  
+  result <- list(model='fitDeepNN', param=split(param,
+    1:(length(units)*length(layer_dropout)*length(lr)*length(decay)*length(outcome_weight)*length(epochs)*max(1,length(seed)))),
+    name='DeepNN'
+  )
+  
+  class(result) <- 'modelSettings' 
+  return(result)
+}
+
+
+fitDeepNN <- function(plpData,population, param, search='grid', quiet=F,
+                      outcomeId, cohortId, ...){
+  # check plpData is coo format:
+  if (!FeatureExtraction::isCovariateData(plpData$covariateData)){
+    stop('DeepNN requires correct covariateData')
+  }
+  if(!is.null(plpData$timeRef)){
+    warning('Data temporal but deepNN uses non-temporal data...')
+  }
+  
+  metaData <- attr(population, 'metaData')
+  if(!is.null(population$indexes))
+    population <- population[population$indexes>0,]
+  attr(population, 'metaData') <- metaData
+  
+  start<-Sys.time()
+  
+  result<- toSparseM(plpData,population,map=NULL, temporal=F)
+  data <- result$data
+  
+  #one-hot encoding
+  population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount)))
+  
+  # do cross validation to find hyperParameter
+  datas <- list(population=population, plpData=data)
+  hyperParamSel <- lapply(param, function(x) do.call(trainDeepNN, c(x,datas,train=TRUE)  ))
+  
+  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
+  hyperSummary <- as.data.frame(hyperSummary)
+  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
+  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
+  
+  #now train the final model and return coef
+  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
+  finalModel<-do.call(trainDeepNN, c(param[[bestInd]],datas, train=FALSE))
+  
+  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
+  incs <- rep(1, nrow(covariateRef)) 
+  covariateRef$included <- incs
+  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
+  
+  #modelTrained <- file.path(outLoc) 
+  param.best <- param[[bestInd]]
+  
+  comp <- start-Sys.time()
+  
+  # train prediction
+  prediction <- finalModel$prediction
+  finalModel$prediction <- NULL
+  
+  # return model location 
+  result <- list(model = finalModel$model,
+                 trainCVAuc = -1, # ToDo decide on how to deal with this
+                 hyperParamSearch = hyperSummary,
+                 modelSettings = list(model='fitDeepNN',modelParameters=param.best),
+                 metaData = plpData$metaData,
+                 populationSettings = attr(population, 'metaData'),
+                 outcomeId=outcomeId,
+                 cohortId=cohortId,
+                 varImp = covariateRef, 
+                 trainingTime =comp,
+                 covariateMap=result$map,
+                 predictionTrain = prediction
+  )
+  class(result) <- 'plpModel'
+  attr(result, 'type') <- 'deep'
+  attr(result, 'predictionType') <- 'binary'
+  
+  return(result)
+}
+
+trainDeepNN<-function(plpData, population,
+                      units1=128, units2= NULL, units3=NULL, 
+                      layer_dropout=0.2,
+                      lr =1e-4, decay=1e-5, outcome_weight = 1.0, batch_size = 100, 
+                      epochs= 100, seed=NULL, train=TRUE, ...){
+  
+  ParallelLogger::logInfo(paste('Training deep neural network with ',length(unique(population$indexes)),' fold CV'))
+  if(!is.null(population$indexes) && train==T){
+    index_vect <- unique(population$indexes)
+    perform <- c()
+    
+    # create prediction matrix to store all predictions
+    predictionMat <- population
+    predictionMat$value <- 0
+    attr(predictionMat, "metaData") <- list(predictionType = "binary")
+    
+    for(index in 1:length(index_vect )){
+      ParallelLogger::logInfo(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
+      
+      model <- keras::keras_model_sequential()
+      
+      if(is.na(units2)){
+      model %>%
+        keras::layer_dense(units=units1, #activation='identify', 
+                           input_shape=ncol(plpData)) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
+      } else if(is.na(units3)){
+        model %>%
+          keras::layer_dense(units=units1, #activation='identify', 
+                             input_shape=ncol(plpData)) %>%
+          keras::layer_dropout(layer_dropout) %>%
+          keras::layer_dense(units=units2 #,activation='identify'
+                             ) %>%
+          keras::layer_dropout(layer_dropout) %>%
+          keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
+      } else{
+        model %>%
+          keras::layer_dense(units=units1, #activation='identify', 
+                             input_shape=ncol(plpData)) %>%
+          keras::layer_dropout(layer_dropout) %>%
+          keras::layer_dense(units=units2 #,activation='identify'
+                             ) %>%
+          keras::layer_dropout(layer_dropout) %>%
+          keras::layer_dense(units=units3 #,activation='identify'
+                             ) %>%
+          keras::layer_dropout(layer_dropout) %>%
+          keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
+      }
+      
+
+      # Prepare model for training
+      model %>% keras::compile(
+        loss = "binary_crossentropy",
+        metrics = c('accuracy'),
+        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+      )
+      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
+      
+      class_weight=list("0"=1,"1"=outcome_weight)
+      
+      data <- plpData[population$rowId[population$indexes!=index],]
+      
+      #Extract validation set first - 10k people or 5%
+      valN <- min(10000,sum(population$indexes!=index)*0.05)
+      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
+      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
+      
+      sampling_generator<-function(data, population, batch_size, train_rows, index){
+        function(){
+          gc()
+          rows<-sample(train_rows, batch_size, replace=FALSE)
+          
+          list(as.array(data[rows,]),
+               population$y[population$indexes!=index,1:2][rows,])
+        }
+      }
+      
+      
+      #print(table(population$y))
+      
+      history <- model %>% keras::fit_generator(sampling_generator(data,population,batch_size,train_rows, index),
+                                                steps_per_epoch = floor(sum(population$indexes!=index)/batch_size),
+                                                epochs=epochs,
+                                                validation_data=list(as.array(data[val_rows,]),
+                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
+                                                callbacks=list(earlyStopping,reduceLr),
+                                                class_weight=class_weight)
+      
+      
+      # batch prediciton 
+      maxVal <- sum(population$indexes==index)
+      batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+      prediction <- population[population$indexes==index,]
+      prediction$value <- 0
+      for(batch in batches){
+        pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],][batch,,drop=FALSE]))
+        prediction$value[batch] <- pred[,2]
+      }
+      
+      attr(prediction, "metaData") <- list(predictionType = "binary")
+      aucVal <- computeAuc(prediction)
+      perform <- c(perform,aucVal)
+      
+      # add the fold predictions and compute AUC after loop
+      predictionMat$value[population$indexes==index] <- prediction$value
+      
+    }
+    
+    auc <- computeAuc(predictionMat)
+    foldPerm <- perform
+    
+    # Output  ----------------------------------------------------------------
+    param.val <- paste0('units1: ',units1,'units2: ',units2,'units3: ',units3,
+                        'layer_dropout: ',layer_dropout,'-- lr: ', lr,
+                        '-- decay: ', decay, '-- batch_size: ',batch_size, '-- epochs: ', epochs)
+    ParallelLogger::logInfo('==========================================')
+    ParallelLogger::logInfo(paste0('DeepNN with parameters:', param.val,' obtained an AUC of ',auc))
+    ParallelLogger::logInfo('==========================================')
+    
+  } else {
+    
+    model <- keras::keras_model_sequential()
+    if(is.na(units2)){
+      model %>%
+        keras::layer_dense(units=units1, #activation='identify', 
+                           input_shape=ncol(plpData)) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
+    } else if(is.na(units3)){
+      model %>%
+        keras::layer_dense(units=units1, #activation='identify', 
+                           input_shape=ncol(plpData)) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=units2 #,activation='identify'
+                           ) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
+    } else{
+      model %>%
+        keras::layer_dense(units=units1, #activation='identify', 
+                           input_shape=ncol(plpData)) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=units2 #,activation='identify'
+                           ) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=units3 #,activation='identify'
+                           ) %>%
+        keras::layer_dropout(layer_dropout) %>%
+        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
+    }
+    
+    # Prepare model for training
+    model %>% keras::compile(
+      loss = "binary_crossentropy",
+      metrics = c('accuracy'),
+      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
+    )
+    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
+    
+    class_weight=list("0"=1,"1"=outcome_weight)
+    
+    #Extract validation set first - 10k people or 5%
+    valN <- min(10000,nrow(population)*0.05)
+    val_rows<-sample(1:nrow(population), valN, replace=FALSE)
+    train_rows <- c(1:nrow(population))[-val_rows]
+    
+    sampling_generator2<-function(data, population, batch_size, train_rows){
+      function(){
+        gc()
+        rows<-sample(train_rows, batch_size, replace=FALSE)
+        
+        list(as.array(data[rows,,]),
+             population$y[,1:2][rows,])
+      }
+    }
+
+    history <- model %>% keras::fit_generator(sampling_generator2(plpData,population,batch_size,train_rows),
+                                              steps_per_epoch = nrow(population)/batch_size,
+                                              epochs=epochs,
+                                              validation_data=list(as.array(plpData[val_rows,,]),
+                                                                   population$y[val_rows,1:2]),
+                                              callbacks=list(earlyStopping,reduceLr),
+                                              class_weight=class_weight)
+    
+    
+    # batch prediciton 
+    maxVal <- nrow(population)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      pred <- keras::predict_proba(model, as.array(plpData[batch,,drop=FALSE]))
+      prediction$value[batch] <- pred[,2]
+    }
+    
+    attr(prediction, "metaData") <- list(predictionType = "binary")
+    auc <- computeAuc(prediction)
+    foldPerm <- auc
+    predictionMat <- prediction
+  }
+  
+  result <- list(model=model,
+                 auc=auc,
+                 prediction = predictionMat,
+                 hyperSum = unlist(list(units1=units1,units2=units2,units3=units3, 
+                                        layer_dropout=layer_dropout,lr =lr, decay=decay,
+                                        batch_size = batch_size, epochs= epochs))
+  )
+  return(result)
+  
+}
+
+
+
+#' [Under development] Transfer learning
+#'
+#' @param plpResult   The plp result when training a kersa deep learning model on big data
+#' @param plpData     The new data to fine tune the model on
+#' @param population  The population for the new data
+#' @param fixLayers   boolean specificying whether to fix weights in model being transferred
+#' @param includeTop  If TRUE the final layer of the model being transferred is removed 
+#' @param addLayers   vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10
+#' @param layerDropout  Add dropout to each new layer (binary vector length of addLayers)
+#' @param layerActivation Activation function for each new layer (string vector length of addLayers)
+#' @param outcomeWeight  The weight to assign the class 1 when training the model
+#' @param batchSize   Size of each batch for updating layers
+#' @param epochs      Number of epoches to run 
+#' @examples
+#' \dontrun{
+#' modelSet <- setDeepNN()
+#' plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...)
+#' 
+#' transferLearning(...)
+#' }
+#' @export
+transferLearning <- function(plpResult, 
+                 plpData, 
+                 population, 
+                 fixLayers = T, 
+                 includeTop= F,
+                 addLayers = c(100,10), 
+                 layerDropout = c(T,T),
+                 layerActivation = c('relu','softmax'),
+                 outcomeWeight = 1,
+                 batchSize = 10000, 
+                 epochs=20){
+  
+  # checks
+  if(!is.null(addLayers)){
+    if(length(addLayers)!=length(layerDropout)){
+      stop('Layer vector not same length as layer dropout vector')
+    }
+    if(length(addLayers)!=length(layerActivation)){
+      stop('Layer vector not same length as layer activation vector')
+    }
+  }
+  if(batchSize > nrow(population)){
+    warning('batchSize is too big for your data...')
+    batchSize = nrow(population)/10
+  }
+  
+  if(!includeTop){
+  # remove last layer if not dropout
+    if(length(grep('Dropout',as.character(plpResult$model$model$layers[[length(plpResult$model$model$layers)]])))==0){
+      keras::pop_layer(plpResult$model$model)
+    }
+  }
+ 
+  # create the base pre-trained
+  base_model <-  plpResult$model$model
+  
+  # add our custom layers
+  predictions <- base_model$output 
+  
+  
+  # fix the older layers
+  if(fixLayers){
+    for (i in 1:length(base_model$layers)){
+      try({base_model$layers[[i]]$trainable <- F}, silent = TRUE)
+    }
+  }
+  
+  ## add loop over settings here - move code to new function and call it
+  ##====
+  # !!check this looping logic works
+  if(!is.null(addLayers)){
+    for(i in 1:length(addLayers)){
+      predictions <- keras::layer_dense(predictions,units = addLayers[i], activation = layerActivation[i])
+      if(layerDropout[i]){
+        predictions <- keras::layer_dropout(predictions, rate = 0.5)
+      }
+    }
+  }
+  
+  # add find layer for binary outcome
+  predictions <- keras::layer_dense(predictions,units = 2, activation = 'sigmoid')
+  
+  
+  # this is the model we will train
+  model <- keras::keras_model(inputs = base_model$input, outputs = predictions)
+  
+  # compile the model (should be done *after* setting layers to non-trainable)
+  model %>% keras::compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',
+                           metrics = c('accuracy'))
+  
+  
+  # make this input...
+  earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
+  reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
+                                                patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, 
+                                                min_lr = 0)
+  
+  class_weight=list("0"=1,"1"=outcomeWeight)
+  
+
+  sampling_generator<-function(data, population, batchSize, train_rows){
+    function(){
+      gc()
+      rows<-sample(train_rows, batchSize, replace=FALSE)
+      
+      list(as.array(data[rows,]),
+           population$y[rows,1:2])
+    }
+  }
+  
+  # convert plpdata to matrix:
+  metaData <- attr(population, 'metaData')
+  if(!is.null(population$indexes))
+    population <- population[population$indexes>0,]
+  attr(population, 'metaData') <- metaData
+  
+  result<- toSparseM(plpData,population,map=plpResult$model$covariateMap, temporal=F)
+  data <- result$data
+  population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount)))
+  
+  #Extract validation set first - 10k people or 5%
+  valN <- min(10000,nrow(population)*0.05)
+  val_rows<-sample(1:nrow(population), valN, replace=FALSE)
+  train_rows <- c(1:nrow(population))[-val_rows]
+  
+  history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows),
+                                            steps_per_epoch = nrow(population)/batchSize,
+                                            epochs=epochs,
+                                            validation_data=list(as.array(data[val_rows,,]),
+                                                                 population$y[val_rows,1:2]),
+                                            callbacks=list(earlyStopping,reduceLr),
+                                            class_weight=class_weight)
+  
+  
+  # batch prediciton 
+  maxVal <- nrow(population)
+  batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
+  prediction <- population
+  prediction$value <- 0
+  for(batch in batches){
+    pred <- model$predict(as.array(data[batch,,drop=FALSE])) # added drop=FALSE
+    prediction$value[batch] <- pred[,2]
+  }
+  
+  ##===
+  
+  attr(prediction, "metaData") <- list(predictionType = "binary")
+  auc <- computeAuc(prediction)
+  foldPerm <- auc
+  predictionMat <- prediction
+
+result <- list(model=model,
+               auc=auc,
+               prediction = predictionMat,
+               hyperSum = list(fixLayers = fixLayers, 
+                               addLayers = addLayers, 
+                               layerDropout = layerDropout,
+                               layerActivation = layerActivation))
+  
+  return(result)
+  
+}
diff --git a/R/Formatting.R b/R/Formatting.R
new file mode 100644
index 0000000..600c24b
--- /dev/null
+++ b/R/Formatting.R
@@ -0,0 +1,457 @@
+# @file formatting.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Convert the plpData in COO format into a sparse R matrix
+#'
+#' @description
+#' Converts the standard plpData to a sparse matrix
+#'
+#' @details
+#' This function converts the covariate file from ffdf in COO format into a sparse matrix from
+#' the package Matrix
+#' @param plpData                       An object of type \code{plpData} with covariate in coo format - the patient level prediction
+#'                                      data extracted from the CDM.
+#' @param population                    The population to include in the matrix
+#' @param map                           A covariate map (telling us the column number for covariates)
+#' @param temporal                      Whether you want to convert temporal data
+#' @examples
+#' #TODO
+#'
+#' @return
+#' Returns a list, containing the data as a sparse matrix, the plpData covariateRef
+#' and a data.frame named map that tells us what covariate corresponds to each column
+#' This object is a list with the following components: \describe{
+#' \item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
+#' \item{covariateRef}{The plpData covariateRef.}
+#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
+#' }
+#'
+#' @export
+toSparseM <- function(plpData,population, map=NULL, temporal=F){
+  # check logger
+  if(length(ParallelLogger::getLoggers())==0){
+    logger <- ParallelLogger::createLogger(name = "SIMPLE",
+                                        threshold = "INFO",
+                                        appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
+    ParallelLogger::registerLogger(logger)
+  }
+  
+  ParallelLogger::logInfo(paste0('starting toSparseM'))
+  
+  
+  ParallelLogger::logDebug(paste0('covariates nrow: ', nrow(plpData$covariateData$covariates)))
+  ParallelLogger::logDebug(paste0('covariateRef nrow: ', nrow(plpData$covariateData$covariateRef)))
+  
+   
+  #assign newIds to covariateRef
+  newcovariateData <- MapCovariates(plpData$covariateData,
+                                 population, 
+                                 mapping=map)
+  
+  ParallelLogger::logDebug(paste0('Max covariateId in covariates: ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T)))))
+  ParallelLogger::logDebug(paste0('# covariates in covariateRef: ', nrow(newcovariateData$covariateRef)))
+  ParallelLogger::logDebug(paste0('Max rowId in covariates: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T)))))
+
+  maxY <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId, na.rm = TRUE)))$max
+  ParallelLogger::logDebug(paste0('Max newCovariateId in mapping: ',maxY))
+  maxX <- max(population$rowId)
+  ParallelLogger::logDebug(paste0('Max rowId in population: ',maxX))
+  
+  # chunk then add
+  if(!temporal){
+    ParallelLogger::logInfo(paste0('toSparseM non temporal used'))
+  data <- Matrix::sparseMatrix(i=1,
+                               j=1,
+                               x=0,
+                               dims=c(maxX,maxY))
+  
+  dataEnv <- environment()
+  convertData1 <- function(batch,dataEnv) {
+    data <- get("data", envir = dataEnv)
+    data <- data + Matrix::sparseMatrix(i=as.data.frame(batch %>% dplyr::select(.data$rowId))$rowId,
+                                         j=as.data.frame(batch %>% dplyr::select(.data$covariateId))$covariateId,
+                                         x=as.data.frame(batch %>% dplyr::select(.data$covariateValue))$covariateValue,
+                                         dims=c(maxX,maxY))
+    assign("data", data, envir = dataEnv)
+    return(NULL)
+  }
+  Andromeda::batchApply(newcovariateData$covariates, convertData1, batchSize = 100000, dataEnv = dataEnv)
+  
+  } else {
+    ParallelLogger::logInfo(paste0('toSparseM temporal used'))
+    
+    ParallelLogger::logTrace(paste0('Min time:', min(plpData$timeRef$timeId)))
+    ParallelLogger::logTrace(paste0('Max time:', max(plpData$timeRef$timeId)))
+    
+    # do we want to use for(i in sort(plpData$timeRef$timeId)){ ?
+    for(i in min(plpData$timeRef$timeId):max(plpData$timeRef$timeId)){
+      
+      if(nrow(newcovariateData$covariates %>% dplyr::filter(.data$timeId==i))>0){
+        ParallelLogger::logTrace(paste0('Found covariates for timeId ', i))
+        
+ 
+        # initiate the sparse matrix
+        dataPlp <- Matrix::sparseMatrix(i=1,
+                                     j=1,
+                                     x=0,
+                                     dims=c(maxX, maxY)) 
+        dataEnv <- environment()
+        ParallelLogger::logTrace(paste0('Initiated Mapping covariates for timeId ', i))
+        
+        
+        # add function to batch creating matrix from Andromeda data
+        convertData <- function(batch, dataEnv) {
+          dataPlp <- get("dataPlp", envir = dataEnv)
+          dataPlp <- dataPlp + Matrix::sparseMatrix(i=as.double(as.character(batch$rowId)),
+                                               j=as.double(as.character(batch$covariateId)),
+                                               x=batch$covariateValue,
+                                               dims=c(maxX,maxY))
+          
+          assign("dataPlp", dataPlp, envir = dataEnv)
+          return(NULL)
+        }
+        
+        # add age for each time
+        tempCovs <- addAgeTemp(timeId = i,newcovariateData, plpData$timeRef) # EDITED adding newCov
+        if(!is.null(tempCovs)){
+          Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv)
+          ParallelLogger::logTrace(paste0('Added any age covariates for timeId ', i))
+        }
+        
+        # add non age temporal covs for each time
+        tempCovs <- addNonAgeTemp(timeId = i,newcovariateData)
+        if(!is.null(tempCovs)){
+          Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv)
+          ParallelLogger::logTrace(paste0('Added non-age non-temporal covariates for timeId ', i))
+        }
+        
+        # add non temporal covs
+        tempCovs <- newcovariateData$covariates %>% 
+          dplyr::filter(!is.na(.data$timeId)) %>% 
+          dplyr::filter(.data$timeId == i) 
+        Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv)
+        
+        data_array <- slam::as.simple_sparse_array(dataPlp)
+        # remove dataPlp
+        #dataPlp <<- NULL
+        ParallelLogger::logTrace(paste0('Dim of data_array: ', paste0(dim(data_array), collapse='-')))
+        
+        #extending one more dimesion to the array
+        data_array<-slam::extend_simple_sparse_array(data_array, MARGIN =c(1L))
+        ParallelLogger::logTrace(paste0('Finished Mapping covariates for timeId ', i))
+      } else {
+        data_array <- tryCatch(slam::simple_sparse_array(i=matrix(c(1,1,1), ncol = 3), 
+                                                   v=0,
+                                                   dim=c(maxX,1, maxY))
+        )
+        
+      }
+      #binding arrays along the dimesion
+      if(i==min(plpData$timeRef$timeId)) {
+        result_array <- data_array
+      }else{
+        result_array <- slam::abind_simple_sparse_array(result_array,data_array,MARGIN=2L)
+      }
+    }
+    data <- result_array
+  }
+  
+  ParallelLogger::logDebug(paste0('Sparse matrix with dimensionality: ', paste(dim(data), collapse=',')  ))
+
+  ParallelLogger::logInfo(paste0('finishing toSparseM'))
+  
+  result <- list(data=data,
+                 covariateRef=as.data.frame(newcovariateData$covariateRef),
+                 map=as.data.frame(newcovariateData$mapping))
+  return(result)
+
+}
+
+# restricts to pop and saves/creates mapping
+MapCovariates <- function(covariateData,population, mapping){
+  
+  # to remove check notes
+  #covariateId <- oldCovariateId <- newCovariateId <- NULL
+  ParallelLogger::logInfo(paste0('starting MapCovariates'))
+  
+  newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
+                                           analysisRef = covariateData$analysisRef)
+  
+  # restrict to population for speed
+  ParallelLogger::logTrace('restricting to population for speed and mapping')
+  if(is.null(mapping)){
+    mapping <- data.frame(oldCovariateId = as.data.frame(covariateData$covariateRef %>% dplyr::select(.data$covariateId)),
+                          newCovariateId = 1:nrow(covariateData$covariateRef))
+  }
+  if(sum(colnames(mapping)%in%c('oldCovariateId','newCovariateId'))!=2){
+    colnames(mapping) <- c('oldCovariateId','newCovariateId')
+  }
+  covariateData$mapping <- mapping
+  covariateData$population <- data.frame(rowId = population[,'rowId'])
+  # assign new ids :
+  newCovariateData$covariates <- covariateData$covariates %>%
+    dplyr::inner_join(covariateData$population) %>% 
+    dplyr::rename(oldCovariateId = .data$covariateId) %>% 
+    dplyr::inner_join(covariateData$mapping) %>% 
+    dplyr::select(- .data$oldCovariateId)  %>%
+    dplyr::rename(covariateId = .data$newCovariateId)
+  covariateData$population <- NULL
+  covariateData$mapping <- NULL
+  
+  newCovariateData$mapping <- mapping
+  
+  ParallelLogger::logInfo(paste0('finished MapCovariates'))
+  
+  return(newCovariateData)
+}
+
+
+
+#' Convert the plpData in COO format into a sparse python matrix using torch.sparse
+#'
+#' @description
+#' Converts the standard plpData to a sparse matrix firectly into python
+#'
+#' @details
+#' This function converts the covariate file from ffdf in COO format into a sparse matrix from
+#' the package Matrix
+#' @param plpData                       An object of type \code{plpData} with covariate in coo format - the patient level prediction
+#'                                      data extracted from the CDM.
+#' @param population                    The population to include in the matrix
+#' @param map                           A covariate map (telling us the column number for covariates)
+#' @param temporal                      Whether to include timeId into tensor
+#' @param pythonExePath                 Location of python exe you want to use
+#' @examples
+#' #TODO
+#'
+#' @return
+#' Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef
+#' and a data.frame named map that tells us what covariate corresponds to each column
+#' This object is a list with the following components: \describe{
+#' \item{data}{The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
+#' \item{covariateRef}{The plpData covariateRef.}
+#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
+#' }
+#'
+#' @export
+toSparseTorchPython <- function(plpData,population, map=NULL, temporal=F, pythonExePath=NULL){
+  
+  map_python_initiate <- map_python <- function(){return(NULL)}
+  
+  # check logger
+  if(length(ParallelLogger::getLoggers())==0){
+    logger <- ParallelLogger::createLogger(name = "SIMPLE",
+                                           threshold = "INFO",
+                                           appenders = list(ParallelLogger::createConsoleAppender(layout = 'layoutTimestamp')))
+    ParallelLogger::registerLogger(logger)
+  }
+  
+  newcovariateData <- MapCovariates(plpData$covariateData,
+                                         population, 
+                                         mapping=map)
+  
+  ParallelLogger::logDebug(paste0('Max ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T)))))
+  ParallelLogger::logDebug(paste0('# cols: ', nrow(newcovariateData$covariateRef)))
+  ParallelLogger::logDebug(paste0('Max rowId: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T)))))
+  
+  ParallelLogger::logTrace(paste0('Converting data into python sparse matrix...'))
+  
+  maxT <- NULL
+  if(temporal){
+    maxT <- as.data.frame(newcovariateData$covariates$timeId %>% dplyr::summarise(max = max(.data$id, na.rm=T)))
+    ParallelLogger::logDebug(paste0('Max timeId: ', maxT))
+  }
+  
+  maxCol <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId,na.rm = TRUE)))$max
+  maxRow <- max(population$rowId)
+  
+  # source the python fucntion
+  e <- environment()
+  reticulate::source_python(system.file(package='PatientLevelPrediction','python','TorchMap.py'), envir = e)
+  
+  dataEnv <- e # adding to remove <<- 
+  #dataPlp <<- map_python_initiate(maxCol = as.integer(maxCol), 
+  dataPlp <- map_python_initiate(maxCol = as.integer(maxCol), 
+                                         maxRow = as.integer(maxRow), 
+                                         maxT= as.integer(maxT))
+  
+  convertData <- function(batch, temporal=T, dataEnv) {
+    if(temporal){
+      #dataPlp <<- map_python(matrix = dataPlp ,
+      dataEnv$dataPlp <- map_python(matrix = dataEnv$dataPlp,
+                                    datas = as.matrix(as.data.frame(batch %>% dplyr::select(.data$rowId,.data$covariateId,.data$timeId,.data$covariateValue))),
+                                    maxCol = as.integer(maxCol),
+                                    maxRow = as.integer(maxRow),
+                                    maxT = as.integer(maxT))
+    }else{
+     # dataPlp <<- map_python(matrix = dataPlp ,
+      dataEnv$dataPlp <- map_python(matrix = dataEnv$dataPlp,
+                             datas = as.matrix(as.data.frame(batch %>% dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue))),
+                             maxCol = as.integer(maxCol),
+                             maxRow = as.integer(maxRow),
+                             maxT = NULL) 
+    }
+    return(NULL)
+  }
+  
+  if(temporal==T){
+    # add the age and non-temporal data
+    timeIds <- unique(plpData$timeRef$timeId)
+    for(timeId in timeIds){
+      tempData <- addAgeTemp(timeId, newcovariateData)
+      if(!is.null(tempData)){
+        Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv)
+      }
+      #tempData <- addNonAgeTemp(timeId,plpData.mapped) - what is plpData.mapped?
+      tempData <- addNonAgeTemp(timeId, newcovariateData)
+      if(!is.null(tempData)){
+        Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv)
+      }
+      tempData <- NULL
+    }
+    
+    # add the rest
+    tempData <- newcovariateData$covariates %>%
+      dplyr::filter(.data$timeId!=0) %>%
+      dplyr::filter(!is.na(.data$timeId))
+    Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv)
+    tempData <- NULL
+  } else {
+    Andromeda::batchApply(newcovariateData$covariates, convertData,
+                          temporal =F, batchSize = 100000, dataEnv=dataEnv)
+  }
+  ##result <- dataEnv$dataPlp
+  ##dataPlp <<- NULL
+  ##dataEnv$dataPlp <- NULL
+  ParallelLogger::logTrace(paste0('Sparse python tensor converted'))                            
+  
+  result <- list(data=dataPlp,
+                 covariateRef=as.data.frame(newcovariateData$covariateRef),
+                 map=as.data.frame(newcovariateData$mapping))
+  return(result)
+}
+
+
+# reformat the evaluation
+reformatPerformance <- function(train, test, analysisId){
+  
+  ParallelLogger::logInfo(paste0('starting reformatPerformance'))
+
+  nr1 <- length(unlist(train$evaluationStatistics[-1]))
+  nr2 <- length(unlist(test$evaluationStatistics[-1]))
+  evaluationStatistics <- cbind(analysisId=rep(analysisId,nr1+nr2),
+                                Eval=c(rep('train', nr1),rep('test', nr2)),
+                                Metric = names(c(unlist(train$evaluationStatistics[-1]),
+                                                 unlist(test$evaluationStatistics[-1]))),
+                                Value = c(unlist(train$evaluationStatistics[-1]),
+                                      unlist(test$evaluationStatistics[-1]))
+                                )
+
+
+  if(!is.null(test$thresholdSummary) & !is.null(train$thresholdSummary)){
+    nr1 <- nrow(train$thresholdSummary)
+    nr2 <- nrow(test$thresholdSummary)
+    thresholdSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
+                                    train$thresholdSummary),
+                              cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
+                                    test$thresholdSummary))
+  } else{
+    thresholdSummary <- NULL
+  }
+  
+
+  if(!is.null(train$demographicSummary) & !is.null(test$demographicSummary)){
+    nr1 <- nrow(train$demographicSummary)
+    nr2 <- nrow(test$demographicSummary)
+    demographicSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
+                                      train$demographicSummary),
+                                cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
+                                      test$demographicSummary))
+  } else{
+    demographicSummary <- NULL
+  }
+
+  nr1 <- nrow(train$calibrationSummary)
+  nr2 <- nrow(test$calibrationSummary)
+  calibrationSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
+                                    train$calibrationSummary),
+                              cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
+                                    test$calibrationSummary))
+
+  if(!is.null(train$predictionDistribution) & !is.null(test$predictionDistribution)){
+    nr1 <- nrow(train$predictionDistribution)
+    nr2 <- nrow(test$predictionDistribution)
+    predictionDistribution <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1),
+                                    train$predictionDistribution),
+                              cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2),
+                                    test$predictionDistribution))
+  } else {
+    predictionDistribution <- NULL
+  }
+    
+  result <- list(evaluationStatistics=evaluationStatistics,
+                 thresholdSummary=thresholdSummary,
+                 demographicSummary =demographicSummary,
+                 calibrationSummary=calibrationSummary,
+                 predictionDistribution=predictionDistribution)
+
+  return(result)
+}
+
+
+# helpers for converting temporal PLP data to matrix/tensor
+addAgeTemp <- function(timeId, newcovariateData, timeRef){
+  
+  startDay <- as.data.frame(timeRef[timeRef$timeId==timeId,])$startDay
+  
+  ageId <- as.data.frame(newcovariateData$mapping %>% 
+    dplyr::filter(.data$oldCovariateId == 1002) %>%
+    dplyr::select(.data$newCovariateId))$newCovariateId
+  
+  ageData <- newcovariateData$covariates%>% # changed from plpData$covariateData
+    dplyr::filter(.data$covariateId == ageId) %>%
+    dplyr::mutate(covariateValueNew = .data$covariateValue*365 + startDay,
+                  timeId = timeId) %>%
+    dplyr::select(- .data$covariateValue) %>%
+    dplyr::rename(covariateValue = .data$covariateValueNew) %>%
+    dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue, .data$timeId)
+  
+  if(nrow(ageData)==0){
+    return(NULL)
+  }
+  return(ageData)
+}
+
+
+addNonAgeTemp <- function(timeId, newcovariateData){
+
+  ageId <- as.data.frame(newcovariateData$mapping %>% 
+                           dplyr::filter(.data$oldCovariateId == 1002) %>%
+                           dplyr::select(.data$newCovariateId))$newCovariateId
+  
+  otherTempCovs <- newcovariateData$covariates%>% 
+    dplyr::filter(is.na(.data$timeId)) %>%
+    dplyr::filter(.data$covariateId != ageId) %>%
+    dplyr::mutate(timeId = timeId) %>%
+    dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue,.data$timeId)
+  
+  if(nrow(otherTempCovs)==0){
+    return(NULL)
+  }
+  return(otherTempCovs)
+}
diff --git a/R/Predict.R b/R/Predict.R
new file mode 100644
index 0000000..caad604
--- /dev/null
+++ b/R/Predict.R
@@ -0,0 +1,643 @@
+# @file predict.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' predictPlp
+#'
+#' @description
+#' Predict the risk of the outcome using the input plpModel for the input plpData
+#' @details
+#' The function applied the trained model on the plpData to make predictions
+#' @param plpModel                         An object of type \code{plpModel} - a patient level prediction model
+#' @param population                       The population created using createStudyPopulation() who will have their risks predicted
+#' @param plpData                          An object of type \code{plpData} - the patient level prediction
+#'                                         data extracted from the CDM.
+#' @param index                            A data frame containing rowId: a vector of rowids and index: a vector of doubles the same length as the rowIds. If used, only the rowIds with a negative index value are used to calculate the prediction.  
+#' 
+#' @return
+#' A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details.
+#'
+
+# parent predict that calls the others
+#' @export
+predictPlp <- function(plpModel, population, plpData,  index=NULL){
+  
+  if(is.null(plpModel))
+    stop('No model input')
+  if(is.null(population))
+    stop('No population input')
+  if(is.null(plpData))
+    stop('No plpData input')
+  
+  # check logger
+  if(length(ParallelLogger::getLoggers())==0){
+    logger <- ParallelLogger::createLogger(name = "SIMPLE",
+                                           threshold = "INFO",
+                                           appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp)))
+    ParallelLogger::registerLogger(logger)
+  }
+  
+  # apply the feature transformations
+  if(!is.null(index)){
+    ParallelLogger::logTrace(paste0('Calculating prediction for ',sum(index$index<0),' in test set'))
+    ind <- population$rowId%in%index$rowId[index$index<0]
+  } else{
+    ParallelLogger::logTrace(paste0('Calculating prediction for ',nrow(population),' in dataset'))
+    ind <- rep(T, nrow(population))
+  }
+  
+  # do the predction on the new data
+  if(class(plpModel)=='plpModel'){
+    # extract the classifier type
+    prediction <- plpModel$predict(plpData=plpData,population=population[ind,])
+    
+    if(nrow(prediction)!=nrow(population[ind,]))
+      ParallelLogger::logWarn(paste0('Dimension mismatch between prediction and population test cases.  Population test: ',nrow(population[ind, ]), '-- Prediction:', nrow(prediction) ))
+  } else{
+    ParallelLogger::logError('Non plpModel input')
+    stop()
+  }
+  
+  metaData <- list(predictionType = attr(plpModel, 'predictionType'), #"binary", 
+                   cohortId = attr(population,'metaData')$cohortId,
+                   outcomeId = attr(population,'metaData')$outcomeId,
+                   timepoint = attr(population,'metaData')$riskWindowEnd)
+  
+  attr(prediction, "metaData") <- metaData
+  return(prediction)
+}
+
+# default patient level prediction prediction  
+predict.plp <- function(plpModel,population, plpData, ...){
+  ## done in transform covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
+  ParallelLogger::logTrace('predict.plp - predictingProbabilities start')
+  prediction <- predictProbabilities(plpModel$model, population, 
+                                     plpData$covariateData)
+  ParallelLogger::logTrace('predict.plp - predictingProbabilities end')
+  
+  # add baselineHazard function
+  if(!is.null(plpModel$model$baselineHazard)){
+    time <- attr(population,'metaData')$riskWindowEnd
+    bhind <- which.min(abs(plpModel$model$baselineHazard$time-time))
+    prediction$value <- 1-plpModel$model$baselineHazard$surv[bhind]^prediction$value
+    
+    attr(prediction, "baselineHazard") <- plpModel$model$baselineHazard$surv[bhind]
+    attr(prediction, "timePoint") <- time
+    attr(prediction, "offset") <- 0
+  }
+  
+  return(prediction)
+}
+
+# for gxboost
+predict.xgboost <- function(plpModel,population, plpData, ...){ 
+  result <- toSparseM(plpData, population, map=plpModel$covariateMap)
+  data <- result$data[population$rowId,, drop = F]
+  prediction <- data.frame(rowId=population$rowId,
+                           value=stats::predict(plpModel$model, data)
+  )
+  
+  prediction <- merge(population, prediction, by='rowId', all.x=T, fill=0)
+  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
+  attr(prediction, "metaData") <- list(predictionType = "binary") 
+  return(prediction)
+  
+}
+
+predict.pythonReticulate <- function(plpModel, population, plpData){
+  
+  
+  python_predict <- python_predict_temporal <- function(){return(NULL)}
+  
+  e <- environment()
+  reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e)
+  
+  
+  ParallelLogger::logInfo('Mapping covariates...')
+  if(!is.null(plpData$timeRef)){
+    pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T)
+    pdata <- pdata$data
+    fun_predict <- python_predict_temporal
+  } else {  
+    newData <- toSparseM(plpData, population, map=plpModel$covariateMap)
+    included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map?
+    included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] 
+    pdata <- reticulate::r_to_py(newData$data[,included, drop = F])
+    fun_predict <- python_predict
+  }
+  
+  population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
+  namesInd <- c('rowIdPython','rowId')%in%colnames(population)
+  namesInd <- c('rowIdPython','rowId')[namesInd]
+  pPopulation <- as.matrix(population[,namesInd])
+  
+  # run the python predict code:
+  ParallelLogger::logInfo('Executing prediction...')
+  result <- fun_predict(population = pPopulation, 
+                        plpData = pdata, 
+                        model_loc = plpModel$model,
+                        dense = ifelse(is.null(plpModel$dense),0,plpModel$dense),
+                        autoencoder = F)
+  
+  #get the prediction from python and reformat:
+  ParallelLogger::logInfo('Returning results...')
+  prediction <- result
+  prediction <- as.data.frame(prediction)
+  attr(prediction, "metaData") <- list(predictionType="binary")
+  colnames(prediction) <- c(namesInd, 'value')
+  
+  # add 1 to rowId from python:
+  ##prediction$rowId <- prediction$rowId+1
+  
+  # add subjectId and date:
+  prediction <- merge(prediction[,colnames(prediction)!='rowIdPython'],
+                      population, 
+                      by='rowId')
+  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')]
+  return(prediction)
+}
+
+predict.pythonAuto <- function(plpModel, population, plpData){
+  
+  python_predict <- function(){return(NULL)}
+  
+  ParallelLogger::logInfo('Mapping covariates...')
+  if(!is.null(plpData$timeRef)){
+    pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T)
+    pdata <- pdata$data
+  } else {  
+    newData <- toSparseM(plpData, population, map=plpModel$covariateMap)
+    included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map?
+    included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] 
+    pdata <- reticulate::r_to_py(newData$data[,included])
+  }
+  
+  population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference
+  namesInd <- c('rowIdPython','rowId')%in%colnames(population)
+  namesInd <- c('rowIdPython','rowId')[namesInd]
+  pPopulation <- as.matrix(population[,namesInd])
+  
+  # run the python predict code:
+  ParallelLogger::logInfo('Executing prediction...')
+  e <- environment()
+  reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e)
+  
+  result <- python_predict(population = pPopulation, 
+                           plpData = pdata, 
+                           model_loc = plpModel$model,
+                           dense = ifelse(is.null(plpModel$dense),0,plpModel$dense),
+                           autoencoder = T)
+  #get the prediction from python and reformat:
+  ParallelLogger::logInfo('Returning results...')
+  prediction <- result
+  prediction <- as.data.frame(prediction)
+  attr(prediction, "metaData") <- list(predictionType="binary")
+  colnames(prediction) <- c(namesInd, 'value')
+  
+  prediction <- merge(prediction[,colnames(prediction)!='rowIdPython'],
+                      population, 
+                      by='rowId')
+  prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')]
+  return(prediction)
+}
+
+
+
+predict.knn <- function(plpData, population, plpModel, ...){
+  ##covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId)
+  prediction <- BigKnn::predictKnn(covariates = plpData$covariateData$covariates,
+                                   cohorts= population[,!colnames(population)%in%'cohortStartDate'],
+                                   indexFolder = plpModel$model,
+                                   k = plpModel$modelSettings$modelParameters$k,
+                                   weighted = TRUE,
+                                   threads = plpModel$modelSettings$threads)
+  
+  # can add: threads = 1 in the future
+  
+  # return the cohorts as a data frame with the prediction added as 
+  # a new column with the column name 'value'
+  prediction <- merge(population, prediction[,c('rowId','value')], by='rowId', 
+                      all.x=T, fill=0)
+  prediction$value[is.na(prediction$value)] <- 0
+  
+  return(prediction)
+}
+
+
+predict.deep <- function(plpModel, population, plpData,   ...){
+  ensure_installed("plyr")
+  
+  temporal <- !is.null(plpData$timeRef)
+  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
+  if(temporal){
+    ParallelLogger::logTrace('temporal')
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
+    
+    data <-result$data[population$rowId,,]
+    if(!is.null(plpModel$useVae)){
+      if(plpModel$useVae==TRUE){
+        data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
+        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
+      }}
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,]))
+      if(is.null(dim(pred))){
+        prediction$value[batch] <- as.double(pred)
+      } else{
+        prediction$value[batch] <- as.double(pred[,2])
+      }
+    }
+    
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
+    return(prediction)
+  } else{
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
+    data <-result$data[population$rowId,]
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,]))
+      prediction$value[batch] <- as.double(pred[,2])
+    }
+    
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
+    return(prediction)
+    
+  }
+}
+
+predict.BayesianDeep <- function(plpModel, population, plpData,   ...){
+  ensure_installed("plyr")
+  
+  temporal <- !is.null(plpData$timeRef)
+  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
+  if(temporal){
+    ParallelLogger::logTrace('temporal')
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
+    
+    data <-result$data[population$rowId,,]
+    if(!is.null(plpModel$useVae)){
+      if(plpModel$useVae==TRUE){
+        data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
+        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
+      }}
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    prediction$epistemicUncertainty <- 0
+    prediction$aleatoricUncertainty <- 0
+    for(batch in batches){
+      num_MC_samples = 100
+      output_dim = 2
+      pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,]))
+      MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
+      for (k in 1:num_MC_samples){
+        MC_samples[k,, ] = stats::predict(plpModel$model, as.array(data[batch,,]))
+        #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+      }
+      pred <- apply(MC_samples[,,output_dim], 2, mean)
+      epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
+      logVar = MC_samples[, , output_dim * 2]
+      if(length(dim(logVar))<=1){
+        aleatoricUncertainty = exp(mean(logVar))
+      }else{
+        aleatoricUncertainty = exp(colMeans(logVar))
+        
+      }
+      prediction$value[batch] <- pred
+      prediction$epistemicUncertainty[batch] = epistemicUncertainty
+      prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
+      #writeLines(paste0(dim(pred[,2]), collapse='-'))
+      #writeLines(paste0(pred[1,2], collapse='-'))
+      
+    }
+    prediction$value[prediction$value>1] <- 1
+    prediction$value[prediction$value<0] <- 0
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender',
+                                                        'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue
+    return(prediction)
+  } else{
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
+    data <-result$data[population$rowId,]
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      num_MC_samples = 100
+      output_dim =2
+      MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
+      for (k in 1:num_MC_samples){
+        MC_samples[k,, ] = stats::predict(plpModel$model, as.array(data[batch,,]))
+        #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
+      }
+      pred <- apply(MC_samples[,,output_dim], 2, mean)
+      epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
+      logVar = MC_samples[, , output_dim * 2]
+      if(length(dim(logVar))<=1){
+        aleatoricUncertainty = exp(mean(logVar))
+      }else{
+        aleatoricUncertainty = exp(colMeans(logVar))
+        
+      }
+      prediction$value[batch] <- pred
+      prediction$epistemicUncertainty[batch] = epistemicUncertainty
+      prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
+      #writeLines(paste0(dim(pred[,2]), collapse='-'))
+      #writeLines(paste0(pred[1,2], collapse='-'))
+    }
+    prediction$value[prediction$value>1] <- 1
+    prediction$value[prediction$value<0] <- 0
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender',
+                                                        'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue
+    return(prediction)
+    
+  }
+}
+
+predict.deepEnsemble <- function(plpModel, population, plpData,   ...){
+  ensure_installed("plyr")
+  
+  mu <- function(){return(NULL)}
+  sigma <- function(){return(NULL)}
+  
+  temporal <- !is.null(plpData$timeRef)
+  ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef)))
+  if(temporal){
+    ParallelLogger::logTrace('temporal')
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
+    
+    data <-result$data[population$rowId,,]
+    if(!is.null(plpModel$useVae)){
+      if(plpModel$useVae==TRUE){
+        data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize))
+        data<-aperm(data, perm = c(2,1,3))#rearrange of dimension
+      }}
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    
+    prediction$sigmas <- 0
+    for(batch in batches){
+      for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
+        if(i==1){
+          muMatrix <- data.frame()
+          sigmaMatrix <-data.frame()
+        }
+        c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,])))
+        muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2])))
+        sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
+      }
+      
+      muMean <- apply(muMatrix,2,mean)
+      muSq <- muMatrix^2
+      sigmaSq <- sigmaMatrix^2
+      sigmaMean <- apply(sigmaMatrix,2,mean)
+      sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2
+      
+      prediction$value[batch] <- c(muMean)
+      prediction$sigmas[batch] <- c(sigmaResult)
+    }
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas')] # need to fix no index issue
+    #If the prediction value is negative, please add values to make all the values to positive. 
+    if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1))  }
+    return(prediction)
+    
+  } else{
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
+    data <-result$data[population$rowId,]
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    
+    prediction$sigmas <- 0
+    for(batch in batches){
+      for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
+        if(i==1){
+          muMatrix <- data.frame()
+          sigmaMatrix <-data.frame()
+        }
+        c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,])))
+        muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2])))
+        sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
+      }
+      
+      muMean <- apply(muMatrix,2,mean)
+      muSq <- muMatrix^2
+      sigmaSq <- sigmaMatrix^2
+      sigmaMean <- apply(sigmaMatrix,2,mean)
+      sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2
+      
+      prediction$value[batch] <- c(muMean)
+      prediction$sigmas[batch] <- c(sigmaResult)
+    }
+    
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas','ageYear', 'gender')] # need to fix no index issue
+    if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1))  }
+    return(prediction)
+    
+  }
+}
+
+predict.deepMulti <- function(plpModel, population, plpData,   ...){
+  
+  repeats <- attr(plpModel, 'inputs')
+  
+  temporal <- !is.null(plpData$timeRef)
+  ParallelLogger::logDebug('timeRef null: ',paste0(is.null(plpData$timeRef)))
+  if(temporal){
+    ParallelLogger::logTrace('temporal')
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T)
+    ParallelLogger::logDebug('result$data dim: ',paste0(dim(result$data), collapse = '-'))
+    ParallelLogger::logDebug('population dim: ',paste0(dim(population), collapse = '-'))
+    data <-result$data[population$rowId,,]
+    
+    ParallelLogger::logInfo('running batch prediction')
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      ParallelLogger::logDebug('batch length: ', length(batch))
+      dat <- list()
+      length(dat) <- repeats
+      for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])}
+      
+      pred <- keras::predict_on_batch(plpModel$model, dat)
+      if(is.null(dim(pred))){
+        ParallelLogger::logDebug('Pred length: ', length(pred))
+        prediction$value[batch] <- as.double(pred)
+      } else{
+        ParallelLogger::logDebug('Pred dim: ', paste0(dim(pred), collapse = '-'))
+        prediction$value[batch] <- as.double(pred[,2])
+      }
+    }
+    
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
+    return(prediction)
+  } else{
+    result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F)
+    data <-result$data[population$rowId,]
+    
+    batch_size <- min(2000, length(population$rowId))
+    maxVal <- length(population$rowId)
+    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
+    prediction <- population
+    prediction$value <- 0
+    for(batch in batches){
+      dat <- list()
+      length(dat) <- repeats
+      for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])}
+      
+      pred <- keras::predict_on_batch(plpModel$model, dat)
+      prediction$value[batch] <- as.double(pred)
+    }
+    
+    prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue
+    return(prediction)
+    
+  }
+}
+
+
+#' Create predictive probabilities
+#'
+#' @details
+#' Generates predictions for the population specified in plpData given the model.
+#'
+#' @return
+#' The value column in the result data.frame is: logistic: probabilities of the outcome, poisson:
+#' Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome.
+#'
+#' @param predictiveModel   An object of type \code{predictiveModel} as generated using
+#'                          \code{\link{fitPlp}}.
+#' @param population        The population to calculate the prediction for
+#' @param covariateData        The covariateData containing the covariates for the population
+#' @export
+predictProbabilities <- function(predictiveModel, population, covariateData) {
+  start <- Sys.time()
+  
+  ParallelLogger::logTrace('predictProbabilities - predictAndromeda start')
+  prediction <- predictAndromeda(predictiveModel$coefficients,
+                            population,
+                            covariateData,
+                            predictiveModel$modelType)
+  ParallelLogger::logTrace('predictProbabilities - predictAndromeda end')
+  prediction$time <- NULL
+  attr(prediction, "modelType") <- predictiveModel$modelType
+  attr(prediction, "cohortId") <- attr(population, "metadata")$cohortId
+  attr(prediction, "outcomeId") <- attr(population, "metadata")$outcomeId
+  
+  if (predictiveModel$modelType %in% c("survival", "cox")) {
+    attr(prediction, "timepoint") <- attr(population, "metadata")$riskWindowEnd
+  }
+  
+  delta <- Sys.time() - start
+  ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units"))
+  return(prediction)
+}
+
+#' Generated predictions from a regression model
+#'
+#' @param coefficients   A names numeric vector where the names are the covariateIds, except for the
+#'                       first value which is expected to be the intercept.
+#' @param population       A data frame containing the population to do the prediction for
+#' @param covariateData     An andromeda object containing the covariateData with predefined columns
+#'                       (see below).
+#' @param modelType      Current supported types are "logistic", "poisson", "cox" or "survival".
+#'
+#' @details
+#' These columns are expected in the outcome object: \tabular{lll}{ \verb{rowId} \tab(integer) \tab
+#' Row ID is used to link multiple covariates (x) to a single outcome (y) \cr \verb{time} \tab(real)
+#' \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr \tab
+#' \tab(e.g. number of days) \cr } These columns are expected in the covariates object: \tabular{lll}{
+#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome
+#' (y) \cr \verb{covariateId} \tab(integer) \tab A numeric identifier of a covariate \cr
+#' \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr }
+#'
+#' @export
+predictAndromeda <- function(coefficients, population, covariateData, modelType = "logistic") {
+  if (!(modelType %in% c("logistic", "poisson", "survival","cox"))) {
+    stop(paste("Unknown modelType:", modelType))
+  }
+  if (!FeatureExtraction::isCovariateData(covariateData)){
+    stop("Needs correct covariateData")
+  }
+  
+  intercept <- coefficients[names(coefficients)%in%'(Intercept)']
+  if(length(intercept)==0) intercept <- 0
+  coefficients <- coefficients[!names(coefficients)%in%'(Intercept)']
+  coefficients <- data.frame(beta = as.numeric(coefficients),
+                             covariateId = bit64::as.integer64(names(coefficients)) #!@ modified 
+                             )
+  coefficients <- coefficients[coefficients$beta != 0, ]
+  if(sum(coefficients$beta != 0)>0){
+    covariateData$coefficients <- coefficients
+    on.exit(covariateData$coefficients <- NULL, add = TRUE)
+    
+    prediction <- covariateData$covariates %>% 
+      dplyr::inner_join(covariateData$coefficients, by= 'covariateId') %>% 
+      dplyr::mutate(values = .data$covariateValue*.data$beta) %>%
+      dplyr::group_by(.data$rowId) %>%
+      dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>%
+      dplyr::select(.data$rowId, .data$value)
+    
+    prediction <- as.data.frame(prediction)
+    prediction <- merge(population, prediction, by ="rowId", all.x = TRUE, fill = 0)
+    prediction$value[is.na(prediction$value)] <- 0
+    prediction$value <- prediction$value + intercept
+  } else{
+    warning('Model had no non-zero coefficients so predicted same for all population...')
+    prediction <- population
+    prediction$value <- rep(0, nrow(population)) + intercept
+  }
+  if (modelType == "logistic") {
+    link <- function(x) {
+      return(1/(1 + exp(0 - x)))
+    }
+    prediction$value <- link(prediction$value)
+  } else if (modelType == "poisson" || modelType == "survival" || modelType == "cox") {
+    prediction$value <- exp(prediction$value)
+    #if(max(prediction$value)>1){
+    #  prediction$value <- prediction$value/max(prediction$value)
+    #}
+  }
+  return(prediction)
+}
+
diff --git a/R/RNNTorch.R b/R/RNNTorch.R
new file mode 100644
index 0000000..ad86b0e
--- /dev/null
+++ b/R/RNNTorch.R
@@ -0,0 +1,173 @@
+# @file RNNTorch.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Create setting for RNN model with python 
+#' @param hidden_size  The hidden size
+#' @param epochs     The number of epochs
+#' @param seed       A seed for the model
+#' @param class_weight   The class weight used for imbalanced data: 
+#'                           0: Inverse ratio between positives and negatives
+#'                          -1: Focal loss
+#' @param type      It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'
+#'
+#' @examples
+#' \dontrun{
+#' model.rnnTorch <- setRNNTorch()
+#' }
+#' @export
+setRNNTorch <- function(hidden_size=c(50, 100), epochs=c(20, 50), seed=0, class_weight = 0, type = 'RNN'){
+  # check inputs
+  
+  # add warning about dense data..
+  
+  
+  # set seed
+  if(is.null(seed[1])){
+    seed <- as.integer(sample(100000000,1))
+  }
+  
+  result <- list(model='fitRNNTorch', param=split(expand.grid(hidden_size=hidden_size,
+                                            epochs=epochs, seed=seed[1], 
+                                            class_weight = class_weight, type = type),
+									        1:(length(hidden_size)*length(epochs)) ),
+                                      name='RNN Torch')
+  
+  class(result) <- 'modelSettings' 
+  
+  return(result)
+}
+
+fitRNNTorch <- function(population, plpData, param, search='grid', quiet=F,
+                        outcomeId, cohortId, ...){
+  
+  # check plpData is libsvm format or convert if needed
+  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
+    stop("Needs correct covariateData")
+  
+  if(colnames(population)[ncol(population)]!='indexes'){
+    warning('indexes column not present as last column - setting all index to 1')
+    population$indexes <- rep(1, nrow(population))
+  }
+  
+  start <- Sys.time()
+  
+  population$rowIdPython <- population$rowId-1  #to account for python/r index difference #subjectId
+  pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
+  
+  result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T)
+    
+  outLoc <- createTempModelLoc()
+  # clear the existing model pickles
+  for(file in dir(outLoc))
+    file.remove(file.path(outLoc,file))
+  
+  # do cross validation to find hyperParameter
+  hyperParamSel <- lapply(param, function(x) do.call(trainRNNTorch, listAppend(x, 
+                                                                               list(plpData = result$data,
+                                                                                    population = pPopulation,
+                                                                                    train=TRUE,
+                                                                                    modelOutput=outLoc))  ))
+ 
+  hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel))
+  
+  #now train the final model and return coef
+  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
+  finalModel <- do.call(trainRNNTorch, listAppend(param[[bestInd]], 
+                                                  list(plpData = result$data,
+                                                       population = pPopulation,
+                                                       train=FALSE,
+                                                       modelOutput=outLoc)))
+
+  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
+  incs <- rep(1, nrow(covariateRef)) 
+  covariateRef$included <- incs
+  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
+  
+  modelTrained <- file.path(outLoc) 
+  param.best <- param[[bestInd]]
+  
+  comp <- start-Sys.time()
+  
+  # prediction on train set:
+  pred <- finalModel
+  pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes
+  colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
+  pred <- as.data.frame(pred)
+  attr(pred, "metaData") <- list(predictionType="binary")
+  
+  pred$value <- 1-pred$value
+  prediction <- merge(population, pred[,c('rowId','value')], by='rowId')
+  
+  # return model location 
+  result <- list(model = modelTrained,
+                 trainCVAuc = -1, # ToDo decide on how to deal with this
+                 hyperParamSearch = hyperSummary,
+                 modelSettings = list(model='fitRNNTorch',modelParameters=param.best),
+                 metaData = plpData$metaData,
+                 populationSettings = attr(population, 'metaData'),
+                 outcomeId=outcomeId,
+                 cohortId=cohortId,
+                 varImp = covariateRef, 
+                 trainingTime =comp,
+                 dense=1,
+                 covariateMap=result$map, # I think this is need for new data to map the same?
+                 predictionTrain = prediction
+  )
+  class(result) <- 'plpModel'
+  attr(result, 'type') <- 'pythonReticulate'
+  attr(result, 'predictionType') <- 'binary'
+  
+  return(result)
+}
+
+
+trainRNNTorch <- function(plpData, population, epochs=50, hidden_size = 100, seed=0, class_weight= 0, type = 'RNN', train=TRUE, modelOutput, quiet=F){
+
+  train_deeptorch <- function(){return(NULL)}
+  python_dir <- system.file(package='PatientLevelPrediction','python')
+  
+  e <- environment()
+  reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e)
+  
+  result <- train_deeptorch(population = population, 
+                                train = train,
+                                plpData = plpData, 
+                                model_type = as.character(type),
+                                epochs = as.integer(epochs),
+                                hidden_size = as.integer(hidden_size),
+                                class_weight = class_weight,
+                                seed = seed, 
+                                quiet = quiet,
+                                modelOutput = modelOutput)
+  
+  if(train){
+    # then get the prediction 
+    pred <- result
+    colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
+    pred <- as.data.frame(pred)
+    attr(pred, "metaData") <- list(predictionType="binary")
+    
+    pred$value <- 1-pred$value
+    auc <- computeAuc(pred)
+    writeLines(paste0('Model obtained CV AUC of ', auc))
+    return(auc)
+  }
+  
+  return(result)
+  
+}
diff --git a/R/SaveLoadPlp.R b/R/SaveLoadPlp.R
new file mode 100644
index 0000000..4b7e6d6
--- /dev/null
+++ b/R/SaveLoadPlp.R
@@ -0,0 +1,891 @@
+# @file PlpSaveLoad.R
+#
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of CohortMethod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Get the patient level prediction data from the server
+#' @description
+#' This function executes a large set of SQL statements against the database in OMOP CDM format to
+#' extract the data needed to perform the analysis.
+#'
+#' @details
+#' Based on the arguments, the at risk cohort data is retrieved, as well as outcomes
+#' occurring in these subjects. The at risk cohort is identified  through
+#' user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema.
+#' Similarly, outcomes are identified 
+#' through user-defined cohorts in a cohort table either inside the CDM instance or in a separate
+#' schema. Covariates are automatically extracted from the appropriate tables within the CDM.
+#' If you wish to exclude concepts from covariates you will need to
+#' manually add the concept_ids and descendants to the \code{excludedCovariateConceptIds} of the
+#' \code{covariateSettings} argument.
+#'
+#' @param connectionDetails            An R object of type\cr\code{connectionDetails} created using the
+#'                                     function \code{createConnectionDetails} in the
+#'                                     \code{DatabaseConnector} package.
+#' @param cdmDatabaseSchema            The name of the database schema that contains the OMOP CDM
+#'                                     instance.  Requires read permissions to this database. On SQL
+#'                                     Server, this should specifiy both the database and the schema,
+#'                                     so for example 'cdm_instance.dbo'.
+#' @param oracleTempSchema             For Oracle only: the name of the database schema where you want
+#'                                     all temporary tables to be managed. Requires create/insert
+#'                                     permissions to this database.
+#' @param cohortId                     A unique identifier to define the at risk cohort. CohortId is
+#'                                     used to select the cohort_concept_id in the cohort-like table.
+#' @param outcomeIds                   A list of cohort_definition_ids used to define outcomes (-999 mean no outcome gets downloaded).
+#' @param studyStartDate               A calendar date specifying the minimum date that a cohort index
+#'                                     date can appear. Date format is 'yyyymmdd'.
+#' @param studyEndDate                 A calendar date specifying the maximum date that a cohort index
+#'                                     date can appear. Date format is 'yyyymmdd'. Important: the study
+#'                                     end data is also used to truncate risk windows, meaning no outcomes
+#'                                     beyond the study end date will be considered.
+#' @param cohortDatabaseSchema         The name of the database schema that is the location where the
+#'                                     cohort data used to define the at risk cohort is available.
+#'                                     Requires read permissions to this database.
+#' @param cohortTable                  The tablename that contains the at risk cohort.  cohortTable has
+#'                                     format of COHORT table: cohort_concept_id, SUBJECT_ID,
+#'                                     COHORT_START_DATE, COHORT_END_DATE.
+#' @param outcomeDatabaseSchema            The name of the database schema that is the location where
+#'                                         the data used to define the outcome cohorts is available. 
+#'                                         Requires read permissions to this database.
+#' @param outcomeTable                     The tablename that contains the outcome cohorts. Expectation is
+#'                                         outcomeTable has format of COHORT table:
+#'                                         COHORT_DEFINITION_ID, SUBJECT_ID, COHORT_START_DATE,
+#'                                         COHORT_END_DATE.
+#' @param cdmVersion                   Define the OMOP CDM version used: currently support "4", "5" and "6".
+#' @param firstExposureOnly            Should only the first exposure per subject be included? Note that
+#'                                     this is typically done in the \code{createStudyPopulation} function,
+#'                                     but can already be done here for efficiency reasons.
+#' @param washoutPeriod                The mininum required continuous observation time prior to index
+#'                                     date for a person to be included in the at risk cohort. Note that
+#'                                     this is typically done in the \code{createStudyPopulation} function,
+#'                                     but can already be done here for efficiency reasons.
+#' @param sampleSize                   If not NULL, only this number of people will be sampled from the target population (Default NULL)
+#' 
+#' @param covariateSettings            An object of type \code{covariateSettings} as created using the
+#'                                     \code{createCovariateSettings} function in the
+#'                                     \code{FeatureExtraction} package.
+#' @param excludeDrugsFromCovariates   A redundant option                                     
+#'
+#' @return
+#' Returns an object of type \code{plpData}, containing information on the cohorts, their
+#' outcomes, and baseline covariates. Information about multiple outcomes can be captured at once for
+#' efficiency reasons. This object is a list with the following components: \describe{
+#' \item{outcomes}{A data frame listing the outcomes per person, including the time to event, and
+#' the outcome id. Outcomes are not yet filtered based on risk window, since this is done at
+#' a later stage.} \item{cohorts}{A data frame listing the persons in each cohort, listing their
+#' exposure status as well as the time to the end of the observation period and time to the end of the
+#' cohort (usually the end of the exposure era).} \item{covariates}{An ffdf object listing the
+#' baseline covariates per person in the two cohorts. This is done using a sparse representation:
+#' covariates with a value of 0 are omitted to save space.} \item{covariateRef}{An ffdf object describing the covariates that have been extracted.}
+#' \item{metaData}{A list of objects with information on how the cohortMethodData object was
+#' constructed.} } The generic \code{()} and \code{summary()} functions have been implemented for this object.
+#'
+#' @export
+getPlpData <- function(connectionDetails,
+                       cdmDatabaseSchema,
+                       oracleTempSchema = cdmDatabaseSchema,
+                       cohortId,
+                       outcomeIds,
+                       studyStartDate = "",
+                       studyEndDate = "",
+                       cohortDatabaseSchema = cdmDatabaseSchema,
+                       cohortTable = "cohort",
+                       outcomeDatabaseSchema = cdmDatabaseSchema,
+                       outcomeTable = "cohort",
+                       cdmVersion = "5",
+                       firstExposureOnly = FALSE,
+                       washoutPeriod = 0,
+                       sampleSize = NULL,
+                       covariateSettings,
+                       excludeDrugsFromCovariates = FALSE) {
+  if (studyStartDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyStartDate) == -1)
+    stop("Study start date must have format YYYYMMDD")
+  if (studyEndDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyEndDate) == -1)
+    stop("Study end date must have format YYYYMMDD")
+  if(!is.null(sampleSize)){
+    if(!class(sampleSize) %in% c('numeric', 'integer'))
+      stop("sampleSize must be numeric")
+  }
+  
+  if(is.null(cohortId))
+    stop('User must input cohortId')
+  if(length(cohortId)>1)
+    stop('Currently only supports one cohortId at a time')
+  if(is.null(outcomeIds))
+    stop('User must input outcomeIds')
+  #ToDo: add other checks the inputs are valid
+  
+  connection <- DatabaseConnector::connect(connectionDetails)
+  on.exit(DatabaseConnector::disconnect(connection))
+  dbms <- connectionDetails$dbms
+  
+  writeLines("\nConstructing the at risk cohort")
+  if(!is.null(sampleSize))  writeLines(paste("\n Sampling ",sampleSize, " people"))
+  renderedSql <- SqlRender::loadRenderTranslateSql("CreateCohorts.sql",
+                                                   packageName = "PatientLevelPrediction",
+                                                   dbms = dbms,
+                                                   oracleTempSchema = oracleTempSchema,
+                                                   cdm_database_schema = cdmDatabaseSchema,
+                                                   cohort_database_schema = cohortDatabaseSchema,
+                                                   cohort_table = cohortTable,
+                                                   cdm_version = cdmVersion,
+                                                   cohort_id = cohortId,
+                                                   study_start_date = studyStartDate,
+                                                   study_end_date = studyEndDate,
+                                                   first_only = firstExposureOnly,
+                                                   washout_period = washoutPeriod,
+                                                   use_sample = !is.null(sampleSize),
+                                                   sample_number=sampleSize
+  )
+  DatabaseConnector::executeSql(connection, renderedSql)
+  
+  writeLines("Fetching cohorts from server")
+  start <- Sys.time()
+  cohortSql <- SqlRender::loadRenderTranslateSql("GetCohorts.sql",
+                                                 packageName = "PatientLevelPrediction",
+                                                 dbms = dbms,
+                                                 oracleTempSchema = oracleTempSchema,
+                                                 cdm_version = cdmVersion)
+  cohorts <- DatabaseConnector::querySql(connection, cohortSql)
+  colnames(cohorts) <- SqlRender::snakeCaseToCamelCase(colnames(cohorts))
+  metaData.cohort <- list(cohortId = cohortId,
+                   studyStartDate = studyStartDate,
+                   studyEndDate = studyEndDate)
+  
+  if(nrow(cohorts)==0)
+    stop('Target population is empty')
+
+  delta <- Sys.time() - start
+  writeLines(paste("Loading cohorts took", signif(delta, 3), attr(delta, "units")))
+  
+  #covariateSettings$useCovariateCohortIdIs1 <- TRUE
+  covariateData <- FeatureExtraction::getDbCovariateData(connection = connection,
+                                                         oracleTempSchema = oracleTempSchema,
+                                                         cdmDatabaseSchema = cdmDatabaseSchema,
+                                                         cdmVersion = cdmVersion,
+                                                         cohortTable = "#cohort_person",
+                                                         cohortTableIsTemp = TRUE,
+                                                         rowIdField = "row_id",
+                                                         covariateSettings = covariateSettings)
+  # add indexes for tidyCov + covariate summary
+  Andromeda::createIndex(covariateData$covariates, c('rowId'),
+                         indexName = 'covariates_rowId')
+  Andromeda::createIndex(covariateData$covariates, c('covariateId'),
+                         indexName = 'covariates_covariateId')
+  Andromeda::createIndex(covariateData$covariates, c('covariateId', 'covariateValue'),
+                         indexName = 'covariates_covariateId_value')
+
+  if(max(outcomeIds)!=-999){
+  writeLines("Fetching outcomes from server")
+  start <- Sys.time()
+  outcomeSql <- SqlRender::loadRenderTranslateSql("GetOutcomes.sql",
+                                                  packageName = "PatientLevelPrediction",
+                                                  dbms = dbms,
+                                                  oracleTempSchema = oracleTempSchema,
+                                                  cdm_database_schema = cdmDatabaseSchema,
+                                                  outcome_database_schema = outcomeDatabaseSchema,
+                                                  outcome_table = outcomeTable,
+                                                  outcome_ids = outcomeIds,
+                                                  cdm_version = cdmVersion)
+  outcomes <- DatabaseConnector::querySql(connection, outcomeSql)
+  colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes))
+  metaData.outcome <- data.frame(outcomeIds =outcomeIds)
+  attr(outcomes, "metaData") <- metaData.outcome
+  if(nrow(outcomes)==0)
+    stop('No Outcomes')
+
+  metaData.cohort$attrition <- getCounts2(cohorts,outcomes, "Original cohorts")
+  attr(cohorts, "metaData") <- metaData.cohort
+  
+  delta <- Sys.time() - start
+  writeLines(paste("Loading outcomes took", signif(delta, 3), attr(delta, "units")))
+  } else {
+    outcomes <- NULL
+  }
+
+  
+  
+  
+  # Remove temp tables:
+  renderedSql <- SqlRender::loadRenderTranslateSql("RemoveCohortTempTables.sql",
+                                                   packageName = "PatientLevelPrediction",
+                                                   dbms = dbms,
+                                                   oracleTempSchema = oracleTempSchema)
+  DatabaseConnector::executeSql(connection, renderedSql, progressBar = FALSE, reportOverallTime = FALSE)
+  #DatabaseConnector::disconnect(connection)
+  
+  metaData <- covariateData$metaData
+  metaData$call <- match.call()
+  metaData$call$connectionDetails = connectionDetails
+  metaData$call$connection = NULL
+  metaData$call$cdmDatabaseSchema = cdmDatabaseSchema
+  metaData$call$oracleTempSchema = oracleTempSchema
+  metaData$call$cohortId = cohortId
+  metaData$call$outcomeIds = outcomeIds
+  metaData$call$studyStartDate = studyStartDate
+  metaData$call$studyEndDate = studyEndDate
+  metaData$call$cohortDatabaseSchema = cohortDatabaseSchema
+  metaData$call$cohortTable = cohortTable
+  metaData$call$outcomeDatabaseSchema = outcomeDatabaseSchema
+  metaData$call$outcomeTable = outcomeTable
+  metaData$call$cdmVersion = cdmVersion
+  metaData$call$firstExposureOnly = firstExposureOnly
+  metaData$call$washoutPeriod = washoutPeriod
+  metaData$call$covariateSettings= covariateSettings
+  metaData$call$sampleSize = sampleSize
+  
+  # create the temporal settings (if temporal use)
+  timeReference <- NULL
+  if(!is.null(covariateSettings$temporal)){
+    if(covariateSettings$temporal){
+      # make sure time days populated
+      if(length(covariateSettings$temporalStartDays)>0){
+        timeReference = data.frame(timeId=1:length(covariateSettings$temporalStartDays),
+                                               startDay = covariateSettings$temporalStartDays, 
+                                               endDay = covariateSettings$temporalEndDays)
+      }
+    }}
+  
+
+  result <- list(cohorts = cohorts,
+                 outcomes = outcomes,
+                 covariateData = covariateData,
+                 timeRef = timeReference,
+                 metaData = metaData)
+  
+  class(result) <- "plpData"
+  return(result)
+}
+
+
+#' Save the cohort data to folder
+#'
+#' @description
+#' \code{savePlpData} saves an object of type plpData to folder.
+#'
+#' @param plpData   An object of type \code{plpData} as generated using
+#'                           \code{getPlpData}.
+#' @param file               The name of the folder where the data will be written. The folder should
+#'                           not yet exist.
+#' @param envir              The environment for to evaluate variables when saving
+#' @param overwrite          Whether to force overwrite an existing file
+#' @details
+#' The data will be written to a set of files in the folder specified by the user.
+#'
+#' @examples
+#' # todo
+#'
+#' @export
+savePlpData <- function(plpData, file, envir=NULL, overwrite=F) {
+  if (missing(plpData))
+    stop("Must specify plpData")
+  if (missing(file))
+    stop("Must specify file")
+  if (!class(plpData) %in% c("plpData","plpData.libsvm"  ))
+    stop("Data not of class plpData")
+  if(dir.exists(file.path(file, "covariates"))){
+    stop('Folder to save covariates already exists...')
+  }
+  
+  if(!dir.exists(file)){
+    dir.create(file)
+  }
+  
+  # save the actual values in the metaData
+  # TODO - only do this if exists in parent or environ
+  if(is.null(plpData$metaData$call$sampleSize)){  # fixed a bug when sampleSize is NULL
+    plpData$metaData$call$sampleSize <- 'NULL'
+  }
+  for(i in 2:length(plpData$metaData$call)){
+    if(!is.null(plpData$metaData$call[[i]]))
+      plpData$metaData$call[[i]] <- eval(plpData$metaData$call[[i]], envir = envir)
+  }
+  
+  #FeatureExtraction::saveCovariateData(covariateData = plpData$covariateData, file = file.path(file, "covariates"))
+  Andromeda::saveAndromeda(plpData$covariateData, file = file.path(file, "covariates"), maintainConnection = T)
+  saveRDS(plpData$timeRef, file = file.path(file, "timeRef.rds"))
+  saveRDS(plpData$cohorts, file = file.path(file, "cohorts.rds"))
+  saveRDS(plpData$outcomes, file = file.path(file, "outcomes.rds"))
+  saveRDS(plpData$metaData, file = file.path(file, "metaData.rds"))
+}
+
+#' Load the cohort data from a folder
+#'
+#' @description
+#' \code{loadPlpData} loads an object of type plpData from a folder in the file
+#' system.
+#'
+#' @param file       The name of the folder containing the data.
+#' @param readOnly   If true, the data is opened read only.
+#'
+#' @details
+#' The data will be written to a set of files in the folder specified by the user.
+#'
+#' @return
+#' An object of class plpData.
+#'
+#' @examples
+#' # todo
+#'
+#' @export
+loadPlpData <- function(file, readOnly = TRUE) {
+  if (!file.exists(file))
+    stop(paste("Cannot find folder", file))
+  if (!file.info(file)$isdir)
+    stop(paste("Not a folder", file))
+  
+  result <- list(covariateData = FeatureExtraction::loadCovariateData(file = file.path(file, "covariates")),
+                 timeRef = readRDS(file.path(file, "timeRef.rds")),
+                 cohorts = readRDS(file.path(file, "cohorts.rds")),
+                 outcomes = readRDS(file.path(file, "outcomes.rds")),
+                 metaData = readRDS(file.path(file, "metaData.rds")))
+
+  class(result) <- "plpData"
+
+  return(result)
+}
+
+
+#' @export
+print.plpData <- function(x, ...) {
+  writeLines("plpData object")
+  writeLines("")
+  writeLines(paste("At risk concept ID:", attr(x$cohorts, "metaData")$cohortId))
+  writeLines(paste("Outcome concept ID(s):", paste(attr(x$outcomes, "metaData")$outcomeIds, collapse = ",")))
+}
+
+#' @method summary plpData
+#' @export
+summary.plpData <- function(object,...){
+  people <- length(unique(object$cohorts$subjectId))
+  outcomeCounts <- data.frame(outcomeId = attr(object$outcomes, "metaData")$outcomeIds,
+                              eventCount = 0,
+                              personCount = 0)
+  for (i in 1:nrow(outcomeCounts)) {
+    outcomeCounts$eventCount[i] <- sum(object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i])
+    outcomeCounts$personCount[i] <- length(unique(object$outcomes$rowId[object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i]]))
+  }
+  
+  covDetails <- FeatureExtraction::summary(object$covariateData)
+  result <- list(metaData = append(append(object$metaData, attr(object$cohorts, "metaData")), attr(object$outcomes, "metaData")),
+                 people = people,
+                 outcomeCounts = outcomeCounts,
+                 covariateCount = covDetails$covariateCount,
+                 covariateValueCount = covDetails$covariateValueCount)
+  class(result) <- "summary.plpData"
+  return(result)
+}
+
+#' @export
+print.summary.plpData <- function(x, ...) {
+  writeLines("plpData object summary")
+  writeLines("")
+  writeLines(paste("At risk cohort concept ID:", x$metaData$cohortId))
+  writeLines(paste("Outcome concept ID(s):", x$metaData$outcomeIds, collapse = ","))
+  writeLines("")
+  writeLines(paste("People:", paste(x$people)))
+  writeLines("")
+  writeLines("Outcome counts:")
+  outcomeCounts <- x$outcomeCounts
+  rownames(outcomeCounts) <- outcomeCounts$outcomeId
+  outcomeCounts$outcomeId <- NULL
+  colnames(outcomeCounts) <- c("Event count", "Person count")
+  stats::printCoefmat(outcomeCounts)
+  writeLines("")
+  writeLines("Covariates:")
+  writeLines(paste("Number of covariates:", x$covariateCount))
+  writeLines(paste("Number of non-zero covariate values:", x$covariateValueCount))
+}
+
+
+#' Saves the plp model
+#'
+#' @details
+#' Saves the plp model to a user specificed folder
+#'
+#' @param plpModel                   A trained classifier returned by running \code{runPlp()$model}
+#' @param dirPath                  A location to save the model to
+#'
+#' @export
+savePlpModel <- function(plpModel, dirPath){
+  if (missing(plpModel))
+    stop("Must specify plpModel")
+  if (missing(dirPath))
+    stop("Must specify directory path")
+  if (class(plpModel) != "plpModel")
+    stop("Not a plpModel")
+  
+  if(!dir.exists(dirPath)) dir.create(dirPath)
+  
+  
+  # If model is saved on hard drive move it...
+  #============================================================
+  moveFile <- moveHdModel(plpModel, dirPath )
+  if(!moveFile){
+    ParallelLogger::logError('Moving model files error')
+  }
+  #============================================================
+    
+
+  # if deep (keras) then save hdfs
+  if(attr(plpModel, 'type')%in%c('deep', 'deepMulti','deepEnsemble')){
+    
+    if(attr(plpModel, 'type')=='deepEnsemble'){
+      tryCatch(
+        {#saveRDS(plpModel, file = file.path(dirPath,  "deepEnsemble_model.rds"))
+          for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){
+          model<-keras::serialize_model(plpModel$model[[i]], include_optimizer = TRUE)
+          keras::save_model_hdf5(model, filepath = file.path(dirPath, "keras_model",i))
+        }},error=function(e) NULL
+      )
+    }
+    if(attr(plpModel, 'type')=='deep'){
+      keras::save_model_hdf5(plpModel$model, filepath = file.path(dirPath, "keras_model"))
+    }
+    if(attr(plpModel, 'type')=='deepMulti'){
+      saveRDS(attr(plpModel, 'inputs'), file = file.path(dirPath,  "inputs_attr.rds"))
+    }
+  } else if(attr(plpModel, 'type') == "xgboost"){
+    # fixing xgboost save/load issue
+    xgboost::xgb.save(model = plpModel$model, fname = file.path(dirPath, "model"))
+  } else {  
+  saveRDS(plpModel$model, file = file.path(dirPath, "model.rds"))
+  }
+  #saveRDS(plpModel$predict, file = file.path(dirPath, "transform.rds"))
+  saveRDS(NULL, file = file.path(dirPath, "transform.rds"))
+  saveRDS(plpModel$index, file = file.path(dirPath, "index.rds"))
+  saveRDS(plpModel$trainCVAuc, file = file.path(dirPath, "trainCVAuc.rds"))
+  saveRDS(plpModel$hyperParamSearch, file = file.path(dirPath, "hyperParamSearch.rds"))
+  saveRDS(plpModel$modelSettings, file = file.path(dirPath,  "modelSettings.rds"))
+  saveRDS(plpModel$metaData, file = file.path(dirPath, "metaData.rds"))
+  saveRDS(plpModel$populationSettings, file = file.path(dirPath, "populationSettings.rds"))
+  saveRDS(plpModel$trainingTime, file = file.path(dirPath,  "trainingTime.rds"))
+  saveRDS(plpModel$varImp, file = file.path(dirPath,  "varImp.rds"))
+  saveRDS(plpModel$dense, file = file.path(dirPath,  "dense.rds"))
+  saveRDS(plpModel$cohortId, file = file.path(dirPath,  "cohortId.rds"))
+  saveRDS(plpModel$outcomeId, file = file.path(dirPath,  "outcomeId.rds"))
+  saveRDS(plpModel$analysisId, file = file.path(dirPath,  "analysisId.rds"))
+  #if(!is.null(plpModel$covariateMap))
+  saveRDS(plpModel$covariateMap, file = file.path(dirPath,  "covariateMap.rds"))
+  
+  attributes <- list(type=attr(plpModel, 'type'), predictionType=attr(plpModel, 'predictionType') )
+  saveRDS(attributes, file = file.path(dirPath,  "attributes.rds"))
+  
+  
+}
+
+moveHdModel <- function(plpModel, dirPath ){
+  #==================================================================
+  # if python then move pickle
+  #==================================================================
+  if(attr(plpModel, 'type') %in% c('pythonOld','pythonReticulate', 'pythonAuto') ){
+    if(!dir.exists(file.path(dirPath,'python_model')))
+      dir.create(file.path(dirPath,'python_model'))
+    for(file in dir(plpModel$model)){   #DOES THIS CORRECTLY TRANSFER AUTOENCODER BITS?
+      file.copy(file.path(plpModel$model,file), 
+                file.path(dirPath,'python_model'), overwrite=TRUE,  recursive = FALSE,
+                copy.mode = TRUE, copy.date = FALSE)
+    }
+  }
+  
+  #==================================================================
+  # if sagemaker then move pickle
+  #==================================================================
+  if(attr(plpModel, 'type') =='sagemaker'){
+    if(!dir.exists(file.path(dirPath,'sagemaker_model')))
+      dir.create(file.path(dirPath,'sagemaker_model'))
+    for(file in dir(plpModel$model$loc)){
+      file.copy(file.path(plpModel$model$loc,file), 
+                file.path(dirPath,'sagemaker_model'), overwrite=TRUE,  recursive = FALSE,
+                copy.mode = TRUE, copy.date = FALSE)
+    }
+  }
+  
+  #==================================================================
+  # if knn then move model
+  #==================================================================
+  if(attr(plpModel, 'type') =='knn'){
+    if(!dir.exists(file.path(dirPath,'knn_model')))
+      dir.create(file.path(dirPath,'knn_model'))
+    for(file in dir(plpModel$model)){
+      file.copy(file.path(plpModel$model,file), 
+                file.path(dirPath,'knn_model'), overwrite=TRUE,  recursive = FALSE,
+                copy.mode = TRUE, copy.date = FALSE)
+    }
+  }
+  
+  return(TRUE)
+}
+
+#' loads the plp model
+#'
+#' @details
+#' Loads a plp model that was saved using \code{savePlpModel()}
+#'
+#' @param dirPath                  The location of the model
+#'
+#' @export
+loadPlpModel <- function(dirPath) {
+  if (!file.exists(dirPath))
+    stop(paste("Cannot find folder", dirPath))
+  if (!file.info(dirPath)$isdir)
+    stop(paste("Not a folder", dirPath))
+  
+  hyperParamSearch <- tryCatch(readRDS(file.path(dirPath, "hyperParamSearch.rds")),
+                               error=function(e) NULL)
+  # add in these as they got dropped
+  outcomeId <- tryCatch(readRDS(file.path(dirPath, "outcomeId.rds")),
+                        error=function(e) NULL)
+  cohortId <- tryCatch(readRDS(file.path(dirPath, "cohortId.rds")),
+                       error=function(e) NULL)  
+  dense <- tryCatch(readRDS(file.path(dirPath, "dense.rds")),
+                    error=function(e) NULL)  
+  covariateMap <- tryCatch(readRDS(file.path(dirPath, "covariateMap.rds")),
+                           error=function(e) NULL) 
+  analysisId <- tryCatch(readRDS(file.path(dirPath, "analysisId.rds")),
+                           error=function(e) NULL) 
+  
+  if(file.exists(file.path(dirPath, "keras_model"))){
+    ensure_installed("keras")
+    model <- keras::load_model_hdf5(file.path(dirPath, "keras_model"))
+  } else if(readRDS(file.path(dirPath, "attributes.rds"))$type == "xgboost"){
+    ensure_installed("xgboost")
+    # fixing xgboost save/load issue
+    model <- xgboost::xgb.load(file.path(dirPath, "model"))
+  } else {  
+    model <- readRDS(file.path(dirPath, "model.rds"))
+  }
+  
+  result <- list(model = model,
+                 modelSettings = readRDS(file.path(dirPath, "modelSettings.rds")),
+                 hyperParamSearch = hyperParamSearch,
+                 trainCVAuc = readRDS(file.path(dirPath, "trainCVAuc.rds")),
+                 metaData = readRDS(file.path(dirPath, "metaData.rds")),
+                 populationSettings= readRDS(file.path(dirPath, "populationSettings.rds")),
+                 outcomeId = outcomeId,
+                 cohortId = cohortId,
+                 varImp = readRDS(file.path(dirPath, "varImp.rds")),
+                 trainingTime = readRDS(file.path(dirPath, "trainingTime.rds")),
+                 covariateMap =covariateMap,
+                 predict = readRDS(file.path(dirPath, "transform.rds")),
+                 index = readRDS(file.path(dirPath, "index.rds")),
+                 dense = dense,
+                 analysisId = analysisId)
+  
+  #attributes <- readRDS(file.path(dirPath, "attributes.rds"))
+  attributes <- readRDS(file.path(dirPath, "attributes.rds"))
+  attr(result, 'type') <- attributes$type
+  attr(result, 'predictionType') <- attributes$predictionType
+  class(result) <- "plpModel"
+  
+  # update the model location to the load dirPath
+  result <- updateModelLocation(result, dirPath)
+  
+  # make this backwrds compatible for ffdf:
+  result$predict <- createTransform(result)
+  
+  return(result)
+}
+
+updateModelLocation  <- function(plpModel, dirPath){
+  type <- attr(plpModel, 'type')
+  # if python update the location
+  if( type %in% c('pythonOld','pythonReticulate', 'pythonAuto')){
+    plpModel$model <- file.path(dirPath,'python_model')
+    plpModel$predict <- createTransform(plpModel)
+  }
+  if( type =='sagemaker'){
+    plpModel$model$loc <- file.path(dirPath,'sagemaker_model')
+    plpModel$predict <- createTransform(plpModel)
+  }
+  # if knn update the locaiton - TODO !!!!!!!!!!!!!!
+  if( type =='knn'){
+    plpModel$model <- file.path(dirPath,'knn_model')
+    plpModel$predict <- createTransform(plpModel)
+  }
+  if( type =='deep' ){
+    plpModel$predict <- createTransform(plpModel)
+  }
+  if( type =='deepEnsemble' ){
+    plpModel$predict <- createTransform(plpModel)
+  }
+  if( type =='deepMulti'){
+    attr(plpModel, 'inputs') <- tryCatch(readRDS(file.path(dirPath, "inputs_attr.rds")),
+                                       error=function(e) NULL) 
+    plpModel$predict <- createTransform(plpModel)
+    
+  }
+  
+  return(plpModel) 
+}
+
+
+#' Saves the prediction dataframe to RDS
+#'
+#' @details
+#' Saves the prediction data frame returned by predict.R to an RDS file and returns the fileLocation where the prediction is saved
+#'
+#' @param prediction                   The prediciton data.frame
+#' @param dirPath                     The directory to save the prediction RDS
+#' @param fileName                    The name of the RDS file that will be saved in dirPath
+#' 
+#' @export
+savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){
+  #TODO check inupts
+  saveRDS(prediction, file=file.path(dirPath,fileName))
+  
+  return(file.path(dirPath,fileName))
+}
+
+#' Loads the prediciton dataframe to csv
+#'
+#' @details
+#' Loads the prediciton  RDS file
+#'
+#' @param fileLocation                     The location with the saved prediction
+#' 
+#' @export
+loadPrediction <- function(fileLocation){
+  #TODO check inupts
+  prediction <- readRDS(file=fileLocation)
+  return(prediction)
+}
+
+#' Saves the result from runPlp into the location directory
+#'
+#' @details
+#' Saves the result from runPlp into the location directory
+#'
+#' @param result                      The result of running runPlp()
+#' @param dirPath                     The directory to save the csv
+#' 
+#' @export
+savePlpResult <- function(result, dirPath){
+  if (missing(result))
+    stop("Must specify runPlp output")
+  if (missing(dirPath))
+    stop("Must specify directory location")
+  #if (class(plpModel) != "plpModel")
+  #  stop("Not a plpModel")
+  
+  if(!dir.exists(dirPath)) dir.create(dirPath, recursive = T)
+  
+  savePlpModel(result$model, dirPath=file.path(dirPath,'model') )
+  saveRDS(result$analysisRef, file = file.path(dirPath, "analysisRef.rds"))
+  saveRDS(result$inputSetting, file = file.path(dirPath, "inputSetting.rds"))
+  saveRDS(result$executionSummary, file = file.path(dirPath, "executionSummary.rds"))
+  saveRDS(result$prediction, file = file.path(dirPath, "prediction.rds"))
+  saveRDS(result$performanceEvaluation, file = file.path(dirPath, "performanceEvaluation.rds"))
+  #saveRDS(result$performanceEvaluationTrain, file = file.path(dirPath, "performanceEvaluationTrain.rds"))
+  saveRDS(result$covariateSummary, file = file.path(dirPath, "covariateSummary.rds"))
+  
+  
+}
+
+#' Loads the evalaution dataframe
+#'
+#' @details
+#' Loads the evaluation 
+#'
+#' @param dirPath                     The directory where the evaluation was saved
+#' 
+#' @export
+loadPlpResult <- function(dirPath){
+  if (!file.exists(dirPath))
+    stop(paste("Cannot find folder", dirPath))
+  if (!file.info(dirPath)$isdir)
+    stop(paste("Not a folder", dirPath))
+  
+  
+  result <- list(model = loadPlpModel(file.path(dirPath, "model")),
+                 analysisRef = readRDS(file.path(dirPath, "analysisRef.rds")),
+                 inputSetting = readRDS(file.path(dirPath, "inputSetting.rds")),
+                 executionSummary = readRDS(file.path(dirPath, "executionSummary.rds")),
+                 prediction = readRDS(file.path(dirPath, "prediction.rds")),
+                 performanceEvaluation = readRDS(file.path(dirPath, "performanceEvaluation.rds")),
+                 #performanceEvaluationTrain= readRDS(file.path(dirPath, "performanceEvaluationTrain.rds")),
+                 covariateSummary = readRDS(file.path(dirPath, "covariateSummary.rds"))
+  )
+  class(result) <- "runPlp"
+  
+  return(result)
+  
+}
+
+
+#result$inputSetting$dataExtrractionSettings$covariateSettings
+formatCovariateSettings <- function(covariateSettings){
+  
+  if(class(covariateSettings) == "covariateSettings"){
+    return(list(cvs = data.frame(X = names(unlist(covariateSettings)), x= unlist(covariateSettings)), 
+                fun = attr(covariateSettings,'fun')))
+    
+  } else{
+    return(list(cvs = do.call(rbind, lapply(1:length(covariateSettings), function(i){
+      inds <- which(lapply(covariateSettings[[i]], class) == "function")
+      if(length(inds)>0){
+        for(j in inds){
+          covariateSettings[[i]][[j]] <- paste0(deparse(covariateSettings[[i]][[j]]), collapse = " ")
+        }
+      }
+    tempResult <- data.frame(names = names(unlist(covariateSettings[[i]])),
+               values = unlist(covariateSettings[[i]]))
+    tempResult$settingsId <- i
+    return(tempResult)
+    })), 
+    fun = unlist(lapply(covariateSettings, function(x) attr(x,'fun')))
+    )
+    )
+  }
+  
+}
+
+reformatCovariateSettings <- function(covariateSettingsLocation){
+  # adding this to stop warnings when files does not exist
+  if(!file.exists(covariateSettingsLocation)){
+    return(NULL)
+  }
+  cs <- utils::read.csv(covariateSettingsLocation, stringsAsFactors=FALSE)
+  fun <- utils::read.csv(gsub('.csv','_fun.csv',covariateSettingsLocation), stringsAsFactors=FALSE)
+  
+  if(sum(colnames(cs)%in%c('X','x'))==2){
+    covariateSettings <- cs$x
+    covariateSettings <- as.list(covariateSettings)
+    names(covariateSettings) <- cs$X
+    attr(covariateSettings,'fun') <- fun$x
+    class(covariateSettings) <- 'covariateSettings'
+  } else {
+    
+    covariateSettings <- list()
+    length(covariateSettings) <- max(cs$settingsId)
+    
+    for(i in 1:max(cs$settingsId)){
+      covariateSettings[[i]] <- cs$values[cs$settingsId==i]
+      covariateSettings[[i]] <- as.list(covariateSettings[[i]])
+      names(covariateSettings[[i]]) <- cs$names[cs$settingsId==i]
+      attr(covariateSettings[[i]],'fun') <- fun$x[i]
+    }
+    
+  }
+  
+return(covariateSettings)
+}
+
+
+#' Save parts of the plp result as a csv for transparent sharing
+#'
+#' @details
+#' Saves the main results as a csv (these files can be read by the shiny app)
+#'
+#' @param result                      An object of class runPlp with development or validation results
+#' @param dirPath                     The directory the save the results as csv files
+#' 
+#' @export
+savePlpToCsv <- function(result, dirPath){
+  
+  #inputSetting
+  if(!dir.exists(file.path(dirPath, 'inputSetting'))){dir.create(file.path(dirPath, 'inputSetting'), recursive = T)}
+  utils::write.csv(result$inputSetting$modelSettings$model, file = file.path(dirPath, 'inputSetting','modelSettings_model.csv'), row.names = F)
+  
+  if(!is.null(result$inputSetting$modelSettings$param)){
+    utils::write.csv(as.data.frame(t(unlist(result$inputSetting$modelSettings$param))), file = file.path(dirPath, 'inputSetting','modelSettings_param.csv'), row.names = F)
+  }else{
+    utils::write.csv(NULL, file = file.path(dirPath, 'inputSetting','modelSettings_param.csv'), row.names = F)
+  }
+  utils::write.csv(result$inputSetting$modelSettings$name, file = file.path(dirPath, 'inputSetting','modelSettings_name.csv'), row.names = F)
+  if(!is.null(result$inputSetting$dataExtrractionSettings$covariateSettings)){
+    utils::write.csv(formatCovariateSettings(result$inputSetting$dataExtrractionSettings$covariateSettings)$cvs, file = file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings.csv'), row.names = F)
+    utils::write.csv(formatCovariateSettings(result$inputSetting$dataExtrractionSettings$covariateSettings)$fun, file = file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings_fun.csv'), row.names = F)
+  }
+  utils::write.csv(result$inputSetting$populationSettings$attrition, file = file.path(dirPath, 'inputSetting','populationSettings_attrition.csv'), row.names = F)
+  result$inputSetting$populationSettings$attrition <- NULL
+  utils::write.csv(result$inputSetting$populationSettings, file = file.path(dirPath, 'inputSetting','populationSettings.csv'), row.names = F)
+  
+  #executionSummary
+  if(!dir.exists(file.path(dirPath, 'executionSummary'))){dir.create(file.path(dirPath, 'executionSummary'), recursive = T)}
+  utils::write.csv(result$executionSummary$PackageVersion, file = file.path(dirPath, 'executionSummary','PackageVersion.csv'), row.names = F)
+  utils::write.csv(unlist(result$executionSummary$PlatformDetails), file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))
+  utils::write.csv(result$executionSummary$TotalExecutionElapsedTime, file = file.path(dirPath, 'executionSummary','TotalExecutionElapsedTime.csv'), row.names = F)
+  utils::write.csv(result$executionSummary$ExecutionDateTime, file = file.path(dirPath, 'executionSummary','ExecutionDateTime.csv'), row.names = F)
+  
+  #performanceEvaluation
+  if(!dir.exists(file.path(dirPath, 'performanceEvaluation'))){dir.create(file.path(dirPath, 'performanceEvaluation'), recursive = T)}
+  utils::write.csv(result$performanceEvaluation$evaluationStatistics, file = file.path(dirPath, 'performanceEvaluation','evaluationStatistics.csv'), row.names = F)
+  utils::write.csv(result$performanceEvaluation$thresholdSummary, file = file.path(dirPath, 'performanceEvaluation','thresholdSummary.csv'), row.names = F)
+  utils::write.csv(result$performanceEvaluation$demographicSummary, file = file.path(dirPath, 'performanceEvaluation','demographicSummary.csv'), row.names = F)
+  utils::write.csv(result$performanceEvaluation$calibrationSummary, file = file.path(dirPath, 'performanceEvaluation','calibrationSummary.csv'), row.names = F)
+  utils::write.csv(result$performanceEvaluation$predictionDistribution, file = file.path(dirPath, 'performanceEvaluation','predictionDistribution.csv'), row.names = F)
+  
+  #covariateSummary
+  utils::write.csv(result$covariateSummary, file = file.path(dirPath,'covariateSummary.csv'), row.names = F)
+}
+
+#' Loads parts of the plp result saved as csv files for transparent sharing
+#'
+#' @details
+#' Load the main results from csv files into a runPlp object
+#'
+#' @param dirPath                     The directory with the results as csv files
+#' 
+#' @export
+loadPlpFromCsv <- function(dirPath){
+  
+  result <- list()
+  objects <- gsub('.csv','',dir(dirPath))
+  if(sum(!c('covariateSummary','executionSummary','inputSetting','performanceEvaluation')%in%objects)>0){
+    stop('Incorrect csv results file')
+  }
+  
+  length(result) <- length(objects)
+  names(result) <- objects
+  
+  #covariateSummary
+  result$covariateSummary <- utils::read.csv(file = file.path(dirPath,'covariateSummary.csv'))
+
+  #executionSummary
+  result$executionSummary <- list()
+  result$executionSummary$PackageVersion <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'executionSummary','PackageVersion.csv')))}, error = function(e){return(NULL)})
+  result$executionSummary$PlatformDetails <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))$x)}, error = function(e){return(NULL)})
+  names(result$executionSummary$PlatformDetails) <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))$X}, error = function(e){return(NULL)})
+  result$executionSummary$TotalExecutionElapsedTime <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','TotalExecutionElapsedTime.csv'))$x}, error = function(e){return(NULL)})
+  result$executionSummary$ExecutionDateTime <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','ExecutionDateTime.csv'))$x}, error = function(e){return(NULL)})
+  
+  #inputSetting
+  result$inputSetting <- list()
+  result$inputSetting$modelSettings$model <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_model.csv'))$x}, error = function(e){return(NULL)})
+  result$inputSetting$modelSettings$param <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_param.csv')))}, error = function(e){return(NULL)})
+  result$inputSetting$modelSettings$name <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_name.csv'))$x}, error = function(e){return(NULL)})
+  
+  result$inputSetting$dataExtrractionSettings$covariateSettings <- tryCatch({reformatCovariateSettings(file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings.csv'))}, error = function(e){return(NULL)})
+
+  result$inputSetting$populationSettings <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'inputSetting','populationSettings.csv')))}, error = function(e){return(NULL)})
+  result$inputSetting$populationSettings$attrition <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','populationSettings_attrition.csv'))}, error = function(e){return(NULL)})
+  
+  #performanceEvaluation
+  result$performanceEvaluation <- list()
+  result$performanceEvaluation$evaluationStatistics <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','evaluationStatistics.csv'))}, error = function(e){return(NULL)})
+  result$performanceEvaluation$thresholdSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','thresholdSummary.csv'))}, error = function(e){return(NULL)})
+  result$performanceEvaluation$demographicSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','demographicSummary.csv'))}, error = function(e){return(NULL)})
+  result$performanceEvaluation$calibrationSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','calibrationSummary.csv'))}, error = function(e){return(NULL)})
+  result$performanceEvaluation$predictionDistribution <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','predictionDistribution.csv'))}, error = function(e){return(NULL)})
+  
+  result$model$modelSettings <- result$inputSetting$modelSettings
+  result$model$populationSettings <- result$inputSetting$populationSettings
+  result$model$metaData$call$covariateSettings <- result$inputSetting$dataExtrractionSettings$covariateSettings
+  
+  class(result) <- "runPlp"
+  return(result)
+}
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..21f8d67
--- /dev/null
+++ b/README.md
@@ -0,0 +1,60 @@
+DeepPatientLevelPrediction
+======================
+
+[![Build Status](https://github.com/OHDSI/DeepPatientLevelPrediction/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/DeepPatientLevelPrediction/actions?query=workflow%3AR-CMD-check)
+[![codecov.io](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction/coverage.svg?branch=master)](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction?branch=master)
+
+
+Introduction
+============
+
+DeepPatientLevelPrediction is an R package for building and validating deep learning patient-level predictive models using data in the OMOP Common Data Model format and OHDSI PatientLevelPrediction framework.  
+
+Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. [Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data.](https://academic.oup.com/jamia/article/25/8/969/4989437) J Am Med Inform Assoc. 2018;25(8):969-975.
+
+
+Features
+========
+- add
+
+
+
+Technology
+==========
+DeepPatientLevelPrediction is an R package, with some functions implemented in C++ and python.
+
+System Requirements
+===================
+Requires R (version 3.3.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in DeepPatientLevelPrediction require Java and Python.
+
+The python installation is required for some of the machine learning algorithms. We advise to
+install Python 3.7 using Anaconda (https://www.continuum.io/downloads). 
+
+Getting Started
+===============
+
+- add
+
+User Documentation
+==================
+Documentation can be found on the [package website](https://ohdsi.github.io/DeepPatientLevelPrediction).
+
+PDF versions of the documentation are also available, as mentioned above.
+
+Support
+=======
+* Developer questions/comments/feedback: <a href="http://forums.ohdsi.org/c/developers">OHDSI Forum</a>
+* We use the <a href="https://github.com/OHDSI/DeepPatientLevelPrediction/issues">GitHub issue tracker</a> for all bugs/issues/enhancements
+
+Contributing
+============
+Read [here](https://ohdsi.github.io/Hades/contribute.html) how you can contribute to this package. 
+ 
+License
+=======
+DeepPatientLevelPrediction is licensed under Apache License 2.0
+
+Development
+===========
+DeepPatientLevelPrediction is being developed in R Studio.
+
diff --git a/inst/python/TorchMap.py b/inst/python/TorchMap.py
new file mode 100644
index 0000000..ef8feac
--- /dev/null
+++ b/inst/python/TorchMap.py
@@ -0,0 +1,19 @@
+import torch
+
+def map_python_initiate(maxCol,maxRow, maxT=None):
+  if maxT != None:
+    matrix = torch.sparse.FloatTensor(torch.LongTensor([[0,0],[0,1],[0,0]]), torch.FloatTensor([0.,0.]), torch.Size([maxRow,maxCol,maxT]))
+  else:
+    matrix = torch.sparse.FloatTensor(torch.LongTensor([[0,0],[0,1]]), torch.FloatTensor([0.,0.]), torch.Size([maxRow,maxCol]))
+  return matrix
+  
+def map_python(matrix, datas, maxCol,maxRow, maxT=None):
+  if maxT != None:
+    indexes= datas[:,0:3]-1
+    matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:,3]), torch.Size([maxRow,maxCol, maxT]))
+    matrix = matrix.add(matrixt)
+  else:
+    indexes= datas[:,0:2]-1
+    matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:,2]), torch.Size([maxRow,maxCol]))
+    matrix = matrix.add(matrixt)
+  return matrix  
diff --git a/inst/python/TorchUtils.py b/inst/python/TorchUtils.py
new file mode 100755
index 0000000..ccb21b9
--- /dev/null
+++ b/inst/python/TorchUtils.py
@@ -0,0 +1,695 @@
+"""
+ deepUtils.py
+
+ Copyright 2016 Observational Health Data Sciences and Informatics
+
+ This file is part of PatientLevelPrediction
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import sys
+import os
+#import _pickle as cPickle
+import pdb
+import random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.utils.data import DataLoader, TensorDataset
+from sklearn.externals import joblib
+from sklearn.metrics import roc_auc_score
+from collections import OrderedDict
+output_dir = 'data'
+
+
+class FocalLoss(nn.Module):
+    """
+    Method to handle data imbalance based on paper (arXiv:1708.02002) entitled
+    Focal loss for dense object detection.
+    Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class])
+
+    """
+
+    def __init__(self, gamma=5, eps=1e-7, size_average=False):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.eps = eps
+        self.size_average = size_average
+
+    def forward(self, input, target):
+        y = self.one_hot(target, input.size(-1))
+        logit = F.softmax(input)
+        logit = logit.clamp(self.eps, 1. - self.eps)
+
+        loss = -1 * y * torch.log(logit)  # cross entropy
+        loss = loss * (1 - logit) ** self.gamma  # focal loss
+
+        if self.size_average:
+            loss = loss.mean()
+        else:
+            loss = loss.sum()
+        return loss
+
+    def one_hot(self, index, classes):
+        """
+
+        :param index: is the labels
+        :param classes: number if classes
+        :return:
+        """
+        size = index.size() + (classes,)
+        view = index.size() + (1,)
+
+        mask = torch.Tensor(*size).fill_(0)
+        index = index.view(*view)
+        ones = 1.
+
+        if isinstance(index, Variable):
+            ones = Variable(torch.Tensor(index.size()).fill_(1))
+            mask = Variable(mask, volatile=index.volatile)
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+                mask = mask.cuda()
+
+        return mask.scatter_(1, index, ones)
+
+def loss_function(recon_x, x, mu, logvar):
+    """Loss function for varational autoencoder VAE"""
+    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
+
+    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
+    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
+
+    return BCE + KLD
+
+
+def mixup_data(x, y, alpha=1.0):
+
+    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda
+    Data Augmentation method based on paper (arXiv:1710.09412) entitled
+    mixup: Beyond empirical risk minimization.
+    '''
+    if alpha > 0.:
+        lam = np.random.beta(alpha, alpha)
+    else:
+        lam = 1.
+    batch_size = x.size()[0]
+    if torch.cuda.is_available():
+        index = torch.randperm(batch_size).cuda()
+    else:
+        index = torch.randperm(batch_size)
+
+    mixed_x = lam * x + (1 - lam) * x[index,:]
+    y_a, y_b = y, y[index]
+    return mixed_x, y_a, y_b, lam
+
+def mixup_criterion(y_a, y_b, lam):
+    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
+
+def early_stop(metrics_hist, patience = 3):
+    if not np.all(np.isnan(metrics_hist)):
+        return np.nanargmin(metrics_hist) > len(metrics_hist) - patience
+    else:
+        #keep training if criterion results have all been nan so far
+        return False
+
+
+class Estimator(object):
+    """
+    It is used for training different deep models in the same interface.
+    """
+
+    def __init__(self, model):
+        self.model = model
+
+    def compile(self, optimizer, loss):
+        self.optimizer = optimizer
+        self.loss_f = loss
+
+    def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False):
+        """
+        train one epoch
+
+        :param train_loader: The data loaded using DataLoader
+        :param l1regularization: default False
+        :return: the return fitted loss and accuracy
+        """
+        loss_list = []
+        acc_list = []
+        for idx, (X, y) in enumerate(train_loader):
+            X_v = Variable(X)
+            y_v = Variable(y)
+            if torch.cuda.is_available():
+                X_v = X_v.cuda()
+                y_v = y_v.cuda()
+
+            if mixup:
+                X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v)
+                X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b)
+
+                # print 'GPU id', torch.cuda.current_device()
+            self.optimizer.zero_grad()
+            # the below comemnted lines are used for multiple GPU training
+            # if torch.cuda.device_count() > 1:
+            # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count()))
+            # if cuda:
+            #	net = net.cuda()
+            # y_pred = net(X_v)
+            if autoencoder:
+                if vae:
+                    y_pred, mu, logvar = self.model(X_v)
+                    loss = loss_function(y_pred, X_v, mu, logvar)
+                else:
+                    y_pred = self.model(X_v)
+                    loss = self.loss_f(y_pred, X_v)
+            else:
+                y_pred = self.model(X_v)
+
+                loss = self.loss_f(y_pred, y_v)
+
+                if mixup:
+                    loss_func = mixup_criterion(y_v_a, y_v_b, lam)
+                    loss = loss_func(self.loss_f, y_pred)
+
+                if l1regularization:
+                    l1_crit = nn.L1Loss(size_average=False)
+                    reg_loss = 0
+                    for param in self.model.parameters():
+                        target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32)))
+                        if torch.cuda.is_available():
+                            target = target.cuda()
+                        reg_loss += l1_crit(param, target)
+
+                    factor = 0.0005
+                    loss += factor * reg_loss
+
+            loss.backward()
+            self.optimizer.step()
+            loss_list.append(loss.item())
+            if autoencoder:
+                acc_list.append(0)
+            else:
+                classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
+                acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten())
+                acc_list.append(acc)
+            del loss
+            del y_pred
+
+        return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list)
+
+    def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False):
+        train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)),
+                                  torch.from_numpy(y.astype(np.float32)).long().view(-1))
+        train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
+        self.model.train()
+        for t in range(nb_epoch):
+            loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae)
+            #print loss
+            val_log = ''
+            if validation_data and not autoencoder:
+                val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size)
+
+                val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc)
+                print(val_log)
+                # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log))
+
+    def evaluate(self, X, y, batch_size=32):
+        y_pred = self.predict(X)
+        y_v = Variable(torch.from_numpy(y).long(), requires_grad=False)
+        if torch.cuda.is_available():
+            y_v = y_v.cuda()
+        loss = self.loss_f(y_pred, y_v)
+        predict = y_pred.data.cpu().numpy()[:, 1].flatten()
+        auc = roc_auc_score(y, predict)
+        return loss.item(), auc
+
+    def _accuracy(self, y_pred, y):
+        return float(sum(y_pred == y)) / y.shape[0]
+
+    def predict(self, X):
+        X = Variable(torch.from_numpy(X.astype(np.float32)))
+        if torch.cuda.is_available():
+            X = X.cuda()
+        y_pred = self.model(X)
+        return y_pred
+
+    def predict_proba(self, X):
+        self.model.eval()
+        return self.model.predict_proba(X)
+
+class EarlyStopping(object): # pylint: disable=R0902
+    """
+    Gives a criterion to stop training when a given metric is not
+    improving anymore
+    Args:
+        mode (str): One of `min`, `max`. In `min` mode, training will
+            be stopped when the quantity monitored has stopped
+            decreasing; in `max` mode it will be stopped when the
+            quantity monitored has stopped increasing. Default: 'min'.
+        patience (int): Number of epochs with no improvement after
+            which training is stopped. For example, if
+            `patience = 2`, then we will ignore the first 2 epochs
+            with no improvement, and will only stop learning after the
+            3rd epoch if the loss still hasn't improved then.
+            Default: 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Default: 1e-4.
+        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
+            dynamic_threshold = best * ( 1 + threshold ) in 'max'
+            mode or best * ( 1 - threshold ) in `min` mode.
+            In `abs` mode, dynamic_threshold = best + threshold in
+            `max` mode or best - threshold in `min` mode. Default: 'rel'.
+    """
+
+    def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'):
+        self.patience = patience
+        self.mode = mode
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.best = None
+        self.num_bad_epochs = None
+        self.mode_worse = None  # the worse value for the chosen mode
+        self.is_better = None
+        self.last_epoch = -1
+        self._init_is_better(mode=mode, threshold=threshold,
+                             threshold_mode=threshold_mode)
+        self._reset()
+
+    def _reset(self):
+        """Resets num_bad_epochs counter and cooldown counter."""
+        self.best = self.mode_worse
+        self.num_bad_epochs = 0
+
+    def step(self, metrics, epoch=None):
+        """ Updates early stopping state """
+        current = metrics
+        if epoch is None:
+            epoch = self.last_epoch = self.last_epoch + 1
+        self.last_epoch = epoch
+
+        if self.is_better(current, self.best):
+            self.best = current
+            self.num_bad_epochs = 0
+        else:
+            self.num_bad_epochs += 1
+
+    @property
+    def stop(self):
+        """ Should we stop learning? """
+        return self.num_bad_epochs > self.patience
+
+    def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201
+        if mode == 'min' and threshold_mode == 'rel':
+            rel_epsilon = 1. - threshold
+            return a < best * rel_epsilon
+
+        elif mode == 'min' and threshold_mode == 'abs':
+            return a < best - threshold
+
+        elif mode == 'max' and threshold_mode == 'rel':
+            rel_epsilon = threshold + 1.
+            return a > best * rel_epsilon
+
+        return a > best + threshold
+
+    def _init_is_better(self, mode, threshold, threshold_mode):
+        if mode not in {'min', 'max'}:
+            raise ValueError('mode ' + mode + ' is unknown!')
+        if threshold_mode not in {'rel', 'abs'}:
+            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
+
+        if mode == 'min':
+            self.mode_worse = float('inf')
+        else:  # mode == 'max':
+            self.mode_worse = (-float('inf'))
+
+        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
+
+    def state_dict(self):
+        """ Returns early stopping state """
+        return {key: value for key, value in self.__dict__.items() if key != 'is_better'}
+
+    def load_state_dict(self, state_dict):
+        """ Loads early stopping state """
+        self.__dict__.update(state_dict)
+        self._init_is_better(mode=self.mode, threshold=self.threshold,
+                             threshold_mode=self.threshold_mode)
+
+def adjust_learning_rate(learning_rate, optimizer, epoch):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+
+    lr = learning_rate * (0.1 ** (epoch // 10))
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+def batch(tensor, batch_size = 50):
+    """ It is used to create batch samples, each batch has batch_size samples"""
+    tensor_list = []
+    length = tensor.shape[0]
+    i = 0
+    while True:
+        if (i+1) * batch_size >= length:
+            tensor_list.append(tensor[i * batch_size: length])
+            return tensor_list
+        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
+        i += 1
+
+
+class selu(nn.Module):
+    def __init__(self):
+        super(selu, self).__init__()
+        self.alpha = 1.6732632423543772848170429916717
+        self.scale = 1.0507009873554804934193349852946
+
+    def forward(self, x):
+        temp1 = self.scale * F.relu(x)
+        temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x)))
+        return temp1 + temp2
+
+
+class alpha_drop(nn.Module):
+    def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1):
+        super(alpha_drop, self).__init__()
+        keep_prob = 1 - p
+        self.a = np.sqrt(
+            fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar)))
+        self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
+        self.alpha = alpha
+        self.keep_prob = 1 - p
+        self.drop_prob = p
+
+    def forward(self, x):
+        if self.keep_prob == 1 or not self.training:
+            # print("testing mode, direct return")
+            return x
+        else:
+            random_tensor = self.keep_prob + torch.rand(x.size())
+
+            binary_tensor = Variable(torch.floor(random_tensor))
+
+            if torch.cuda.is_available():
+                binary_tensor = binary_tensor.cuda()
+
+            x = x.mul(binary_tensor)
+            ret = x + self.alpha * (1 - binary_tensor)
+            ret.mul_(self.a).add_(self.b)
+            return ret
+
+def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None):
+    """
+    create matrix for temporal models.
+
+    :param covariate_ids: the covariate ids in the whole data
+    :param patient_dict: the dictionary contains the data for each patient
+    :param y_dict: if the output labels is known, it contains the labels for patients
+    :param timeid_len: the total number time window gaps when extracting temporal data
+    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
+    """
+    D = len(covariate_ids)
+    N = len(patient_dict)
+    T = timeid_len   
+    concept_list =list(covariate_ids)
+    concept_list.sort()
+    x_raw = np.zeros((N, D, T), dtype=float)
+    patient_ind = 0
+    p_ids = []
+    patient_keys = patient_dict.keys()
+    #print covariate_ids
+    for kk in patient_keys:
+        #print('-------------------')
+        vals = patient_dict[kk]
+        #sorted(vals)
+        p_ids.append(int(kk))
+        for timeid, meas in vals.iteritems():
+            int_time = int(timeid) - 1
+            for val in meas:
+                if not len(val):
+                    continue
+                cov_id, cov_val = val
+                if cov_id not in covariate_ids:
+                    continue
+                lab_ind = concept_list.index(cov_id)
+                if cov_mean_dict is None:
+                    x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
+                else:
+                    mean_std = cov_mean_dict[cov_id]
+                    if mean_std[1]:
+                        x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1]
+                    else:
+                        x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
+
+    
+        patient_ind = patient_ind + 1
+
+    #impute the data using the value of previous timestamp
+    #fw = open('patient_var.txt', 'w')
+    for i in xrange(N):
+        for j in xrange(D):
+            temp = x_raw[i][j]
+            nonzero_inds = np.nonzero(temp)[0]
+            count_nonzeros =  len(nonzero_inds)
+            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
+            if count_nonzeros == 1:
+                ind = nonzero_inds[0]
+                for k in xrange(ind + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][ind]
+            elif count_nonzeros > 1:
+                for ind in xrange(1, count_nonzeros):
+                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
+                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
+                # For last nonzeros.
+                for k in xrange(nonzero_inds[-1] + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
+
+    #fw.close()
+
+    return x_raw, patient_keys
+
+def forward_impute_missing_value(x_raw):
+    N = x_raw.shape[0]
+    D = x_raw.shape[1]
+    T = x_raw.shape[2]
+    for i in xrange(N):
+        for j in xrange(D):
+            temp = x_raw[i][j]
+            nonzero_inds = np.nonzero(temp)[0]
+            count_nonzeros =  len(nonzero_inds)
+            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
+            if count_nonzeros == 1:
+                ind = nonzero_inds[0]
+                for k in xrange(ind + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][ind]
+            elif count_nonzeros > 1:
+                for ind in xrange(1, count_nonzeros):
+                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
+                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
+                # For last nonzeros.
+                for k in xrange(nonzero_inds[-1] + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
+
+
+def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False):
+    """
+    It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix
+
+    :param covariates: covariates extracted by FeatureExtraction package
+    :param timeid_len: the total number of window gaps when extracting temporal data
+    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
+    """
+    patient_dict = OrderedDict()
+    print('Loading temporal data')
+    cov_vals_dict = {}
+    for row in covariates:
+        p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3]
+        cov_id = np.int64(cov_id)
+        #time_id = int(time_id)
+        cov_vals_dict.setdefault(cov_id, []).append(float(cov_val))
+        if p_id not in patient_dict:
+            patient_dict[p_id] = {time_id: [(cov_id, cov_val)]}
+        else:
+            if time_id not in patient_dict[p_id]:
+                patient_dict[p_id][time_id] = [(cov_id, cov_val)]
+            else:
+                patient_dict[p_id][time_id].append((cov_id, cov_val))
+        #covariate_ids.add(cov_id)
+    #T = 365/time_window
+    covariate_ids = set()
+    cov_mean_dict = {}
+    if not predict:
+        fw = open('covariate_mean_std.csv', 'w')
+        for key, val in cov_vals_dict.iteritems():
+            mean_val = np.mean(val)
+            std_val = np.std(val)
+
+            # Remove those covariates with few occurrence (<5)
+            if len(val) >= 5:
+                covariate_ids.add(key)
+                cov_mean_dict[key] = (mean_val, std_val)
+                fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n')
+        fw.close()
+    else:
+        fp = open('covariate_mean_std.csv', 'r')
+        for line in fp:
+            values = line.rstrip().split(',')
+            key = np.int64(values[0])
+            covariate_ids.add(key)
+            cov_mean_dict[key] = (float(values[1]), float(values[2]))
+        fp.close()
+
+
+    if normalize:
+        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict)
+    else:
+        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len)
+    
+    return x, patient_keys
+
+
+def read_covariates(covariate_file):
+    patient_dict = {}
+    head = True
+    with open(covariate_file, 'r') as fp:
+        for line in fp:
+            if head:
+                head = False
+                continue
+            values = line.rstrip().split(',')
+            patient_id = values[1]
+            cov_id = values[2]
+            #time_id = int(values[-1])
+            # covariates in one patient has time order
+            patient_dict.setdefault(patient_id, []).append((cov_id))
+    new_patient = []
+    for key in patient_dict.keys():
+        #patient_dict[key].sort()
+        sort_vals = []
+        for val in patient_dict[key]:
+            if val[1] not in sort_vals:
+                sort_vals.append(val)
+        new_patient.append(sort_vals)
+
+    return new_patient
+
+
+def word_embeddings(covariate_file, embedding_size=50):
+    import gensim.models.word2vec as w2v
+    modelname = "processed_%s.w2v" % ('heartfailure')
+    sentences = read_covariates(covariate_file)
+    model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1)
+    print("building word2vec vocab on %s..." % (covariate_file))
+
+    model.build_vocab(sentences)
+    print("training...")
+    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
+    out_file = modelname
+    print("writing embeddings to %s" % (out_file))
+    model.save(out_file)
+    return out_file
+
+def read_data(filename):
+    covariate_ids = set()
+    patient_dict = {}
+    head = True
+    with open(filename) as fp:
+        for lines in fp:
+            if head:
+                head = False
+                continue
+            lines = lines.strip('\n').strip('\r').split(',')
+            try:
+                p_id, cov_id, time_id, cov_val =  lines[1], lines[2], lines[3], lines[4]
+            except:
+                pdb.set_trace()
+            print(p_id, cov_id, time_id)
+            if p_id not in patient_dict:
+                patient_dict[p_id] = {}
+            else:
+                if time_id not in patient_dict[p_id]:
+                    patient_dict[p_id][time_id] = []
+                else:
+                    patient_dict[p_id][time_id].append((cov_id, cov_val))
+                    covariate_ids.add(cov_id)
+    patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients
+    #(15, 2000, 60) 20000 patients, 15 lab tests,
+    return covariate_ids, patient_dict
+
+def split_training_validation(classes, validation_size=0.2, shuffle=False):
+    """split sampels based on balnace classes"""
+    num_samples = len(classes)
+    classes = np.array(classes)
+    classes_unique = np.unique(classes)
+    num_classes = len(classes_unique)
+    indices = np.arange(num_samples)
+    # indices_folds=np.zeros([num_samples],dtype=int)
+    training_indice = []
+    training_label = []
+    validation_indice = []
+    validation_label = []
+    for cl in classes_unique:
+        indices_cl = indices[classes == cl]
+        num_samples_cl = len(indices_cl)
+
+        # split this class into k parts
+        if shuffle:
+            random.shuffle(indices_cl)  # in-place shuffle
+
+        # module and residual
+        num_samples_each_split = int(num_samples_cl * validation_size)
+        res = num_samples_cl - num_samples_each_split
+
+        training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]]
+        training_label = training_label + [cl] * res
+
+        validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]]
+        validation_label = validation_label + [cl] * num_samples_each_split
+
+    training_index = np.arange(len(training_label))
+    random.shuffle(training_index)
+    training_indice = np.array(training_indice)[training_index]
+    training_label = np.array(training_label)[training_index]
+
+    validation_index = np.arange(len(validation_label))
+    random.shuffle(validation_index)
+    validation_indice = np.array(validation_indice)[validation_index]
+    validation_label = np.array(validation_label)[validation_index]
+
+    return training_indice, training_label, validation_indice, validation_label
+
+
+if __name__ == "__main__":
+    x_raw = np.array([[1, 1, 0], [0,1,0]])
+    x = []
+    x.append(x_raw)
+    x.append(x_raw)
+    x = np.array(x)
+    #pdb.set_trace()
+    forward_impute_missing_value(x)
+    #filename = sys.argv[1]
+    #word_embeddings(filename)
+    '''
+    population = joblib.load('/data/share/plp/SYNPUF/population.pkl')
+    # y = population[:, 1]
+    covriate_ids, patient_dict = read_data(filename)
+    # y_ids = np.array([int(val) for val in patient_dict.keys()])
+    # Y = []
+    y_dict = dict(zip(population[:, 0], population[:, 1]))
+    #x, patient_keys = convert_2_cnn_format(covariates, timeid_len = 31)
+    # for val in y_ids:
+    #    Y.append(y_dict[y_ids]) 
+    x_train, x_valid, x_test, Y_train, Y_valid, Y_test = convert_to_temporal_format(covriate_ids, patient_dict, y_dict)
+    '''
diff --git a/inst/python/deepTorch.py b/inst/python/deepTorch.py
new file mode 100644
index 0000000..7425413
--- /dev/null
+++ b/inst/python/deepTorch.py
@@ -0,0 +1,1186 @@
+"""
+ deepTorch.py: It implements different deep learning classifiers
+
+ Copyright 2016 Observational Health Data Sciences and Informatics
+
+ This file is part of PatientLevelPrediction
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import sys
+import os
+import pdb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.utils.data import DataLoader, TensorDataset
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+from collections import OrderedDict
+import timeit
+import joblib
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score
+import numpy as np
+
+import warnings
+warnings.filterwarnings("ignore")
+
+if "python_dir" in globals():
+    #print python_dir
+    sys.path.insert(0, python_dir)
+
+import TorchUtils as tu
+
+
+class LogisticRegression(nn.Module):
+    """
+    Train a logistic regression model using pytorch
+    """
+    def __init__(self, input_size, num_classes = 2):
+        super(LogisticRegression, self).__init__()
+        self.linear = nn.Linear(input_size, num_classes)
+
+    def forward(self, x):
+        out = self.linear(x)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class MLP(nn.Module):
+    """
+    Train a multiple-layer perceptron with one hideen layer
+    """
+    def __init__(self, input_dim, hidden_size, num_classes = 2):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, p =0.5, training=self.training)
+        x = self.fc2(x)
+        x = torch.sigmoid(x)
+        return x
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class SNN(nn.Module):
+    """
+    Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515
+    """
+
+    def __init__(self, input_dim, hidden_size, num_classes=2):
+        super(SNN, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_size)
+        self.fc2 = tu.selu()
+        self.ad1 = tu.alpha_drop()
+        self.fc4 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = self.fc2(x)
+        x = self.ad1(x)
+        x = self.fc4(x)
+        x = torch.sigmoid(x)
+        return x
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+class AutoEncoder(nn.Module):
+    """
+    A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data.
+    """
+    def __init__(self, input_size, encoding_size):
+        super(AutoEncoder, self).__init__()
+
+        self.encoder = nn.Sequential(
+            nn.Linear(input_size, input_size/2),
+            nn.ReLU(True),
+            nn.Linear(input_size/2, input_size/4),
+            nn.ReLU(True),
+            nn.Linear(input_size/4, encoding_size),
+            nn.ReLU(True)
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(encoding_size, input_size/4),
+            nn.ReLU(True),
+            nn.Linear(input_size/4, input_size/2),
+            nn.ReLU(True),
+            nn.Linear(input_size/2, input_size)
+        )
+
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+        encoded = self.encoder(x)
+        decoded = self.decoder(encoded)
+        return decoded
+
+    def get_encode_features(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            encoded = self.encoder(x)
+            encoded = encoded.data.cpu().numpy()
+            return encoded
+
+class VAE(nn.Module):
+    """
+    A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data.
+    """
+    def __init__(self, input_size, encoding_size):
+        super(VAE, self).__init__()
+
+        self.fc1 = nn.Linear(input_size, input_size/2)
+        self.fc21 = nn.Linear(input_size/2, encoding_size)
+        self.fc22 = nn.Linear(input_size/2, encoding_size)
+        self.fc3 = nn.Linear(encoding_size, input_size/2)
+        self.fc4 = nn.Linear(input_size/2, input_size)
+
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def encode(self, x):
+        h1 = self.relu(self.fc1(x))
+        return self.fc21(h1), self.fc22(h1)
+
+    def reparameterize(self, mu, logvar):
+        if self.training:
+            std = logvar.mul(0.5).exp_()
+            eps = Variable(std.data.new(std.size()).normal_())
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+
+    def decode(self, z):
+        h3 = self.relu(self.fc3(z))
+        return self.sigmoid(self.fc4(h3))
+
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+        mu, logvar = self.encode(x)
+        z = self.reparameterize(mu, logvar)
+        return self.decode(z), mu, logvar
+
+    def get_encode_features(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            mu, logvar = self.encode(x)
+            encoded = self.reparameterize(mu, logvar)
+            encoded = encoded.data.cpu().numpy()
+            return encoded
+
+class Decoder(nn.Module):
+    """ VAE decoder input_size = original inputsize/16*256"""
+    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
+        super(Decoder, self).__init__()
+        self.latent_size = latent_size
+        self.img_channels = img_channels
+
+        self.fc1 = nn.Linear(latent_size, input_size)
+        self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding)
+        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding)
+        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding)
+        self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding)
+
+    def forward(self, x): # pylint: disable=arguments-differ
+        x = F.relu(self.fc1(x))
+        x = x.unsqueeze(-1).unsqueeze(-1)
+        x = F.relu(self.deconv1(x))
+        x = F.relu(self.deconv2(x))
+        x = F.relu(self.deconv3(x))
+        reconstruction = torch.sigmoid(self.deconv4(x))
+        return reconstruction
+
+class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
+    """ VAE encoder """
+    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
+        super(Encoder, self).__init__()
+        self.latent_size = latent_size
+        #self.img_size = img_size
+        self.img_channels = img_channels
+
+        self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding)
+        self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding)
+        self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding)
+        self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding)
+        out_size = input_size / 16
+        self.fc_mu = nn.Linear(out_size, latent_size)
+        self.fc_logsigma = nn.Linear(out_size, latent_size)
+
+
+    def forward(self, x): # pylint: disable=arguments-differ
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        x = F.relu(self.conv4(x))
+        x = x.view(x.size(0), -1)
+
+        mu = self.fc_mu(x)
+        logsigma = self.fc_logsigma(x)
+
+        return mu, logsigma
+
+class VAE_CNN(nn.Module):
+    """ Variational Autoencoder """
+    def __init__(self, latent_size, input_size):
+        super(VAE, self).__init__()
+        self.encoder = Encoder(latent_size, input_size)
+        input_size = input_size/16
+        self.decoder = Decoder(latent_size, input_size)
+
+    def forward(self, x): # pylint: disable=arguments-differ
+        mu, logsigma = self.encoder(x)
+        sigma = logsigma.exp()
+        eps = torch.randn_like(sigma)
+        z = eps.mul(sigma).add_(mu)
+
+        recon_x = self.decoder(z)
+        return recon_x, mu, logsigma
+
+    def get_encode_features(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            mu, logvar = self.encoder(x)
+            encoded = mu .data.cpu().numpy()
+            return encoded
+
+class CNN(nn.Module):
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
+        super(CNN, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size)
+        self.bn = nn.BatchNorm1d(hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.layer1(x)
+        out = self.layer2(out)
+        out = out.view(out.size(0), -1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.bn(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+#allow multiple kernel with differnt kernel size
+class CNN_MLF(nn.Module):
+    """
+    It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
+        super(CNN_MLF, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1
+        maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size
+        maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1
+        maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        conv_outsize = 	maxpool1_size + maxpool2_size +maxpool3_size
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out1 = self.layer1(x)
+        out2 = self.layer2(x)
+        out3 = self.layer3(x)
+        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class CNN_LSTM(nn.Module):
+    """
+    It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2):
+        super(CNN_LSTM, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding)
+        input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(hidden_size, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.layer1(x)
+        out = self.downsample(out)
+        out = torch.squeeze(out, 1)
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() 
+            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda()
+        else:
+            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) 
+            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
+        out, hn  = self.layer2(out, (h0, c0))
+        out = hn[0][-1]
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class CNN_MIX(nn.Module):
+    """
+    It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
+        super(CNN_MIX, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size))
+        out1_size = int(np.ceil(float(window_size)/pool_size[1]))
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        
+        out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.layer1(x)
+        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
+        out = self.layer2(out)
+        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
+        out = self.layer3(out)
+        out = out.view(out.size(0), -1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class CNN_MULTI(nn.Module):
+    """
+    It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
+        super(CNN_MULTI, self).__init__()
+        # resolution 1
+        self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size)
+        maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+        self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size)
+        maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+              
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        #resolution 2
+        self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size)
+        maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+              
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size))
+        cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+        self.layer4 = nn.Sequential(
+            nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        merge_size = cnn1_size + cnn2_size + cnn4_size
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.pool1_1(x)
+        out = self.pool1_2(out)
+        out1 = self.layer1(out)
+        out = self.pool2_1(x)
+        out2 = self.layer2(out)
+        out = self.layer3(x)
+        out3 = self.layer4(out)
+        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+# 1x3 Convolution
+def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)):
+    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
+                     padding=padding, stride=stride, bias=False)
+
+
+# Residual Block
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride)
+        self.bn1 = nn.BatchNorm2d(nb_filter)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride)
+        self.bn2 = nn.BatchNorm2d(nb_filter)
+        self.downsample = downsample
+        
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+
+
+# ResNet Module
+class ResNet(nn.Module):
+    def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100):
+        super(ResNet, self).__init__()
+        self.in_channels = 1
+        self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size)
+        self.bn = nn.BatchNorm2d(nb_filter)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self.make_layer(block, nb_filter, layers[0],  kernel_size = kernel_size)
+        self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter)
+        self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter)
+        self.avg_pool = nn.AvgPool2d(pool_size)
+        avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+        last_layer_size = nb_filter*4*labcounts*avgpool2_1_size
+        self.fc = nn.Linear(last_layer_size, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def make_layer(self, block, out_channels, blocks, stride=1,  kernel_size = (1, 3), in_channels = 16):
+        downsample = None
+        if (stride != 1) or (self.in_channels != out_channels):
+            downsample = nn.Sequential(
+                convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride),
+                nn.BatchNorm2d(out_channels))
+        layers = []
+        layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample))
+        self.in_channels = out_channels
+        for i in range(1, blocks):
+            layers.append(block(out_channels, out_channels, kernel_size = kernel_size))
+        return nn.Sequential(*layers)
+     
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.conv(x)
+        out = self.bn(out)
+        out = self.relu(out)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.avg_pool(out)
+        out = out.view(out.size(0), -1)
+        out = self.fc(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+        
+class GRU(nn.Module):
+    """
+    It is a deep network with one GRU layer, which are further fed into one fully connected layers.
+    """
+    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
+        super(GRU, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
+        self.linear = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection
+        else:
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection
+        self.gru.flatten_parameters()
+        out, hn = self.gru(x, h0)
+
+        rearranged = hn[-1]
+        out = self.linear(rearranged)
+        out = torch.sigmoid(out)
+        return out
+
+    def initHidden(self, N):
+        return Variable(torch.randn(1, N, self.hidden_size))
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class RNN(nn.Module):
+    """
+    It is a deep network with one LSTM layer, which are further fed into one fully connected layer.
+    """
+    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
+        super(RNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
+        self.fc = nn.Linear(hidden_size, num_classes)
+    
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() 
+            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda()
+        else:
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
+            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
+        self.lstm.flatten_parameters()
+        out, hn = self.lstm(x, (h0, c0))
+        rearranged = hn[0][-1]
+        # Decode hidden state of last time step
+        out = self.fc(rearranged)
+        out = torch.sigmoid(out)
+        return out
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class BiRNN(nn.Module):
+    """
+    It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer.
+    """
+    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
+        super(BiRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
+                            batch_first = True, dropout = dropout, bidirectional=True)
+        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection 
+    
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection 
+            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda()
+        else:
+            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
+            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
+        self.lstm.flatten_parameters()
+        out, hn = self.lstm(x, (h0, c0))
+        hn = hn[0]
+
+        rearranged = hn[-2:].view(x.size(0), -1)
+        # Decode hidden state of last time step
+        out = self.fc(rearranged)
+        out = torch.sigmoid(out)
+        return out
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+# select model
+if __name__ == "__main__":
+    if model_type in ['LogisticRegression', 'MLP', 'SNN']:
+        y = population[:, 1]
+        X = plpData[population[:, 0], :]
+        trainInds = population[:, population.shape[1] - 1] > 0
+        if class_weight == -1:
+            loss = tu.FocalLoss(gamma = 5)
+        else:
+            if class_weight == 0:
+                weights = float(np.count_nonzero(y))/y.shape[0]
+                class_weight = [1 - weights, weights]
+            else:
+                class_weight = [class_weight, 1]
+            class_weight = 1/torch.Tensor(class_weight)
+            if torch.cuda.is_available():
+                class_weight = class_weight.cuda()
+            loss=nn.CrossEntropyLoss(weight = class_weight)
+
+        print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
+        print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]))
+        ###########################################################################
+        l1regularization = False
+
+        if train:
+            pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
+            print("Calculating prediction for train set of size %s" % (pred_size))
+            test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
+            for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
+                testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
+                trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
+                train_x = X[trainInds, :][trainInd, :]
+                train_y = y[trainInds][trainInd]
+
+                test_x = X[trainInds, :][testInd, :]
+                print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
+                print("Train set contains %s outcomes " % (np.sum(train_y)))
+                train_x = train_x.toarray()
+                test_x = test_x.toarray()
+
+                if autoencoder:
+                    print('first train stakced autoencoder')
+                    encoding_size = 256
+                    if vae:
+                        auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
+                    else:
+                        auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
+                    if torch.cuda.is_available():
+                        auto_model = auto_model.cuda()
+                    clf = tu.Estimator(auto_model)
+                    clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay),
+                                loss=nn.MSELoss())
+                    clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae)
+                    #split to batch for large dataset
+                    train_batch = tu.batch(train_x, batch_size=32)
+                    train_x = np.array([]).reshape(0, encoding_size)
+                    for train in train_batch:
+                        encode_train = auto_model.get_encode_features(train)
+                        train_x = np.concatenate((train_x, encode_train), axis=0)
+                    #train_x = auto_model.get_encode_features(train_x.toarray())
+                    #test_x = auto_model.get_encode_features(test_x.toarray())
+                    test_batch = tu.batch(test_x, batch_size=32)
+                    test_x = np.array([]).reshape(0, encoding_size)
+                    for test in test_batch:
+                        encode_Test = auto_model.get_encode_features(test)
+                        test_x = np.concatenate((test_x, encode_Test), axis=0)
+                    del auto_model
+                    del clf
+                # train on fold
+                print("Training fold %s" % (i))
+                start_time = timeit.default_timer()
+                if model_type == 'LogisticRegression':
+                    model = LogisticRegression(train_x.shape[1])
+                    l1regularization = True
+                elif model_type == 'SNN':
+                    model = SNN(train_x.shape[1], size)
+                else:
+                    model = MLP(train_x.shape[1], size)
+
+                if torch.cuda.is_available():
+                    model = model.cuda()
+                clf = tu.Estimator(model)
+                clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
+                            loss=loss)
+                #if not autoencoder:
+                #    train_x = train_x.toarray()
+                #    test_x = test_x.toarray()
+
+                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
+
+                ind = (population[:, population.shape[1] - 1] > 0)
+                ind = population[ind, population.shape[1] - 1] == i
+
+                test_input_var = torch.from_numpy(test_x.astype(np.float32))
+
+                test_batch = tu.batch(test_x, batch_size = 32)
+                temp = []
+                for test in test_batch:
+                    pred_test1 = model.predict_proba(test)[:, 1]
+                    temp = np.concatenate((temp, pred_test1), axis = 0)
+                #temp = model.predict_proba(test_input_var)[:, 1]
+                #temp = preds.data.cpu().numpy().flatten()
+                #print temp
+                test_pred[ind] = temp
+                print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
+                print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
+
+            # merge pred with indexes[testInd,:]
+            test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
+            prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
+
+        # train final:
+        else:
+            print("Training final neural network model on all train data...")
+            print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
+
+            start_time = timeit.default_timer()
+
+            train_x = X[trainInds, :]
+            train_x = train_x.toarray()
+            train_y = y[trainInds]
+            if not os.path.exists(modelOutput):
+                os.makedirs(modelOutput)
+            if autoencoder:
+                encoding_size = 256
+                if vae:
+                    auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
+                else:
+                    auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
+                if torch.cuda.is_available():
+                    auto_model = auto_model.cuda()
+                clf = tu.Estimator(auto_model)
+                clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay),
+                            loss=nn.MSELoss())
+
+                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae)
+                #train_x = auto_model.get_encode_features(train_x.toarray())
+                train_batch = tu.batch(train_x, batch_size=32)
+                train_x = np.array([]).reshape(0, encoding_size)
+                for train in train_batch:
+                    encode_train = auto_model.get_encode_features(train)
+                    train_x = np.concatenate((train_x, encode_train), axis=0)
+                joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl'))
+                del auto_model
+                del clf
+
+            print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay))
+            if model_type == 'LogisticRegression':
+                model = LogisticRegression(train_x.shape[1])
+                l1regularization = True
+            elif model_type == 'SNN':
+                model = SNN(train_x.shape[1], size)
+            else:
+                model = MLP(train_x.shape[1], size)
+
+            #if not autoencoder:
+            #    train_x = train_x.toarray()
+
+            if torch.cuda.is_available():
+                model = model.cuda()
+            clf = tu.Estimator(model)
+            clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
+                        loss=loss)
+            clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
+
+            end_time = timeit.default_timer()
+            print("Training final took: %.2f s" % (end_time - start_time))
+
+            # save the model:
+            print("Model saved to: %s" % (modelOutput))
+
+            joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
+
+
+    elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']:
+        #print 'running model', model_type
+        y = population[:, 1]
+        #plpData = plpData[population[:, 0], :]
+        #config = tf.ConfigProto()
+        #config.gpu_options.allow_growth = True
+        #with tf.Session(config=config) as sess:
+        #    X = tf.sparse_reorder(plpData)
+        #    X = tf.sparse_tensor_to_dense(X)
+        #    X = sess.run(X)
+            #tu.forward_impute_missing_value(X)
+        X = plpData.to_dense().numpy()
+        X = X[np.int64(population[:, 0]), :]
+        '''
+        p_ids_in_cov = set(covariates[:, 0])
+        full_covariates = np.array([]).reshape(0,4)
+        default_covid = covariates[0, 1]
+        timeid_len = len(set(covariates[:, -2]))
+        for p_id in  population[:, 0]:
+            if p_id not in p_ids_in_cov:
+                tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1
+                full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
+            else:
+                tmp_x = covariates[covariates[:, 0] == p_id, :]
+                full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
+
+        trainInds = population[:, population.shape[1] - 1] > 0
+        X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len= timeid_len)
+        full_covariates = []
+        '''
+        if class_weight == -1:
+            loss = tu.FocalLoss(gamma = 3)
+        else:
+            if class_weight == 0:
+                weights = float(np.count_nonzero(y))/y.shape[0]
+                class_weight = [1 - weights, weights]
+            else:
+                class_weight = [class_weight, 1]
+            class_weight = 1/torch.Tensor(class_weight)
+            if torch.cuda.is_available():
+                class_weight = class_weight.cuda()
+            loss=nn.CrossEntropyLoss(weight = class_weight)
+        trainInds = population[:, population.shape[1] - 1] > 0
+        if train:
+            pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
+            print("Calculating prediction for train set of size %s" % (pred_size))
+            test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
+            for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
+                testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
+                trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
+                train_x = X[trainInds, :][trainInd, :]
+                train_y = y[trainInds][trainInd]
+
+                test_x = X[trainInds, :][testInd, :]
+                print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
+                print("Train set contains %s outcomes " % (np.sum(train_y)))
+
+                # train on fold
+                learning_rate = 0.001
+                print("Training fold %s" % (i))
+                start_time = timeit.default_timer()
+                if model_type == 'CNN':
+                    model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+                elif model_type == 'CNN_LSTM':
+                    model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+                elif model_type == 'CNN_MLF': # multiple kernels with different size
+                    model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+                elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis
+                    model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+                elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis
+                    model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+                elif model_type == 'ResNet':
+                    print('train ResNet')
+                    model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+                elif model_type == 'RNN':
+                    model = RNN(train_x.shape[2], hidden_size, 2, 2)
+                elif model_type == 'BiRNN':
+                    model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
+                elif model_type == 'GRU':
+                    model = GRU(train_x.shape[2], hidden_size, 2, 2)
+                else:
+                    print('temproal data not supported by this model')
+
+                if torch.cuda.is_available():
+                    model = model.cuda()
+                clf = tu.Estimator(model)
+                clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
+                            loss=loss)
+
+                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
+
+                ind = (population[:, population.shape[1] - 1] > 0)
+                ind = population[ind, population.shape[1] - 1] == i
+
+                test_batch = tu.batch(test_x, batch_size = 32)
+                temp = []
+                for test in test_batch:
+                    pred_test1 = model.predict_proba(test)[:, 1]
+                    temp = np.concatenate((temp, pred_test1), axis = 0)
+
+                test_pred[ind] = temp
+                del model
+                print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
+                print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
+
+            # merge pred with indexes[testInd,:]
+            test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
+            prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
+
+        # train final:
+        else:
+            print("Training final neural network model on all train data...")
+            print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
+
+            start_time = timeit.default_timer()
+
+            train_x = X[trainInds, :]
+            train_y = y[trainInds]
+            learning_rate = 0.001
+
+            if model_type == 'CNN':
+                model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+            elif model_type == 'CNN_LSTM':
+                model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+            elif model_type == 'CNN_MLF': # multiple kernels with different size
+                model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+            elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis
+                model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+            elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis
+                model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+            elif model_type == 'ResNet':
+                model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+            elif model_type == 'RNN':
+                model = RNN(train_x.shape[2], hidden_size, 2, 2)
+            elif model_type == 'BiRNN':
+                model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
+            elif model_type == 'GRU':
+                model = GRU(train_x.shape[2], hidden_size, 2, 2)
+            else:
+                print('temproal data not supported by this model')
+
+            if torch.cuda.is_available():
+                model = model.cuda()
+            clf = tu.Estimator(model)
+            clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
+                        loss=loss)
+            clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
+
+            end_time = timeit.default_timer()
+            print("Training final took: %.2f s" % (end_time - start_time))
+
+            # save the model:
+            if not os.path.exists(modelOutput):
+                os.makedirs(modelOutput)
+            print("Model saved to: %s" % (modelOutput))
+
+            joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
+            
+            # prediction on train:
+            test_batch = tu.batch(train_x, batch_size = 32)
+            test_pred = []
+            for test in test_batch:
+                pred_test1 = model.predict_proba(test)[:, 1]
+                test_pred = np.concatenate((test_pred, pred_test1), axis = 0)
+            test_pred.shape = (population.shape[0], 1)
+            prediction = np.append(population, test_pred, axis=1)
+
+'''
+if __name__ == "__main__":
+    DATA_SIZE = 1000
+    INPUT_SIZE = 36
+    HIDDEN_SIZE = 100
+    class_size = 2
+    #X = np.random.randn(DATA_SIZE * class_size, 18, INPUT_SIZE)
+    X = np.random.randn(DATA_SIZE * class_size, INPUT_SIZE)
+    y = np.array([i for i in range(class_size) for _ in range(DATA_SIZE)])
+    
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
+    model = LogisticRegression(X_train.shape[1])
+    l1regularization = True
+    #model = CNN_LSTM(nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2]) 
+    #model = ResNet(ResidualBlock, [3, 3, 3], nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2])
+    #model = RNN(INPUT_SIZE, HIDDEN_SIZE, 2, class_size)
+    #pdb.set_trace()
+    if cuda:
+        model = model.cuda()
+    clf = Estimator(model)
+    clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-4),
+                loss=nn.CrossEntropyLoss())
+    clf.fit(X_train, y_train, batch_size=64, nb_epoch=10,
+            validation_data=(X_test, y_test), l1regularization = l1regularization)
+    score, auc = clf.evaluate(X_test, y_test)
+    
+    print('Test score: %s' %(auc))
+'''
diff --git a/inst/python/deepTorchFunctions.py b/inst/python/deepTorchFunctions.py
new file mode 100644
index 0000000..ff2f6f0
--- /dev/null
+++ b/inst/python/deepTorchFunctions.py
@@ -0,0 +1,1732 @@
+"""
+ deepTorch.py: It implements different deep learning classifiers
+
+ Copyright 2016 Observational Health Data Sciences and Informatics
+
+ This file is part of PatientLevelPrediction
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+"""
+
+import sys
+import os
+import pdb
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Variable
+from torch.utils.data import DataLoader, TensorDataset
+from torch.nn.utils.rnn import pack_padded_sequence
+from torch.nn.utils.rnn import pad_packed_sequence
+from collections import OrderedDict
+import timeit
+import joblib
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score
+import numpy as np
+
+import warnings
+warnings.filterwarnings("ignore")
+
+class FocalLoss(nn.Module):
+    """
+    Method to handle data imbalance based on paper (arXiv:1708.02002) entitled
+    Focal loss for dense object detection.
+    Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class])
+
+    """
+
+    def __init__(self, gamma=5, eps=1e-7, size_average=False):
+        super(FocalLoss, self).__init__()
+        self.gamma = gamma
+        self.eps = eps
+        self.size_average = size_average
+
+    def forward(self, input, target):
+        y = self.one_hot(target, input.size(-1))
+        logit = F.softmax(input)
+        logit = logit.clamp(self.eps, 1. - self.eps)
+
+        loss = -1 * y * torch.log(logit)  # cross entropy
+        loss = loss * (1 - logit) ** self.gamma  # focal loss
+
+        if self.size_average:
+            loss = loss.mean()
+        else:
+            loss = loss.sum()
+        return loss
+
+    def one_hot(self, index, classes):
+        """
+
+        :param index: is the labels
+        :param classes: number if classes
+        :return:
+        """
+        size = index.size() + (classes,)
+        view = index.size() + (1,)
+
+        mask = torch.Tensor(*size).fill_(0)
+        index = index.view(*view)
+        ones = 1.
+
+        if isinstance(index, Variable):
+            ones = Variable(torch.Tensor(index.size()).fill_(1))
+            mask = Variable(mask, volatile=index.volatile)
+            if torch.cuda.is_available():
+                ones = ones.cuda()
+                mask = mask.cuda()
+
+        return mask.scatter_(1, index, ones)
+
+def loss_function(recon_x, x, mu, logvar):
+    """Loss function for varational autoencoder VAE"""
+    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
+
+    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
+    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
+
+    return BCE + KLD
+
+
+def mixup_data(x, y, alpha=1.0):
+
+    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda
+    Data Augmentation method based on paper (arXiv:1710.09412) entitled
+    mixup: Beyond empirical risk minimization.
+    '''
+    if alpha > 0.:
+        lam = np.random.beta(alpha, alpha)
+    else:
+        lam = 1.
+    batch_size = x.size()[0]
+    if torch.cuda.is_available():
+        index = torch.randperm(batch_size).cuda()
+    else:
+        index = torch.randperm(batch_size)
+
+    mixed_x = lam * x + (1 - lam) * x[index,:]
+    y_a, y_b = y, y[index]
+    return mixed_x, y_a, y_b, lam
+
+def mixup_criterion(y_a, y_b, lam):
+    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
+
+def early_stop(metrics_hist, patience = 3):
+    if not np.all(np.isnan(metrics_hist)):
+        return np.nanargmin(metrics_hist) > len(metrics_hist) - patience
+    else:
+        #keep training if criterion results have all been nan so far
+        return False
+
+
+class Estimator(object):
+    """
+    It is used for training different deep models in the same interface.
+    """
+
+    def __init__(self, model):
+        self.model = model
+
+    def compile(self, optimizer, loss):
+        self.optimizer = optimizer
+        self.loss_f = loss
+
+    def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False):
+        """
+        train one epoch
+
+        :param train_loader: The data loaded using DataLoader
+        :param l1regularization: default False
+        :return: the return fitted loss and accuracy
+        """
+        loss_list = []
+        acc_list = []
+        for idx, (X, y) in enumerate(train_loader):
+            X_v = Variable(X)
+            y_v = Variable(y)
+            if torch.cuda.is_available():
+                X_v = X_v.cuda()
+                y_v = y_v.cuda()
+
+            if mixup:
+                X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v)
+                X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b)
+
+                # print 'GPU id', torch.cuda.current_device()
+            self.optimizer.zero_grad()
+            # the below comemnted lines are used for multiple GPU training
+            # if torch.cuda.device_count() > 1:
+            # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count()))
+            # if cuda:
+            #	net = net.cuda()
+            # y_pred = net(X_v)
+            if autoencoder:
+                if vae:
+                    y_pred, mu, logvar = self.model(X_v)
+                    loss = loss_function(y_pred, X_v, mu, logvar)
+                else:
+                    y_pred = self.model(X_v)
+                    loss = self.loss_f(y_pred, X_v)
+            else:
+                y_pred = self.model(X_v)
+
+                loss = self.loss_f(y_pred, y_v)
+
+                if mixup:
+                    loss_func = mixup_criterion(y_v_a, y_v_b, lam)
+                    loss = loss_func(self.loss_f, y_pred)
+
+                if l1regularization:
+                    l1_crit = nn.L1Loss(size_average=False)
+                    reg_loss = 0
+                    for param in self.model.parameters():
+                        target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32)))
+                        if torch.cuda.is_available():
+                            target = target.cuda()
+                        reg_loss += l1_crit(param, target)
+
+                    factor = 0.0005
+                    loss += factor * reg_loss
+
+            loss.backward()
+            self.optimizer.step()
+            loss_list.append(loss.item())
+            if autoencoder:
+                acc_list.append(0)
+            else:
+                classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
+                acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten())
+                acc_list.append(acc)
+            del loss
+            del y_pred
+
+        return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list)
+
+    def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False):
+        train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)),
+                                  torch.from_numpy(y.astype(np.float32)).long().view(-1))
+        train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
+        self.model.train()
+        for t in range(nb_epoch):
+            loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae)
+            #print loss
+            val_log = ''
+            if validation_data and not autoencoder:
+                val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size)
+
+                val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc)
+                print(val_log)
+                # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log))
+
+    def evaluate(self, X, y, batch_size=32):
+        y_pred = self.predict(X)
+        y_v = Variable(torch.from_numpy(y).long(), requires_grad=False)
+        if torch.cuda.is_available():
+            y_v = y_v.cuda()
+        loss = self.loss_f(y_pred, y_v)
+        predict = y_pred.data.cpu().numpy()[:, 1].flatten()
+        auc = roc_auc_score(y, predict)
+        return loss.item(), auc
+
+    def _accuracy(self, y_pred, y):
+        return float(sum(y_pred == y)) / y.shape[0]
+
+    def predict(self, X):
+        X = Variable(torch.from_numpy(X.astype(np.float32)))
+        if torch.cuda.is_available():
+            X = X.cuda()
+        y_pred = self.model(X)
+        return y_pred
+
+    def predict_proba(self, X):
+        self.model.eval()
+        return self.model.predict_proba(X)
+
+class EarlyStopping(object): # pylint: disable=R0902
+    """
+    Gives a criterion to stop training when a given metric is not
+    improving anymore
+    Args:
+        mode (str): One of `min`, `max`. In `min` mode, training will
+            be stopped when the quantity monitored has stopped
+            decreasing; in `max` mode it will be stopped when the
+            quantity monitored has stopped increasing. Default: 'min'.
+        patience (int): Number of epochs with no improvement after
+            which training is stopped. For example, if
+            `patience = 2`, then we will ignore the first 2 epochs
+            with no improvement, and will only stop learning after the
+            3rd epoch if the loss still hasn't improved then.
+            Default: 10.
+        threshold (float): Threshold for measuring the new optimum,
+            to only focus on significant changes. Default: 1e-4.
+        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
+            dynamic_threshold = best * ( 1 + threshold ) in 'max'
+            mode or best * ( 1 - threshold ) in `min` mode.
+            In `abs` mode, dynamic_threshold = best + threshold in
+            `max` mode or best - threshold in `min` mode. Default: 'rel'.
+    """
+
+    def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'):
+        self.patience = patience
+        self.mode = mode
+        self.threshold = threshold
+        self.threshold_mode = threshold_mode
+        self.best = None
+        self.num_bad_epochs = None
+        self.mode_worse = None  # the worse value for the chosen mode
+        self.is_better = None
+        self.last_epoch = -1
+        self._init_is_better(mode=mode, threshold=threshold,
+                             threshold_mode=threshold_mode)
+        self._reset()
+
+    def _reset(self):
+        """Resets num_bad_epochs counter and cooldown counter."""
+        self.best = self.mode_worse
+        self.num_bad_epochs = 0
+
+    def step(self, metrics, epoch=None):
+        """ Updates early stopping state """
+        current = metrics
+        if epoch is None:
+            epoch = self.last_epoch = self.last_epoch + 1
+        self.last_epoch = epoch
+
+        if self.is_better(current, self.best):
+            self.best = current
+            self.num_bad_epochs = 0
+        else:
+            self.num_bad_epochs += 1
+
+    @property
+    def stop(self):
+        """ Should we stop learning? """
+        return self.num_bad_epochs > self.patience
+
+    def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201
+        if mode == 'min' and threshold_mode == 'rel':
+            rel_epsilon = 1. - threshold
+            return a < best * rel_epsilon
+
+        elif mode == 'min' and threshold_mode == 'abs':
+            return a < best - threshold
+
+        elif mode == 'max' and threshold_mode == 'rel':
+            rel_epsilon = threshold + 1.
+            return a > best * rel_epsilon
+
+        return a > best + threshold
+
+    def _init_is_better(self, mode, threshold, threshold_mode):
+        if mode not in {'min', 'max'}:
+            raise ValueError('mode ' + mode + ' is unknown!')
+        if threshold_mode not in {'rel', 'abs'}:
+            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
+
+        if mode == 'min':
+            self.mode_worse = float('inf')
+        else:  # mode == 'max':
+            self.mode_worse = (-float('inf'))
+
+        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
+
+    def state_dict(self):
+        """ Returns early stopping state """
+        return {key: value for key, value in self.__dict__.items() if key != 'is_better'}
+
+    def load_state_dict(self, state_dict):
+        """ Loads early stopping state """
+        self.__dict__.update(state_dict)
+        self._init_is_better(mode=self.mode, threshold=self.threshold,
+                             threshold_mode=self.threshold_mode)
+
+def adjust_learning_rate(learning_rate, optimizer, epoch):
+    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+
+    lr = learning_rate * (0.1 ** (epoch // 10))
+
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+def batch(tensor, batch_size = 50):
+    """ It is used to create batch samples, each batch has batch_size samples"""
+    tensor_list = []
+    length = tensor.shape[0]
+    i = 0
+    while True:
+        if (i+1) * batch_size >= length:
+            tensor_list.append(tensor[i * batch_size: length])
+            return tensor_list
+        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
+        i += 1
+
+
+class selu(nn.Module):
+    def __init__(self):
+        super(selu, self).__init__()
+        self.alpha = 1.6732632423543772848170429916717
+        self.scale = 1.0507009873554804934193349852946
+
+    def forward(self, x):
+        temp1 = self.scale * F.relu(x)
+        temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x)))
+        return temp1 + temp2
+
+
+class alpha_drop(nn.Module):
+    def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1):
+        super(alpha_drop, self).__init__()
+        keep_prob = 1 - p
+        self.a = np.sqrt(
+            fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar)))
+        self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
+        self.alpha = alpha
+        self.keep_prob = 1 - p
+        self.drop_prob = p
+
+    def forward(self, x):
+        if self.keep_prob == 1 or not self.training:
+            # print("testing mode, direct return")
+            return x
+        else:
+            random_tensor = self.keep_prob + torch.rand(x.size())
+
+            binary_tensor = Variable(torch.floor(random_tensor))
+
+            if torch.cuda.is_available():
+                binary_tensor = binary_tensor.cuda()
+
+            x = x.mul(binary_tensor)
+            ret = x + self.alpha * (1 - binary_tensor)
+            ret.mul_(self.a).add_(self.b)
+            return ret
+
+def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None):
+    """
+    create matrix for temporal models.
+
+    :param covariate_ids: the covariate ids in the whole data
+    :param patient_dict: the dictionary contains the data for each patient
+    :param y_dict: if the output labels is known, it contains the labels for patients
+    :param timeid_len: the total number time window gaps when extracting temporal data
+    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
+    """
+    D = len(covariate_ids)
+    N = len(patient_dict)
+    T = timeid_len   
+    concept_list =list(covariate_ids)
+    concept_list.sort()
+    x_raw = np.zeros((N, D, T), dtype=float)
+    patient_ind = 0
+    p_ids = []
+    patient_keys = patient_dict.keys()
+    #print covariate_ids
+    for kk in patient_keys:
+        #print('-------------------')
+        vals = patient_dict[kk]
+        #sorted(vals)
+        p_ids.append(int(kk))
+        for timeid, meas in vals.iteritems():
+            int_time = int(timeid) - 1
+            for val in meas:
+                if not len(val):
+                    continue
+                cov_id, cov_val = val
+                if cov_id not in covariate_ids:
+                    continue
+                lab_ind = concept_list.index(cov_id)
+                if cov_mean_dict is None:
+                    x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
+                else:
+                    mean_std = cov_mean_dict[cov_id]
+                    if mean_std[1]:
+                        x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1]
+                    else:
+                        x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
+
+    
+        patient_ind = patient_ind + 1
+
+    #impute the data using the value of previous timestamp
+    #fw = open('patient_var.txt', 'w')
+    for i in xrange(N):
+        for j in xrange(D):
+            temp = x_raw[i][j]
+            nonzero_inds = np.nonzero(temp)[0]
+            count_nonzeros =  len(nonzero_inds)
+            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
+            if count_nonzeros == 1:
+                ind = nonzero_inds[0]
+                for k in xrange(ind + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][ind]
+            elif count_nonzeros > 1:
+                for ind in xrange(1, count_nonzeros):
+                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
+                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
+                # For last nonzeros.
+                for k in xrange(nonzero_inds[-1] + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
+
+    #fw.close()
+
+    return x_raw, patient_keys
+
+def forward_impute_missing_value(x_raw):
+    N = x_raw.shape[0]
+    D = x_raw.shape[1]
+    T = x_raw.shape[2]
+    for i in xrange(N):
+        for j in xrange(D):
+            temp = x_raw[i][j]
+            nonzero_inds = np.nonzero(temp)[0]
+            count_nonzeros =  len(nonzero_inds)
+            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
+            if count_nonzeros == 1:
+                ind = nonzero_inds[0]
+                for k in xrange(ind + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][ind]
+            elif count_nonzeros > 1:
+                for ind in xrange(1, count_nonzeros):
+                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
+                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
+                # For last nonzeros.
+                for k in xrange(nonzero_inds[-1] + 1, T):
+                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
+
+
+def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False):
+    """
+    It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix
+
+    :param covariates: covariates extracted by FeatureExtraction package
+    :param timeid_len: the total number of window gaps when extracting temporal data
+    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
+    """
+    patient_dict = OrderedDict()
+    print('Loading temporal data')
+    cov_vals_dict = {}
+    for row in covariates:
+        p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3]
+        cov_id = np.int64(cov_id)
+        #time_id = int(time_id)
+        cov_vals_dict.setdefault(cov_id, []).append(float(cov_val))
+        if p_id not in patient_dict:
+            patient_dict[p_id] = {time_id: [(cov_id, cov_val)]}
+        else:
+            if time_id not in patient_dict[p_id]:
+                patient_dict[p_id][time_id] = [(cov_id, cov_val)]
+            else:
+                patient_dict[p_id][time_id].append((cov_id, cov_val))
+        #covariate_ids.add(cov_id)
+    #T = 365/time_window
+    covariate_ids = set()
+    cov_mean_dict = {}
+    if not predict:
+        fw = open('covariate_mean_std.csv', 'w')
+        for key, val in cov_vals_dict.iteritems():
+            mean_val = np.mean(val)
+            std_val = np.std(val)
+
+            # Remove those covariates with few occurrence (<5)
+            if len(val) >= 5:
+                covariate_ids.add(key)
+                cov_mean_dict[key] = (mean_val, std_val)
+                fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n')
+        fw.close()
+    else:
+        fp = open('covariate_mean_std.csv', 'r')
+        for line in fp:
+            values = line.rstrip().split(',')
+            key = np.int64(values[0])
+            covariate_ids.add(key)
+            cov_mean_dict[key] = (float(values[1]), float(values[2]))
+        fp.close()
+
+
+    if normalize:
+        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict)
+    else:
+        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len)
+    
+    return x, patient_keys
+
+
+def read_covariates(covariate_file):
+    patient_dict = {}
+    head = True
+    with open(covariate_file, 'r') as fp:
+        for line in fp:
+            if head:
+                head = False
+                continue
+            values = line.rstrip().split(',')
+            patient_id = values[1]
+            cov_id = values[2]
+            #time_id = int(values[-1])
+            # covariates in one patient has time order
+            patient_dict.setdefault(patient_id, []).append((cov_id))
+    new_patient = []
+    for key in patient_dict.keys():
+        #patient_dict[key].sort()
+        sort_vals = []
+        for val in patient_dict[key]:
+            if val[1] not in sort_vals:
+                sort_vals.append(val)
+        new_patient.append(sort_vals)
+
+    return new_patient
+
+
+def word_embeddings(covariate_file, embedding_size=50):
+    import gensim.models.word2vec as w2v
+    modelname = "processed_%s.w2v" % ('heartfailure')
+    sentences = read_covariates(covariate_file)
+    model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1)
+    print("building word2vec vocab on %s..." % (covariate_file))
+
+    model.build_vocab(sentences)
+    print("training...")
+    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
+    out_file = modelname
+    print("writing embeddings to %s" % (out_file))
+    model.save(out_file)
+    return out_file
+
+def read_data(filename):
+    covariate_ids = set()
+    patient_dict = {}
+    head = True
+    with open(filename) as fp:
+        for lines in fp:
+            if head:
+                head = False
+                continue
+            lines = lines.strip('\n').strip('\r').split(',')
+            try:
+                p_id, cov_id, time_id, cov_val =  lines[1], lines[2], lines[3], lines[4]
+            except:
+                pdb.set_trace()
+            print(p_id, cov_id, time_id)
+            if p_id not in patient_dict:
+                patient_dict[p_id] = {}
+            else:
+                if time_id not in patient_dict[p_id]:
+                    patient_dict[p_id][time_id] = []
+                else:
+                    patient_dict[p_id][time_id].append((cov_id, cov_val))
+                    covariate_ids.add(cov_id)
+    patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients
+    #(15, 2000, 60) 20000 patients, 15 lab tests,
+    return covariate_ids, patient_dict
+
+def split_training_validation(classes, validation_size=0.2, shuffle=False):
+    """split sampels based on balnace classes"""
+    num_samples = len(classes)
+    classes = np.array(classes)
+    classes_unique = np.unique(classes)
+    num_classes = len(classes_unique)
+    indices = np.arange(num_samples)
+    # indices_folds=np.zeros([num_samples],dtype=int)
+    training_indice = []
+    training_label = []
+    validation_indice = []
+    validation_label = []
+    for cl in classes_unique:
+        indices_cl = indices[classes == cl]
+        num_samples_cl = len(indices_cl)
+
+        # split this class into k parts
+        if shuffle:
+            random.shuffle(indices_cl)  # in-place shuffle
+
+        # module and residual
+        num_samples_each_split = int(num_samples_cl * validation_size)
+        res = num_samples_cl - num_samples_each_split
+
+        training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]]
+        training_label = training_label + [cl] * res
+
+        validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]]
+        validation_label = validation_label + [cl] * num_samples_each_split
+
+    training_index = np.arange(len(training_label))
+    random.shuffle(training_index)
+    training_indice = np.array(training_indice)[training_index]
+    training_label = np.array(training_label)[training_index]
+
+    validation_index = np.arange(len(validation_label))
+    random.shuffle(validation_index)
+    validation_indice = np.array(validation_indice)[validation_index]
+    validation_label = np.array(validation_label)[validation_index]
+
+    return training_indice, training_label, validation_indice, validation_label
+
+
+class LogisticRegression(nn.Module):
+    """
+    Train a logistic regression model using pytorch
+    """
+    def __init__(self, input_size, num_classes = 2):
+        super(LogisticRegression, self).__init__()
+        self.linear = nn.Linear(input_size, num_classes)
+
+    def forward(self, x):
+        out = self.linear(x)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class MLP(nn.Module):
+    """
+    Train a multiple-layer perceptron with one hideen layer
+    """
+    def __init__(self, input_dim, hidden_size, num_classes = 2):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, p =0.5, training=self.training)
+        x = self.fc2(x)
+        x = torch.sigmoid(x)
+        return x
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class SNN(nn.Module):
+    """
+    Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515
+    """
+
+    def __init__(self, input_dim, hidden_size, num_classes=2):
+        super(SNN, self).__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_size)
+        self.fc2 = selu()
+        self.ad1 = alpha_drop()
+        self.fc4 = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = self.fc2(x)
+        x = self.ad1(x)
+        x = self.fc4(x)
+        x = torch.sigmoid(x)
+        return x
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+class AutoEncoder(nn.Module):
+    """
+    A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data.
+    """
+    def __init__(self, input_size, encoding_size):
+        super(AutoEncoder, self).__init__()
+
+        self.encoder = nn.Sequential(
+            nn.Linear(input_size, input_size/2),
+            nn.ReLU(True),
+            nn.Linear(input_size/2, input_size/4),
+            nn.ReLU(True),
+            nn.Linear(input_size/4, encoding_size),
+            nn.ReLU(True)
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(encoding_size, input_size/4),
+            nn.ReLU(True),
+            nn.Linear(input_size/4, input_size/2),
+            nn.ReLU(True),
+            nn.Linear(input_size/2, input_size)
+        )
+
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+        encoded = self.encoder(x)
+        decoded = self.decoder(encoded)
+        return decoded
+
+    def get_encode_features(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            encoded = self.encoder(x)
+            encoded = encoded.data.cpu().numpy()
+            return encoded
+
+class VAE(nn.Module):
+    """
+    A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data.
+    """
+    def __init__(self, input_size, encoding_size):
+        super(VAE, self).__init__()
+
+        self.fc1 = nn.Linear(input_size, input_size/2)
+        self.fc21 = nn.Linear(input_size/2, encoding_size)
+        self.fc22 = nn.Linear(input_size/2, encoding_size)
+        self.fc3 = nn.Linear(encoding_size, input_size/2)
+        self.fc4 = nn.Linear(input_size/2, input_size)
+
+        self.relu = nn.ReLU()
+        self.sigmoid = nn.Sigmoid()
+
+    def encode(self, x):
+        h1 = self.relu(self.fc1(x))
+        return self.fc21(h1), self.fc22(h1)
+
+    def reparameterize(self, mu, logvar):
+        if self.training:
+            std = logvar.mul(0.5).exp_()
+            eps = Variable(std.data.new(std.size()).normal_())
+            return eps.mul(std).add_(mu)
+        else:
+            return mu
+
+    def decode(self, z):
+        h3 = self.relu(self.fc3(z))
+        return self.sigmoid(self.fc4(h3))
+
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+        mu, logvar = self.encode(x)
+        z = self.reparameterize(mu, logvar)
+        return self.decode(z), mu, logvar
+
+    def get_encode_features(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            mu, logvar = self.encode(x)
+            encoded = self.reparameterize(mu, logvar)
+            encoded = encoded.data.cpu().numpy()
+            return encoded
+
+class Decoder(nn.Module):
+    """ VAE decoder input_size = original inputsize/16*256"""
+    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
+        super(Decoder, self).__init__()
+        self.latent_size = latent_size
+        self.img_channels = img_channels
+
+        self.fc1 = nn.Linear(latent_size, input_size)
+        self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding)
+        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding)
+        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding)
+        self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding)
+
+    def forward(self, x): # pylint: disable=arguments-differ
+        x = F.relu(self.fc1(x))
+        x = x.unsqueeze(-1).unsqueeze(-1)
+        x = F.relu(self.deconv1(x))
+        x = F.relu(self.deconv2(x))
+        x = F.relu(self.deconv3(x))
+        reconstruction = torch.sigmoid(self.deconv4(x))
+        return reconstruction
+
+class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
+    """ VAE encoder """
+    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
+        super(Encoder, self).__init__()
+        self.latent_size = latent_size
+        #self.img_size = img_size
+        self.img_channels = img_channels
+
+        self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding)
+        self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding)
+        self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding)
+        self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding)
+        out_size = input_size / 16
+        self.fc_mu = nn.Linear(out_size, latent_size)
+        self.fc_logsigma = nn.Linear(out_size, latent_size)
+
+
+    def forward(self, x): # pylint: disable=arguments-differ
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        x = F.relu(self.conv4(x))
+        x = x.view(x.size(0), -1)
+
+        mu = self.fc_mu(x)
+        logsigma = self.fc_logsigma(x)
+
+        return mu, logsigma
+
+class VAE_CNN(nn.Module):
+    """ Variational Autoencoder """
+    def __init__(self, latent_size, input_size):
+        super(VAE, self).__init__()
+        self.encoder = Encoder(latent_size, input_size)
+        input_size = input_size/16
+        self.decoder = Decoder(latent_size, input_size)
+
+    def forward(self, x): # pylint: disable=arguments-differ
+        mu, logsigma = self.encoder(x)
+        sigma = logsigma.exp()
+        eps = torch.randn_like(sigma)
+        z = eps.mul(sigma).add_(mu)
+
+        recon_x = self.decoder(z)
+        return recon_x, mu, logsigma
+
+    def get_encode_features(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            mu, logvar = self.encoder(x)
+            encoded = mu .data.cpu().numpy()
+            return encoded
+
+class CNN(nn.Module):
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
+        super(CNN, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size)
+        self.bn = nn.BatchNorm1d(hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.layer1(x)
+        out = self.layer2(out)
+        out = out.view(out.size(0), -1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.bn(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+#allow multiple kernel with differnt kernel size
+class CNN_MLF(nn.Module):
+    """
+    It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
+        super(CNN_MLF, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1
+        maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size
+        maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1
+        maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        conv_outsize = 	maxpool1_size + maxpool2_size +maxpool3_size
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out1 = self.layer1(x)
+        out2 = self.layer2(x)
+        out3 = self.layer3(x)
+        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class CNN_LSTM(nn.Module):
+    """
+    It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2):
+        super(CNN_LSTM, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size, stride = stride))
+        self.num_layers = num_layers
+        self.hidden_size = hidden_size
+        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
+        self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding)
+        input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(hidden_size, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.layer1(x)
+        out = self.downsample(out)
+        out = torch.squeeze(out, 1)
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() 
+            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda()
+        else:
+            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) 
+            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
+        out, hn  = self.layer2(out, (h0, c0))
+        out = hn[0][-1]
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class CNN_MIX(nn.Module):
+    """
+    It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
+        super(CNN_MIX, self).__init__()
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size))
+        out1_size = int(np.ceil(float(window_size)/pool_size[1]))
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        
+        out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.layer1(x)
+        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
+        out = self.layer2(out)
+        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
+        out = self.layer3(out)
+        out = out.view(out.size(0), -1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class CNN_MULTI(nn.Module):
+    """
+    It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github.
+    """
+    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
+        super(CNN_MULTI, self).__init__()
+        # resolution 1
+        self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size)
+        maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+        self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size)
+        maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+              
+        self.layer1 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        #resolution 2
+        self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size)
+        maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+              
+        self.layer2 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        self.layer3 = nn.Sequential(
+            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU(),
+            nn.MaxPool2d(pool_size))
+        cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+        self.layer4 = nn.Sequential(
+            nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
+            nn.BatchNorm2d(nb_filter),
+            nn.ReLU())
+        cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
+        merge_size = cnn1_size + cnn2_size + cnn4_size
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.pool1_1(x)
+        out = self.pool1_2(out)
+        out1 = self.layer1(out)
+        out = self.pool2_1(x)
+        out2 = self.layer2(out)
+        out = self.layer3(x)
+        out3 = self.layer4(out)
+        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1)
+        out = self.drop1(out)
+        out = self.fc1(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+# 1x3 Convolution
+def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)):
+    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
+                     padding=padding, stride=stride, bias=False)
+
+
+# Residual Block
+class ResidualBlock(nn.Module):
+    def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride)
+        self.bn1 = nn.BatchNorm2d(nb_filter)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride)
+        self.bn2 = nn.BatchNorm2d(nb_filter)
+        self.downsample = downsample
+        
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        if self.downsample:
+            residual = self.downsample(x)
+        out += residual
+        out = self.relu(out)
+        return out
+
+
+# ResNet Module
+class ResNet(nn.Module):
+    def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100):
+        super(ResNet, self).__init__()
+        self.in_channels = 1
+        self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size)
+        self.bn = nn.BatchNorm2d(nb_filter)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self.make_layer(block, nb_filter, layers[0],  kernel_size = kernel_size)
+        self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter)
+        self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter)
+        self.avg_pool = nn.AvgPool2d(pool_size)
+        avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1
+        last_layer_size = nb_filter*4*labcounts*avgpool2_1_size
+        self.fc = nn.Linear(last_layer_size, hidden_size)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.relu1 = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_size, num_classes)
+        
+    def make_layer(self, block, out_channels, blocks, stride=1,  kernel_size = (1, 3), in_channels = 16):
+        downsample = None
+        if (stride != 1) or (self.in_channels != out_channels):
+            downsample = nn.Sequential(
+                convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride),
+                nn.BatchNorm2d(out_channels))
+        layers = []
+        layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample))
+        self.in_channels = out_channels
+        for i in range(1, blocks):
+            layers.append(block(out_channels, out_channels, kernel_size = kernel_size))
+        return nn.Sequential(*layers)
+     
+    def forward(self, x):
+        x = x.view(x.size(0), 1, x.size(1), x.size(2))
+        out = self.conv(x)
+        out = self.bn(out)
+        out = self.relu(out)
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.avg_pool(out)
+        out = out.view(out.size(0), -1)
+        out = self.fc(out)
+        out = self.drop2(out)
+        out = self.relu1(out)
+        out = self.fc2(out)
+        out = torch.sigmoid(out)
+        return out
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+        
+class GRU(nn.Module):
+    """
+    It is a deep network with one GRU layer, which are further fed into one fully connected layers.
+    """
+    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
+        super(GRU, self).__init__()
+
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
+        self.linear = nn.Linear(hidden_size, num_classes)
+
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection
+        else:
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection
+        self.gru.flatten_parameters()
+        out, hn = self.gru(x, h0)
+
+        rearranged = hn[-1]
+        out = self.linear(rearranged)
+        out = torch.sigmoid(out)
+        return out
+
+    def initHidden(self, N):
+        return Variable(torch.randn(1, N, self.hidden_size))
+    
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class RNN(nn.Module):
+    """
+    It is a deep network with one LSTM layer, which are further fed into one fully connected layer.
+    """
+    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
+        super(RNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
+        self.fc = nn.Linear(hidden_size, num_classes)
+    
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() 
+            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda()
+        else:
+            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
+            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
+        self.lstm.flatten_parameters()
+        out, hn = self.lstm(x, (h0, c0))
+        rearranged = hn[0][-1]
+        # Decode hidden state of last time step
+        out = self.fc(rearranged)
+        out = torch.sigmoid(out)
+        return out
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+class BiRNN(nn.Module):
+    """
+    It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer.
+    """
+    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
+        super(BiRNN, self).__init__()
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
+                            batch_first = True, dropout = dropout, bidirectional=True)
+        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection 
+    
+    def forward(self, x):
+        if torch.cuda.is_available():
+            x = x.cuda()
+            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection 
+            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda()
+        else:
+            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
+            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
+        self.lstm.flatten_parameters()
+        out, hn = self.lstm(x, (h0, c0))
+        hn = hn[0]
+
+        rearranged = hn[-2:].view(x.size(0), -1)
+        # Decode hidden state of last time step
+        out = self.fc(rearranged)
+        out = torch.sigmoid(out)
+        return out
+
+    def predict_proba(self, x):
+        if type(x) is np.ndarray:
+            x = torch.from_numpy(x.astype(np.float32))
+        with torch.no_grad():
+            x = Variable(x)
+            if torch.cuda.is_available():
+                x = x.cuda()
+            y = self.forward(x)
+            temp = y.data.cpu().numpy()
+            return temp
+
+
+# select model
+def train_deeptorch(population, plpData, train = True, model_type = 'LogisticRegression', class_weight =0,  autoencoder = True, w_decay =0.9, epochs = 1, vae = False, size = 100, loss = 'LogSoftmax', nbfilters = 4, learning_rate = 0.0001, hidden_size = 100, modelOutput = 'C:/deeptorch', seed = 1, quiet = False):
+  if model_type in ['LogisticRegression', 'MLP', 'SNN']:
+    y = population[:, 1]
+    X = plpData[population[:, 0], :]
+    trainInds = population[:, population.shape[1] - 1] > 0
+    if class_weight == -1:
+      loss = FocalLoss(gamma = 5)
+    else:
+      if class_weight == 0:
+        weights = float(np.count_nonzero(y))/y.shape[0]
+        class_weight = [1 - weights, weights]
+      else:
+        class_weight = [class_weight, 1]
+      class_weight = 1/torch.Tensor(class_weight)
+      if torch.cuda.is_available():
+        class_weight = class_weight.cuda()
+      loss=nn.CrossEntropyLoss(weight = class_weight)
+    
+    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
+    print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]))
+    ###########################################################################
+    l1regularization = False
+    if train:
+      pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
+      print("Calculating prediction for train set of size %s" % (pred_size))
+      test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
+      for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
+        testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
+        trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
+        train_x = X[trainInds, :][trainInd, :]
+        train_y = y[trainInds][trainInd]
+        test_x = X[trainInds, :][testInd, :]
+        print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
+        print("Train set contains %s outcomes " % (np.sum(train_y)))
+        train_x = train_x.toarray()
+        test_x = test_x.toarray()
+        if autoencoder:
+          print('first train stakced autoencoder')
+          encoding_size = 256
+          if vae:
+            auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
+          else:
+            auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
+          if torch.cuda.is_available():
+            auto_model = auto_model.cuda()
+          clf = Estimator(auto_model)
+          clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay),
+                                loss=nn.MSELoss())
+          clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae)
+          #split to batch for large dataset
+          train_batch = batch(train_x, batch_size=32)
+          train_x = np.array([]).reshape(0, encoding_size)
+          for train in train_batch:
+            encode_train = auto_model.get_encode_features(train)
+            train_x = np.concatenate((train_x, encode_train), axis=0)
+          test_batch = batch(test_x, batch_size=32)
+          test_x = np.array([]).reshape(0, encoding_size)
+          for test in test_batch:
+            encode_Test = auto_model.get_encode_features(test)
+            test_x = np.concatenate((test_x, encode_Test), axis=0)
+          del auto_model
+          del clf
+        # train on fold
+        print("Training fold %s" % (i))
+        start_time = timeit.default_timer()
+        if model_type == 'LogisticRegression':
+          model = LogisticRegression(train_x.shape[1])
+          l1regularization = True
+        elif model_type == 'SNN':
+          model = SNN(train_x.shape[1], size)
+        else:
+          model = MLP(train_x.shape[1], size)
+        
+        if torch.cuda.is_available():
+          model = model.cuda()
+        clf = Estimator(model)
+        clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
+                            loss=loss)
+        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
+        
+        ind = (population[:, population.shape[1] - 1] > 0)
+        ind = population[ind, population.shape[1] - 1] == i
+        test_input_var = torch.from_numpy(test_x.astype(np.float32))
+        test_batch = batch(test_x, batch_size = 32)
+        temp = []
+        for test in test_batch:
+          pred_test1 = model.predict_proba(test)[:, 1]
+          temp = np.concatenate((temp, pred_test1), axis = 0)
+        test_pred[ind] = temp
+        print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
+        print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
+
+      # RETURN CV PREDICTION WHEN TRAIN == T
+      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
+      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
+      return prediction;
+      
+    # train final:
+    else:
+      print("Training final neural network model on all train data...")
+      print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
+      start_time = timeit.default_timer()
+      train_x = X[trainInds, :]
+      train_x = train_x.toarray()
+      train_y = y[trainInds]
+      if not os.path.exists(modelOutput):
+        os.makedirs(modelOutput)
+      if autoencoder:
+        encoding_size = 256
+        if vae:
+          auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
+        else:
+          auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
+        if torch.cuda.is_available():
+          auto_model = auto_model.cuda()
+        clf = Estimator(auto_model)
+        clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay),
+                            loss=nn.MSELoss())
+        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae)
+        train_batch = batch(train_x, batch_size=32)
+        train_x = np.array([]).reshape(0, encoding_size)
+        for train in train_batch:
+          encode_train = auto_model.get_encode_features(train)
+          train_x = np.concatenate((train_x, encode_train), axis=0)
+        joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl'))
+        del auto_model
+        del clf
+      print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay))
+      if model_type == 'LogisticRegression':
+        model = LogisticRegression(train_x.shape[1])
+        l1regularization = True
+      elif model_type == 'SNN':
+        model = SNN(train_x.shape[1], size)
+      else:
+        model = MLP(train_x.shape[1], size)
+      if torch.cuda.is_available():
+        model = model.cuda()
+      clf = Estimator(model)
+      clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
+                        loss=loss)
+      clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
+      end_time = timeit.default_timer()
+      print("Training final took: %.2f s" % (end_time - start_time))
+      print("Model saved to: %s" % (modelOutput))
+      joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
+      # DO PREDICTION ON TRAIN:
+      train_batch = batch(train_x, batch_size = 32)
+      train_pred = []
+      for train in train_batch:
+        preds = model.predict_proba(train)[:, 1]
+        train_pred = np.concatenate((train_pred, preds), axis = 0)
+      train_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
+      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], train_pred, axis=1)
+      # RETURN TRAIN PREDICTION WHEN TRAIN == F
+      return prediction;
+  
+  elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']:
+    y = population[:, 1]
+    X = plpData.to_dense().numpy()
+    X = X[np.int64(population[:, 0]), :]
+    trainInds = population[:, population.shape[1] - 1] > 0
+    
+    if class_weight == -1:
+      loss = FocalLoss(gamma = 3)
+    else:
+      if class_weight == 0:
+        weights = float(np.count_nonzero(y))/y.shape[0]
+        class_weight = [1 - weights, weights]
+      else:
+        class_weight = [class_weight, 1]
+      class_weight = 1/torch.Tensor(class_weight)
+      if torch.cuda.is_available():
+        class_weight = class_weight.cuda()
+      loss=nn.CrossEntropyLoss(weight = class_weight)
+    
+    if train:
+      test_pred = np.zeros(population[population[:, population.shape[1] - 1] > 0, :].shape[0])  # zeros length sum(population[:,population.size[1]] ==i)
+      for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
+        testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
+        trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
+        train_x = X[trainInds, :][trainInd, :]
+        train_y = y[trainInds][trainInd]
+        test_x = X[trainInds, :][testInd, :]
+        print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
+        print("Train set contains %s outcomes " % (np.sum(train_y)))
+        # train on fold
+        learning_rate = 0.001
+        print("Training fold %s" % (i))
+        start_time = timeit.default_timer()
+        if model_type == 'CNN':
+          model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+        elif model_type == 'CNN_LSTM':
+          model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+        elif model_type == 'CNN_MLF': # multiple kernels with different size
+          model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+        elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis
+          model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+        elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis
+          model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+        elif model_type == 'ResNet':
+          print('train ResNet')
+          model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+        elif model_type == 'RNN':
+          model = RNN(train_x.shape[2], hidden_size, 2, 2)
+        elif model_type == 'BiRNN':
+          model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
+        elif model_type == 'GRU':
+          model = GRU(train_x.shape[2], hidden_size, 2, 2)
+        else:
+          print('temproal data not supported by this model')
+          
+        if torch.cuda.is_available():
+          model = model.cuda()
+        clf = Estimator(model)
+        clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
+                            loss=loss)
+        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
+        ind = (population[:, population.shape[1] - 1] > 0)
+        ind = population[ind, population.shape[1] - 1] == i
+        test_batch = batch(test_x, batch_size = 32)
+        temp = []
+        for test in test_batch:
+          pred_test1 = model.predict_proba(test)[:, 1]
+          temp = np.concatenate((temp, pred_test1), axis = 0)
+        test_pred[ind] = temp
+        del model
+        print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
+        print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
+      # RETURN CV PREDICTION
+      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
+      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
+      return prediction;
+      
+    # train final:
+    else:
+      print("Training final neural network model on all train data...")
+      print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
+      start_time = timeit.default_timer()
+      train_x = X[trainInds, :]
+      train_y = y[trainInds]
+      learning_rate = 0.001
+      if model_type == 'CNN':
+        model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+      elif model_type == 'CNN_LSTM':
+        model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+      elif model_type == 'CNN_MLF': # multiple kernels with different size
+        model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+      elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis
+        model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+      elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis
+        model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
+      elif model_type == 'ResNet':
+        model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
+      elif model_type == 'RNN':
+        model = RNN(train_x.shape[2], hidden_size, 2, 2)
+      elif model_type == 'BiRNN':
+        model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
+      elif model_type == 'GRU':
+        model = GRU(train_x.shape[2], hidden_size, 2, 2)
+      else:
+        print('temproal data not supported by this model')
+      
+      if torch.cuda.is_available():
+        model = model.cuda()
+      clf = Estimator(model)
+      clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
+                        loss=loss)
+      clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
+      end_time = timeit.default_timer()
+      print("Training final took: %.2f s" % (end_time - start_time))
+      # save the model:
+      if not os.path.exists(modelOutput):
+        os.makedirs(modelOutput)
+      print("Model saved to: %s" % (modelOutput))
+      
+      joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
+      # prediction on train:
+      test_batch = batch(train_x, batch_size = 32)
+      test_pred = []
+      for test in test_batch:
+        pred_test1 = model.predict_proba(test)[:, 1]
+        test_pred = np.concatenate((test_pred, pred_test1), axis = 0)
+      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
+      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
+      return prediction;
diff --git a/inst/python/predictFunctions.py b/inst/python/predictFunctions.py
new file mode 100644
index 0000000..fcbb2e5
--- /dev/null
+++ b/inst/python/predictFunctions.py
@@ -0,0 +1,153 @@
+#  apply random forest model on new data
+#===============================================================
+# INPUT:
+# 1) location of new data
+# 2) location of model
+#
+# OUTPUT:
+# it returns a file with indexes merged with prediction for test index  - named new_pred
+#================================================================
+import numpy as np
+from collections import OrderedDict
+import os
+import sys
+import timeit
+import math
+from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack
+from joblib import Memory
+import joblib
+
+def batch(tensor, batch_size = 50):
+    """ It is used to create batch samples, each batch has batch_size samples"""
+    tensor_list = []
+    length = tensor.shape[0]
+    i = 0
+    while True:
+        if (i+1) * batch_size >= length:
+            tensor_list.append(tensor[i * batch_size: length])
+            return tensor_list
+        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
+        i += 1
+#================================================================
+###########################################################################
+def python_predict_temporal(population, plpData, model_loc, dense, autoencoder):
+  print("Applying Python Model") 
+  print("Loading Data...")
+  # load data + train,test indexes + validation index
+  #y=population[:,1]
+  
+  ###########################################################################	
+  # uf dense convert 
+  if dense==1:
+    print("converting to dense data...")
+    X = plpData.to_dense().numpy()
+  if dense==0:
+    print("keeping data sparse...")
+    X = plpData.numpy()
+  ###########################################################################	
+  
+  # order the data
+  X = X[np.int64(population[:, 0]), :]
+  # load index file
+  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
+  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
+  print("Data ready for model has %s features" %(np.shape(X)[1]))
+
+  # load model
+  print("Loading model...")
+  if autoencoder:
+    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
+    X = autoencoder_model.get_encode_features(X)
+  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
+  print("Calculating predictions on population...")
+  test_batch = batch(X, batch_size = 32)
+  test_pred = []
+  for test in test_batch:
+    pred_test1 = modelTrained.predict_proba(test)[:, 1]
+    test_pred = np.concatenate((test_pred , pred_test1), axis = 0)
+  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
+  print("Mean: %s prediction value" %(np.mean(test_pred)))
+  # merge pred with population
+  test_pred.shape = (population.shape[0], 1)
+  prediction = np.append(population,test_pred, axis=1)
+  return prediction
+  
+def python_predict(population, plpData, model_loc, dense, autoencoder):
+  print("Applying Python Model") 
+  print("Loading Data...")
+  # load data + train,test indexes + validation index
+  #y=population[:,1]
+  X = plpData[population[:,0].astype(int),:]
+  # load index file
+  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
+  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
+  print("Data ready for model has %s features" %(np.shape(X)[1]))
+  ###########################################################################	
+  # uf dense convert 
+  if dense==1:
+    print("converting to dense data...")
+    X=X.toarray()
+  ###########################################################################	
+  # load model
+  print("Loading model...")
+  if autoencoder:
+    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
+    X = autoencoder_model.get_encode_features(X)
+  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
+  print("Calculating predictions on population...")
+  test_pred = modelTrained.predict_proba(X)[:, 1]
+  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
+  print("Mean: %s prediction value" %(np.mean(test_pred)))
+  # merge pred with population
+  test_pred.shape = (population.shape[0], 1)
+  prediction = np.append(population,test_pred, axis=1)
+  return prediction
+
+
+def python_predict_survival(population, plpData, model_loc):
+  print("Applying Python Model") 
+  print("Loading Data...")
+  # load data + train,test indexes + validation index
+  X = plpData[population[:,0].astype(int),:]
+  # load index file
+  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
+  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
+  print("Data ready for model has %s features" %(np.shape(X)[1]))
+  ###########################################################################	
+  # load model
+  print("Loading model...")
+  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
+  print("Calculating predictions on population...")
+  test_pred = modelTrained.predict(X.toarray())
+  test_pred = test_pred.flatten()
+  rowCount = population.shape[0]
+  test_pred = test_pred[0:(rowCount)]
+  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
+  print("Mean: %s prediction value" %(np.mean(test_pred)))
+  # merge pred with population
+  test_pred.shape = (population.shape[0], 1)
+  prediction = np.append(population,test_pred, axis=1)
+  return prediction
+  
+def python_predict_garden(population, plpData, model_loc,quantile=None):
+  print("Applying Python Model") 
+  print("Loading Data...")
+  # load data + train,test indexes + validation index
+  #y=population[:,1]
+  X = plpData[population[:,0].astype(int),:]
+  # load index file
+  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
+  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
+  print("Data ready for model has %s features" %(np.shape(X)[1]))
+  ###########################################################################	
+  # load model
+  print("Loading model...")
+  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
+  print("Calculating predictions on population...")
+  test_pred = modelTrained.predict(X,quantile=quantile)[:, 0]
+  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
+  print("Mean: %s prediction value" %(np.mean(test_pred)))
+  # merge pred with population
+  test_pred.shape = (population.shape[0], 1)
+  prediction = np.append(population,test_pred, axis=1)
+  return prediction
diff --git a/inst/python/python_predict.py b/inst/python/python_predict.py
new file mode 100644
index 0000000..3a2f132
--- /dev/null
+++ b/inst/python/python_predict.py
@@ -0,0 +1,111 @@
+#  apply random forest model on new data
+#===============================================================
+# INPUT:
+# 1) location of new data
+# 2) location of model
+#
+# OUTPUT:
+# it returns a file with indexes merged with prediction for test index  - named new_pred
+#================================================================
+import numpy as np
+from collections import OrderedDict
+import os
+import sys
+import timeit
+import math
+#from sklearn.ensemble import RandomForestClassifier
+#from sklearn.naive_bayes import GaussianNB
+from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack
+#from sklearn.feature_selection import SelectFromModel
+#from sklearn.cross_validation import PredefinedSplit
+from sklearn.externals.joblib import Memory
+#from sklearn.datasets import load_svmlight_file
+from sklearn.externals import joblib
+if "python_dir" in globals():
+    sys.path.insert(0, python_dir)
+    import TorchUtils as tu
+#================================================================
+
+
+print("Applying Python Model") 
+
+###########################################################################
+	
+def get_temproal_data(covariates, population):
+    p_ids_in_cov = set(covariates[:, 0])
+    timeid_len = len(set(covariates[:, -2]))
+    full_covariates = np.array([]).reshape(0,4)
+    default_covid = covariates[0, 1]
+    for p_id in  population[:, 0]:
+        if p_id not in p_ids_in_cov:
+            tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1
+            full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
+        else:
+            tmp_x = covariates[covariates[:, 0] == p_id, :]
+            #print tmp_x.shape, X.shape
+            full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
+
+    X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len  = timeid_len, predict = True)
+    return X
+
+
+print("Loading Data...")
+# load data + train,test indexes + validation index
+
+y=population[:,1]
+#print covariates.shape
+
+if modeltype == 'temporal':
+    X = plpData.to_dense().numpy()
+    X = X[np.int64(population[:, 0]), :]
+	#X = get_temproal_data(covariates, population)
+    dense = 0
+else:
+    #print included
+	X = plpData[population[:,0],:]
+	X = X[:,included.flatten()]
+
+# load index file
+print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
+print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
+print("Data ready for model has %s features" %(np.shape(X)[1]))
+
+###########################################################################	
+# uf dense convert 
+if dense==1:
+  print("converting to dense data...")
+  X=X.toarray()
+###########################################################################	
+
+# load model
+print("Loading model...")
+
+modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
+
+print(X.shape)
+print("Calculating predictions on population...")
+
+if autoencoder:
+    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
+    X = autoencoder_model.get_encode_features(X)
+
+if modeltype == 'temporal':
+    test_batch = tu.batch(X, batch_size = 32)
+    test_pred = []
+    for test in test_batch:
+        pred_test1 = modelTrained.predict_proba(test)[:, 1]
+        test_pred = np.concatenate((test_pred , pred_test1), axis = 0)
+else:
+    test_pred = modelTrained.predict_proba(X)[:, 1]
+
+
+if test_pred.ndim != 1:
+    test_pred = test_pred[:,1]
+  
+
+print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
+print("Mean: %s prediction value" %(np.mean(test_pred)))
+
+# merge pred with population
+test_pred.shape = (population.shape[0], 1)
+prediction = np.append(population,test_pred, axis=1)
diff --git a/tests/testthat.R b/tests/testthat.R
new file mode 100644
index 0000000..59feab0
--- /dev/null
+++ b/tests/testthat.R
@@ -0,0 +1,6 @@
+#Sys.setenv("R_TESTS" = "")
+#options(fftempdir = file.path(getwd(),'fftemp'))
+library(testthat)
+library(DeepPatientLevelPrediction)
+test_check("DeepPatientLevelPrediction")
+unlink('T:/Temp', recursive = T)
diff --git a/tests/testthat/test-keras.R b/tests/testthat/test-keras.R
new file mode 100644
index 0000000..944f5bb
--- /dev/null
+++ b/tests/testthat/test-keras.R
@@ -0,0 +1,137 @@
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library("testthat")
+context("Keras")
+
+deepSet <- setDeepNN(epochs = 1)
+
+plpResultDeep <- runPlp(population = population,
+                                               plpData = plpData, 
+                                               modelSettings = deepSet, 
+                                               savePlpData = F, 
+                                               savePlpResult = F, 
+                                               saveEvaluation = F, 
+                                               savePlpPlots = F, 
+                                               analysisId = 'deepTest',
+                                               saveDirectory =  saveLoc)
+
+
+
+#TODO: add input checks and test these...
+#options(fftempdir = getwd())
+
+
+test_that("deepNN working checks", {
+  
+  # check same structure
+  testthat::expect_equal(names(plpResultDeep), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultDeep$prediction), nrow(population))
+
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultDeep$prediction$value), 0)
+  testthat::expect_lte(max(plpResultDeep$prediction$value), 1)
+
+})
+
+
+# add temporal data:
+CNN1Set <- setCovNN(epochs = 1, kernelSize = 3, batchSize =30)
+plpResultCNN1 <- runPlp(population = population2,
+                                                plpData = plpData3, 
+                                                modelSettings = CNN1Set, 
+                                                savePlpData = F, 
+                                                savePlpResult = F, 
+                                                saveEvaluation = F, 
+                                                savePlpPlots = F, 
+                                                analysisId = 'cnn1Test',
+                                                saveDirectory =  saveLoc)
+
+test_that("covNN working checks", {
+  
+  # check same structure
+  testthat::expect_equal(names(plpResultCNN1), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultCNN1$prediction), nrow(population2))
+  
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultCNN1$prediction$value), 0)
+  testthat::expect_lte(max(plpResultCNN1$prediction$value), 1)
+  
+})
+
+CNN2Set <- setCovNN2(epochs = 1, kernelSize = 4, filters=4)
+plpResultCNN2 <- runPlp(population = population2,
+                                                plpData = plpData3, 
+                                                modelSettings = CNN2Set, 
+                                                savePlpData = F, 
+                                                savePlpResult = F, 
+                                                saveEvaluation = F, 
+                                                savePlpPlots = F, 
+                                                analysisId = 'cnn1Test',
+                                                saveDirectory =  saveLoc)
+
+test_that("covNN2 working checks", {
+  
+  # check same structure
+  testthat::expect_equal(names(plpResultCNN2), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultCNN2$prediction), nrow(population2))
+  
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultCNN2$prediction$value), 0)
+  testthat::expect_lte(max(plpResultCNN2$prediction$value), 1)
+  
+})
+
+
+if(!travis){
+CIReNNSet <- setCIReNN(epochs = 1, useVae = F, units=c(10) )
+plpResultCIReNN <- runPlp(population = population2, plpData = plpData3, 
+                                         minCovariateFraction = 0.001, normalizeData = F, 
+                                         modelSettings = CIReNNSet, testSplit = 'person', 
+                                         testFraction = 0.25, splitSeed = 1, 
+                                         nfold = 3, savePlpData = F, savePlpResult = F,
+                                         savePlpPlots = F,
+                                         saveEvaluation = F, 
+                          analysisId = 'cireNNTest',
+                          saveDirectory =  saveLoc)
+}
+
+test_that("CIReNN working checks", {
+  if(travis){
+    skip("Too slow for travis")
+  }
+  # check same structure
+  testthat::expect_equal(names(plpResultCIReNN), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultCIReNN$prediction), nrow(population2))
+  
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultCIReNN$prediction$value), 0)
+  testthat::expect_lte(max(plpResultCIReNN$prediction$value), 1)
+  
+})
+
diff --git a/tests/testthat/test-torch.R b/tests/testthat/test-torch.R
new file mode 100644
index 0000000..ab732d2
--- /dev/null
+++ b/tests/testthat/test-torch.R
@@ -0,0 +1,123 @@
+# Copyright 2020 Observational Health Data Sciences and Informatics
+#
+# This file is part of PatientLevelPrediction
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+library("testthat")
+context("Torch")
+
+lrtSet <- setLRTorch(w_decay=0.0005, 
+                     epochs=1, 
+                     class_weight = 0, 
+                     autoencoder = FALSE, 
+                     vae =FALSE)
+  
+
+plpResultLrt <- runPlp(population = population,
+                                               plpData = plpData, 
+                                               modelSettings = lrtSet, 
+                                               savePlpData = F, 
+                                               savePlpResult = F, 
+                                               saveEvaluation = F, 
+                                               savePlpPlots = F, 
+                                               analysisId = 'lrtTest',
+                                               saveDirectory =  saveLoc)
+
+
+
+#TODO: add input checks and test these...
+#options(fftempdir = getwd())
+
+
+test_that("torch LR working checks", {
+  
+  # check same structure
+  testthat::expect_equal(names(plpResultLrt), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultLrt$prediction), nrow(population))
+
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultLrt$prediction$value), 0)
+  testthat::expect_lte(max(plpResultLrt$prediction$value), 1)
+
+})
+
+mlptSet <- setMLPTorch(size = 10, 
+                       w_decay = 0.001, 
+                       epochs = 1,
+                       autoencode = F)
+
+
+plpResultMlpt <- runPlp(population = population,
+                       plpData = plpData, 
+                       modelSettings = mlptSet, 
+                       savePlpData = F, 
+                       savePlpResult = F, 
+                       saveEvaluation = F, 
+                       savePlpPlots = F, 
+                       analysisId = 'mlptTest',
+                       saveDirectory =  saveLoc)
+
+test_that("MLP LR working checks", {
+  
+  # check same structure
+  testthat::expect_equal(names(plpResultMlpt), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultMlpt$prediction), nrow(population))
+  
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultMlpt$prediction$value), 0)
+  testthat::expect_lte(max(plpResultMlpt$prediction$value), 1)
+  
+})
+
+
+# add temporal data:
+if(!travis){
+RNNTSet <- setRNNTorch(hidden_size = 1, 
+                        epochs =1)
+plpResultRNNT <- runPlp(population = population2,
+                                                plpData = plpData3, 
+                                                modelSettings = RNNTSet, 
+                                                savePlpData = F, 
+                                                savePlpResult = F, 
+                                                saveEvaluation = F, 
+                                                savePlpPlots = F, 
+                                                analysisId = 'rnntTest',
+                                                saveDirectory =  saveLoc)
+}
+
+test_that("RNN Torch working checks", {
+  if(travis){
+    skip("Too slow for travis")
+  }
+  # check same structure
+  testthat::expect_equal(names(plpResultRNNT), 
+                         names(plpResult))
+  
+  # check prediction same size as pop
+  testthat::expect_equal(nrow(plpResultRNNT$prediction), nrow(population2))
+  
+  # check prediction between 0 and 1
+  testthat::expect_gte(min(plpResultRNNT$prediction$value), 0)
+  testthat::expect_lte(max(plpResultRNNT$prediction$value), 1)
+  
+})
+
+# add CNN when it is fixed
+