From c9af100ddfae4e88f07d3e4436cc8c57b82c3aca Mon Sep 17 00:00:00 2001
From: Egill Fridgeirsson <egillax@gmail.com>
Date: Fri, 3 Jun 2022 13:03:09 +0200
Subject: [PATCH] remove unused files and unused dependencies in DESCRIPTION.

---
 DESCRIPTION                       |   15 +-
 NAMESPACE                         |   13 +-
 R/CIReNN.R                        | 1162 -------------------
 R/CNNTorch.R                      |  178 ---
 R/CovNN.R                         |  535 ---------
 R/CovNN2.R                        |  461 --------
 R/DeepNN.R                        |  557 ----------
 R/DeepPatientLevelPrediction.R    |   29 +
 R/Formatting.R                    |  173 ---
 R/RNNTorch.R                      |  173 ---
 R/helpers.R                       |   69 +-
 R/sparseRTorch.R                  |   63 --
 inst/python/TorchMap.py           |   17 -
 inst/python/TorchUtils.py         |  695 ------------
 inst/python/deepTorch.py          | 1186 --------------------
 inst/python/deepTorchFunctions.py | 1732 -----------------------------
 inst/python/predictFunctions.py   |  153 ---
 inst/python/python_predict.py     |  111 --
 man/setCIReNN.Rd                  |   93 --
 man/setCNNTorch.Rd                |   36 -
 man/setCovNN.Rd                   |   48 -
 man/setCovNN2.Rd                  |   48 -
 man/setDeepNN.Rd                  |   42 -
 man/setRNNTorch.Rd                |   35 -
 man/toSparseMDeep.Rd              |   40 -
 man/toSparseRTorch.Rd             |   37 -
 man/transferLearning.Rd           |   54 -
 tests/testthat/test-keras.R       |  137 ---
 tests/testthat/test-torch.R       |  123 --
 29 files changed, 35 insertions(+), 7980 deletions(-)
 delete mode 100644 R/CIReNN.R
 delete mode 100644 R/CNNTorch.R
 delete mode 100644 R/CovNN.R
 delete mode 100644 R/CovNN2.R
 delete mode 100644 R/DeepNN.R
 create mode 100644 R/DeepPatientLevelPrediction.R
 delete mode 100644 R/Formatting.R
 delete mode 100644 R/RNNTorch.R
 delete mode 100644 R/sparseRTorch.R
 delete mode 100644 inst/python/TorchMap.py
 delete mode 100755 inst/python/TorchUtils.py
 delete mode 100644 inst/python/deepTorch.py
 delete mode 100644 inst/python/deepTorchFunctions.py
 delete mode 100644 inst/python/predictFunctions.py
 delete mode 100644 inst/python/python_predict.py
 delete mode 100644 man/setCIReNN.Rd
 delete mode 100644 man/setCNNTorch.Rd
 delete mode 100644 man/setCovNN.Rd
 delete mode 100644 man/setCovNN2.Rd
 delete mode 100644 man/setDeepNN.Rd
 delete mode 100644 man/setRNNTorch.Rd
 delete mode 100644 man/toSparseMDeep.Rd
 delete mode 100644 man/toSparseRTorch.Rd
 delete mode 100644 man/transferLearning.Rd
 delete mode 100644 tests/testthat/test-keras.R
 delete mode 100644 tests/testthat/test-torch.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 7d4fafb..c9608a5 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -18,31 +18,20 @@ BugReports: https://github.com/OHDSI/DeepPatientLevelPrediction/issues
 VignetteBuilder: knitr
 Depends:
     R (>= 3.3.0),
-    FeatureExtraction (>= 3.0.0)
 Imports:
-    Andromeda,
-    DatabaseConnector (>= 3.0.0),
+    PatientLevelPrediction,
     dplyr,
     data.table,
     knitr,
-    magrittr,
     Matrix,
     ParallelLogger (>= 2.0.0),
-    reshape2,
-    reticulate (> 1.16),
-    RSQLite,
-    slam,
-    SqlRender (>= 1.1.3),
     torch,
     tibble,
-    tidyr
 Suggests:
     devtools,
-    keras,
     plyr,  
-    tensorflow,
     testthat
 Remotes:
-    ohdsi/FeatureExtraction
+    ohdsi/PatientLevelPrediction
 RoxygenNote: 7.1.2
 Encoding: UTF-8
diff --git a/NAMESPACE b/NAMESPACE
index a542ccd..b866710 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -10,20 +10,13 @@ export(gridCvDeep)
 export(predictDeepEstimator)
 export(predictDeepNN)
 export(predictTabNetTorch)
-export(setCIReNN)
-export(setCNNTorch)
-export(setCovNN)
-export(setCovNN2)
-export(setDeepNN)
 export(setDeepNNTorch)
-export(setRNNTorch)
 export(setResNet)
 export(setTabNetTorch)
 export(setTransformer)
 export(singleLayerNN)
-export(toSparseMDeep)
-export(toSparseRTorch)
-export(transferLearning)
 export(tripleLayerNN)
 import(data.table)
-importFrom(zeallot,"%<-%")
+importFrom(data.table,":=")
+importFrom(dplyr,"%>%")
+importFrom(rlang,.data)
diff --git a/R/CIReNN.R b/R/CIReNN.R
deleted file mode 100644
index 4cdaa81..0000000
--- a/R/CIReNN.R
+++ /dev/null
@@ -1,1162 +0,0 @@
-# @file CIReNN.R
-# Code edited from OHDSI contributor @chandryou CIReNN branch
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# BuildCIReNN<-function(outcomes=ff::as.ffdf(population[,c('rowId','y')]),
-#     covariates = result$data,
-#     indexFolder=indexFolder){
-# 
-# }
-
-#' Create setting for CIReNN model
-#'
-#' @param numberOfRNNLayer The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3) 
-#' @param units         The number of units of RNN layer - as a list of vectors
-#' @param recurrentDropout  The reccurrent dropout rate (regularisation)
-#' @param layerDropout      The layer dropout rate (regularisation)
-#' @param lr                 Learning rate
-#' @param decay              Learning rate decay over each update.
-#' @param outcomeWeight      The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight. 
-#' @param batchSize          The number of data points to use per training batch
-#' @param epochs          Number of times to iterate over dataset
-#' @param earlyStoppingMinDelta         minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement.
-#' @param earlyStoppingPatience         Number of epochs with no improvement after which training will be stopped.
-#' @param useDeepEnsemble               logical (either TRUE or FALSE) value for using Deep Ensemble
-#' @param numberOfEnsembleNetwork       Integer, Number of Ensemble. If you want to use Deep Ensemble, this number should be greater than 1. 
-#' @param bayes                         logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer
-#' @param useDeepEnsemble               logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing. 
-#' @param numberOfEnsembleNetwork       Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5).
-#' @param useVae                        logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN
-#' @param vaeDataSamplingProportion     Data sampling proportion for VAE
-#' @param vaeValidationSplit            Validation split proportion for VAE
-#' @param vaeBatchSize                  batch size for VAE
-#' @param vaeLatentDim                  Number of latent dimesion for VAE
-#' @param vaeIntermediateDim            Number of intermediate dimesion for VAE
-#' @param vaeEpoch                      Number of times to interate over dataset for VAE
-#' @param vaeEpislonStd                 Epsilon
-#' @param useGPU                        logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE
-#' @param maxGPUs                       Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas.
-#' @param seed            Random seed used by deep learning model
-#' @importFrom zeallot %<-%
-#' @examples
-#' \dontrun{
-#' model.CIReNN <- setCIReNN()
-#' }
-#' @export
-setCIReNN <- function(numberOfRNNLayer=c(1),units=c(128, 64), recurrentDropout=c(0.2), layerDropout=c(0.2),
-                      lr =c(1e-4), decay=c(1e-5), outcomeWeight = c(0), batchSize = c(100), 
-                      epochs= c(100), earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), 
-                      bayes = T, useDeepEnsemble = F, numberOfEnsembleNetwork = 5, 
-                      useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, 
-                      vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, 
-                      vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2,
-                      seed=1234  ){
-  
-  ensure_installed("keras")
-  ensure_installed("tensorflow")
-  ensure_installed("plyr")
-  
-  if( sum(!( numberOfRNNLayer %in% c(1,2,3)))!=0 ) stop ('Only 1,2 or 3 is available now. ')
-  if(!((all.equal(numberOfEnsembleNetwork, as.integer(numberOfEnsembleNetwork))) & (numberOfEnsembleNetwork>=1) )) {
-    stop ('Number of ensemble network should be a natural number')  }
-  
-  # if(class(indexFolder)!='character')
-  #     stop('IndexFolder must be a character')
-  # if(length(indexFolder)>1)
-  #     stop('IndexFolder must be one')
-  # 
-  # if(class(units)!='numeric')
-  #     stop('units must be a numeric value >0 ')
-  # if(units<1)
-  #     stop('units must be a numeric value >0 ')
-  # 
-  # #if(length(units)>1)
-  # #    stop('units can only be a single value')
-  # 
-  # if(class(recurrent_dropout)!='numeric')
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if( (recurrent_dropout<0) | (recurrent_dropout>=1))
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if(class(layer_dropout)!='numeric')
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if( (layer_dropout<0) | (layer_dropout>=1))
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if(class(lr)!='numeric')
-  #     stop('lr must be a numeric value >0')
-  # if(lr<=0)
-  #     stop('lr must be a numeric value >0')
-  # if(class(decay)!='numeric')
-  #     stop('decay must be a numeric value >=0')
-  # if(decay<=0)
-  #     stop('decay must be a numeric value >=0')
-  # if(class(outcome_weight)!='numeric')
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(outcome_weight<=0)
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(class(batch_size)!='numeric')
-  #     stop('batch_size must be an integer')
-  # if(batch_size%%1!=0)
-  #     stop('batch_size must be an integer')
-  # if(class(epochs)!='numeric')
-  #     stop('epochs must be an integer')
-  # if(epochs%%1!=0)
-  #     stop('epochs must be an integer')
-  # if(!class(seed)%in%c('numeric','NULL'))
-  #     stop('Invalid seed')
-  #if(class(UsetidyCovariateData)!='logical')
-  #    stop('UsetidyCovariateData must be an TRUE or FALSE')
-  
-  result <- list(model='fitCIReNN', param=split(expand.grid(
-    numberOfRNNLayer=numberOfRNNLayer,units=units, recurrentDropout=recurrentDropout, 
-    layerDropout=layerDropout,
-    lr =lr, decay=decay, outcomeWeight=outcomeWeight, epochs= epochs,
-    earlyStoppingMinDelta = earlyStoppingMinDelta, earlyStoppingPatience = earlyStoppingPatience,
-    bayes= bayes, useDeepEnsemble = useDeepEnsemble,numberOfEnsembleNetwork = numberOfEnsembleNetwork,
-    useVae= useVae,vaeDataSamplingProportion = vaeDataSamplingProportion, vaeValidationSplit= vaeValidationSplit, 
-    vaeBatchSize = vaeBatchSize, vaeLatentDim = vaeLatentDim, vaeIntermediateDim = vaeIntermediateDim, 
-    vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd, useGPU = useGPU, maxGPUs = maxGPUs,
-    seed=ifelse(is.null(seed),'NULL', seed)),
-    
-    1:(length(numberOfRNNLayer)*length(units)*length(recurrentDropout)*length(layerDropout)*length(lr)*length(decay)*length(outcomeWeight)*length(earlyStoppingMinDelta)*length(earlyStoppingPatience)*length(epochs)*max(1,length(seed)))),
-    name='CIReNN'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitCIReNN <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=T)
-  data <- result$data
-  covariateMap <- result$map
-  
-  #remove result to save memory
-  rm(result)
-  
-  if(param[[1]]$useVae){
-    #Sampling the data for bulding VAE
-    vaeSampleData <- data[sample(seq(dim(data)[1]), floor(dim(data)[1]*param[[1]]$vaeDataSamplingProportion),replace=FALSE),,]
-    
-    #Build VAE
-    vae <- buildVae(vaeSampleData, vaeValidationSplit= param[[1]]$vaeValidationSplit, 
-                  vaeBatchSize = param[[1]]$vaeBatchSize, vaeLatentDim = param[[1]]$vaeLatentDim, vaeIntermediateDim = param[[1]]$vaeIntermediateDim,
-                  vaeEpoch = param[[1]]$vaeEpoch, vaeEpislonStd = param[[1]]$vaeEpislonStd, useGPU= param[[1]]$useGPU, maxGPUs= param[[1]]$maxGPUs, temporal = TRUE)
-    #remove sample data for VAE to save memory
-    vaeSampleData <- NULL
-    
-    vaeEnDecoder<- vae[[1]]
-    vaeEncoder  <- vae[[2]]
-    
-    #Embedding by using VAE encoder
-    data <- plyr::aaply(as.array(data), 2, function(x) stats::predict(vaeEncoder, x, batch_size = param$vaeBatchSize))
-    data <- aperm(data, perm = c(2,1,3))#rearrange of dimension
-    
-    ##Check the performance of vae
-    # decodedVaeData<-plyr::aaply(as.array(data), 2, function(x) predict(vaeEnDecoder, x, batch_size = param$vaeBatchSzie))
-    # decodedVaeData<-aperm(decodedVaeData, c(2,1,3))
-    # a1=Epi::ROC(form=as.factor(as.vector(data))~as.vector(decodedVaeData),plot="ROC")
-    
-  }else {
-    vaeEnDecoder <- NULL
-    vaeEncoder <- NULL
-  }
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, 2)#[,2] #population$outcomeCount
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  
-  #remove data to save memory
-  data <- NULL
-  
-  #Selection of hyperparameters
-  hyperParamSel <- lapply(param, function(x) do.call(trainCIReNN, c(x,datas,train=TRUE)  ))
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel <- unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainCIReNN, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCIReNN',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId = outcomeId,
-                 cohortId = cohortId,
-                 varImp = covariateRef, 
-                 trainingTime = comp,
-                 covariateMap = covariateMap,
-                 useDeepEnsemble = param.best$useDeepEnsemble,
-                 numberOfEnsembleNetwork = param.best$numberOfEnsembleNetwork,
-                 useVae = param.best$useVae,
-                 vaeBatchSize = param.best$vaeBatchSize,
-                 vaeEnDecoder = vaeEnDecoder,
-                 vaeEncoder = vaeEncoder,
-                 predictionTrain  = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deep'
-  if(param.best$useDeepEnsemble){
-    attr(result, 'type') <- 'deepEnsemble'
-    }
-  if(param.best$bayes){
-    attr(result, 'type') <- 'BayesianDeep'
-    }
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainCIReNN<-function(plpData, population,
-                      numberOfRNNLayer=1,units=128, recurrentDropout=0.2, layerDropout=0.2,
-                      lr =1e-4, decay=1e-5, outcomeWeight = 0, batchSize = 100, 
-                      epochs= 100, earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), 
-                      bayes = T, useDeepEnsemble = F,numberOfEnsembleNetwork =3,
-                      useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, 
-                      vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, 
-                      vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2,
-                      seed=NULL, train=TRUE){
-  
-  mu <- function(){return(NULL)}
-  sigma <- function(){return(NULL)}
-  
-  output_dim = 2 #output dimension for outcomes
-  num_MC_samples = 100 #sample number for MC sampling in Bayesian Deep Learning Prediction
-  if(outcomeWeight == 0){
-    outcomeWeight = round(sum(population$outcomeCount==0)/sum(population$outcomeCount>=1),1) #if outcome weight = 0, then it means balanced weight
-  }
-  #heteroscedatic loss function
-  heteroscedastic_loss = function(y_true, y_pred) {
-    mean = y_pred[, 1:output_dim]
-    log_var = y_pred[, (output_dim + 1):(output_dim * 2)]
-    precision = keras::k_exp(-log_var)
-    keras::k_sum(precision * (y_true - mean) ^ 2 + log_var, axis = 2)
-  }
-  
-  if(!is.null(population$indexes) && train==T){
-    writeLines(paste('Training recurrent neural network with ',length(unique(population$indexes)),' fold CV'))
-    
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      if(useDeepEnsemble){
-        predList<-list()
-        for (i in seq(numberOfEnsembleNetwork)){
-          #print(i)
-          ParallelLogger::logInfo(paste(i,'th process is started'))
-          pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs,
-                                      earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience,
-                                      train_rows=train_rows,index=index,lr=lr,decay=decay,
-                                      units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer,
-                                      layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs)
-          ParallelLogger::logInfo(paste(i,'th process is ended started'))
-          predList <- append(predList,pred)
-        }
-        model <- predList
-        
-        # batch prediciton 
-        maxVal <- sum(population$indexes==index)
-        batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-        prediction <- population[population$indexes==index,]
-        prediction$value <- 0
-        prediction$sigmas <- 0
-        
-        for(batch in batches){
-          
-          for (i in seq(numberOfEnsembleNetwork)){
-            if(i==1){
-              muMatrix <- data.frame()
-              sigmaMatrix <-data.frame()
-            }
-            c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,])))
-            muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2])))
-            sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
-          }
-          
-          muMean <- apply(muMatrix,2,mean)
-          muSq <- muMatrix^2
-          sigmaSq <- sigmaMatrix^2
-          sigmaMean <- apply(sigmaMatrix,2,mean)
-          sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2
-          
-          prediction$value[batch] <- c(muMean)
-          #if prediction$value is negative, make this positive
-          prediction$sigmas[batch] <- c(sigmaResult)
-          
-        }
-        prediction$value[prediction$value>1] <- 1
-        prediction$value[prediction$value<0] <- 0
-        #prediction$value[batch] <- mu[,2]
-        #prediction$sigmas[batch] <- sigma[,2]
-        
-        #writeLines(paste0(dim(pred[,2]), collapse='-'))
-        #writeLines(paste0(pred[1,2], collapse='-'))
-        
-        attr(prediction, "metaData") <- list(predictionType = "binary")
-        aucVal <- computeAuc(prediction)
-        perform <- c(perform,aucVal)
-        
-        # add the fold predictions and compute AUC after loop
-        predictionMat$value[population$indexes==index] <- prediction$value
-        
-      }else{
-        layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-        if(useGPU){
-          ##GRU layer
-          if(numberOfRNNLayer==1){
-            layerOutput <- layerInput %>% 
-              keras::layer_cudnn_gru(units=units, #time step x number of features
-                                     return_sequences=FALSE) %>% 
-              keras::layer_dropout(layerDropout)
-          } 
-          if(numberOfRNNLayer==2){
-            layerOutput <- layerInput %>% 
-              keras::layer_cudnn_gru(units=units, #time step x number of features
-                                     return_sequences=TRUE) %>% 
-              keras::layer_dropout(layerDropout) %>%
-              keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-              keras::layer_dropout(layerDropout)
-          }
-          if(numberOfRNNLayer==3){
-            layerOutput <- layerInput %>% 
-              keras::layer_cudnn_gru(units=units, #time step x number of features
-                                     return_sequences=TRUE) %>% 
-              keras::layer_dropout(layerDropout) %>%
-              keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
-              keras::layer_dropout(layerDropout) %>%
-              keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-              keras::layer_dropout(layerDropout)
-          }
-        }else{
-          ##GRU layer
-          if(numberOfRNNLayer == 1){
-            layerOutput <- layerInput %>% 
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=FALSE) 
-          }
-          if(numberOfRNNLayer > 1 ){
-            layerInput %>% # !ISSUE : "missing layerOutput <- "?
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=TRUE) 
-          }
-          if(numberOfRNNLayer == 2){
-            layerOutput <- layerInput %>% 
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=FALSE)
-          }
-          if(numberOfRNNLayer==3){
-            layerOutput <- layerInput %>% 
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=TRUE) %>%
-              # ISSUE- I removed layerOutput <- layerInput %>%  as this was after a pipe
-              keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                               return_sequences=FALSE) 
-          }
-        }
-        
-        earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
-                                                     mode="auto",min_delta = earlyStoppingMinDelta)
-        reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                      patience = 5,mode = "auto", 
-                                                      min_delta = 1e-5, cooldown = 0, min_lr = 0)
-        
-        class_weight=list("0"=1,"1"=outcomeWeight)
-        
-        if(bayes){
-          mean = layerOutput %>% 
-            layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-          
-          log_var = layerOutput %>% 
-            layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-          
-          output = keras::layer_concatenate(list(mean, log_var))
-          model = keras::keras_model(layerInput, output)
-          #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output)
-          model %>% keras::compile(
-            optimizer = "adam",
-            loss = heteroscedastic_loss,
-            metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss))
-          )
-          
-        }else{
-          model<- layerInput %>% 
-            keras::layer_dense(units=2, activation='softmax')
-          model %>% keras::compile(
-            loss = 'binary_crossentropy',
-            metrics = c('accuracy'),
-            optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-          )
-        }
-        
-        data <- plpData[population$rowId[population$indexes!=index],,]
-        
-        #Extract validation set first - 10k people or 5%
-        valN <- min(10000,sum(population$indexes!=index)*0.05)
-        val_rows <- sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-        train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-        
-        sampling_generator <- function(data, population, batchSize, train_rows, index){
-          function(){
-            gc()
-            rows<-sample(train_rows, batchSize, replace=FALSE)
-            
-            list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,])
-          }
-        }
-        
-
-        history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                                  steps_per_epoch = sum(population$indexes!=index)/batchSize,
-                                                  epochs=epochs,
-                                                  validation_data=list(as.array(data[val_rows,,]), 
-                                                                       population$y[population$indexes!=index,][val_rows,]),
-                                                  callbacks=list(earlyStopping,reduceLr),
-                                                  class_weight=class_weight)
-        
-        # batch prediciton 
-        maxVal <- sum(population$indexes==index)
-        batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-        prediction <- population[population$indexes==index,]
-        prediction$value <- 0
-        
-        if(bayes){
-          prediction$epistemicUncertainty <- 0
-          prediction$aleatoricUncertainty <- 0
-          for(batch in batches){
-            MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
-            for (k in 1:num_MC_samples){
-              MC_samples[k,, ] = stats::predict(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-                #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-            }
-            pred <- apply(MC_samples[,,output_dim], 2, mean)
-            epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
-            logVar = MC_samples[, , output_dim * 2]
-            
-            if(length(dim(logVar))<=1){
-              aleatoricUncertainty = exp(mean(logVar))
-            }else{
-              aleatoricUncertainty = exp(colMeans(logVar))
-            }
-            
-            prediction$value[batch] <- pred
-            prediction$epistemicUncertainty[batch] = epistemicUncertainty
-            prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
-            
-          }
-          
-        }else{
-          for(batch in batches){
-            pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-            prediction$value[batch] <- pred[,2]
-            #writeLines(paste0(dim(pred[,2]), collapse='-'))
-            #writeLines(paste0(pred[1,2], collapse='-'))
-          }
-        }
-        prediction$value[prediction$value>1] <- 1
-        prediction$value[prediction$value<0] <- 0
-        attr(prediction, "metaData") <- list(predictionType = "binary")
-        aucVal <- computeAuc(prediction)
-        perform <- c(perform,aucVal)
-        
-        # add the fold predictions and compute AUC after loop
-        predictionMat$value[population$indexes==index] <- prediction$value
-        # add uncertainty
-        predictionMat$aleatoricUncertainty[population$indexes==index] <- prediction$aleatoricUncertainty
-        predictionMat$epistemicUncertainty[population$indexes==index] <- prediction$epistemicUncertainty
-      }
-      
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('RNNlayer Number: ', numberOfRNNLayer, '-- units: ',units,'-- recurrentDropout: ', recurrentDropout,
-                        'layerDropout: ',layerDropout,'-- lr: ', lr,
-                        '-- decay: ', decay,'-- outcomeWeight',outcomeWeight, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
-    writeLines('==========================================')
-    writeLines(paste0('CIReNN with parameters:', param.val,' obtained an AUC of ',auc))
-    writeLines('==========================================')
-    
-  } else {
-    if(useDeepEnsemble){
-      predList<-list()
-      for (i in seq(numberOfEnsembleNetwork)){
-       #print(i)
-       pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs,
-                                   earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience,
-                                   train_rows=train_rows,index=index,lr=lr,decay=decay,
-                                   units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer,
-                                   layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs)
-       
-       predList <- append(predList,pred)
-      }
-      model <- predList
-      
-      # batch prediciton 
-      maxVal <- nrow(population)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population
-      prediction$value <- 0
-      prediction$sigmas <- 0
-      
-      for(batch in batches){
-        
-        for (i in seq(numberOfEnsembleNetwork)){
-          if(i == 1){
-            muMatrix <- data.frame()
-            sigmaMatrix <-data.frame()
-          }
-          c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[batch,,])))
-          muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2])))
-          sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2])))
-        }
-        
-        muMean <- apply(muMatrix,2,mean)
-        muSq <- muMatrix^2
-        sigmaSq <- sigmaMatrix^2
-        sigmaMean <- apply(sigmaMatrix,2,mean)
-        sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2
-        
-        prediction$value[batch] <- c(muMean)
-        prediction$sigmas[batch] <- c(sigmaResult)
-      }
-      prediction$value[prediction$value>1] <- 1
-      prediction$value[prediction$value<0] <- 0
-
-      
-    }else{
-      layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-      if(useGPU){
-        ##GRU layer
-        if(numberOfRNNLayer==1){
-          layerOutput <- layerInput %>% 
-            keras::layer_cudnn_gru(units=units, #time step x number of features
-                                   return_sequences=FALSE) %>% 
-            keras::layer_dropout(layerDropout)
-        } 
-        if(numberOfRNNLayer==2){
-          layerOutput <- layerInput %>% 
-            keras::layer_cudnn_gru(units=units, #time step x number of features
-                                   return_sequences=TRUE) %>% 
-            keras::layer_dropout(layerDropout) %>%
-            keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-            keras::layer_dropout(layerDropout)
-        }
-        if(numberOfRNNLayer==3){
-          layerOutput <- layerInput %>% 
-            keras::layer_cudnn_gru(units=units, #time step x number of features
-                                   return_sequences=TRUE) %>% 
-            keras::layer_dropout(layerDropout) %>%
-            keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
-            keras::layer_dropout(layerDropout) %>%
-            keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-            keras::layer_dropout(layerDropout)
-        }
-      }else{
-        ##GRU layer
-        if(numberOfRNNLayer==1){
-          layerOutput <- layerInput %>% 
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=FALSE) 
-        }
-        if(numberOfRNNLayer>1 ){
-          layerInput %>%   # ISSUE - "layerInput <- " missing?
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=TRUE) 
-        }
-        if(numberOfRNNLayer==2){
-          layerOutput <- layerInput %>% 
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=FALSE) 
-        }
-        if(numberOfRNNLayer==3){
-          layerOutput <- layerInput %>% 
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=TRUE) %>%
-            #layerOutput <- layerInput %>%  ISSUE - pipe above?
-            keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,
-                             return_sequences=FALSE) 
-        }
-      }
-      
-      earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
-                                                   mode="auto",min_delta = earlyStoppingMinDelta)
-      reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", 
-                                                    min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight = list("0" = 1, 
-                          "1" = outcomeWeight)
-      
-      if(bayes){
-        mean = layerOutput %>% 
-          layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-        
-        log_var = layerOutput %>% 
-          layer_concrete_dropout(layer = keras::layer_dense(units = output_dim))
-        
-        output = keras::layer_concatenate(list(mean, log_var))
-        model = keras::keras_model(layerInput, output)
-        #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output)
-        model %>% keras::compile(
-          optimizer = "adam",
-          loss = heteroscedastic_loss,
-          metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss))
-        )
-        
-      }else{
-        model <- layerInput %>% 
-          keras::layer_dense(units=2, activation='softmax')
-        model %>% keras::compile(
-          loss = 'binary_crossentropy',
-          metrics = c('accuracy'),
-          optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-        )
-      }
-      
-      data <- plpData[population$rowId,,]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,length(population$indexes)*0.05)
-      val_rows <- sample(1:length(population$indexes), valN, replace=FALSE)
-      train_rows <- c(1:length(population$indexes))[-val_rows]
-      
-      sampling_generator2 <- function(data, population, batchSize, train_rows){
-        function(){
-          gc()
-          rows<-sample(train_rows, batchSize, replace=FALSE)
-          list(as.array(data[rows,,]), population$y[rows,])
-        }
-      }
-      
-      
-      history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                                steps_per_epoch = nrow(population[-val_rows,])/batchSize,
-                                                epochs=epochs,
-                                                validation_data=list(as.array(data[val_rows,,]), 
-                                                                     population$y[val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight,
-                                                view_metrics=F)
-      
-      
-      # batched prediciton 
-      maxVal <- nrow(population)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population
-      prediction$value <- 0
-      
-      if(bayes){
-        prediction$epistemicUncertainty <- 0
-        prediction$aleatoricUncertainty <- 0
-        for(batch in batches){
-          MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim))
-          for (k in 1:num_MC_samples){
-            MC_samples[k,, ] = stats::predict(model, as.array(plpData[batch,,]))
-            #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-          }
-          pred <- apply(MC_samples[,,output_dim], 2, mean)
-          epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var)
-          logVar = MC_samples[, , output_dim * 2]
-          if(length(dim(logVar)) <= 1){
-            aleatoricUncertainty = exp(mean(logVar))
-          }else{
-            aleatoricUncertainty = exp(colMeans(logVar))
-            
-          }
-          prediction$value[batch] <- pred
-          prediction$epistemicUncertainty[batch] = epistemicUncertainty
-          prediction$aleatoricUncertainty[batch] = aleatoricUncertainty
-        }
-        
-      }else{
-        for(batch in batches){
-          pred <- keras::predict_on_batch(model, as.array(plpData[batch,,]))
-          prediction$value[batch] <- pred[,2]
-        }
-        
-      }
-      prediction$value[prediction$value>1] <- 1
-      prediction$value[prediction$value<0] <- 0
-      
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      auc <- computeAuc(prediction)
-      foldPerm <- auc
-      predictionMat <- prediction
-      
-    }
-  
-  }
-  result <- list(model=model,
-                 auc=auc,
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(numberOfRNNLayer=numberOfRNNLayer, 
-                                        units=units, recurrentDropout=recurrentDropout, 
-                                        layerDropout=layerDropout,lr =lr, decay=decay,outcomeWeight=outcomeWeight,
-                                        batchSize = batchSize, epochs= epochs, earlyStoppingMinDelta = earlyStoppingMinDelta, 
-                                        earlyStoppingPatience=earlyStoppingPatience,
-                                        useDeepEnsemble = useDeepEnsemble,
-                                        numberOfEnsembleNetwork =numberOfEnsembleNetwork,
-                                        useVae = useVae, vaeDataSamplingProportion =vaeDataSamplingProportion ,vaeValidationSplit = vaeValidationSplit,
-                                        vaeBatchSize =vaeBatchSize,
-                                        vaeLatentDim = vaeLatentDim,
-                                        vaeIntermediateDim = vaeIntermediateDim,
-                                        vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd))
-  )
-  return(result)
-  
-}
-
-#function for building vae
-buildVae<-function(data, vaeValidationSplit= 0.2, vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L,
-                   vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU= FALSE, maxGPUs = NULL, temporal = TRUE){
-  if (temporal){
-    dataSample <- data %>% 
-      apply(3, as.numeric)
-    } else{
-      dataSample <- data
-    }
-  originalDim <- dim(dataSample)[2]
-  K <- keras::backend()
-  x <- keras::layer_input (shape =originalDim)
-  h <- keras::layer_dense (x, vaeIntermediateDim, activation = 'relu')
-  z_mean <- keras::layer_dense(h, vaeLatentDim)
-  z_log_var <- keras::layer_dense(h, vaeLatentDim)
-  
-  sampling<- function(arg){
-    z_mean <- arg[,1:vaeLatentDim]
-    z_log_var <- arg[, (vaeLatentDim+1):(2*vaeLatentDim)]
-    
-    epsilon <- keras::k_random_normal(
-      shape = c(keras::k_shape(z_mean)[[1]]),
-      mean = 0.,
-      stddev = vaeEpislonStd
-    )
-    
-    z_mean + keras::k_exp(z_log_var/2)*epsilon
-  }
-  
-  z <- keras::layer_concatenate(list(z_mean, z_log_var)) %>% 
-    keras::layer_lambda(sampling)
-  
-  #we instantiate these layers separately so as to reuse them later
-  decoder_h <- keras::layer_dense(units = vaeIntermediateDim, activation = 'relu')
-  decoder_mean <- keras::layer_dense (units = originalDim, activation = 'sigmoid')
-  h_decoded <- decoder_h (z)
-  x_decoded_mean <- decoder_mean(h_decoded)
-  
-  #end-to-end autoencoder
-  vae <- keras::keras_model (x,x_decoded_mean)
-  #encoder, from inputs to latent space
-  encoder <- keras::keras_model(x, z_mean)
-  
-  #generator, from latent space to reconstruted inputs
-  decoder_input <- keras::layer_input (shape = vaeLatentDim)
-  h_decoded_2 <- decoder_h(decoder_input)
-  x_decoded_mean_2 <- decoder_mean(h_decoded_2)
-  generator <- keras::keras_model (decoder_input, x_decoded_mean_2)
-  
-  vae_loss <- function(x, x_decoded_mean){
-    xent_loss <- (originalDim/1.0)* keras::loss_binary_crossentropy(x, x_decoded_mean)
-    k1_loss <- -0.5 * keras::k_mean(1 + z_log_var - keras::k_square(z_mean) - keras::k_exp(z_log_var), axis = -1L)
-    xent_loss + k1_loss
-  }
-  #Activating parallelisation of GPU in encoder
-  if(useGPU & (maxGPUs>1) ){
-    vae <- keras::multi_gpu_model(vae,gpus = maxGPUs)
-  }
-  
-  vae %>% keras::compile (optimizer = "rmsprop", loss = vae_loss)
-  #if (!is.null(dataValidation)) dataValidation<-list(dataValidation,dataValidation)
-  vaeEarlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=5,mode="auto",min_delta = 1e-3)
-  naanStopping = keras::callback_terminate_on_naan()
-  csvLogging = keras::callback_csv_logger (filename="./vae.csv",separator = ",", append =TRUE )
-  
-  vae %>% keras::fit (
-    dataSample,dataSample
-    ,shuffle = TRUE
-    ,epochs = vaeEpoch
-    ,batch_size = vaeBatchSize
-    #,validation_data = dataValidation
-    ,validation_split = vaeValidationSplit
-    ,callbacks = list(vaeEarlyStopping, 
-                      csvLogging,
-                      naanStopping)
-  )
-  return (list (vae,encoder))
-}
-
-#Defining Gaussian Layer for Deep Ensemble
-GaussianLayer <- R6::R6Class("GaussianLayer",
-                             inherit = keras::KerasLayer,
-                             
-                             public = list(
-                               output_dim = NULL,
-                               kernel_1 = NULL,
-                               kernel_2 = NULL,
-                               bias_1 = NULL,
-                               bias_2 = NULL,
-                               
-                               initialize = function(output_dim){
-                                 self$output_dim <- output_dim
-                               },
-                               build = function(input_shape){
-                                 super$build(input_shape)
-                                 
-                                 self$kernel_1 = self$add_weight(name = 'kernel_1',
-                                                                 shape = list(as.integer(input_shape[[2]]), self$output_dim), #list(30, self$output_dim),#shape = keras::shape(30, self$output_dim),
-                                                                 initializer = keras::initializer_glorot_normal(),
-                                                                 trainable = TRUE)
-                                 self$kernel_2 = self$add_weight(name = 'kernel_2',
-                                                                 shape = list(as.integer(input_shape[[2]]), self$output_dim),#list(30, self$output_dim),  #shape = keras::shape(30, self$output_dim),
-                                                                 initializer = keras::initializer_glorot_normal(),
-                                                                 trainable = TRUE)
-                                 self$bias_1 = self$add_weight(name = 'bias_1',
-                                                               shape = list(self$output_dim),  #shape = keras::shape(self$output_dim),
-                                                               initializer = keras::initializer_glorot_normal(),
-                                                               trainable = TRUE)
-                                 self$bias_2 = self$add_weight(name = 'bias_2',
-                                                               shape = list(self$output_dim), #shape = keras::shape(self$output_dim),
-                                                               initializer = keras::initializer_glorot_normal(),
-                                                               trainable = TRUE)
-                               },
-                               
-                               call = function(x, mask = NULL){
-                                 output_mu = keras::k_dot(x, self$kernel_1) + self$bias_1
-                                 output_sig = keras::k_dot(x, self$kernel_2) + self$bias_2
-                                 output_sig_pos = keras::k_log(1 + keras::k_exp(output_sig)) + 1e-06
-                                 return (list(output_mu, output_sig_pos))
-                               },
-                               
-                               
-                               compute_output_shape = function(input_shape){
-                                 return (list (
-                                   list(input_shape[[1]], self$output_dim), 
-                                   list(input_shape[[1]], self$output_dim) )
-                                 )
-                               } 
-                             )
-)
-
-#define layer wrapper function for Deep Ensemble
-layer_custom <- function(object, output_dim, name = NULL, trainable = TRUE) {
-  keras::create_layer(GaussianLayer, object, list(
-    output_dim = as.integer(output_dim),
-    name = name,
-    trainable = trainable
-  ))
-}
-
-#Custom loss function for Deep Ensemble
-custom_loss <- function(sigma){
-  gaussian_loss <- function(y_true,y_pred){
-    tensorflow::tf$reduce_mean(0.5*tensorflow::tf$log(sigma) + 0.5*tensorflow::tf$div(tensorflow::tf$square(y_true - y_pred), sigma)) + 1e-6
-  }
-  return(gaussian_loss)
-}
-
-#Create Deep Ensemble Network function
-createEnsembleNetwork<-function(train, plpData,population,batchSize,epochs, earlyStoppingPatience, earlyStoppingMinDelta,
-                                train_rows=NULL,index=NULL,lr,decay,
-                                units,recurrentDropout,numberOfRNNLayer,layerDropout, useGPU = useGPU, maxGPUs = maxGPUs){
-  
-  mu <- function(){return(NULL)}
-  sigma <- function(){return(NULL)}
-  
-  if(useGPU){
-    ##GRU layer
-    layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-    
-    if(numberOfRNNLayer==1){
-      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
-                                                return_sequences=FALSE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    } 
-    if(numberOfRNNLayer==2){
-      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_gru(units=units, return_sequences=FALSE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-    if(numberOfRNNLayer==3){
-      layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>%
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% 
-        keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-  }else{
-    ##GRU layer
-    layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3]))
-    
-    if(numberOfRNNLayer==1){
-      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
-                                                return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    } 
-    if(numberOfRNNLayer==2){
-      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE)%>% keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-    if(numberOfRNNLayer==3){
-      layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features
-                                                return_sequences=TRUE) %>% 
-        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=TRUE) %>%
-        keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>%
-        keras::layer_dense(units=2, activation='softmax')
-    }
-  }
-  
-  c(mu,sigma) %<-% layer_custom(layers, 2, name = 'main_output')
-  
-  model <- keras::keras_model(inputs = layerInput,outputs=mu)
-  
-  earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience,
-                                               mode="auto",min_delta = earlyStoppingMinDelta)
-  #Currently using Parallel GPU makes errors in Deep Ensemble
-  #if(useGPU & (maxGPUs>1) ) model <- keras::multi_gpu_model(model,gpus = maxGPUs) 
-  
-  model %>% keras::compile(
-    loss = custom_loss(!!sigma),
-    optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-  )
-  
-  
-  if(!is.null(population$indexes) && train==T){
-    data <- plpData[population$rowId[population$indexes!=index],,]
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,sum(population$indexes!=index)*0.05)
-    val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-    train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-    
-    sampling_generator<-function(data, population, batchSize, train_rows, index){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        
-        list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,])
-      }
-    }
-    
-    history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                              steps_per_epoch = sum(population$indexes!=index)/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(data[val_rows,,]), 
-                                                                   population$y[population$indexes!=index,][val_rows,])#,
-                                              ,callbacks=list(earlyStopping)
-                                              #callbacks=list(earlyStopping,reduceLr),
-    )
-    
-  }else{
-    data <- plpData[population$rowId,,]
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,length(population$indexes)*0.05)
-    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
-    train_rows <- c(1:length(population$indexes))[-val_rows]
-    
-    sampling_generator2<-function(data, population, batchSize, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        list(as.array(data[rows,,]), population$y[rows,])
-      }
-    }
-    
-    
-    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                              steps_per_epoch = nrow(population[-val_rows,])/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(data[val_rows,,]),
-                                                                   population$y[val_rows,]),
-                                              callbacks=list(earlyStopping),
-                                              view_metrics=F)
-
-  }
-  #ParallelLogger::logInfo('right before get_intermediate')
-  layer_name = 'main_output'
-  get_intermediate = keras::k_function(inputs=list(model$input),
-                                       outputs=model$get_layer(layer_name)$output)
-  #ParallelLogger::logInfo('right after get_intermediate')
-  return(get_intermediate)
-}
-
-#Custom layer for Bayesian Drop Out Layer
-ConcreteDropout <- R6::R6Class("ConcreteDropout",
-                               
-                               inherit = keras::KerasWrapper,
-                               
-                               public = list(
-                                 weight_regularizer = NULL,
-                                 dropout_regularizer = NULL,
-                                 init_min = NULL,
-                                 init_max = NULL,
-                                 is_mc_dropout = NULL,
-                                 supports_masking = TRUE,
-                                 p_logit = NULL,
-                                 p = NULL,
-                                 
-                                 initialize = function(weight_regularizer,
-                                                       dropout_regularizer,
-                                                       init_min,
-                                                       init_max,
-                                                       is_mc_dropout) {
-                                   self$weight_regularizer <- weight_regularizer
-                                   self$dropout_regularizer <- dropout_regularizer
-                                   self$is_mc_dropout <- is_mc_dropout
-                                   self$init_min <- keras::k_log(init_min) - keras::k_log(1 - init_min)
-                                   self$init_max <- keras::k_log(init_max) - keras::k_log(1 - init_max)
-                                 },
-                                 
-                                 build = function(input_shape) {
-                                   super$build(input_shape)
-                                   
-                                   self$p_logit <- super$add_weight(
-                                     name = "p_logit",
-                                     shape = keras::shape(1),
-                                     initializer = keras::initializer_random_uniform(self$init_min, self$init_max),
-                                     trainable = TRUE
-                                   )
-                                   
-                                   self$p <- keras::k_sigmoid(self$p_logit)
-                                   
-                                   input_dim <- input_shape[[2]]
-                                   
-                                   weight <- private$py_wrapper$layer$kernel
-                                   
-                                   kernel_regularizer <- self$weight_regularizer * 
-                                     keras::k_sum(keras::k_square(weight)) / 
-                                     (1 - self$p)
-                                   
-                                   dropout_regularizer <- self$p * keras::k_log(self$p)
-                                   dropout_regularizer <- dropout_regularizer +  
-                                     (1 - self$p) * keras::k_log(1 - self$p)
-                                   dropout_regularizer <- dropout_regularizer * 
-                                     self$dropout_regularizer * 
-                                     keras::k_cast(input_dim, keras::k_floatx())
-                                   
-                                   regularizer <- keras::k_sum(kernel_regularizer + dropout_regularizer)
-                                   super$add_loss(regularizer)
-                                 },
-                                 
-                                 concrete_dropout = function(x) {
-                                   eps <- keras::k_cast_to_floatx(keras::k_epsilon())
-                                   temp <- 0.1
-                                   
-                                   unif_noise <- keras::k_random_uniform(shape = keras::k_shape(x))
-                                   
-                                   drop_prob <- keras::k_log(self$p + eps) - 
-                                     keras::k_log(1 - self$p + eps) + 
-                                     keras::k_log(unif_noise + eps) - 
-                                     keras::k_log(1 - unif_noise + eps)
-                                   drop_prob <- keras::k_sigmoid(drop_prob / temp)
-                                   
-                                   random_tensor <- 1 - drop_prob
-                                   
-                                   retain_prob <- 1 - self$p
-                                   x <- x * random_tensor
-                                   x <- x / retain_prob
-                                   x
-                                 },
-                                 
-                                 call = function(x, mask = NULL, training = NULL) {
-                                   if (self$is_mc_dropout) {
-                                     super$call(self$concrete_dropout(x))
-                                   } else {
-                                     k_in_train_phase(
-                                       function()
-                                         super$call(self$concrete_dropout(x)),
-                                       super$call(x),
-                                       training = training
-                                     )
-                                   }
-                                 }
-                               )
-)
-#define layer wrapper for Bayesian Drop-out layer
-layer_concrete_dropout <- function(object, 
-                                   layer,
-                                   weight_regularizer = 1e-6,
-                                   dropout_regularizer = 1e-5,
-                                   init_min = 0.1,
-                                   init_max = 0.1,
-                                   is_mc_dropout = TRUE,
-                                   name = NULL,
-                                   trainable = TRUE) {
-  keras::create_wrapper(ConcreteDropout, object, list(
-    layer = layer,
-    weight_regularizer = weight_regularizer,
-    dropout_regularizer = dropout_regularizer,
-    init_min = init_min,
-    init_max = init_max,
-    is_mc_dropout = is_mc_dropout,
-    name = name,
-    trainable = trainable
-  ))
-}
diff --git a/R/CNNTorch.R b/R/CNNTorch.R
deleted file mode 100644
index b2d021d..0000000
--- a/R/CNNTorch.R
+++ /dev/null
@@ -1,178 +0,0 @@
-# @file CNNTorch.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for CNN model with python 
-#' @param nbfilters  The number of filters
-#' @param epochs     The number of epochs
-#' @param seed       A seed for the model
-#' @param class_weight   The class weight used for imbalanced data: 
-#'                           0: Inverse ratio between positives and negatives
-#'                          -1: Focal loss
-#' @param type      It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, 
-#'                      'CNN_MIX', 'ResNet' and 'CNN_MULTI'
-#' 
-#' @examples
-#' \dontrun{
-#' model.cnnTorch <- setCNNTorch()
-#' }
-#' @export
-setCNNTorch <- function(nbfilters=c(16, 32), epochs=c(20, 50), seed=0, class_weight = 0, type = 'CNN'){
-  
-  ParallelLogger::logWarn('This model has broken - please use setCNN() or setCNN2() instead ')
-  
-  # set seed
-  if(is.null(seed[1])){
-    seed <- as.integer(sample(100000000,1))
-  }
-  
-  result <- list(model='fitCNNTorch', param=split(expand.grid(nbfilters=nbfilters,
-                                            epochs=epochs, seed=seed[1], 
-											class_weight = class_weight, type = type),
-											1:(length(nbfilters)*length(epochs)) ),
-                 					  name='CNN Torch')
-  
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-
-fitCNNTorch <- function(population, plpData, param, search='grid', quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # check plpData is libsvm format or convert if needed
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  
-  if(colnames(population)[ncol(population)]!='indexes'){
-    warning('indexes column not present as last column - setting all index to 1')
-    population$indexes <- rep(1, nrow(population))
-  }
-  
-  
-  start <- Sys.time()
-  
-  population$rowIdPython <- population$rowId-1  #to account for python/r index difference #subjectId
-  pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
-  
-  result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T)
-  
-  outLoc <- createTempModelLoc()
-  # clear the existing model pickles
-  for(file in dir(outLoc))
-    file.remove(file.path(outLoc,file))
-
-  # do cross validation to find hyperParameter
-  hyperParamSel <- lapply(param, function(x) do.call(trainCNNTorch, listAppend(x, 
-                                                                                list(plpData = result$data,
-                                                                                     population = pPopulation,
-                                                                                     train=TRUE,
-                                                                                     modelOutput=outLoc))  ))
-  
-  hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel <- do.call(trainCNNTorch, listAppend(param[[bestInd]], 
-                                                  list(plpData = result$data,
-                                                       population = pPopulation,
-                                                       train=FALSE,
-                                                       modelOutput=outLoc)))
-  
-
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  pred <- as.matrix(finalModel)
-  pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes
-  colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-  pred <- as.data.frame(pred)
-  attr(pred, "metaData") <- list(predictionType="binary")
-  
-  pred$value <- 1-pred$value
-  prediction <- merge(population, pred[,c('rowId','value')], by='rowId')
-  
-  
-  # return model location 
-  result <- list(model = modelTrained,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCNNTorch',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 dense=1,
-                 covariateMap=result$map, # I think this is need for new data to map the same?
-                 predictionTrain = prediction
-                 )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'pythonReticulate'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-
-trainCNNTorch <- function(plpData, population, epochs=50, nbfilters = 16, seed=0, class_weight= 0, type = 'CNN', train=TRUE, modelOutput, quiet=F){
-  
-  train_deeptorch <- function(){return(NULL)}
-  
-  python_dir <- system.file(package='PatientLevelPrediction','python')
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e)
-  
-  
-  result <- train_deeptorch(population = population, 
-                            plpData = plpData, 
-                            epochs = as.integer(epochs),
-                            nbfilters = as.integer(nbfilters),
-                            seed = as.integer(seed), 
-                            class_weight = as.double(class_weight),
-                            model_type = as.character(type),
-                            train = train,
-                            modelOutput = modelOutput,
-                            quiet = quiet
-                            )
-  
-  if(train){
-    # then get the prediction 
-    pred <- as.matrix(result)
-    colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-    pred <- as.data.frame(pred)
-    attr(pred, "metaData") <- list(predictionType="binary")
-    
-    pred$value <- 1-pred$value
-    auc <- computeAuc(pred)
-    writeLines(paste0('Model obtained CV AUC of ', auc))
-    return(auc)
-  }
-  
-  return(result)
-  
-}
diff --git a/R/CovNN.R b/R/CovNN.R
deleted file mode 100644
index 4c05ffb..0000000
--- a/R/CovNN.R
+++ /dev/null
@@ -1,535 +0,0 @@
-# @file CovNN.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)
-#'
-#' @param batchSize         The number of samples to used in each batch during model training
-#' @param outcomeWeight     The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)
-#' @param lr                The learning rate
-#' @param decay             The decay of the learning rate
-#' @param dropout            [currently not used] the dropout rate for regularisation
-#' @param epochs            The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)
-#' @param filters            The number of columns output by each convolution
-#' @param kernelSize        The number of time dimensions used for each convolution
-#' @param loss               The loss function implemented 
-#' @param seed               The random seed
-#'
-#' @examples
-#' \dontrun{
-#' model.CovNN <- setCovNN()
-#' }
-#' @export
-setCovNN <- function(batchSize = 1000,
-                     outcomeWeight=1,
-                     lr=0.00001,
-                     decay=0.000001,
-                     dropout=0,
-                     epochs = 10,
-                     filters = 3, kernelSize = 10,
-                     loss = "binary_crossentropy", 
-                     seed=NULL  ){
-  #[TODO: add input checks...]
-  
-  ensure_installed("keras")
-  
-  if(!is.null(seed)){
-    warning('seed currently not implemented in CovNN')
-  }
-  
-  
-  result <- list(model='fitCovNN', param=split(expand.grid(
-    batchSize=batchSize, 
-    outcomeWeight=outcomeWeight,
-    lr=lr,
-    decay=decay,
-    dropout=dropout,
-    epochs= epochs,filters=filters,
-    kernelSize=kernelSize,loss =loss,
-    seed=ifelse(is.null(seed),'NULL', seed)),
-    1:(length(batchSize)*length(outcomeWeight)*length(epochs)*
-         length(filters)*length(lr)*length(decay)*
-         length(kernelSize)*length(loss)*max(1,length(seed)))),
-    name='CovNN'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitCovNN <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  if(is.null(plpData$timeRef)){
-    stop('Data not temporal...')
-  }
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=T)
-  
-  data <- result$data#[population$rowId,,]
-  #data<-as.array(data) -- cant make dense on big data!
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, 2)
-  #colnames(population$y) <- c('0','1')
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  hyperParamSel <- lapply(param, function(x) do.call(trainCovNN, c(x,datas,train=TRUE)  ))
-  
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainCovNN, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCovNN',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 covariateMap=result$map,
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deepMulti'
-  attr(result, 'inputs') <- '3'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainCovNN<-function(plpData, population,
-                     outcomeWeight=1, lr=0.0001, decay=0.9,
-                     dropout=0.5, filters=3,
-                     kernelSize = dim(plpData)[3],
-                     batchSize, epochs, loss= "binary_crossentropy",
-                     seed=NULL, train=TRUE){
-  
-  if(!is.null(population$indexes) && train==T){
-    writeLines(paste('Training covolutional multi-resolution neural network with ',length(unique(population$indexes)),' fold CV'))
-    
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    if(kernelSize>dim(plpData)[3]){
-      kernelSize <- dim(plpData)[3] -1
-      warning('kernelsize reduced')
-    }
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      #submodel1 <- keras::keras_model_sequential()
-      
-      submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                            name='submodel1_input')
-      submodel1_output <- submodel1_input %>%
-        # Begin with 1D convolutional layer
-        keras::layer_max_pooling_1d(pool_size = kernelSize) %>%
-        keras::layer_conv_1d(
-          #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), 
-          filters = filters, 
-          kernel_size = floor(dim(plpData)[2]/kernelSize),
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-       keras::layer_flatten()
-      
-      #submodel2 <- keras::keras_model_sequential()
-      #submodel2 %>%
-      #  keras::layer_reshape(input_shape=c(dim(plpData)[2], dim(plpData)[3]), 
-      #                       target_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
-      submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                            name='submodel2_input')
-      submodel2_output <- submodel2_input %>%
-        # Begin with 1D convolutional layer
-        keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-        keras::layer_conv_1d(
-          #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), 
-          filters = filters, 
-          kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))),
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-       keras::layer_flatten()
-      
-      
-      #submodel3 <- keras::keras_model_sequential()
-      #submodel3 %>%
-      submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                            name='submodel3_input')
-      submodel3_output <- submodel3_input %>%
-        # Begin with 1D convolutional layer
-        keras::layer_conv_1d(
-          input_shape = c(dim(plpData)[2], dim(plpData)[3]), 
-          filters = filters, 
-          kernel_size = kernelSize,
-          padding = "valid"
-        ) %>%
-        # Normalize the activations of the previous layer
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))),
-          padding = "valid", 
-          use_bias = T
-        ) %>%
-        keras::layer_flatten() 
-      
-        #model <- keras::keras_model_sequential()
-        
-        deep_output <- keras::layer_concatenate(list(submodel1_output, 
-                                                     submodel2_output, 
-                                                     submodel3_output)) %>%
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense( 
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FULLY CONNECTED LAYER 2
-        # add drop out of 0.5
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense(
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FINAL LAYER
-        keras::layer_dropout(rate=dropout) %>%
-        keras::layer_dense(name = 'final',
-                           units = 2, 
-                           activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'sigmoid', name='deep_output')
-      
-        model <- keras::keras_model(
-          inputs = c(submodel1_input, submodel2_input, submodel3_input), 
-          outputs = c(deep_output)
-        )
-        
-        
-      # Prepare model for training
-      model %>% keras::compile(
-        loss = "binary_crossentropy",
-        metrics = c('accuracy'),
-        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-      )
-      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight=list("0"=1,"1"=outcomeWeight)
-      
-      data <- plpData[population$rowId[population$indexes!=index],,]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,sum(population$indexes!=index)*0.05)
-      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-      
-      sampling_generator<-function(data, population, batchSize, train_rows, index){
-        function(){
-          gc()
-          rows<-sample(train_rows, batchSize, replace=FALSE)
-          
-          list(list(as.array(data[rows,,]),
-                    as.array(data[rows,,]),
-                    as.array(data[rows,,])),
-               population$y[population$indexes!=index,1:2][rows,])
-        }
-      }
-      
-      
-      #print(table(population$y))
-      
-      if(length(train_rows) < batchSize){
-        # checking if this fixes issue with batchsize too big
-        batchSize <- length(train_rows)
-        ParallelLogger::logInfo('Reduce batchSize to training size')
-      }
-      
-      history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                                steps_per_epoch = length(train_rows)/batchSize,
-                                                epochs=epochs,
-                                                validation_data=list(list(as.array(data[val_rows,,]), 
-                                                                          as.array(data[val_rows,,]),
-                                                                          as.array(data[val_rows,,])),
-                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight)
-      
-      
-      
-      
-      # batch prediciton 
-      maxVal <- sum(population$indexes==index)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population[population$indexes==index,]
-      prediction$value <- 0
-      for(batch in batches){
-        pred <- keras::predict_on_batch(model, list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,]),
-                                                    as.array(plpData[population$rowId[population$indexes==index],,][batch,,]),
-                                                    as.array(plpData[population$rowId[population$indexes==index],,][batch,,])))
-        prediction$value[batch] <- as.double(pred[,2])
-        }
-        
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- prediction$value
-    
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('outcomeWeight: ', outcomeWeight,
-                        '-- kernelSize: ',paste0(kernelSize,collapse ='-'),
-                        '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay,
-                        '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
-    writeLines('==========================================')
-    writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc))
-    writeLines('==========================================')
-  } else {
-    
-    #Initialize model
-    submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                          name='submodel1_input')
-    submodel1_output <- submodel1_input %>%
-      # Begin with 1D convolutional layer
-      keras::layer_max_pooling_1d(pool_size = kernelSize) %>%
-      keras::layer_conv_1d(
-        #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), 
-        filters = filters, 
-        kernel_size = floor(dim(plpData)[2]/kernelSize),
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      keras::layer_flatten()
-    
-    submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                          name='submodel2_input')
-    submodel2_output <- submodel2_input %>%
-      # Begin with 1D convolutional layer
-      keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-      keras::layer_conv_1d(
-        #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), 
-        filters = filters, 
-        kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))),
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      keras::layer_flatten()
-    
- 
-    submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-                                          name='submodel3_input')
-    submodel3_output <- submodel3_input %>%
-      # Begin with 1D convolutional layer
-      keras::layer_conv_1d(
-        input_shape = c(dim(plpData)[2], dim(plpData)[3]), 
-        filters = filters, 
-        kernel_size = kernelSize,
-        padding = "valid"
-      ) %>%
-      # Normalize the activations of the previous layer
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>%
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))),
-        padding = "valid", 
-        use_bias = T
-      ) %>%
-      keras::layer_flatten() 
-
-    deep_output <- keras::layer_concatenate(list(submodel1_output, 
-                                                 submodel2_output, 
-                                                 submodel3_output)) %>%
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense( 
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FULLY CONNECTED LAYER 2
-      # add drop out of 0.5
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense(
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FINAL LAYER
-      keras::layer_dropout(rate=dropout) %>%
-      keras::layer_dense(name = 'final',
-                         units = 2, 
-                         activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'sigmoid', name='deep_output')
-    
-    model <- keras::keras_model(
-      inputs = c(submodel1_input, submodel2_input, submodel3_input), 
-      outputs = c(deep_output)
-    )
-    
-    
-    # Prepare model for training
-    model %>% keras::compile(
-      loss = "binary_crossentropy",#loss, 
-      metrics = c('accuracy'),
-      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-    )
-    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-    
-    
-    class_weight=list("0"=1,"1"=outcomeWeight)
-    
-    data <- plpData[population$rowId,,]
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,length(population$indexes)*0.05)
-    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
-    train_rows <- c(1:length(population$indexes))[-val_rows]
-    
-    
-    sampling_generator2<-function(data, population, batchSize, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        list(list(as.array(data[rows,,]),
-                  as.array(data[rows,,]),
-                  as.array(data[rows,,])), population$y[rows,])
-      }
-    }
-    
-    
-    if(length(train_rows) < batchSize){
-      # checking if this fixes issue with batchsize too big
-      batchSize <- length(train_rows)
-      ParallelLogger::logInfo('Reduce batchSize to training size')
-    }
-    
-    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                              steps_per_epoch = length(train_rows)/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(list(as.array(data[val_rows,,]), 
-                                                                   as.array(data[val_rows,,]),
-                                                                   as.array(data[val_rows,,])),
-                                                                   population$y[val_rows,]),
-                                              callbacks=list(earlyStopping,reduceLr),
-                                              class_weight=class_weight,
-                                              view_metrics=F)
-    
-    # batched prediciton 
-    maxVal <- nrow(population)
-    batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_on_batch(model, list(as.array(plpData[batch,,]),
-                                                  as.array(plpData[batch,,]),
-                                                  as.array(plpData[batch,,])))
-      prediction$value[batch] <- as.double(pred[,2])
-    }
-
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-    predictionMat <- prediction
-  }
-  
-  result <- list(model=model,
-                 auc=auc, 
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay,
-                                        outcomeWeight=outcomeWeight,
-                                        dropout=dropout,filters=filters,
-                                        kernelSize=paste0(kernelSize,collapse ='-'),
-                                        epochs=epochs, loss=loss,
-                                        fold_auc=foldPerm))
-                 )
-  
-  return(result)
-  
-}
diff --git a/R/CovNN2.R b/R/CovNN2.R
deleted file mode 100644
index de7bd45..0000000
--- a/R/CovNN2.R
+++ /dev/null
@@ -1,461 +0,0 @@
-# @file CovNN2.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf
-#'
-#' @param batchSize         The number of samples to used in each batch during model training
-#' @param outcomeWeight     The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)
-#' @param lr                The learning rate
-#' @param decay             The decay of the learning rate
-#' @param dropout            [currently not used] the dropout rate for regularisation
-#' @param epochs            The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)
-#' @param filters            The number of columns output by each convolution
-#' @param kernelSize        The number of time dimensions used for each convolution
-#' @param loss               The loss function implemented 
-#' @param seed               The random seed
-#'
-#' @examples
-#' \dontrun{
-#' model.CovNN <- setCovNN()
-#' }
-#' @export
-setCovNN2 <- function(batchSize = 1000,
-                      outcomeWeight=1,
-                      lr=0.00001,
-                      decay=0.000001,
-                      dropout=0,
-                      epochs = 10,
-                      filters = 3, kernelSize = 10,
-                      loss = "binary_crossentropy", 
-                      seed=NULL  ){
-  #[TODO: add input checks...]
-  
-  ensure_installed("keras")
-  
-  if(!is.null(seed)){
-    warning('seed currently not implemented in CovNN')
-  }
-  
-  result <- list(model='fitCovNN2', param=split(expand.grid(
-    batchSize=batchSize, 
-    outcomeWeight=outcomeWeight,
-    lr=lr,
-    decay=decay,
-    dropout=dropout,
-    epochs= epochs,filters=filters,
-    kernelSize=kernelSize,loss =loss,
-    seed=ifelse(is.null(seed),'NULL', seed)),
-    1:(length(batchSize)*length(outcomeWeight)*length(epochs)*
-         length(filters)*length(lr)*length(decay)*
-         length(kernelSize)*length(loss)*max(1,length(seed)))),
-    name='CovNN2'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitCovNN2 <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  if(is.null(plpData$timeRef)){
-    stop('Data not temporal...')
-  }
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseM(plpData,population,map=NULL, temporal=T)
-  
-  data <- result$data#[population$rowId,,]
-  #data<-as.array(data) -- cant make dense on big data!
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, 2)
-  #colnames(population$y) <- c('0','1')
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  hyperParamSel <- lapply(param, function(x) do.call(trainCovNN2, c(x,datas,train=TRUE)  ))
-  
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainCovNN2, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction 
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitCovNN2',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 covariateMap=result$map,
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deep'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainCovNN2<-function(plpData, population,
-                      outcomeWeight=1, lr=0.0001, decay=0.9,
-                      dropout=0.5, filters=3,
-                      kernelSize = dim(plpData)[3],
-                      batchSize, epochs, loss= "binary_crossentropy",
-                      seed=NULL, train=TRUE){
-  
-  if(!is.null(population$indexes) && train==T){
-    writeLines(paste('Training covolutional neural network (input and time) with ',length(unique(population$indexes)),' fold CV'))
-    
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      model <- keras::keras_model_sequential()  %>%
-        
-        ##model_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), 
-        ##                                      name='initial_input') %>%
-        keras::layer_permute(dims = c(2,1), 
-                             input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
-        
-        # Begin with 1D convolutional layer across input - hidden layer 1
-        ##model_output <-  keras::layer_input(shape=c(dim(plpData)[3], dim(plpData)[2]), 
-        ##                                    name='second_input') %>%
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = dim(plpData)[3]-2,
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-        # second layer across input - hidden layer 2   time x filters
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = 3,#filters,
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-        
-        # permute bact to time x var
-        keras::layer_permute(dims = c(2,1)) %>% 
-        #max pool over time and conv
-        keras::layer_max_pooling_1d(pool_size = 2) %>%
-        keras::layer_conv_1d(
-          filters = filters, 
-          kernel_size = 2,
-          padding = "valid"
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>% 
-        keras::layer_flatten() %>%
-        # final 2 deep layers with dropout and batchnorm/ relu activation
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense( 
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FULLY CONNECTED LAYER 2
-        # add drop out of 0.5
-        keras::layer_dropout(rate=dropout) %>%
-        # add fully connected layer 2
-        keras::layer_dense(
-          units = 100, 
-          activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'relu') %>%
-        
-        # =========== FINAL LAYER
-        keras::layer_dropout(rate=dropout) %>%
-        keras::layer_dense(name = 'final',
-                           units = 2, 
-                           activation = "linear", use_bias=T
-        ) %>%
-        keras::layer_batch_normalization() %>%
-        keras::layer_activation(activation = 'sigmoid', name='main_output')
-      
-      ##model <- keras::keras_model(
-      ##  inputs = c(model_input), 
-      ##  outputs = c(model_output)
-      ##)
-      
-      # Prepare model for training
-      model %>% keras::compile(
-        loss = "binary_crossentropy",
-        metrics = c('accuracy'),
-        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-      )
-      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight=list("0"=1,"1"=outcomeWeight)
-      
-      data <- plpData[population$rowId[population$indexes!=index],,]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,sum(population$indexes!=index)*0.05)
-      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-      
-      sampling_generator<-function(data, population, batchSize, train_rows, index){
-        function(){
-          gc()
-          rows<-sample(train_rows, batchSize, replace=FALSE)
-          
-          list(as.array(data[rows,,]),
-               population$y[population$indexes!=index,1:2][rows,])
-        }
-      }
-      
-      
-      #print(table(population$y))
-      if(length(train_rows) < batchSize){
-        # checking if this fixes issue with batchsize too big
-        batchSize <- length(train_rows)
-        ParallelLogger::logInfo('Reducing batchSize to training size')
-      }
-      
-      history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index),
-                                                steps_per_epoch = length(train_rows)/batchSize,
-                                                epochs=epochs,
-                                                validation_data=list(as.array(data[val_rows,,]),
-                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight)
-      
-      
-      
-      
-      # batch prediciton 
-      maxVal <- sum(population$indexes==index)
-      batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-      prediction <- population[population$indexes==index,]
-      prediction$value <- 0
-      for(batch in batches){
-        pred <- keras::predict_on_batch(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))
-        prediction$value[batch] <- as.double(pred[,2])
-      }
-      
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- prediction$value
-      
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('outcomeWeight: ', outcomeWeight,
-                        '-- kernelSize: ',paste0(kernelSize,collapse ='-'),
-                        '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay,
-                        '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs)
-    writeLines('==========================================')
-    writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc))
-    writeLines('==========================================')
-  } else {
-    
-    model <- keras::keras_model_sequential()  %>%
-      
-      keras::layer_permute(dims = c(2,1), 
-                           input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>%
-      
-      # Begin with 1D convolutional layer across input - hidden layer 1
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = dim(plpData)[3]-2,
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      # second layer across input - hidden layer 2   time x filters
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = 3,
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      
-      # permute bact to time x var
-      keras::layer_permute(dims = c(2,1)) %>% 
-      #max pool over time and conv
-      keras::layer_max_pooling_1d(pool_size = 2) %>%
-      keras::layer_conv_1d(
-        filters = filters, 
-        kernel_size = 2,
-        padding = "valid"
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>% 
-      keras::layer_flatten() %>%
-      # final 2 deep layers with dropout and batchnorm/ relu activation
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense( 
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FULLY CONNECTED LAYER 2
-      # add drop out of 0.5
-      keras::layer_dropout(rate=dropout) %>%
-      # add fully connected layer 2
-      keras::layer_dense(
-        units = 100, 
-        activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'relu') %>%
-      
-      # =========== FINAL LAYER
-      keras::layer_dropout(rate=dropout) %>%
-      keras::layer_dense(name = 'final',
-                         units = 2, 
-                         activation = "linear", use_bias=T
-      ) %>%
-      keras::layer_batch_normalization() %>%
-      keras::layer_activation(activation = 'sigmoid', name='main_output')
-    
-    
-    
-    # Prepare model for training
-    model %>% keras::compile(
-      loss = "binary_crossentropy",#loss, 
-      metrics = c('accuracy'),
-      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-    )
-    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-    
-    
-    class_weight=list("0"=1,"1"=outcomeWeight)
-    
-    data <- plpData[population$rowId,,]
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,length(population$indexes)*0.05)
-    val_rows<-sample(1:length(population$indexes), valN, replace=FALSE)
-    train_rows <- c(1:length(population$indexes))[-val_rows]
-    
-    
-    sampling_generator2<-function(data, population, batchSize, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batchSize, replace=FALSE)
-        list(as.array(data[rows,,]), population$y[rows,])
-      }
-    }
-    
-    if(length(train_rows) < batchSize){
-      # checking if this fixes issue with batchsize too big
-      batchSize <- length(train_rows)
-      ParallelLogger::logInfo('Reducing batchSize to training size')
-    }
-    
-    
-    history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows),
-                                              steps_per_epoch = length(train_rows)/batchSize,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(data[val_rows,,]),
-                                                                   population$y[val_rows,]),
-                                              callbacks=list(earlyStopping,reduceLr),
-                                              class_weight=class_weight,
-                                              view_metrics=F)
-    
-    # batched prediciton 
-    maxVal <- nrow(population)
-    batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_on_batch(model, as.array(plpData[batch,,]))
-      prediction$value[batch] <- as.double(pred[,2])
-    }
-    
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-    predictionMat <- prediction
-  }
-  
-  result <- list(model=model,
-                 auc=auc,
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay,
-                                        outcomeWeight=outcomeWeight,
-                                        dropout=dropout,filters=filters,
-                                        kernelSize=paste0(kernelSize,collapse ='-'),
-                                        epochs=epochs, loss=loss,
-                                        fold_auc=foldPerm))
-  )
-  
-  return(result)
-  
-}
diff --git a/R/DeepNN.R b/R/DeepNN.R
deleted file mode 100644
index 19a5dbe..0000000
--- a/R/DeepNN.R
+++ /dev/null
@@ -1,557 +0,0 @@
-# @file DeepNN.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for DeepNN model
-#'
-#' @param units         The number of units of the deep network - as a list of vectors
-#' @param layer_dropout      The layer dropout rate (regularisation)
-#' @param lr                 Learning rate
-#' @param decay              Learning rate decay over each update.
-#' @param outcome_weight      The weight of the outcome class in the loss function
-#' @param batch_size          The number of data points to use per training batch
-#' @param epochs          Number of times to iterate over dataset
-#' @param seed            Random seed used by deep learning model
-#'
-#' @examples
-#' \dontrun{
-#' model <- setDeepNN()
-#' }
-#' @export
-setDeepNN <- function(units=list(c(128, 64), 128), layer_dropout=c(0.2),
-                      lr =c(1e-4), decay=c(1e-5), outcome_weight = c(1.0), batch_size = c(100), 
-                      epochs= c(100),  seed=NULL  ){
-  
-  ensure_installed("keras")
-  
-  # if(class(indexFolder)!='character')
-  #     stop('IndexFolder must be a character')
-  # if(length(indexFolder)>1)
-  #     stop('IndexFolder must be one')
-  # 
-  # if(class(units)!='numeric')
-  #     stop('units must be a numeric value >0 ')
-  # if(units<1)
-  #     stop('units must be a numeric value >0 ')
-  # 
-  # #if(length(units)>1)
-  # #    stop('units can only be a single value')
-  # 
-  # if(class(recurrent_dropout)!='numeric')
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if( (recurrent_dropout<0) | (recurrent_dropout>=1))
-  #     stop('dropout must be a numeric value >=0 and <1')
-  # if(class(layer_dropout)!='numeric')
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if( (layer_dropout<0) | (layer_dropout>=1))
-  #     stop('layer_dropout must be a numeric value >=0 and <1')
-  # if(class(lr)!='numeric')
-  #     stop('lr must be a numeric value >0')
-  # if(lr<=0)
-  #     stop('lr must be a numeric value >0')
-  # if(class(decay)!='numeric')
-  #     stop('decay must be a numeric value >=0')
-  # if(decay<=0)
-  #     stop('decay must be a numeric value >=0')
-  # if(class(outcome_weight)!='numeric')
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(outcome_weight<=0)
-  #     stop('outcome_weight must be a numeric value >=0')
-  # if(class(batch_size)!='numeric')
-  #     stop('batch_size must be an integer')
-  # if(batch_size%%1!=0)
-  #     stop('batch_size must be an integer')
-  # if(class(epochs)!='numeric')
-  #     stop('epochs must be an integer')
-  # if(epochs%%1!=0)
-  #     stop('epochs must be an integer')
-  # if(!class(seed)%in%c('numeric','NULL'))
-  #     stop('Invalid seed')
-  #if(class(UsetidyCovariateData)!='logical')
-  #    stop('UsetidyCovariateData must be an TRUE or FALSE')
-  
-  param <- expand.grid(units=units,
-    layer_dropout=layer_dropout,
-    lr =lr, decay=decay, outcome_weight=outcome_weight,epochs= epochs,
-    seed=ifelse(is.null(seed),'NULL', seed))
-  param$units1=unlist(lapply(param$units, function(x) x[1])) 
-  param$units2=unlist(lapply(param$units, function(x) x[2])) 
-  param$units3=unlist(lapply(param$units, function(x) x[3]))
-  
-  result <- list(model='fitDeepNN', param=split(param,
-    1:(length(units)*length(layer_dropout)*length(lr)*length(decay)*length(outcome_weight)*length(epochs)*max(1,length(seed)))),
-    name='DeepNN'
-  )
-  
-  class(result) <- 'modelSettings' 
-  return(result)
-}
-
-
-fitDeepNN <- function(plpData,population, param, search='grid', quiet=F,
-                      outcomeId, cohortId, ...){
-  # check plpData is coo format:
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData)){
-    stop('DeepNN requires correct covariateData')
-  }
-  if(!is.null(plpData$timeRef)){
-    warning('Data temporal but deepNN uses non-temporal data...')
-  }
-  
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  start<-Sys.time()
-  
-  result<- toSparseMDeep(plpData,population,map=NULL, temporal=F)
-  data <- result$data
-  
-  #one-hot encoding
-  population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount)))
-  
-  # do cross validation to find hyperParameter
-  datas <- list(population=population, plpData=data)
-  hyperParamSel <- lapply(param, function(x) do.call(trainDeepNN, c(x,datas,train=TRUE)  ))
-  
-  hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum)))
-  hyperSummary <- as.data.frame(hyperSummary)
-  hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc))
-  hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel<-do.call(trainDeepNN, c(param[[bestInd]],datas, train=FALSE))
-  
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  #modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # train prediction
-  prediction <- finalModel$prediction
-  finalModel$prediction <- NULL
-  
-  # return model location 
-  result <- list(model = finalModel$model,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitDeepNN',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 covariateMap=result$map,
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'deep'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-trainDeepNN<-function(plpData, population,
-                      units1=128, units2= NULL, units3=NULL, 
-                      layer_dropout=0.2,
-                      lr =1e-4, decay=1e-5, outcome_weight = 1.0, batch_size = 100, 
-                      epochs= 100, seed=NULL, train=TRUE, ...){
-  
-  ParallelLogger::logInfo(paste('Training deep neural network with ',length(unique(population$indexes)),' fold CV'))
-  if(!is.null(population$indexes) && train==T){
-    index_vect <- unique(population$indexes)
-    perform <- c()
-    
-    # create prediction matrix to store all predictions
-    predictionMat <- population
-    predictionMat$value <- 0
-    attr(predictionMat, "metaData") <- list(predictionType = "binary")
-    
-    for(index in 1:length(index_vect )){
-      ParallelLogger::logInfo(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows'))
-      
-      model <- keras::keras_model_sequential()
-      
-      if(is.na(units2)){
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-      } else if(is.na(units3)){
-        model %>%
-          keras::layer_dense(units=units1, #activation='identify', 
-                             input_shape=ncol(plpData)) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=units2 #,activation='identify'
-                             ) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-      } else{
-        model %>%
-          keras::layer_dense(units=units1, #activation='identify', 
-                             input_shape=ncol(plpData)) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=units2 #,activation='identify'
-                             ) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=units3 #,activation='identify'
-                             ) %>%
-          keras::layer_dropout(layer_dropout) %>%
-          keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-      }
-      
-
-      # Prepare model for training
-      model %>% keras::compile(
-        loss = "binary_crossentropy",
-        metrics = c('accuracy'),
-        optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-      )
-      earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-      reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                    patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-      
-      class_weight=list("0"=1,"1"=outcome_weight)
-      
-      data <- plpData[population$rowId[population$indexes!=index],]
-      
-      #Extract validation set first - 10k people or 5%
-      valN <- min(10000,sum(population$indexes!=index)*0.05)
-      val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE)
-      train_rows <- c(1:sum(population$indexes!=index))[-val_rows]
-      
-      sampling_generator<-function(data, population, batch_size, train_rows, index){
-        function(){
-          gc()
-          rows<-sample(train_rows, batch_size, replace=FALSE)
-          
-          list(as.array(data[rows,]),
-               population$y[population$indexes!=index,1:2][rows,])
-        }
-      }
-      
-      
-      #print(table(population$y))
-      
-      history <- model %>% keras::fit_generator(sampling_generator(data,population,batch_size,train_rows, index),
-                                                steps_per_epoch = floor(sum(population$indexes!=index)/batch_size),
-                                                epochs=epochs,
-                                                validation_data=list(as.array(data[val_rows,]),
-                                                                     population$y[population$indexes!=index,1:2][val_rows,]),
-                                                callbacks=list(earlyStopping,reduceLr),
-                                                class_weight=class_weight)
-      
-      
-      # batch prediciton 
-      maxVal <- sum(population$indexes==index)
-      batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-      prediction <- population[population$indexes==index,]
-      prediction$value <- 0
-      for(batch in batches){
-        pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],][batch,,drop=FALSE]))
-        prediction$value[batch] <- pred[,2]
-      }
-      
-      attr(prediction, "metaData") <- list(predictionType = "binary")
-      aucVal <- computeAuc(prediction)
-      perform <- c(perform,aucVal)
-      
-      # add the fold predictions and compute AUC after loop
-      predictionMat$value[population$indexes==index] <- prediction$value
-      
-    }
-    
-    auc <- computeAuc(predictionMat)
-    foldPerm <- perform
-    
-    # Output  ----------------------------------------------------------------
-    param.val <- paste0('units1: ',units1,'units2: ',units2,'units3: ',units3,
-                        'layer_dropout: ',layer_dropout,'-- lr: ', lr,
-                        '-- decay: ', decay, '-- batch_size: ',batch_size, '-- epochs: ', epochs)
-    ParallelLogger::logInfo('==========================================')
-    ParallelLogger::logInfo(paste0('DeepNN with parameters:', param.val,' obtained an AUC of ',auc))
-    ParallelLogger::logInfo('==========================================')
-    
-  } else {
-    
-    model <- keras::keras_model_sequential()
-    if(is.na(units2)){
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-    } else if(is.na(units3)){
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=units2 #,activation='identify'
-                           ) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-    } else{
-      model %>%
-        keras::layer_dense(units=units1, #activation='identify', 
-                           input_shape=ncol(plpData)) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=units2 #,activation='identify'
-                           ) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=units3 #,activation='identify'
-                           ) %>%
-        keras::layer_dropout(layer_dropout) %>%
-        keras::layer_dense(units=2, activation='sigmoid', use_bias = T)
-    }
-    
-    # Prepare model for training
-    model %>% keras::compile(
-      loss = "binary_crossentropy",
-      metrics = c('accuracy'),
-      optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay)
-    )
-    earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-    reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                  patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0)
-    
-    class_weight=list("0"=1,"1"=outcome_weight)
-    
-    #Extract validation set first - 10k people or 5%
-    valN <- min(10000,nrow(population)*0.05)
-    val_rows<-sample(1:nrow(population), valN, replace=FALSE)
-    train_rows <- c(1:nrow(population))[-val_rows]
-    
-    sampling_generator2<-function(data, population, batch_size, train_rows){
-      function(){
-        gc()
-        rows<-sample(train_rows, batch_size, replace=FALSE)
-        
-        list(as.array(data[rows,,]),
-             population$y[,1:2][rows,])
-      }
-    }
-
-    history <- model %>% keras::fit_generator(sampling_generator2(plpData,population,batch_size,train_rows),
-                                              steps_per_epoch = nrow(population)/batch_size,
-                                              epochs=epochs,
-                                              validation_data=list(as.array(plpData[val_rows,,]),
-                                                                   population$y[val_rows,1:2]),
-                                              callbacks=list(earlyStopping,reduceLr),
-                                              class_weight=class_weight)
-    
-    
-    # batch prediciton 
-    maxVal <- nrow(population)
-    batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal))
-    prediction <- population
-    prediction$value <- 0
-    for(batch in batches){
-      pred <- keras::predict_proba(model, as.array(plpData[batch,,drop=FALSE]))
-      prediction$value[batch] <- pred[,2]
-    }
-    
-    attr(prediction, "metaData") <- list(predictionType = "binary")
-    auc <- computeAuc(prediction)
-    foldPerm <- auc
-    predictionMat <- prediction
-  }
-  
-  result <- list(model=model,
-                 auc=auc,
-                 prediction = predictionMat,
-                 hyperSum = unlist(list(units1=units1,units2=units2,units3=units3, 
-                                        layer_dropout=layer_dropout,lr =lr, decay=decay,
-                                        batch_size = batch_size, epochs= epochs))
-  )
-  return(result)
-  
-}
-
-
-
-#' [Under development] Transfer learning
-#'
-#' @param plpResult   The plp result when training a kersa deep learning model on big data
-#' @param plpData     The new data to fine tune the model on
-#' @param population  The population for the new data
-#' @param fixLayers   boolean specificying whether to fix weights in model being transferred
-#' @param includeTop  If TRUE the final layer of the model being transferred is removed 
-#' @param addLayers   vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10
-#' @param layerDropout  Add dropout to each new layer (binary vector length of addLayers)
-#' @param layerActivation Activation function for each new layer (string vector length of addLayers)
-#' @param outcomeWeight  The weight to assign the class 1 when training the model
-#' @param batchSize   Size of each batch for updating layers
-#' @param epochs      Number of epoches to run 
-#' @examples
-#' \dontrun{
-#' modelSet <- setDeepNN()
-#' plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...)
-#' 
-#' transferLearning(...)
-#' }
-#' @export
-transferLearning <- function(plpResult, 
-                 plpData, 
-                 population, 
-                 fixLayers = T, 
-                 includeTop= F,
-                 addLayers = c(100,10), 
-                 layerDropout = c(T,T),
-                 layerActivation = c('relu','softmax'),
-                 outcomeWeight = 1,
-                 batchSize = 10000, 
-                 epochs=20){
-  
-  # checks
-  if(!is.null(addLayers)){
-    if(length(addLayers)!=length(layerDropout)){
-      stop('Layer vector not same length as layer dropout vector')
-    }
-    if(length(addLayers)!=length(layerActivation)){
-      stop('Layer vector not same length as layer activation vector')
-    }
-  }
-  if(batchSize > nrow(population)){
-    warning('batchSize is too big for your data...')
-    batchSize = nrow(population)/10
-  }
-  
-  if(!includeTop){
-  # remove last layer if not dropout
-    if(length(grep('Dropout',as.character(plpResult$model$model$layers[[length(plpResult$model$model$layers)]])))==0){
-      keras::pop_layer(plpResult$model$model)
-    }
-  }
- 
-  # create the base pre-trained
-  base_model <-  plpResult$model$model
-  
-  # add our custom layers
-  predictions <- base_model$output 
-  
-  
-  # fix the older layers
-  if(fixLayers){
-    for (i in 1:length(base_model$layers)){
-      try({base_model$layers[[i]]$trainable <- F}, silent = TRUE)
-    }
-  }
-  
-  ## add loop over settings here - move code to new function and call it
-  ##====
-  # !!check this looping logic works
-  if(!is.null(addLayers)){
-    for(i in 1:length(addLayers)){
-      predictions <- keras::layer_dense(predictions,units = addLayers[i], activation = layerActivation[i])
-      if(layerDropout[i]){
-        predictions <- keras::layer_dropout(predictions, rate = 0.5)
-      }
-    }
-  }
-  
-  # add find layer for binary outcome
-  predictions <- keras::layer_dense(predictions,units = 2, activation = 'sigmoid')
-  
-  
-  # this is the model we will train
-  model <- keras::keras_model(inputs = base_model$input, outputs = predictions)
-  
-  # compile the model (should be done *after* setting layers to non-trainable)
-  model %>% keras::compile(optimizer = 'rmsprop', loss = 'binary_crossentropy',
-                           metrics = c('accuracy'))
-  
-  
-  # make this input...
-  earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4)
-  reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, 
-                                                patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, 
-                                                min_lr = 0)
-  
-  class_weight=list("0"=1,"1"=outcomeWeight)
-  
-
-  sampling_generator<-function(data, population, batchSize, train_rows){
-    function(){
-      gc()
-      rows<-sample(train_rows, batchSize, replace=FALSE)
-      
-      list(as.array(data[rows,]),
-           population$y[rows,1:2])
-    }
-  }
-  
-  # convert plpdata to matrix:
-  metaData <- attr(population, 'metaData')
-  if(!is.null(population$indexes))
-    population <- population[population$indexes>0,]
-  attr(population, 'metaData') <- metaData
-  
-  result<- toSparseM(plpData,population,map=plpResult$model$covariateMap, temporal=F)
-  data <- result$data
-  population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount)))
-  
-  #Extract validation set first - 10k people or 5%
-  valN <- min(10000,nrow(population)*0.05)
-  val_rows<-sample(1:nrow(population), valN, replace=FALSE)
-  train_rows <- c(1:nrow(population))[-val_rows]
-  
-  history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows),
-                                            steps_per_epoch = nrow(population)/batchSize,
-                                            epochs=epochs,
-                                            validation_data=list(as.array(data[val_rows,,]),
-                                                                 population$y[val_rows,1:2]),
-                                            callbacks=list(earlyStopping,reduceLr),
-                                            class_weight=class_weight)
-  
-  
-  # batch prediciton 
-  maxVal <- nrow(population)
-  batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal))
-  prediction <- population
-  prediction$value <- 0
-  for(batch in batches){
-    pred <- model$predict(as.array(data[batch,,drop=FALSE])) # added drop=FALSE
-    prediction$value[batch] <- pred[,2]
-  }
-  
-  ##===
-  
-  attr(prediction, "metaData") <- list(predictionType = "binary")
-  auc <- computeAuc(prediction)
-  foldPerm <- auc
-  predictionMat <- prediction
-
-result <- list(model=model,
-               auc=auc,
-               prediction = predictionMat,
-               hyperSum = list(fixLayers = fixLayers, 
-                               addLayers = addLayers, 
-                               layerDropout = layerDropout,
-                               layerActivation = layerActivation))
-  
-  return(result)
-  
-}
diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R
new file mode 100644
index 0000000..3dbf7f4
--- /dev/null
+++ b/R/DeepPatientLevelPrediction.R
@@ -0,0 +1,29 @@
+# @file DeepPatientLevelPrediction.R
+#
+# Copyright 2022 Observational Health Data Sciences and Informatics
+#
+# This file is part of DeepPatientLevelPrediction
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' DeepPatientLevelPrediction
+#' 
+#' @description A package for running predictions using deep learning on
+#' data in the OMOP CDM
+#'
+#' @docType package
+#' @name DeepPatientLevelPrediction
+#' @importFrom dplyr %>%
+#' @importFrom data.table :=
+#' @importFrom rlang .data
+NULL
\ No newline at end of file
diff --git a/R/Formatting.R b/R/Formatting.R
deleted file mode 100644
index 47feec8..0000000
--- a/R/Formatting.R
+++ /dev/null
@@ -1,173 +0,0 @@
-# @file Formatting.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of DeepPatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Convert the plpData in COO format into a sparse R matrix
-#' Converts the standard plpData to a sparse matrix
-#' This function converts the covariate file from ffdf in COO format into a sparse matrix from
-#' the package Matrix
-#' @param plpData                       An object of type \code{plpData} with covariate in coo format - the patient level prediction
-#'                                      data extracted from the CDM.
-#' @param population                    The population to include in the matrix
-#' @param map                           A covariate map (telling us the column number for covariates)
-#' @param temporal                      Whether you want to convert temporal data
-#' @examples
-#' #TODO
-#'
-#' @return
-#' Returns a list, containing the data as a sparse matrix, the plpData covariateRef
-#' and a data.frame named map that tells us what covariate corresponds to each column
-#' This object is a list with the following components: \describe{
-#' \item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
-#' \item{covariateRef}{The plpData covariateRef.}
-#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
-#' }
-#'
-#' @export
-toSparseMDeep <- function(plpData,
-                          population, 
-                          map=NULL, 
-                          temporal=F){
-  # check logger
-  ParallelLogger::logInfo(paste0('starting toSparseM'))
-  ParallelLogger::logDebug(paste0('covariates nrow: ', nrow(plpData$covariateData$covariates)))
-  ParallelLogger::logDebug(paste0('covariateRef nrow: ', nrow(plpData$covariateData$covariateRef)))
-  
-   
-  #assign newIds to covariateRef
-  newcovariateData <- MapCovariates(plpData$covariateData,
-                                 population, 
-                                 mapping=map)
-  
-  ParallelLogger::logDebug(paste0('Max covariateId in covariates: ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T)))))
-  ParallelLogger::logDebug(paste0('# covariates in covariateRef: ', nrow(newcovariateData$covariateRef)))
-  ParallelLogger::logDebug(paste0('Max rowId in covariates: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T)))))
-
-  maxY <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId, na.rm = TRUE)))$max
-  ParallelLogger::logDebug(paste0('Max newCovariateId in mapping: ',maxY))
-  maxX <- max(population$rowId)
-  ParallelLogger::logDebug(paste0('Max rowId in population: ',maxX))
-  
-  # chunk then add
-  if(!temporal){
-    
-    ParallelLogger::logInfo(paste0('Casting plpData into vectors and creating sparseMatrix - you need suffciently large RAM for this '))
-    ParallelLogger::logInfo(paste0('toSparseMDeep non temporal used'))
-    
-    data <- Matrix::sparseMatrix(i=as.data.frame(newcovariateData$covariates %>% dplyr::select(.data$rowId))$rowId,
-                                        j=as.data.frame(newcovariateData$covariates %>% dplyr::select(.data$covariateId))$covariateId,
-                                        x=as.data.frame(newcovariateData$covariates %>% dplyr::select(.data$covariateValue))$covariateValue,
-                                        dims=c(maxX,maxY))
-  } else {
-    ParallelLogger::logInfo(paste0('toSparseMDeep temporal used'))
-    
-    minT <- min(newcovariateData$covariates %>% dplyr::select(.data$timeId) %>% collect(), na.rm = T)
-    maxT <- max(newcovariateData$covariates %>% dplyr::select(.data$timeId) %>% collect(), na.rm = T)
-    
-    ParallelLogger::logTrace(paste0('Min time:', minT))
-    ParallelLogger::logTrace(paste0('Max time:', maxT))
-    
-    # do we want to use for(i in sort(plpData$timeRef$timeId)){ ?
-    for(i in minT:maxT){
-      
-      if(newcovariateData$covariates %>% dplyr::filter(.data$timeId==i) %>% dplyr::summarise(n=n()) %>% dplyr::collect() > 0  ){
-        ParallelLogger::logInfo(paste0('Found covariates for timeId ', i))
-
-        dataPlp <- Matrix::sparseMatrix(i= as.integer(as.character(as.data.frame(newcovariateData$covariates  %>% dplyr::filter(.data$timeId==i) %>% dplyr::select(.data$rowId))$rowId)),
-                                        j= as.integer(as.character(as.data.frame(newcovariateData$covariates  %>% dplyr::filter(.data$timeId==i) %>% dplyr::select(.data$covariateId))$covariateId)),
-                                        x= as.double(as.character(as.data.frame(newcovariateData$covariates  %>% dplyr::filter(.data$timeId==i) %>% dplyr::select(.data$covariateValue))$covariateValue)),
-                                        dims=c(maxX,maxY))
-        
-        data_array <- slam::as.simple_sparse_array(dataPlp)
-        #extending one more dimesion to the array
-        data_array <- slam::extend_simple_sparse_array(data_array, MARGIN =c(1L))
-        ParallelLogger::logInfo(paste0('Finished Mapping covariates for timeId ', i))
- 
-      } else {
-        data_array <- tryCatch(slam::simple_sparse_array(i=matrix(c(1,1,1), ncol = 3), 
-                                                         v=0,
-                                                         dim=c(maxX,1, maxY))
-        )
-        
-      }
-      
-      # add na timeIds - how?
-      
-       #binding arrays along the dimesion
-      if(i==minT) {
-        result_array <- data_array
-      }else{
-        result_array <- slam::abind_simple_sparse_array(result_array,data_array,MARGIN=2L)
-      }
-    }
-    data <- result_array
-  }
-  
-  ParallelLogger::logDebug(paste0('Sparse matrix with dimensionality: ', paste(dim(data), collapse=',')  ))
-
-  ParallelLogger::logInfo(paste0('finishing toSparseM'))
-  
-  result <- list(data=data,
-                 covariateRef=as.data.frame(newcovariateData$covariateRef),
-                 map=as.data.frame(newcovariateData$mapping))
-  return(result)
-
-}
-
-
-MapCovariates <- function(covariateData,population, mapping=NULL){
-  
-  # to remove check notes
-  #covariateId <- oldCovariateId <- newCovariateId <- NULL
-  ParallelLogger::logInfo(paste0('starting MapCovariates'))
-  
-  newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef,
-                                           analysisRef = covariateData$analysisRef)
-  
-  # restrict to population for speed
-  ParallelLogger::logTrace('restricting to population for speed and mapping')
-  if(is.null(mapping)){
-    metaData <- attr(covariateData, 'metaData')
-    deletedCovariates <- c(metaData$deletedRedundantCovariateIds, metaData$deletedInfrequentCovariateIds)
-    mapping <- data.frame(oldCovariateId = as.data.frame(covariateData$covariateRef %>%
-                                                         dplyr::filter(!(.data$covariateId %in% deletedCovariates)) %>%
-                                                                         dplyr::select(.data$covariateId)),
-                          newCovariateId = 1:nrow(as.data.frame(covariateData$covariateRef %>% 
-                                                                  filter(!(covariateId %in% deletedCovariates)))))
-  }
-  if(sum(colnames(mapping)%in%c('oldCovariateId','newCovariateId'))!=2){
-    colnames(mapping) <- c('oldCovariateId','newCovariateId')
-  }
-  covariateData$mapping <- mapping
-  covariateData$population <- data.frame(rowId = population[,'rowId'])
-  # assign new ids :
-  newCovariateData$covariates <- covariateData$covariates %>%
-    dplyr::inner_join(covariateData$population) %>% 
-    dplyr::rename(oldCovariateId = .data$covariateId) %>% 
-    dplyr::inner_join(covariateData$mapping) %>% 
-    dplyr::select(- .data$oldCovariateId)  %>%
-    dplyr::rename(covariateId = .data$newCovariateId)
-  covariateData$population <- NULL
-  covariateData$mapping <- NULL
-  
-  newCovariateData$mapping <- mapping
-  
-  ParallelLogger::logInfo(paste0('finished MapCovariates'))
-  
-  return(newCovariateData)
-}
-
diff --git a/R/RNNTorch.R b/R/RNNTorch.R
deleted file mode 100644
index ad86b0e..0000000
--- a/R/RNNTorch.R
+++ /dev/null
@@ -1,173 +0,0 @@
-# @file RNNTorch.R
-#
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Create setting for RNN model with python 
-#' @param hidden_size  The hidden size
-#' @param epochs     The number of epochs
-#' @param seed       A seed for the model
-#' @param class_weight   The class weight used for imbalanced data: 
-#'                           0: Inverse ratio between positives and negatives
-#'                          -1: Focal loss
-#' @param type      It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'
-#'
-#' @examples
-#' \dontrun{
-#' model.rnnTorch <- setRNNTorch()
-#' }
-#' @export
-setRNNTorch <- function(hidden_size=c(50, 100), epochs=c(20, 50), seed=0, class_weight = 0, type = 'RNN'){
-  # check inputs
-  
-  # add warning about dense data..
-  
-  
-  # set seed
-  if(is.null(seed[1])){
-    seed <- as.integer(sample(100000000,1))
-  }
-  
-  result <- list(model='fitRNNTorch', param=split(expand.grid(hidden_size=hidden_size,
-                                            epochs=epochs, seed=seed[1], 
-                                            class_weight = class_weight, type = type),
-									        1:(length(hidden_size)*length(epochs)) ),
-                                      name='RNN Torch')
-  
-  class(result) <- 'modelSettings' 
-  
-  return(result)
-}
-
-fitRNNTorch <- function(population, plpData, param, search='grid', quiet=F,
-                        outcomeId, cohortId, ...){
-  
-  # check plpData is libsvm format or convert if needed
-  if (!FeatureExtraction::isCovariateData(plpData$covariateData))
-    stop("Needs correct covariateData")
-  
-  if(colnames(population)[ncol(population)]!='indexes'){
-    warning('indexes column not present as last column - setting all index to 1')
-    population$indexes <- rep(1, nrow(population))
-  }
-  
-  start <- Sys.time()
-  
-  population$rowIdPython <- population$rowId-1  #to account for python/r index difference #subjectId
-  pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')])
-  
-  result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T)
-    
-  outLoc <- createTempModelLoc()
-  # clear the existing model pickles
-  for(file in dir(outLoc))
-    file.remove(file.path(outLoc,file))
-  
-  # do cross validation to find hyperParameter
-  hyperParamSel <- lapply(param, function(x) do.call(trainRNNTorch, listAppend(x, 
-                                                                               list(plpData = result$data,
-                                                                                    population = pPopulation,
-                                                                                    train=TRUE,
-                                                                                    modelOutput=outLoc))  ))
- 
-  hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel))
-  
-  #now train the final model and return coef
-  bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1]
-  finalModel <- do.call(trainRNNTorch, listAppend(param[[bestInd]], 
-                                                  list(plpData = result$data,
-                                                       population = pPopulation,
-                                                       train=FALSE,
-                                                       modelOutput=outLoc)))
-
-  covariateRef <- as.data.frame(plpData$covariateData$covariateRef)
-  incs <- rep(1, nrow(covariateRef)) 
-  covariateRef$included <- incs
-  covariateRef$covariateValue <- rep(0, nrow(covariateRef))
-  
-  modelTrained <- file.path(outLoc) 
-  param.best <- param[[bestInd]]
-  
-  comp <- start-Sys.time()
-  
-  # prediction on train set:
-  pred <- finalModel
-  pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes
-  colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-  pred <- as.data.frame(pred)
-  attr(pred, "metaData") <- list(predictionType="binary")
-  
-  pred$value <- 1-pred$value
-  prediction <- merge(population, pred[,c('rowId','value')], by='rowId')
-  
-  # return model location 
-  result <- list(model = modelTrained,
-                 trainCVAuc = -1, # ToDo decide on how to deal with this
-                 hyperParamSearch = hyperSummary,
-                 modelSettings = list(model='fitRNNTorch',modelParameters=param.best),
-                 metaData = plpData$metaData,
-                 populationSettings = attr(population, 'metaData'),
-                 outcomeId=outcomeId,
-                 cohortId=cohortId,
-                 varImp = covariateRef, 
-                 trainingTime =comp,
-                 dense=1,
-                 covariateMap=result$map, # I think this is need for new data to map the same?
-                 predictionTrain = prediction
-  )
-  class(result) <- 'plpModel'
-  attr(result, 'type') <- 'pythonReticulate'
-  attr(result, 'predictionType') <- 'binary'
-  
-  return(result)
-}
-
-
-trainRNNTorch <- function(plpData, population, epochs=50, hidden_size = 100, seed=0, class_weight= 0, type = 'RNN', train=TRUE, modelOutput, quiet=F){
-
-  train_deeptorch <- function(){return(NULL)}
-  python_dir <- system.file(package='PatientLevelPrediction','python')
-  
-  e <- environment()
-  reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e)
-  
-  result <- train_deeptorch(population = population, 
-                                train = train,
-                                plpData = plpData, 
-                                model_type = as.character(type),
-                                epochs = as.integer(epochs),
-                                hidden_size = as.integer(hidden_size),
-                                class_weight = class_weight,
-                                seed = seed, 
-                                quiet = quiet,
-                                modelOutput = modelOutput)
-  
-  if(train){
-    # then get the prediction 
-    pred <- result
-    colnames(pred) <- c('rowId','outcomeCount','indexes', 'value')
-    pred <- as.data.frame(pred)
-    attr(pred, "metaData") <- list(predictionType="binary")
-    
-    pred$value <- 1-pred$value
-    auc <- computeAuc(pred)
-    writeLines(paste0('Model obtained CV AUC of ', auc))
-    return(auc)
-  }
-  
-  return(result)
-  
-}
diff --git a/R/helpers.R b/R/helpers.R
index 4524635..38ea07c 100644
--- a/R/helpers.R
+++ b/R/helpers.R
@@ -1,71 +1,4 @@
-rowIdSets <- function(population, 
-                     index){
-  
-  if(!is.null(index)){
-    testRowIds <- population$rowId[population$indexes==index]
-    trainRowIds <- population$rowId[population$indexes!=index]
-    
-    valN <- min(10000,length(trainRowIds)*0.05)
-    valSamp <- sample(1:length(trainRowIds), valN, replace=FALSE)
-    earlyStopRowIds <- trainRowIds[valSamp]
-    trainRowIds <- trainRowIds[-valSamp]
-    
-    datas <- list(testRowIds = sort(testRowIds),
-                  trainRowIds = sort(trainRowIds),
-                  earlyStopRowIds = sort(earlyStopRowIds)
-    )
-  }else{
-    trainRowIds <- population$rowId
-    
-    valN <- min(10000,length(trainRowIds)*0.05)
-    valSamp <- sample(1:length(trainRowIds), valN, replace=FALSE)
-    earlyStopRowIds <- trainRowIds[valSamp]
-    trainRowIds <- trainRowIds[-valSamp]
-    
-    datas <- list(trainRowIds = sort(trainRowIds),
-                  earlyStopRowIds = sort(earlyStopRowIds)
-    )
-    
-  }
-  
-  return(datas)
-}
-
-convertToTorchData <- function(data, population, rowIds){
-  x <- torch::torch_tensor(as.matrix(data[rowIds,]), dtype = torch::torch_float())
-  
-  #one-hot encoding
-  y <- population$outcomeCount[population$rowId%in%rowIds]
-  y[y>0] <- 1
-  y <- torch::torch_tensor(matrix(y), dtype = torch::torch_float())
-  
-  return(list(x=x,
-              y=y))
-}
-
-batchPredict <- function(model, 
-                         plpData,
-                         population,
-                         predictRowIds,
-                         batch_size ){
-  ParallelLogger::logInfo('Predicting using  batch')
-  maxVal <- length(predictRowIds)
-  batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size), maxVal))
-  
-  ParallelLogger::logInfo('Pop')
-  prediction <- population[population$rowId%in%predictRowIds,]
-  prediction$value <- 0
-  
-  for(batch in batches){
-    b <- torch::torch_tensor(as.matrix(plpData[predictRowIds[batch],, drop = F]), dtype = torch::torch_float())
-    pred <- model(b)
-    prediction$value[batch] <- as.array(pred$to())[,1]
-  }
-  attr(prediction, "metaData") <- list(predictionType = "binary")
-  return(prediction)
-}
-
-#' @description converts a sparse Matrix into a list of its columns, 
+#' converts a sparse Matrix into a list of its columns, 
 #' subsequently vapply can be used to apply functions over the list 
 listCols<-function(m){
     res<-split(m@x, findInterval(seq_len(length(m@x)), m@p, left.open=TRUE))
diff --git a/R/sparseRTorch.R b/R/sparseRTorch.R
deleted file mode 100644
index 890052f..0000000
--- a/R/sparseRTorch.R
+++ /dev/null
@@ -1,63 +0,0 @@
-#' Convert the plpData in COO format into a sparse Torch tensor
-#'
-#' @description
-#' Converts the standard plpData to a sparse tensor for Torch
-#'
-#' @details
-#' This function converts the covariate file from COO format into a sparse Torch tensor
-#' @param plpData                       An object of type \code{plpData} with covariate in coo format - the patient level prediction
-#'                                      data extracted from the CDM.
-#' @param population                    The population to include in the matrix
-#' @param map                           A covariate map (telling us the column number for covariates)
-#' @param temporal                      Whether you want to convert temporal data
-#' @examples
-#' #TODO
-#'
-#' @return
-#' Returns a list, containing the data as a sparse matrix, the plpData covariateRef
-#' and a data.frame named map that tells us what covariate corresponds to each column
-#' This object is a list with the following components: \describe{
-#' \item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
-#' \item{covariateRef}{The plpData covariateRef.}
-#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
-#' }
-#'
-#' @export
-toSparseRTorch <- function(plpData, population, map=NULL, temporal=T){
-  
-  newCovariateData <- MapCovariates(plpData$covariateData,
-                                    population, 
-                                    mapping=map)
-
-  if(temporal){
-    indicesTemporal <- newCovariateData$covariates %>% filter(!is.na(.data$timeId)) %>% mutate(timeId = .data$timeId+1) %>% select(.data$rowId, .data$covariateId, .data$timeId) %>% collect() %>% as.matrix() 
-    valuesTemporal <- newCovariateData$covariates %>% filter(!is.na(.data$timeId)) %>% select(.data$covariateValue) %>% collect() %>% as.matrix() 
-    
-    indicesTensor <- torch::torch_tensor(indicesTemporal, dtype=torch::torch_long())$t() 
-    valuesTensor <- torch::torch_tensor(valuesTemporal)$squeeze()
-    
-    sparseMatrixTemporal <- torch::torch_sparse_coo_tensor(indices=indicesTensor,
-                                                           values=valuesTensor) 
-    
-    indicesNonTemporal <- newCovariateData$covariates %>% filter(is.na(.data$timeId)) %>% select(.data$rowId, .data$covariateId) %>% collect() %>% as.matrix()
-    valuesNonTemporal <- newCovariateData$covariates %>% filter(is.na(.data$timeId)) %>% select(.data$covariateValue) %>% collect() %>% as.matrix() 
-    
-  } else{
-    sparseMatrixTemporal <- NULL
-    indicesNonTemporal <- newCovariateData$covariates %>% select(.data$rowId, .data$covariateId) %>% collect() %>% as.matrix()
-    valuesNonTemporal <- newCovariateData$covariates %>% select(.data$covariateValue) %>% collect() %>% as.matrix() 
-    }
-  
-  indicesTensor <- torch::torch_tensor(indicesNonTemporal, dtype=torch::torch_long())$t() 
-  valuesTensor <- torch::torch_tensor(valuesNonTemporal)$squeeze()
-  sparseMatrixNonTemporal <- torch::torch_sparse_coo_tensor(indices=indicesTensor,
-                                                 values=valuesTensor)  
-  results = list(
-    data = sparseMatrixNonTemporal,
-    dataTemporal = sparseMatrixTemporal,
-    covariateRef=as.data.frame(newCovariateData$covariateRef),
-    map=as.data.frame(newCovariateData$mapping))
-  
-  return(results)
-  
-}
\ No newline at end of file
diff --git a/inst/python/TorchMap.py b/inst/python/TorchMap.py
deleted file mode 100644
index cf6cf0e..0000000
--- a/inst/python/TorchMap.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import torch
-
-
-def map_python(datas, maxCol, maxRow, maxT=None, matrix=None):
-    if maxT is not None:
-        indexes = datas[:, 0:3] - 1
-        matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:, 3]),
-                                           torch.Size([maxRow, maxCol, maxT]))
-    else:
-        indexes = datas[:, 0:2] - 1
-        matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:, 2]),
-                                           torch.Size([maxRow, maxCol]))
-    if matrix is None:
-        return matrix
-    else:
-        matrix = matrix.add(matrixt)
-    return matrix
diff --git a/inst/python/TorchUtils.py b/inst/python/TorchUtils.py
deleted file mode 100755
index ccb21b9..0000000
--- a/inst/python/TorchUtils.py
+++ /dev/null
@@ -1,695 +0,0 @@
-"""
- deepUtils.py
-
- Copyright 2016 Observational Health Data Sciences and Informatics
-
- This file is part of PatientLevelPrediction
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import sys
-import os
-#import _pickle as cPickle
-import pdb
-import random
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.externals import joblib
-from sklearn.metrics import roc_auc_score
-from collections import OrderedDict
-output_dir = 'data'
-
-
-class FocalLoss(nn.Module):
-    """
-    Method to handle data imbalance based on paper (arXiv:1708.02002) entitled
-    Focal loss for dense object detection.
-    Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class])
-
-    """
-
-    def __init__(self, gamma=5, eps=1e-7, size_average=False):
-        super(FocalLoss, self).__init__()
-        self.gamma = gamma
-        self.eps = eps
-        self.size_average = size_average
-
-    def forward(self, input, target):
-        y = self.one_hot(target, input.size(-1))
-        logit = F.softmax(input)
-        logit = logit.clamp(self.eps, 1. - self.eps)
-
-        loss = -1 * y * torch.log(logit)  # cross entropy
-        loss = loss * (1 - logit) ** self.gamma  # focal loss
-
-        if self.size_average:
-            loss = loss.mean()
-        else:
-            loss = loss.sum()
-        return loss
-
-    def one_hot(self, index, classes):
-        """
-
-        :param index: is the labels
-        :param classes: number if classes
-        :return:
-        """
-        size = index.size() + (classes,)
-        view = index.size() + (1,)
-
-        mask = torch.Tensor(*size).fill_(0)
-        index = index.view(*view)
-        ones = 1.
-
-        if isinstance(index, Variable):
-            ones = Variable(torch.Tensor(index.size()).fill_(1))
-            mask = Variable(mask, volatile=index.volatile)
-            if torch.cuda.is_available():
-                ones = ones.cuda()
-                mask = mask.cuda()
-
-        return mask.scatter_(1, index, ones)
-
-def loss_function(recon_x, x, mu, logvar):
-    """Loss function for varational autoencoder VAE"""
-    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
-
-    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
-    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
-
-    return BCE + KLD
-
-
-def mixup_data(x, y, alpha=1.0):
-
-    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda
-    Data Augmentation method based on paper (arXiv:1710.09412) entitled
-    mixup: Beyond empirical risk minimization.
-    '''
-    if alpha > 0.:
-        lam = np.random.beta(alpha, alpha)
-    else:
-        lam = 1.
-    batch_size = x.size()[0]
-    if torch.cuda.is_available():
-        index = torch.randperm(batch_size).cuda()
-    else:
-        index = torch.randperm(batch_size)
-
-    mixed_x = lam * x + (1 - lam) * x[index,:]
-    y_a, y_b = y, y[index]
-    return mixed_x, y_a, y_b, lam
-
-def mixup_criterion(y_a, y_b, lam):
-    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
-
-def early_stop(metrics_hist, patience = 3):
-    if not np.all(np.isnan(metrics_hist)):
-        return np.nanargmin(metrics_hist) > len(metrics_hist) - patience
-    else:
-        #keep training if criterion results have all been nan so far
-        return False
-
-
-class Estimator(object):
-    """
-    It is used for training different deep models in the same interface.
-    """
-
-    def __init__(self, model):
-        self.model = model
-
-    def compile(self, optimizer, loss):
-        self.optimizer = optimizer
-        self.loss_f = loss
-
-    def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False):
-        """
-        train one epoch
-
-        :param train_loader: The data loaded using DataLoader
-        :param l1regularization: default False
-        :return: the return fitted loss and accuracy
-        """
-        loss_list = []
-        acc_list = []
-        for idx, (X, y) in enumerate(train_loader):
-            X_v = Variable(X)
-            y_v = Variable(y)
-            if torch.cuda.is_available():
-                X_v = X_v.cuda()
-                y_v = y_v.cuda()
-
-            if mixup:
-                X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v)
-                X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b)
-
-                # print 'GPU id', torch.cuda.current_device()
-            self.optimizer.zero_grad()
-            # the below comemnted lines are used for multiple GPU training
-            # if torch.cuda.device_count() > 1:
-            # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count()))
-            # if cuda:
-            #	net = net.cuda()
-            # y_pred = net(X_v)
-            if autoencoder:
-                if vae:
-                    y_pred, mu, logvar = self.model(X_v)
-                    loss = loss_function(y_pred, X_v, mu, logvar)
-                else:
-                    y_pred = self.model(X_v)
-                    loss = self.loss_f(y_pred, X_v)
-            else:
-                y_pred = self.model(X_v)
-
-                loss = self.loss_f(y_pred, y_v)
-
-                if mixup:
-                    loss_func = mixup_criterion(y_v_a, y_v_b, lam)
-                    loss = loss_func(self.loss_f, y_pred)
-
-                if l1regularization:
-                    l1_crit = nn.L1Loss(size_average=False)
-                    reg_loss = 0
-                    for param in self.model.parameters():
-                        target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32)))
-                        if torch.cuda.is_available():
-                            target = target.cuda()
-                        reg_loss += l1_crit(param, target)
-
-                    factor = 0.0005
-                    loss += factor * reg_loss
-
-            loss.backward()
-            self.optimizer.step()
-            loss_list.append(loss.item())
-            if autoencoder:
-                acc_list.append(0)
-            else:
-                classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
-                acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten())
-                acc_list.append(acc)
-            del loss
-            del y_pred
-
-        return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list)
-
-    def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False):
-        train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)),
-                                  torch.from_numpy(y.astype(np.float32)).long().view(-1))
-        train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
-        self.model.train()
-        for t in range(nb_epoch):
-            loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae)
-            #print loss
-            val_log = ''
-            if validation_data and not autoencoder:
-                val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size)
-
-                val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc)
-                print(val_log)
-                # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log))
-
-    def evaluate(self, X, y, batch_size=32):
-        y_pred = self.predict(X)
-        y_v = Variable(torch.from_numpy(y).long(), requires_grad=False)
-        if torch.cuda.is_available():
-            y_v = y_v.cuda()
-        loss = self.loss_f(y_pred, y_v)
-        predict = y_pred.data.cpu().numpy()[:, 1].flatten()
-        auc = roc_auc_score(y, predict)
-        return loss.item(), auc
-
-    def _accuracy(self, y_pred, y):
-        return float(sum(y_pred == y)) / y.shape[0]
-
-    def predict(self, X):
-        X = Variable(torch.from_numpy(X.astype(np.float32)))
-        if torch.cuda.is_available():
-            X = X.cuda()
-        y_pred = self.model(X)
-        return y_pred
-
-    def predict_proba(self, X):
-        self.model.eval()
-        return self.model.predict_proba(X)
-
-class EarlyStopping(object): # pylint: disable=R0902
-    """
-    Gives a criterion to stop training when a given metric is not
-    improving anymore
-    Args:
-        mode (str): One of `min`, `max`. In `min` mode, training will
-            be stopped when the quantity monitored has stopped
-            decreasing; in `max` mode it will be stopped when the
-            quantity monitored has stopped increasing. Default: 'min'.
-        patience (int): Number of epochs with no improvement after
-            which training is stopped. For example, if
-            `patience = 2`, then we will ignore the first 2 epochs
-            with no improvement, and will only stop learning after the
-            3rd epoch if the loss still hasn't improved then.
-            Default: 10.
-        threshold (float): Threshold for measuring the new optimum,
-            to only focus on significant changes. Default: 1e-4.
-        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
-            dynamic_threshold = best * ( 1 + threshold ) in 'max'
-            mode or best * ( 1 - threshold ) in `min` mode.
-            In `abs` mode, dynamic_threshold = best + threshold in
-            `max` mode or best - threshold in `min` mode. Default: 'rel'.
-    """
-
-    def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'):
-        self.patience = patience
-        self.mode = mode
-        self.threshold = threshold
-        self.threshold_mode = threshold_mode
-        self.best = None
-        self.num_bad_epochs = None
-        self.mode_worse = None  # the worse value for the chosen mode
-        self.is_better = None
-        self.last_epoch = -1
-        self._init_is_better(mode=mode, threshold=threshold,
-                             threshold_mode=threshold_mode)
-        self._reset()
-
-    def _reset(self):
-        """Resets num_bad_epochs counter and cooldown counter."""
-        self.best = self.mode_worse
-        self.num_bad_epochs = 0
-
-    def step(self, metrics, epoch=None):
-        """ Updates early stopping state """
-        current = metrics
-        if epoch is None:
-            epoch = self.last_epoch = self.last_epoch + 1
-        self.last_epoch = epoch
-
-        if self.is_better(current, self.best):
-            self.best = current
-            self.num_bad_epochs = 0
-        else:
-            self.num_bad_epochs += 1
-
-    @property
-    def stop(self):
-        """ Should we stop learning? """
-        return self.num_bad_epochs > self.patience
-
-    def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201
-        if mode == 'min' and threshold_mode == 'rel':
-            rel_epsilon = 1. - threshold
-            return a < best * rel_epsilon
-
-        elif mode == 'min' and threshold_mode == 'abs':
-            return a < best - threshold
-
-        elif mode == 'max' and threshold_mode == 'rel':
-            rel_epsilon = threshold + 1.
-            return a > best * rel_epsilon
-
-        return a > best + threshold
-
-    def _init_is_better(self, mode, threshold, threshold_mode):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if threshold_mode not in {'rel', 'abs'}:
-            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
-
-        if mode == 'min':
-            self.mode_worse = float('inf')
-        else:  # mode == 'max':
-            self.mode_worse = (-float('inf'))
-
-        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
-
-    def state_dict(self):
-        """ Returns early stopping state """
-        return {key: value for key, value in self.__dict__.items() if key != 'is_better'}
-
-    def load_state_dict(self, state_dict):
-        """ Loads early stopping state """
-        self.__dict__.update(state_dict)
-        self._init_is_better(mode=self.mode, threshold=self.threshold,
-                             threshold_mode=self.threshold_mode)
-
-def adjust_learning_rate(learning_rate, optimizer, epoch):
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-
-    lr = learning_rate * (0.1 ** (epoch // 10))
-
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-    return lr
-
-def batch(tensor, batch_size = 50):
-    """ It is used to create batch samples, each batch has batch_size samples"""
-    tensor_list = []
-    length = tensor.shape[0]
-    i = 0
-    while True:
-        if (i+1) * batch_size >= length:
-            tensor_list.append(tensor[i * batch_size: length])
-            return tensor_list
-        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
-        i += 1
-
-
-class selu(nn.Module):
-    def __init__(self):
-        super(selu, self).__init__()
-        self.alpha = 1.6732632423543772848170429916717
-        self.scale = 1.0507009873554804934193349852946
-
-    def forward(self, x):
-        temp1 = self.scale * F.relu(x)
-        temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x)))
-        return temp1 + temp2
-
-
-class alpha_drop(nn.Module):
-    def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1):
-        super(alpha_drop, self).__init__()
-        keep_prob = 1 - p
-        self.a = np.sqrt(
-            fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar)))
-        self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
-        self.alpha = alpha
-        self.keep_prob = 1 - p
-        self.drop_prob = p
-
-    def forward(self, x):
-        if self.keep_prob == 1 or not self.training:
-            # print("testing mode, direct return")
-            return x
-        else:
-            random_tensor = self.keep_prob + torch.rand(x.size())
-
-            binary_tensor = Variable(torch.floor(random_tensor))
-
-            if torch.cuda.is_available():
-                binary_tensor = binary_tensor.cuda()
-
-            x = x.mul(binary_tensor)
-            ret = x + self.alpha * (1 - binary_tensor)
-            ret.mul_(self.a).add_(self.b)
-            return ret
-
-def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None):
-    """
-    create matrix for temporal models.
-
-    :param covariate_ids: the covariate ids in the whole data
-    :param patient_dict: the dictionary contains the data for each patient
-    :param y_dict: if the output labels is known, it contains the labels for patients
-    :param timeid_len: the total number time window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    D = len(covariate_ids)
-    N = len(patient_dict)
-    T = timeid_len   
-    concept_list =list(covariate_ids)
-    concept_list.sort()
-    x_raw = np.zeros((N, D, T), dtype=float)
-    patient_ind = 0
-    p_ids = []
-    patient_keys = patient_dict.keys()
-    #print covariate_ids
-    for kk in patient_keys:
-        #print('-------------------')
-        vals = patient_dict[kk]
-        #sorted(vals)
-        p_ids.append(int(kk))
-        for timeid, meas in vals.iteritems():
-            int_time = int(timeid) - 1
-            for val in meas:
-                if not len(val):
-                    continue
-                cov_id, cov_val = val
-                if cov_id not in covariate_ids:
-                    continue
-                lab_ind = concept_list.index(cov_id)
-                if cov_mean_dict is None:
-                    x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-                else:
-                    mean_std = cov_mean_dict[cov_id]
-                    if mean_std[1]:
-                        x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1]
-                    else:
-                        x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-
-    
-        patient_ind = patient_ind + 1
-
-    #impute the data using the value of previous timestamp
-    #fw = open('patient_var.txt', 'w')
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-    #fw.close()
-
-    return x_raw, patient_keys
-
-def forward_impute_missing_value(x_raw):
-    N = x_raw.shape[0]
-    D = x_raw.shape[1]
-    T = x_raw.shape[2]
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-
-def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False):
-    """
-    It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix
-
-    :param covariates: covariates extracted by FeatureExtraction package
-    :param timeid_len: the total number of window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    patient_dict = OrderedDict()
-    print('Loading temporal data')
-    cov_vals_dict = {}
-    for row in covariates:
-        p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3]
-        cov_id = np.int64(cov_id)
-        #time_id = int(time_id)
-        cov_vals_dict.setdefault(cov_id, []).append(float(cov_val))
-        if p_id not in patient_dict:
-            patient_dict[p_id] = {time_id: [(cov_id, cov_val)]}
-        else:
-            if time_id not in patient_dict[p_id]:
-                patient_dict[p_id][time_id] = [(cov_id, cov_val)]
-            else:
-                patient_dict[p_id][time_id].append((cov_id, cov_val))
-        #covariate_ids.add(cov_id)
-    #T = 365/time_window
-    covariate_ids = set()
-    cov_mean_dict = {}
-    if not predict:
-        fw = open('covariate_mean_std.csv', 'w')
-        for key, val in cov_vals_dict.iteritems():
-            mean_val = np.mean(val)
-            std_val = np.std(val)
-
-            # Remove those covariates with few occurrence (<5)
-            if len(val) >= 5:
-                covariate_ids.add(key)
-                cov_mean_dict[key] = (mean_val, std_val)
-                fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n')
-        fw.close()
-    else:
-        fp = open('covariate_mean_std.csv', 'r')
-        for line in fp:
-            values = line.rstrip().split(',')
-            key = np.int64(values[0])
-            covariate_ids.add(key)
-            cov_mean_dict[key] = (float(values[1]), float(values[2]))
-        fp.close()
-
-
-    if normalize:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict)
-    else:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len)
-    
-    return x, patient_keys
-
-
-def read_covariates(covariate_file):
-    patient_dict = {}
-    head = True
-    with open(covariate_file, 'r') as fp:
-        for line in fp:
-            if head:
-                head = False
-                continue
-            values = line.rstrip().split(',')
-            patient_id = values[1]
-            cov_id = values[2]
-            #time_id = int(values[-1])
-            # covariates in one patient has time order
-            patient_dict.setdefault(patient_id, []).append((cov_id))
-    new_patient = []
-    for key in patient_dict.keys():
-        #patient_dict[key].sort()
-        sort_vals = []
-        for val in patient_dict[key]:
-            if val[1] not in sort_vals:
-                sort_vals.append(val)
-        new_patient.append(sort_vals)
-
-    return new_patient
-
-
-def word_embeddings(covariate_file, embedding_size=50):
-    import gensim.models.word2vec as w2v
-    modelname = "processed_%s.w2v" % ('heartfailure')
-    sentences = read_covariates(covariate_file)
-    model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1)
-    print("building word2vec vocab on %s..." % (covariate_file))
-
-    model.build_vocab(sentences)
-    print("training...")
-    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
-    out_file = modelname
-    print("writing embeddings to %s" % (out_file))
-    model.save(out_file)
-    return out_file
-
-def read_data(filename):
-    covariate_ids = set()
-    patient_dict = {}
-    head = True
-    with open(filename) as fp:
-        for lines in fp:
-            if head:
-                head = False
-                continue
-            lines = lines.strip('\n').strip('\r').split(',')
-            try:
-                p_id, cov_id, time_id, cov_val =  lines[1], lines[2], lines[3], lines[4]
-            except:
-                pdb.set_trace()
-            print(p_id, cov_id, time_id)
-            if p_id not in patient_dict:
-                patient_dict[p_id] = {}
-            else:
-                if time_id not in patient_dict[p_id]:
-                    patient_dict[p_id][time_id] = []
-                else:
-                    patient_dict[p_id][time_id].append((cov_id, cov_val))
-                    covariate_ids.add(cov_id)
-    patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients
-    #(15, 2000, 60) 20000 patients, 15 lab tests,
-    return covariate_ids, patient_dict
-
-def split_training_validation(classes, validation_size=0.2, shuffle=False):
-    """split sampels based on balnace classes"""
-    num_samples = len(classes)
-    classes = np.array(classes)
-    classes_unique = np.unique(classes)
-    num_classes = len(classes_unique)
-    indices = np.arange(num_samples)
-    # indices_folds=np.zeros([num_samples],dtype=int)
-    training_indice = []
-    training_label = []
-    validation_indice = []
-    validation_label = []
-    for cl in classes_unique:
-        indices_cl = indices[classes == cl]
-        num_samples_cl = len(indices_cl)
-
-        # split this class into k parts
-        if shuffle:
-            random.shuffle(indices_cl)  # in-place shuffle
-
-        # module and residual
-        num_samples_each_split = int(num_samples_cl * validation_size)
-        res = num_samples_cl - num_samples_each_split
-
-        training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]]
-        training_label = training_label + [cl] * res
-
-        validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]]
-        validation_label = validation_label + [cl] * num_samples_each_split
-
-    training_index = np.arange(len(training_label))
-    random.shuffle(training_index)
-    training_indice = np.array(training_indice)[training_index]
-    training_label = np.array(training_label)[training_index]
-
-    validation_index = np.arange(len(validation_label))
-    random.shuffle(validation_index)
-    validation_indice = np.array(validation_indice)[validation_index]
-    validation_label = np.array(validation_label)[validation_index]
-
-    return training_indice, training_label, validation_indice, validation_label
-
-
-if __name__ == "__main__":
-    x_raw = np.array([[1, 1, 0], [0,1,0]])
-    x = []
-    x.append(x_raw)
-    x.append(x_raw)
-    x = np.array(x)
-    #pdb.set_trace()
-    forward_impute_missing_value(x)
-    #filename = sys.argv[1]
-    #word_embeddings(filename)
-    '''
-    population = joblib.load('/data/share/plp/SYNPUF/population.pkl')
-    # y = population[:, 1]
-    covriate_ids, patient_dict = read_data(filename)
-    # y_ids = np.array([int(val) for val in patient_dict.keys()])
-    # Y = []
-    y_dict = dict(zip(population[:, 0], population[:, 1]))
-    #x, patient_keys = convert_2_cnn_format(covariates, timeid_len = 31)
-    # for val in y_ids:
-    #    Y.append(y_dict[y_ids]) 
-    x_train, x_valid, x_test, Y_train, Y_valid, Y_test = convert_to_temporal_format(covriate_ids, patient_dict, y_dict)
-    '''
diff --git a/inst/python/deepTorch.py b/inst/python/deepTorch.py
deleted file mode 100644
index 7425413..0000000
--- a/inst/python/deepTorch.py
+++ /dev/null
@@ -1,1186 +0,0 @@
-"""
- deepTorch.py: It implements different deep learning classifiers
-
- Copyright 2016 Observational Health Data Sciences and Informatics
-
- This file is part of PatientLevelPrediction
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import sys
-import os
-import pdb
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from torch.utils.data import DataLoader, TensorDataset
-from torch.nn.utils.rnn import pack_padded_sequence
-from torch.nn.utils.rnn import pad_packed_sequence
-from collections import OrderedDict
-import timeit
-import joblib
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import roc_auc_score
-import numpy as np
-
-import warnings
-warnings.filterwarnings("ignore")
-
-if "python_dir" in globals():
-    #print python_dir
-    sys.path.insert(0, python_dir)
-
-import TorchUtils as tu
-
-
-class LogisticRegression(nn.Module):
-    """
-    Train a logistic regression model using pytorch
-    """
-    def __init__(self, input_size, num_classes = 2):
-        super(LogisticRegression, self).__init__()
-        self.linear = nn.Linear(input_size, num_classes)
-
-    def forward(self, x):
-        out = self.linear(x)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class MLP(nn.Module):
-    """
-    Train a multiple-layer perceptron with one hideen layer
-    """
-    def __init__(self, input_dim, hidden_size, num_classes = 2):
-        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p =0.5, training=self.training)
-        x = self.fc2(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class SNN(nn.Module):
-    """
-    Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515
-    """
-
-    def __init__(self, input_dim, hidden_size, num_classes=2):
-        super(SNN, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = tu.selu()
-        self.ad1 = tu.alpha_drop()
-        self.fc4 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = self.fc2(x)
-        x = self.ad1(x)
-        x = self.fc4(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-class AutoEncoder(nn.Module):
-    """
-    A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(AutoEncoder, self).__init__()
-
-        self.encoder = nn.Sequential(
-            nn.Linear(input_size, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, encoding_size),
-            nn.ReLU(True)
-        )
-        self.decoder = nn.Sequential(
-            nn.Linear(encoding_size, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size)
-        )
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        encoded = self.encoder(x)
-        decoded = self.decoder(encoded)
-        return decoded
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            encoded = self.encoder(x)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class VAE(nn.Module):
-    """
-    A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(VAE, self).__init__()
-
-        self.fc1 = nn.Linear(input_size, input_size/2)
-        self.fc21 = nn.Linear(input_size/2, encoding_size)
-        self.fc22 = nn.Linear(input_size/2, encoding_size)
-        self.fc3 = nn.Linear(encoding_size, input_size/2)
-        self.fc4 = nn.Linear(input_size/2, input_size)
-
-        self.relu = nn.ReLU()
-        self.sigmoid = nn.Sigmoid()
-
-    def encode(self, x):
-        h1 = self.relu(self.fc1(x))
-        return self.fc21(h1), self.fc22(h1)
-
-    def reparameterize(self, mu, logvar):
-        if self.training:
-            std = logvar.mul(0.5).exp_()
-            eps = Variable(std.data.new(std.size()).normal_())
-            return eps.mul(std).add_(mu)
-        else:
-            return mu
-
-    def decode(self, z):
-        h3 = self.relu(self.fc3(z))
-        return self.sigmoid(self.fc4(h3))
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        mu, logvar = self.encode(x)
-        z = self.reparameterize(mu, logvar)
-        return self.decode(z), mu, logvar
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encode(x)
-            encoded = self.reparameterize(mu, logvar)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class Decoder(nn.Module):
-    """ VAE decoder input_size = original inputsize/16*256"""
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Decoder, self).__init__()
-        self.latent_size = latent_size
-        self.img_channels = img_channels
-
-        self.fc1 = nn.Linear(latent_size, input_size)
-        self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding)
-        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding)
-        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding)
-        self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.fc1(x))
-        x = x.unsqueeze(-1).unsqueeze(-1)
-        x = F.relu(self.deconv1(x))
-        x = F.relu(self.deconv2(x))
-        x = F.relu(self.deconv3(x))
-        reconstruction = torch.sigmoid(self.deconv4(x))
-        return reconstruction
-
-class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
-    """ VAE encoder """
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Encoder, self).__init__()
-        self.latent_size = latent_size
-        #self.img_size = img_size
-        self.img_channels = img_channels
-
-        self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding)
-        self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding)
-        self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding)
-        out_size = input_size / 16
-        self.fc_mu = nn.Linear(out_size, latent_size)
-        self.fc_logsigma = nn.Linear(out_size, latent_size)
-
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = F.relu(self.conv4(x))
-        x = x.view(x.size(0), -1)
-
-        mu = self.fc_mu(x)
-        logsigma = self.fc_logsigma(x)
-
-        return mu, logsigma
-
-class VAE_CNN(nn.Module):
-    """ Variational Autoencoder """
-    def __init__(self, latent_size, input_size):
-        super(VAE, self).__init__()
-        self.encoder = Encoder(latent_size, input_size)
-        input_size = input_size/16
-        self.decoder = Decoder(latent_size, input_size)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        mu, logsigma = self.encoder(x)
-        sigma = logsigma.exp()
-        eps = torch.randn_like(sigma)
-        z = eps.mul(sigma).add_(mu)
-
-        recon_x = self.decoder(z)
-        return recon_x, mu, logsigma
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encoder(x)
-            encoded = mu .data.cpu().numpy()
-            return encoded
-
-class CNN(nn.Module):
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size)
-        self.bn = nn.BatchNorm1d(hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.layer2(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.bn(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-#allow multiple kernel with differnt kernel size
-class CNN_MLF(nn.Module):
-    """
-    It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN_MLF, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1
-        maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size
-        maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1
-        maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        conv_outsize = 	maxpool1_size + maxpool2_size +maxpool3_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out1 = self.layer1(x)
-        out2 = self.layer2(x)
-        out3 = self.layer3(x)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_LSTM(nn.Module):
-    """
-    It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2):
-        super(CNN_LSTM, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding)
-        input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(hidden_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.downsample(out)
-        out = torch.squeeze(out, 1)
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
-        out, hn  = self.layer2(out, (h0, c0))
-        out = hn[0][-1]
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MIX(nn.Module):
-    """
-    It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MIX, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        out1_size = int(np.ceil(float(window_size)/pool_size[1]))
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        
-        out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer2(out)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer3(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MULTI(nn.Module):
-    """
-    It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MULTI, self).__init__()
-        # resolution 1
-        self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        #resolution 2
-        self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.layer4 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        merge_size = cnn1_size + cnn2_size + cnn4_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.pool1_1(x)
-        out = self.pool1_2(out)
-        out1 = self.layer1(out)
-        out = self.pool2_1(x)
-        out2 = self.layer2(out)
-        out = self.layer3(x)
-        out3 = self.layer4(out)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# 1x3 Convolution
-def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)):
-    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
-                     padding=padding, stride=stride, bias=False)
-
-
-# Residual Block
-class ResidualBlock(nn.Module):
-    def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn1 = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn2 = nn.BatchNorm2d(nb_filter)
-        self.downsample = downsample
-        
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        if self.downsample:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-
-
-# ResNet Module
-class ResNet(nn.Module):
-    def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100):
-        super(ResNet, self).__init__()
-        self.in_channels = 1
-        self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size)
-        self.bn = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self.make_layer(block, nb_filter, layers[0],  kernel_size = kernel_size)
-        self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter)
-        self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter)
-        self.avg_pool = nn.AvgPool2d(pool_size)
-        avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        last_layer_size = nb_filter*4*labcounts*avgpool2_1_size
-        self.fc = nn.Linear(last_layer_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def make_layer(self, block, out_channels, blocks, stride=1,  kernel_size = (1, 3), in_channels = 16):
-        downsample = None
-        if (stride != 1) or (self.in_channels != out_channels):
-            downsample = nn.Sequential(
-                convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride),
-                nn.BatchNorm2d(out_channels))
-        layers = []
-        layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample))
-        self.in_channels = out_channels
-        for i in range(1, blocks):
-            layers.append(block(out_channels, out_channels, kernel_size = kernel_size))
-        return nn.Sequential(*layers)
-     
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.conv(x)
-        out = self.bn(out)
-        out = self.relu(out)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.avg_pool(out)
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-        
-class GRU(nn.Module):
-    """
-    It is a deep network with one GRU layer, which are further fed into one fully connected layers.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(GRU, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.linear = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection
-        self.gru.flatten_parameters()
-        out, hn = self.gru(x, h0)
-
-        rearranged = hn[-1]
-        out = self.linear(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def initHidden(self, N):
-        return Variable(torch.randn(1, N, self.hidden_size))
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class RNN(nn.Module):
-    """
-    It is a deep network with one LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(RNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.fc = nn.Linear(hidden_size, num_classes)
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        rearranged = hn[0][-1]
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class BiRNN(nn.Module):
-    """
-    It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(BiRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
-                            batch_first = True, dropout = dropout, bidirectional=True)
-        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection 
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        hn = hn[0]
-
-        rearranged = hn[-2:].view(x.size(0), -1)
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# select model
-if __name__ == "__main__":
-    if model_type in ['LogisticRegression', 'MLP', 'SNN']:
-        y = population[:, 1]
-        X = plpData[population[:, 0], :]
-        trainInds = population[:, population.shape[1] - 1] > 0
-        if class_weight == -1:
-            loss = tu.FocalLoss(gamma = 5)
-        else:
-            if class_weight == 0:
-                weights = float(np.count_nonzero(y))/y.shape[0]
-                class_weight = [1 - weights, weights]
-            else:
-                class_weight = [class_weight, 1]
-            class_weight = 1/torch.Tensor(class_weight)
-            if torch.cuda.is_available():
-                class_weight = class_weight.cuda()
-            loss=nn.CrossEntropyLoss(weight = class_weight)
-
-        print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
-        print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]))
-        ###########################################################################
-        l1regularization = False
-
-        if train:
-            pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
-            print("Calculating prediction for train set of size %s" % (pred_size))
-            test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
-            for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-                testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-                trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-                train_x = X[trainInds, :][trainInd, :]
-                train_y = y[trainInds][trainInd]
-
-                test_x = X[trainInds, :][testInd, :]
-                print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-                print("Train set contains %s outcomes " % (np.sum(train_y)))
-                train_x = train_x.toarray()
-                test_x = test_x.toarray()
-
-                if autoencoder:
-                    print('first train stakced autoencoder')
-                    encoding_size = 256
-                    if vae:
-                        auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-                    else:
-                        auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-                    if torch.cuda.is_available():
-                        auto_model = auto_model.cuda()
-                    clf = tu.Estimator(auto_model)
-                    clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay),
-                                loss=nn.MSELoss())
-                    clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae)
-                    #split to batch for large dataset
-                    train_batch = tu.batch(train_x, batch_size=32)
-                    train_x = np.array([]).reshape(0, encoding_size)
-                    for train in train_batch:
-                        encode_train = auto_model.get_encode_features(train)
-                        train_x = np.concatenate((train_x, encode_train), axis=0)
-                    #train_x = auto_model.get_encode_features(train_x.toarray())
-                    #test_x = auto_model.get_encode_features(test_x.toarray())
-                    test_batch = tu.batch(test_x, batch_size=32)
-                    test_x = np.array([]).reshape(0, encoding_size)
-                    for test in test_batch:
-                        encode_Test = auto_model.get_encode_features(test)
-                        test_x = np.concatenate((test_x, encode_Test), axis=0)
-                    del auto_model
-                    del clf
-                # train on fold
-                print("Training fold %s" % (i))
-                start_time = timeit.default_timer()
-                if model_type == 'LogisticRegression':
-                    model = LogisticRegression(train_x.shape[1])
-                    l1regularization = True
-                elif model_type == 'SNN':
-                    model = SNN(train_x.shape[1], size)
-                else:
-                    model = MLP(train_x.shape[1], size)
-
-                if torch.cuda.is_available():
-                    model = model.cuda()
-                clf = tu.Estimator(model)
-                clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                            loss=loss)
-                #if not autoencoder:
-                #    train_x = train_x.toarray()
-                #    test_x = test_x.toarray()
-
-                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-
-                ind = (population[:, population.shape[1] - 1] > 0)
-                ind = population[ind, population.shape[1] - 1] == i
-
-                test_input_var = torch.from_numpy(test_x.astype(np.float32))
-
-                test_batch = tu.batch(test_x, batch_size = 32)
-                temp = []
-                for test in test_batch:
-                    pred_test1 = model.predict_proba(test)[:, 1]
-                    temp = np.concatenate((temp, pred_test1), axis = 0)
-                #temp = model.predict_proba(test_input_var)[:, 1]
-                #temp = preds.data.cpu().numpy().flatten()
-                #print temp
-                test_pred[ind] = temp
-                print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-                print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-
-            # merge pred with indexes[testInd,:]
-            test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-            prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-
-        # train final:
-        else:
-            print("Training final neural network model on all train data...")
-            print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-
-            start_time = timeit.default_timer()
-
-            train_x = X[trainInds, :]
-            train_x = train_x.toarray()
-            train_y = y[trainInds]
-            if not os.path.exists(modelOutput):
-                os.makedirs(modelOutput)
-            if autoencoder:
-                encoding_size = 256
-                if vae:
-                    auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-                else:
-                    auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-                if torch.cuda.is_available():
-                    auto_model = auto_model.cuda()
-                clf = tu.Estimator(auto_model)
-                clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay),
-                            loss=nn.MSELoss())
-
-                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae)
-                #train_x = auto_model.get_encode_features(train_x.toarray())
-                train_batch = tu.batch(train_x, batch_size=32)
-                train_x = np.array([]).reshape(0, encoding_size)
-                for train in train_batch:
-                    encode_train = auto_model.get_encode_features(train)
-                    train_x = np.concatenate((train_x, encode_train), axis=0)
-                joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl'))
-                del auto_model
-                del clf
-
-            print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay))
-            if model_type == 'LogisticRegression':
-                model = LogisticRegression(train_x.shape[1])
-                l1regularization = True
-            elif model_type == 'SNN':
-                model = SNN(train_x.shape[1], size)
-            else:
-                model = MLP(train_x.shape[1], size)
-
-            #if not autoencoder:
-            #    train_x = train_x.toarray()
-
-            if torch.cuda.is_available():
-                model = model.cuda()
-            clf = tu.Estimator(model)
-            clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                        loss=loss)
-            clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-
-            end_time = timeit.default_timer()
-            print("Training final took: %.2f s" % (end_time - start_time))
-
-            # save the model:
-            print("Model saved to: %s" % (modelOutput))
-
-            joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-
-
-    elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']:
-        #print 'running model', model_type
-        y = population[:, 1]
-        #plpData = plpData[population[:, 0], :]
-        #config = tf.ConfigProto()
-        #config.gpu_options.allow_growth = True
-        #with tf.Session(config=config) as sess:
-        #    X = tf.sparse_reorder(plpData)
-        #    X = tf.sparse_tensor_to_dense(X)
-        #    X = sess.run(X)
-            #tu.forward_impute_missing_value(X)
-        X = plpData.to_dense().numpy()
-        X = X[np.int64(population[:, 0]), :]
-        '''
-        p_ids_in_cov = set(covariates[:, 0])
-        full_covariates = np.array([]).reshape(0,4)
-        default_covid = covariates[0, 1]
-        timeid_len = len(set(covariates[:, -2]))
-        for p_id in  population[:, 0]:
-            if p_id not in p_ids_in_cov:
-                tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1
-                full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-            else:
-                tmp_x = covariates[covariates[:, 0] == p_id, :]
-                full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-
-        trainInds = population[:, population.shape[1] - 1] > 0
-        X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len= timeid_len)
-        full_covariates = []
-        '''
-        if class_weight == -1:
-            loss = tu.FocalLoss(gamma = 3)
-        else:
-            if class_weight == 0:
-                weights = float(np.count_nonzero(y))/y.shape[0]
-                class_weight = [1 - weights, weights]
-            else:
-                class_weight = [class_weight, 1]
-            class_weight = 1/torch.Tensor(class_weight)
-            if torch.cuda.is_available():
-                class_weight = class_weight.cuda()
-            loss=nn.CrossEntropyLoss(weight = class_weight)
-        trainInds = population[:, population.shape[1] - 1] > 0
-        if train:
-            pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
-            print("Calculating prediction for train set of size %s" % (pred_size))
-            test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
-            for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-                testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-                trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-                train_x = X[trainInds, :][trainInd, :]
-                train_y = y[trainInds][trainInd]
-
-                test_x = X[trainInds, :][testInd, :]
-                print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-                print("Train set contains %s outcomes " % (np.sum(train_y)))
-
-                # train on fold
-                learning_rate = 0.001
-                print("Training fold %s" % (i))
-                start_time = timeit.default_timer()
-                if model_type == 'CNN':
-                    model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'CNN_LSTM':
-                    model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-                elif model_type == 'CNN_MLF': # multiple kernels with different size
-                    model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis
-                    model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis
-                    model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-                elif model_type == 'ResNet':
-                    print('train ResNet')
-                    model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-                elif model_type == 'RNN':
-                    model = RNN(train_x.shape[2], hidden_size, 2, 2)
-                elif model_type == 'BiRNN':
-                    model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-                elif model_type == 'GRU':
-                    model = GRU(train_x.shape[2], hidden_size, 2, 2)
-                else:
-                    print('temproal data not supported by this model')
-
-                if torch.cuda.is_available():
-                    model = model.cuda()
-                clf = tu.Estimator(model)
-                clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                            loss=loss)
-
-                clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-
-                ind = (population[:, population.shape[1] - 1] > 0)
-                ind = population[ind, population.shape[1] - 1] == i
-
-                test_batch = tu.batch(test_x, batch_size = 32)
-                temp = []
-                for test in test_batch:
-                    pred_test1 = model.predict_proba(test)[:, 1]
-                    temp = np.concatenate((temp, pred_test1), axis = 0)
-
-                test_pred[ind] = temp
-                del model
-                print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-                print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-
-            # merge pred with indexes[testInd,:]
-            test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-            prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-
-        # train final:
-        else:
-            print("Training final neural network model on all train data...")
-            print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-
-            start_time = timeit.default_timer()
-
-            train_x = X[trainInds, :]
-            train_y = y[trainInds]
-            learning_rate = 0.001
-
-            if model_type == 'CNN':
-                model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'CNN_LSTM':
-                model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-            elif model_type == 'CNN_MLF': # multiple kernels with different size
-                model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis
-                model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis
-                model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-            elif model_type == 'ResNet':
-                model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-            elif model_type == 'RNN':
-                model = RNN(train_x.shape[2], hidden_size, 2, 2)
-            elif model_type == 'BiRNN':
-                model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-            elif model_type == 'GRU':
-                model = GRU(train_x.shape[2], hidden_size, 2, 2)
-            else:
-                print('temproal data not supported by this model')
-
-            if torch.cuda.is_available():
-                model = model.cuda()
-            clf = tu.Estimator(model)
-            clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                        loss=loss)
-            clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-
-            end_time = timeit.default_timer()
-            print("Training final took: %.2f s" % (end_time - start_time))
-
-            # save the model:
-            if not os.path.exists(modelOutput):
-                os.makedirs(modelOutput)
-            print("Model saved to: %s" % (modelOutput))
-
-            joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-            
-            # prediction on train:
-            test_batch = tu.batch(train_x, batch_size = 32)
-            test_pred = []
-            for test in test_batch:
-                pred_test1 = model.predict_proba(test)[:, 1]
-                test_pred = np.concatenate((test_pred, pred_test1), axis = 0)
-            test_pred.shape = (population.shape[0], 1)
-            prediction = np.append(population, test_pred, axis=1)
-
-'''
-if __name__ == "__main__":
-    DATA_SIZE = 1000
-    INPUT_SIZE = 36
-    HIDDEN_SIZE = 100
-    class_size = 2
-    #X = np.random.randn(DATA_SIZE * class_size, 18, INPUT_SIZE)
-    X = np.random.randn(DATA_SIZE * class_size, INPUT_SIZE)
-    y = np.array([i for i in range(class_size) for _ in range(DATA_SIZE)])
-    
-    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
-    model = LogisticRegression(X_train.shape[1])
-    l1regularization = True
-    #model = CNN_LSTM(nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2]) 
-    #model = ResNet(ResidualBlock, [3, 3, 3], nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2])
-    #model = RNN(INPUT_SIZE, HIDDEN_SIZE, 2, class_size)
-    #pdb.set_trace()
-    if cuda:
-        model = model.cuda()
-    clf = Estimator(model)
-    clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-4),
-                loss=nn.CrossEntropyLoss())
-    clf.fit(X_train, y_train, batch_size=64, nb_epoch=10,
-            validation_data=(X_test, y_test), l1regularization = l1regularization)
-    score, auc = clf.evaluate(X_test, y_test)
-    
-    print('Test score: %s' %(auc))
-'''
diff --git a/inst/python/deepTorchFunctions.py b/inst/python/deepTorchFunctions.py
deleted file mode 100644
index ff2f6f0..0000000
--- a/inst/python/deepTorchFunctions.py
+++ /dev/null
@@ -1,1732 +0,0 @@
-"""
- deepTorch.py: It implements different deep learning classifiers
-
- Copyright 2016 Observational Health Data Sciences and Informatics
-
- This file is part of PatientLevelPrediction
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-"""
-
-import sys
-import os
-import pdb
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from torch.autograd import Variable
-from torch.utils.data import DataLoader, TensorDataset
-from torch.nn.utils.rnn import pack_padded_sequence
-from torch.nn.utils.rnn import pad_packed_sequence
-from collections import OrderedDict
-import timeit
-import joblib
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import roc_auc_score
-import numpy as np
-
-import warnings
-warnings.filterwarnings("ignore")
-
-class FocalLoss(nn.Module):
-    """
-    Method to handle data imbalance based on paper (arXiv:1708.02002) entitled
-    Focal loss for dense object detection.
-    Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class])
-
-    """
-
-    def __init__(self, gamma=5, eps=1e-7, size_average=False):
-        super(FocalLoss, self).__init__()
-        self.gamma = gamma
-        self.eps = eps
-        self.size_average = size_average
-
-    def forward(self, input, target):
-        y = self.one_hot(target, input.size(-1))
-        logit = F.softmax(input)
-        logit = logit.clamp(self.eps, 1. - self.eps)
-
-        loss = -1 * y * torch.log(logit)  # cross entropy
-        loss = loss * (1 - logit) ** self.gamma  # focal loss
-
-        if self.size_average:
-            loss = loss.mean()
-        else:
-            loss = loss.sum()
-        return loss
-
-    def one_hot(self, index, classes):
-        """
-
-        :param index: is the labels
-        :param classes: number if classes
-        :return:
-        """
-        size = index.size() + (classes,)
-        view = index.size() + (1,)
-
-        mask = torch.Tensor(*size).fill_(0)
-        index = index.view(*view)
-        ones = 1.
-
-        if isinstance(index, Variable):
-            ones = Variable(torch.Tensor(index.size()).fill_(1))
-            mask = Variable(mask, volatile=index.volatile)
-            if torch.cuda.is_available():
-                ones = ones.cuda()
-                mask = mask.cuda()
-
-        return mask.scatter_(1, index, ones)
-
-def loss_function(recon_x, x, mu, logvar):
-    """Loss function for varational autoencoder VAE"""
-    BCE = F.binary_cross_entropy(recon_x, x, size_average=False)
-
-    # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
-    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
-
-    return BCE + KLD
-
-
-def mixup_data(x, y, alpha=1.0):
-
-    '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda
-    Data Augmentation method based on paper (arXiv:1710.09412) entitled
-    mixup: Beyond empirical risk minimization.
-    '''
-    if alpha > 0.:
-        lam = np.random.beta(alpha, alpha)
-    else:
-        lam = 1.
-    batch_size = x.size()[0]
-    if torch.cuda.is_available():
-        index = torch.randperm(batch_size).cuda()
-    else:
-        index = torch.randperm(batch_size)
-
-    mixed_x = lam * x + (1 - lam) * x[index,:]
-    y_a, y_b = y, y[index]
-    return mixed_x, y_a, y_b, lam
-
-def mixup_criterion(y_a, y_b, lam):
-    return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)
-
-def early_stop(metrics_hist, patience = 3):
-    if not np.all(np.isnan(metrics_hist)):
-        return np.nanargmin(metrics_hist) > len(metrics_hist) - patience
-    else:
-        #keep training if criterion results have all been nan so far
-        return False
-
-
-class Estimator(object):
-    """
-    It is used for training different deep models in the same interface.
-    """
-
-    def __init__(self, model):
-        self.model = model
-
-    def compile(self, optimizer, loss):
-        self.optimizer = optimizer
-        self.loss_f = loss
-
-    def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False):
-        """
-        train one epoch
-
-        :param train_loader: The data loaded using DataLoader
-        :param l1regularization: default False
-        :return: the return fitted loss and accuracy
-        """
-        loss_list = []
-        acc_list = []
-        for idx, (X, y) in enumerate(train_loader):
-            X_v = Variable(X)
-            y_v = Variable(y)
-            if torch.cuda.is_available():
-                X_v = X_v.cuda()
-                y_v = y_v.cuda()
-
-            if mixup:
-                X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v)
-                X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b)
-
-                # print 'GPU id', torch.cuda.current_device()
-            self.optimizer.zero_grad()
-            # the below comemnted lines are used for multiple GPU training
-            # if torch.cuda.device_count() > 1:
-            # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count()))
-            # if cuda:
-            #	net = net.cuda()
-            # y_pred = net(X_v)
-            if autoencoder:
-                if vae:
-                    y_pred, mu, logvar = self.model(X_v)
-                    loss = loss_function(y_pred, X_v, mu, logvar)
-                else:
-                    y_pred = self.model(X_v)
-                    loss = self.loss_f(y_pred, X_v)
-            else:
-                y_pred = self.model(X_v)
-
-                loss = self.loss_f(y_pred, y_v)
-
-                if mixup:
-                    loss_func = mixup_criterion(y_v_a, y_v_b, lam)
-                    loss = loss_func(self.loss_f, y_pred)
-
-                if l1regularization:
-                    l1_crit = nn.L1Loss(size_average=False)
-                    reg_loss = 0
-                    for param in self.model.parameters():
-                        target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32)))
-                        if torch.cuda.is_available():
-                            target = target.cuda()
-                        reg_loss += l1_crit(param, target)
-
-                    factor = 0.0005
-                    loss += factor * reg_loss
-
-            loss.backward()
-            self.optimizer.step()
-            loss_list.append(loss.item())
-            if autoencoder:
-                acc_list.append(0)
-            else:
-                classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten()
-                acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten())
-                acc_list.append(acc)
-            del loss
-            del y_pred
-
-        return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list)
-
-    def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False):
-        train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)),
-                                  torch.from_numpy(y.astype(np.float32)).long().view(-1))
-        train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True)
-        self.model.train()
-        for t in range(nb_epoch):
-            loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae)
-            #print loss
-            val_log = ''
-            if validation_data and not autoencoder:
-                val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size)
-
-                val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc)
-                print(val_log)
-                # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log))
-
-    def evaluate(self, X, y, batch_size=32):
-        y_pred = self.predict(X)
-        y_v = Variable(torch.from_numpy(y).long(), requires_grad=False)
-        if torch.cuda.is_available():
-            y_v = y_v.cuda()
-        loss = self.loss_f(y_pred, y_v)
-        predict = y_pred.data.cpu().numpy()[:, 1].flatten()
-        auc = roc_auc_score(y, predict)
-        return loss.item(), auc
-
-    def _accuracy(self, y_pred, y):
-        return float(sum(y_pred == y)) / y.shape[0]
-
-    def predict(self, X):
-        X = Variable(torch.from_numpy(X.astype(np.float32)))
-        if torch.cuda.is_available():
-            X = X.cuda()
-        y_pred = self.model(X)
-        return y_pred
-
-    def predict_proba(self, X):
-        self.model.eval()
-        return self.model.predict_proba(X)
-
-class EarlyStopping(object): # pylint: disable=R0902
-    """
-    Gives a criterion to stop training when a given metric is not
-    improving anymore
-    Args:
-        mode (str): One of `min`, `max`. In `min` mode, training will
-            be stopped when the quantity monitored has stopped
-            decreasing; in `max` mode it will be stopped when the
-            quantity monitored has stopped increasing. Default: 'min'.
-        patience (int): Number of epochs with no improvement after
-            which training is stopped. For example, if
-            `patience = 2`, then we will ignore the first 2 epochs
-            with no improvement, and will only stop learning after the
-            3rd epoch if the loss still hasn't improved then.
-            Default: 10.
-        threshold (float): Threshold for measuring the new optimum,
-            to only focus on significant changes. Default: 1e-4.
-        threshold_mode (str): One of `rel`, `abs`. In `rel` mode,
-            dynamic_threshold = best * ( 1 + threshold ) in 'max'
-            mode or best * ( 1 - threshold ) in `min` mode.
-            In `abs` mode, dynamic_threshold = best + threshold in
-            `max` mode or best - threshold in `min` mode. Default: 'rel'.
-    """
-
-    def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'):
-        self.patience = patience
-        self.mode = mode
-        self.threshold = threshold
-        self.threshold_mode = threshold_mode
-        self.best = None
-        self.num_bad_epochs = None
-        self.mode_worse = None  # the worse value for the chosen mode
-        self.is_better = None
-        self.last_epoch = -1
-        self._init_is_better(mode=mode, threshold=threshold,
-                             threshold_mode=threshold_mode)
-        self._reset()
-
-    def _reset(self):
-        """Resets num_bad_epochs counter and cooldown counter."""
-        self.best = self.mode_worse
-        self.num_bad_epochs = 0
-
-    def step(self, metrics, epoch=None):
-        """ Updates early stopping state """
-        current = metrics
-        if epoch is None:
-            epoch = self.last_epoch = self.last_epoch + 1
-        self.last_epoch = epoch
-
-        if self.is_better(current, self.best):
-            self.best = current
-            self.num_bad_epochs = 0
-        else:
-            self.num_bad_epochs += 1
-
-    @property
-    def stop(self):
-        """ Should we stop learning? """
-        return self.num_bad_epochs > self.patience
-
-    def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201
-        if mode == 'min' and threshold_mode == 'rel':
-            rel_epsilon = 1. - threshold
-            return a < best * rel_epsilon
-
-        elif mode == 'min' and threshold_mode == 'abs':
-            return a < best - threshold
-
-        elif mode == 'max' and threshold_mode == 'rel':
-            rel_epsilon = threshold + 1.
-            return a > best * rel_epsilon
-
-        return a > best + threshold
-
-    def _init_is_better(self, mode, threshold, threshold_mode):
-        if mode not in {'min', 'max'}:
-            raise ValueError('mode ' + mode + ' is unknown!')
-        if threshold_mode not in {'rel', 'abs'}:
-            raise ValueError('threshold mode ' + threshold_mode + ' is unknown!')
-
-        if mode == 'min':
-            self.mode_worse = float('inf')
-        else:  # mode == 'max':
-            self.mode_worse = (-float('inf'))
-
-        self.is_better = partial(self._cmp, mode, threshold_mode, threshold)
-
-    def state_dict(self):
-        """ Returns early stopping state """
-        return {key: value for key, value in self.__dict__.items() if key != 'is_better'}
-
-    def load_state_dict(self, state_dict):
-        """ Loads early stopping state """
-        self.__dict__.update(state_dict)
-        self._init_is_better(mode=self.mode, threshold=self.threshold,
-                             threshold_mode=self.threshold_mode)
-
-def adjust_learning_rate(learning_rate, optimizer, epoch):
-    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
-
-    lr = learning_rate * (0.1 ** (epoch // 10))
-
-    for param_group in optimizer.param_groups:
-        param_group['lr'] = lr
-    return lr
-
-def batch(tensor, batch_size = 50):
-    """ It is used to create batch samples, each batch has batch_size samples"""
-    tensor_list = []
-    length = tensor.shape[0]
-    i = 0
-    while True:
-        if (i+1) * batch_size >= length:
-            tensor_list.append(tensor[i * batch_size: length])
-            return tensor_list
-        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
-        i += 1
-
-
-class selu(nn.Module):
-    def __init__(self):
-        super(selu, self).__init__()
-        self.alpha = 1.6732632423543772848170429916717
-        self.scale = 1.0507009873554804934193349852946
-
-    def forward(self, x):
-        temp1 = self.scale * F.relu(x)
-        temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x)))
-        return temp1 + temp2
-
-
-class alpha_drop(nn.Module):
-    def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1):
-        super(alpha_drop, self).__init__()
-        keep_prob = 1 - p
-        self.a = np.sqrt(
-            fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar)))
-        self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha)
-        self.alpha = alpha
-        self.keep_prob = 1 - p
-        self.drop_prob = p
-
-    def forward(self, x):
-        if self.keep_prob == 1 or not self.training:
-            # print("testing mode, direct return")
-            return x
-        else:
-            random_tensor = self.keep_prob + torch.rand(x.size())
-
-            binary_tensor = Variable(torch.floor(random_tensor))
-
-            if torch.cuda.is_available():
-                binary_tensor = binary_tensor.cuda()
-
-            x = x.mul(binary_tensor)
-            ret = x + self.alpha * (1 - binary_tensor)
-            ret.mul_(self.a).add_(self.b)
-            return ret
-
-def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None):
-    """
-    create matrix for temporal models.
-
-    :param covariate_ids: the covariate ids in the whole data
-    :param patient_dict: the dictionary contains the data for each patient
-    :param y_dict: if the output labels is known, it contains the labels for patients
-    :param timeid_len: the total number time window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    D = len(covariate_ids)
-    N = len(patient_dict)
-    T = timeid_len   
-    concept_list =list(covariate_ids)
-    concept_list.sort()
-    x_raw = np.zeros((N, D, T), dtype=float)
-    patient_ind = 0
-    p_ids = []
-    patient_keys = patient_dict.keys()
-    #print covariate_ids
-    for kk in patient_keys:
-        #print('-------------------')
-        vals = patient_dict[kk]
-        #sorted(vals)
-        p_ids.append(int(kk))
-        for timeid, meas in vals.iteritems():
-            int_time = int(timeid) - 1
-            for val in meas:
-                if not len(val):
-                    continue
-                cov_id, cov_val = val
-                if cov_id not in covariate_ids:
-                    continue
-                lab_ind = concept_list.index(cov_id)
-                if cov_mean_dict is None:
-                    x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-                else:
-                    mean_std = cov_mean_dict[cov_id]
-                    if mean_std[1]:
-                        x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1]
-                    else:
-                        x_raw[patient_ind][lab_ind][int_time] = float(cov_val)
-
-    
-        patient_ind = patient_ind + 1
-
-    #impute the data using the value of previous timestamp
-    #fw = open('patient_var.txt', 'w')
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-    #fw.close()
-
-    return x_raw, patient_keys
-
-def forward_impute_missing_value(x_raw):
-    N = x_raw.shape[0]
-    D = x_raw.shape[1]
-    T = x_raw.shape[2]
-    for i in xrange(N):
-        for j in xrange(D):
-            temp = x_raw[i][j]
-            nonzero_inds = np.nonzero(temp)[0]
-            count_nonzeros =  len(nonzero_inds)
-            #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n')
-            if count_nonzeros == 1:
-                ind = nonzero_inds[0]
-                for k in xrange(ind + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][ind]
-            elif count_nonzeros > 1:
-                for ind in xrange(1, count_nonzeros):
-                    for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]):
-                        x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]]
-                # For last nonzeros.
-                for k in xrange(nonzero_inds[-1] + 1, T):
-                    x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]]
-
-
-def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False):
-    """
-    It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix
-
-    :param covariates: covariates extracted by FeatureExtraction package
-    :param timeid_len: the total number of window gaps when extracting temporal data
-    :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids
-    """
-    patient_dict = OrderedDict()
-    print('Loading temporal data')
-    cov_vals_dict = {}
-    for row in covariates:
-        p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3]
-        cov_id = np.int64(cov_id)
-        #time_id = int(time_id)
-        cov_vals_dict.setdefault(cov_id, []).append(float(cov_val))
-        if p_id not in patient_dict:
-            patient_dict[p_id] = {time_id: [(cov_id, cov_val)]}
-        else:
-            if time_id not in patient_dict[p_id]:
-                patient_dict[p_id][time_id] = [(cov_id, cov_val)]
-            else:
-                patient_dict[p_id][time_id].append((cov_id, cov_val))
-        #covariate_ids.add(cov_id)
-    #T = 365/time_window
-    covariate_ids = set()
-    cov_mean_dict = {}
-    if not predict:
-        fw = open('covariate_mean_std.csv', 'w')
-        for key, val in cov_vals_dict.iteritems():
-            mean_val = np.mean(val)
-            std_val = np.std(val)
-
-            # Remove those covariates with few occurrence (<5)
-            if len(val) >= 5:
-                covariate_ids.add(key)
-                cov_mean_dict[key] = (mean_val, std_val)
-                fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n')
-        fw.close()
-    else:
-        fp = open('covariate_mean_std.csv', 'r')
-        for line in fp:
-            values = line.rstrip().split(',')
-            key = np.int64(values[0])
-            covariate_ids.add(key)
-            cov_mean_dict[key] = (float(values[1]), float(values[2]))
-        fp.close()
-
-
-    if normalize:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict)
-    else:
-        x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len)
-    
-    return x, patient_keys
-
-
-def read_covariates(covariate_file):
-    patient_dict = {}
-    head = True
-    with open(covariate_file, 'r') as fp:
-        for line in fp:
-            if head:
-                head = False
-                continue
-            values = line.rstrip().split(',')
-            patient_id = values[1]
-            cov_id = values[2]
-            #time_id = int(values[-1])
-            # covariates in one patient has time order
-            patient_dict.setdefault(patient_id, []).append((cov_id))
-    new_patient = []
-    for key in patient_dict.keys():
-        #patient_dict[key].sort()
-        sort_vals = []
-        for val in patient_dict[key]:
-            if val[1] not in sort_vals:
-                sort_vals.append(val)
-        new_patient.append(sort_vals)
-
-    return new_patient
-
-
-def word_embeddings(covariate_file, embedding_size=50):
-    import gensim.models.word2vec as w2v
-    modelname = "processed_%s.w2v" % ('heartfailure')
-    sentences = read_covariates(covariate_file)
-    model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1)
-    print("building word2vec vocab on %s..." % (covariate_file))
-
-    model.build_vocab(sentences)
-    print("training...")
-    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)
-    out_file = modelname
-    print("writing embeddings to %s" % (out_file))
-    model.save(out_file)
-    return out_file
-
-def read_data(filename):
-    covariate_ids = set()
-    patient_dict = {}
-    head = True
-    with open(filename) as fp:
-        for lines in fp:
-            if head:
-                head = False
-                continue
-            lines = lines.strip('\n').strip('\r').split(',')
-            try:
-                p_id, cov_id, time_id, cov_val =  lines[1], lines[2], lines[3], lines[4]
-            except:
-                pdb.set_trace()
-            print(p_id, cov_id, time_id)
-            if p_id not in patient_dict:
-                patient_dict[p_id] = {}
-            else:
-                if time_id not in patient_dict[p_id]:
-                    patient_dict[p_id][time_id] = []
-                else:
-                    patient_dict[p_id][time_id].append((cov_id, cov_val))
-                    covariate_ids.add(cov_id)
-    patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients
-    #(15, 2000, 60) 20000 patients, 15 lab tests,
-    return covariate_ids, patient_dict
-
-def split_training_validation(classes, validation_size=0.2, shuffle=False):
-    """split sampels based on balnace classes"""
-    num_samples = len(classes)
-    classes = np.array(classes)
-    classes_unique = np.unique(classes)
-    num_classes = len(classes_unique)
-    indices = np.arange(num_samples)
-    # indices_folds=np.zeros([num_samples],dtype=int)
-    training_indice = []
-    training_label = []
-    validation_indice = []
-    validation_label = []
-    for cl in classes_unique:
-        indices_cl = indices[classes == cl]
-        num_samples_cl = len(indices_cl)
-
-        # split this class into k parts
-        if shuffle:
-            random.shuffle(indices_cl)  # in-place shuffle
-
-        # module and residual
-        num_samples_each_split = int(num_samples_cl * validation_size)
-        res = num_samples_cl - num_samples_each_split
-
-        training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]]
-        training_label = training_label + [cl] * res
-
-        validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]]
-        validation_label = validation_label + [cl] * num_samples_each_split
-
-    training_index = np.arange(len(training_label))
-    random.shuffle(training_index)
-    training_indice = np.array(training_indice)[training_index]
-    training_label = np.array(training_label)[training_index]
-
-    validation_index = np.arange(len(validation_label))
-    random.shuffle(validation_index)
-    validation_indice = np.array(validation_indice)[validation_index]
-    validation_label = np.array(validation_label)[validation_index]
-
-    return training_indice, training_label, validation_indice, validation_label
-
-
-class LogisticRegression(nn.Module):
-    """
-    Train a logistic regression model using pytorch
-    """
-    def __init__(self, input_size, num_classes = 2):
-        super(LogisticRegression, self).__init__()
-        self.linear = nn.Linear(input_size, num_classes)
-
-    def forward(self, x):
-        out = self.linear(x)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class MLP(nn.Module):
-    """
-    Train a multiple-layer perceptron with one hideen layer
-    """
-    def __init__(self, input_dim, hidden_size, num_classes = 2):
-        super(MLP, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p =0.5, training=self.training)
-        x = self.fc2(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class SNN(nn.Module):
-    """
-    Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515
-    """
-
-    def __init__(self, input_dim, hidden_size, num_classes=2):
-        super(SNN, self).__init__()
-        self.fc1 = nn.Linear(input_dim, hidden_size)
-        self.fc2 = selu()
-        self.ad1 = alpha_drop()
-        self.fc4 = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        x = F.relu(self.fc1(x))
-        x = F.dropout(x, p=0.5, training=self.training)
-        x = self.fc2(x)
-        x = self.ad1(x)
-        x = self.fc4(x)
-        x = torch.sigmoid(x)
-        return x
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-class AutoEncoder(nn.Module):
-    """
-    A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(AutoEncoder, self).__init__()
-
-        self.encoder = nn.Sequential(
-            nn.Linear(input_size, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, encoding_size),
-            nn.ReLU(True)
-        )
-        self.decoder = nn.Sequential(
-            nn.Linear(encoding_size, input_size/4),
-            nn.ReLU(True),
-            nn.Linear(input_size/4, input_size/2),
-            nn.ReLU(True),
-            nn.Linear(input_size/2, input_size)
-        )
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        encoded = self.encoder(x)
-        decoded = self.decoder(encoded)
-        return decoded
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            encoded = self.encoder(x)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class VAE(nn.Module):
-    """
-    A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data.
-    """
-    def __init__(self, input_size, encoding_size):
-        super(VAE, self).__init__()
-
-        self.fc1 = nn.Linear(input_size, input_size/2)
-        self.fc21 = nn.Linear(input_size/2, encoding_size)
-        self.fc22 = nn.Linear(input_size/2, encoding_size)
-        self.fc3 = nn.Linear(encoding_size, input_size/2)
-        self.fc4 = nn.Linear(input_size/2, input_size)
-
-        self.relu = nn.ReLU()
-        self.sigmoid = nn.Sigmoid()
-
-    def encode(self, x):
-        h1 = self.relu(self.fc1(x))
-        return self.fc21(h1), self.fc22(h1)
-
-    def reparameterize(self, mu, logvar):
-        if self.training:
-            std = logvar.mul(0.5).exp_()
-            eps = Variable(std.data.new(std.size()).normal_())
-            return eps.mul(std).add_(mu)
-        else:
-            return mu
-
-    def decode(self, z):
-        h3 = self.relu(self.fc3(z))
-        return self.sigmoid(self.fc4(h3))
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-        mu, logvar = self.encode(x)
-        z = self.reparameterize(mu, logvar)
-        return self.decode(z), mu, logvar
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encode(x)
-            encoded = self.reparameterize(mu, logvar)
-            encoded = encoded.data.cpu().numpy()
-            return encoded
-
-class Decoder(nn.Module):
-    """ VAE decoder input_size = original inputsize/16*256"""
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Decoder, self).__init__()
-        self.latent_size = latent_size
-        self.img_channels = img_channels
-
-        self.fc1 = nn.Linear(latent_size, input_size)
-        self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding)
-        self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding)
-        self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding)
-        self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.fc1(x))
-        x = x.unsqueeze(-1).unsqueeze(-1)
-        x = F.relu(self.deconv1(x))
-        x = F.relu(self.deconv2(x))
-        x = F.relu(self.deconv3(x))
-        reconstruction = torch.sigmoid(self.deconv4(x))
-        return reconstruction
-
-class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes
-    """ VAE encoder """
-    def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)):
-        super(Encoder, self).__init__()
-        self.latent_size = latent_size
-        #self.img_size = img_size
-        self.img_channels = img_channels
-
-        self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding)
-        self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding)
-        self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding)
-        self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding)
-        out_size = input_size / 16
-        self.fc_mu = nn.Linear(out_size, latent_size)
-        self.fc_logsigma = nn.Linear(out_size, latent_size)
-
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        x = F.relu(self.conv1(x))
-        x = F.relu(self.conv2(x))
-        x = F.relu(self.conv3(x))
-        x = F.relu(self.conv4(x))
-        x = x.view(x.size(0), -1)
-
-        mu = self.fc_mu(x)
-        logsigma = self.fc_logsigma(x)
-
-        return mu, logsigma
-
-class VAE_CNN(nn.Module):
-    """ Variational Autoencoder """
-    def __init__(self, latent_size, input_size):
-        super(VAE, self).__init__()
-        self.encoder = Encoder(latent_size, input_size)
-        input_size = input_size/16
-        self.decoder = Decoder(latent_size, input_size)
-
-    def forward(self, x): # pylint: disable=arguments-differ
-        mu, logsigma = self.encoder(x)
-        sigma = logsigma.exp()
-        eps = torch.randn_like(sigma)
-        z = eps.mul(sigma).add_(mu)
-
-        recon_x = self.decoder(z)
-        return recon_x, mu, logsigma
-
-    def get_encode_features(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            mu, logvar = self.encoder(x)
-            encoded = mu .data.cpu().numpy()
-            return encoded
-
-class CNN(nn.Module):
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size)
-        self.bn = nn.BatchNorm1d(hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.layer2(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.bn(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-#allow multiple kernel with differnt kernel size
-class CNN_MLF(nn.Module):
-    """
-    It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0):
-        super(CNN_MLF, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1
-        maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size
-        maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1
-        maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        conv_outsize = 	maxpool1_size + maxpool2_size +maxpool3_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out1 = self.layer1(x)
-        out2 = self.layer2(x)
-        out3 = self.layer3(x)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_LSTM(nn.Module):
-    """
-    It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2):
-        super(CNN_LSTM, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size, stride = stride))
-        self.num_layers = num_layers
-        self.hidden_size = hidden_size
-        out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1
-        self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding)
-        input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(hidden_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = self.downsample(out)
-        out = torch.squeeze(out, 1)
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size))
-        out, hn  = self.layer2(out, (h0, c0))
-        out = hn[0][-1]
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MIX(nn.Module):
-    """
-    It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MIX, self).__init__()
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        out1_size = int(np.ceil(float(window_size)/pool_size[1]))
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        
-        out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.layer1(x)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer2(out)
-        out = out.view(out.size(0), out.size(2), out.size(1), out.size(3))
-        out = self.layer3(out)
-        out = out.view(out.size(0), -1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class CNN_MULTI(nn.Module):
-    """
-    It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github.
-    """
-    def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0):
-        super(CNN_MULTI, self).__init__()
-        # resolution 1
-        self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer1 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        #resolution 2
-        self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size)
-        maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-              
-        self.layer2 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        self.layer3 = nn.Sequential(
-            nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU(),
-            nn.MaxPool2d(pool_size))
-        cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        self.layer4 = nn.Sequential(
-            nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding),
-            nn.BatchNorm2d(nb_filter),
-            nn.ReLU())
-        cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1
-        merge_size = cnn1_size + cnn2_size + cnn4_size
-        self.drop1 = nn.Dropout(p=0.5)
-        self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.pool1_1(x)
-        out = self.pool1_2(out)
-        out1 = self.layer1(out)
-        out = self.pool2_1(x)
-        out2 = self.layer2(out)
-        out = self.layer3(x)
-        out3 = self.layer4(out)
-        out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1)
-        out = self.drop1(out)
-        out = self.fc1(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# 1x3 Convolution
-def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)):
-    return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, 
-                     padding=padding, stride=stride, bias=False)
-
-
-# Residual Block
-class ResidualBlock(nn.Module):
-    def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn1 = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride)
-        self.bn2 = nn.BatchNorm2d(nb_filter)
-        self.downsample = downsample
-        
-    def forward(self, x):
-        residual = x
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-        out = self.conv2(out)
-        out = self.bn2(out)
-        if self.downsample:
-            residual = self.downsample(x)
-        out += residual
-        out = self.relu(out)
-        return out
-
-
-# ResNet Module
-class ResNet(nn.Module):
-    def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100):
-        super(ResNet, self).__init__()
-        self.in_channels = 1
-        self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size)
-        self.bn = nn.BatchNorm2d(nb_filter)
-        self.relu = nn.ReLU(inplace=True)
-        self.layer1 = self.make_layer(block, nb_filter, layers[0],  kernel_size = kernel_size)
-        self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter)
-        self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter)
-        self.avg_pool = nn.AvgPool2d(pool_size)
-        avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1
-        last_layer_size = nb_filter*4*labcounts*avgpool2_1_size
-        self.fc = nn.Linear(last_layer_size, hidden_size)
-        self.drop2 = nn.Dropout(p=0.5)
-        self.relu1 = nn.ReLU()
-        self.fc2 = nn.Linear(hidden_size, num_classes)
-        
-    def make_layer(self, block, out_channels, blocks, stride=1,  kernel_size = (1, 3), in_channels = 16):
-        downsample = None
-        if (stride != 1) or (self.in_channels != out_channels):
-            downsample = nn.Sequential(
-                convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride),
-                nn.BatchNorm2d(out_channels))
-        layers = []
-        layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample))
-        self.in_channels = out_channels
-        for i in range(1, blocks):
-            layers.append(block(out_channels, out_channels, kernel_size = kernel_size))
-        return nn.Sequential(*layers)
-     
-    def forward(self, x):
-        x = x.view(x.size(0), 1, x.size(1), x.size(2))
-        out = self.conv(x)
-        out = self.bn(out)
-        out = self.relu(out)
-        out = self.layer1(out)
-        out = self.layer2(out)
-        out = self.layer3(out)
-        out = self.avg_pool(out)
-        out = out.view(out.size(0), -1)
-        out = self.fc(out)
-        out = self.drop2(out)
-        out = self.relu1(out)
-        out = self.fc2(out)
-        out = torch.sigmoid(out)
-        return out
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-        
-class GRU(nn.Module):
-    """
-    It is a deep network with one GRU layer, which are further fed into one fully connected layers.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(GRU, self).__init__()
-
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.linear = nn.Linear(hidden_size, num_classes)
-
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection
-        self.gru.flatten_parameters()
-        out, hn = self.gru(x, h0)
-
-        rearranged = hn[-1]
-        out = self.linear(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def initHidden(self, N):
-        return Variable(torch.randn(1, N, self.hidden_size))
-    
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class RNN(nn.Module):
-    """
-    It is a deep network with one LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(RNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout)
-        self.fc = nn.Linear(hidden_size, num_classes)
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) 
-            c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        rearranged = hn[0][-1]
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-class BiRNN(nn.Module):
-    """
-    It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer.
-    """
-    def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5):
-        super(BiRNN, self).__init__()
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
-                            batch_first = True, dropout = dropout, bidirectional=True)
-        self.fc = nn.Linear(hidden_size*2, num_classes)  # 2 for bidirection 
-    
-    def forward(self, x):
-        if torch.cuda.is_available():
-            x = x.cuda()
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda()
-        else:
-            h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection 
-            c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size))
-        self.lstm.flatten_parameters()
-        out, hn = self.lstm(x, (h0, c0))
-        hn = hn[0]
-
-        rearranged = hn[-2:].view(x.size(0), -1)
-        # Decode hidden state of last time step
-        out = self.fc(rearranged)
-        out = torch.sigmoid(out)
-        return out
-
-    def predict_proba(self, x):
-        if type(x) is np.ndarray:
-            x = torch.from_numpy(x.astype(np.float32))
-        with torch.no_grad():
-            x = Variable(x)
-            if torch.cuda.is_available():
-                x = x.cuda()
-            y = self.forward(x)
-            temp = y.data.cpu().numpy()
-            return temp
-
-
-# select model
-def train_deeptorch(population, plpData, train = True, model_type = 'LogisticRegression', class_weight =0,  autoencoder = True, w_decay =0.9, epochs = 1, vae = False, size = 100, loss = 'LogSoftmax', nbfilters = 4, learning_rate = 0.0001, hidden_size = 100, modelOutput = 'C:/deeptorch', seed = 1, quiet = False):
-  if model_type in ['LogisticRegression', 'MLP', 'SNN']:
-    y = population[:, 1]
-    X = plpData[population[:, 0], :]
-    trainInds = population[:, population.shape[1] - 1] > 0
-    if class_weight == -1:
-      loss = FocalLoss(gamma = 5)
-    else:
-      if class_weight == 0:
-        weights = float(np.count_nonzero(y))/y.shape[0]
-        class_weight = [1 - weights, weights]
-      else:
-        class_weight = [class_weight, 1]
-      class_weight = 1/torch.Tensor(class_weight)
-      if torch.cuda.is_available():
-        class_weight = class_weight.cuda()
-      loss=nn.CrossEntropyLoss(weight = class_weight)
-    
-    print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1]))
-    print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1]))
-    ###########################################################################
-    l1regularization = False
-    if train:
-      pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0))
-      print("Calculating prediction for train set of size %s" % (pred_size))
-      test_pred = np.zeros(pred_size)  # zeros length sum(population[:,population.size[1]] ==i)
-      for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-        testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-        trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-        train_x = X[trainInds, :][trainInd, :]
-        train_y = y[trainInds][trainInd]
-        test_x = X[trainInds, :][testInd, :]
-        print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-        print("Train set contains %s outcomes " % (np.sum(train_y)))
-        train_x = train_x.toarray()
-        test_x = test_x.toarray()
-        if autoencoder:
-          print('first train stakced autoencoder')
-          encoding_size = 256
-          if vae:
-            auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-          else:
-            auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-          if torch.cuda.is_available():
-            auto_model = auto_model.cuda()
-          clf = Estimator(auto_model)
-          clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay),
-                                loss=nn.MSELoss())
-          clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae)
-          #split to batch for large dataset
-          train_batch = batch(train_x, batch_size=32)
-          train_x = np.array([]).reshape(0, encoding_size)
-          for train in train_batch:
-            encode_train = auto_model.get_encode_features(train)
-            train_x = np.concatenate((train_x, encode_train), axis=0)
-          test_batch = batch(test_x, batch_size=32)
-          test_x = np.array([]).reshape(0, encoding_size)
-          for test in test_batch:
-            encode_Test = auto_model.get_encode_features(test)
-            test_x = np.concatenate((test_x, encode_Test), axis=0)
-          del auto_model
-          del clf
-        # train on fold
-        print("Training fold %s" % (i))
-        start_time = timeit.default_timer()
-        if model_type == 'LogisticRegression':
-          model = LogisticRegression(train_x.shape[1])
-          l1regularization = True
-        elif model_type == 'SNN':
-          model = SNN(train_x.shape[1], size)
-        else:
-          model = MLP(train_x.shape[1], size)
-        
-        if torch.cuda.is_available():
-          model = model.cuda()
-        clf = Estimator(model)
-        clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                            loss=loss)
-        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-        
-        ind = (population[:, population.shape[1] - 1] > 0)
-        ind = population[ind, population.shape[1] - 1] == i
-        test_input_var = torch.from_numpy(test_x.astype(np.float32))
-        test_batch = batch(test_x, batch_size = 32)
-        temp = []
-        for test in test_batch:
-          pred_test1 = model.predict_proba(test)[:, 1]
-          temp = np.concatenate((temp, pred_test1), axis = 0)
-        test_pred[ind] = temp
-        print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-        print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-
-      # RETURN CV PREDICTION WHEN TRAIN == T
-      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-      return prediction;
-      
-    # train final:
-    else:
-      print("Training final neural network model on all train data...")
-      print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-      start_time = timeit.default_timer()
-      train_x = X[trainInds, :]
-      train_x = train_x.toarray()
-      train_y = y[trainInds]
-      if not os.path.exists(modelOutput):
-        os.makedirs(modelOutput)
-      if autoencoder:
-        encoding_size = 256
-        if vae:
-          auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size)
-        else:
-          auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size)
-        if torch.cuda.is_available():
-          auto_model = auto_model.cuda()
-        clf = Estimator(auto_model)
-        clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay),
-                            loss=nn.MSELoss())
-        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae)
-        train_batch = batch(train_x, batch_size=32)
-        train_x = np.array([]).reshape(0, encoding_size)
-        for train in train_batch:
-          encode_train = auto_model.get_encode_features(train)
-          train_x = np.concatenate((train_x, encode_train), axis=0)
-        joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl'))
-        del auto_model
-        del clf
-      print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay))
-      if model_type == 'LogisticRegression':
-        model = LogisticRegression(train_x.shape[1])
-        l1regularization = True
-      elif model_type == 'SNN':
-        model = SNN(train_x.shape[1], size)
-      else:
-        model = MLP(train_x.shape[1], size)
-      if torch.cuda.is_available():
-        model = model.cuda()
-      clf = Estimator(model)
-      clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay),
-                        loss=loss)
-      clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization)
-      end_time = timeit.default_timer()
-      print("Training final took: %.2f s" % (end_time - start_time))
-      print("Model saved to: %s" % (modelOutput))
-      joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-      # DO PREDICTION ON TRAIN:
-      train_batch = batch(train_x, batch_size = 32)
-      train_pred = []
-      for train in train_batch:
-        preds = model.predict_proba(train)[:, 1]
-        train_pred = np.concatenate((train_pred, preds), axis = 0)
-      train_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], train_pred, axis=1)
-      # RETURN TRAIN PREDICTION WHEN TRAIN == F
-      return prediction;
-  
-  elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']:
-    y = population[:, 1]
-    X = plpData.to_dense().numpy()
-    X = X[np.int64(population[:, 0]), :]
-    trainInds = population[:, population.shape[1] - 1] > 0
-    
-    if class_weight == -1:
-      loss = FocalLoss(gamma = 3)
-    else:
-      if class_weight == 0:
-        weights = float(np.count_nonzero(y))/y.shape[0]
-        class_weight = [1 - weights, weights]
-      else:
-        class_weight = [class_weight, 1]
-      class_weight = 1/torch.Tensor(class_weight)
-      if torch.cuda.is_available():
-        class_weight = class_weight.cuda()
-      loss=nn.CrossEntropyLoss(weight = class_weight)
-    
-    if train:
-      test_pred = np.zeros(population[population[:, population.shape[1] - 1] > 0, :].shape[0])  # zeros length sum(population[:,population.size[1]] ==i)
-      for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1):
-        testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i
-        trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i)
-        train_x = X[trainInds, :][trainInd, :]
-        train_y = y[trainInds][trainInd]
-        test_x = X[trainInds, :][testInd, :]
-        print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0]))
-        print("Train set contains %s outcomes " % (np.sum(train_y)))
-        # train on fold
-        learning_rate = 0.001
-        print("Training fold %s" % (i))
-        start_time = timeit.default_timer()
-        if model_type == 'CNN':
-          model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'CNN_LSTM':
-          model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-        elif model_type == 'CNN_MLF': # multiple kernels with different size
-          model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis
-          model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis
-          model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-        elif model_type == 'ResNet':
-          print('train ResNet')
-          model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-        elif model_type == 'RNN':
-          model = RNN(train_x.shape[2], hidden_size, 2, 2)
-        elif model_type == 'BiRNN':
-          model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-        elif model_type == 'GRU':
-          model = GRU(train_x.shape[2], hidden_size, 2, 2)
-        else:
-          print('temproal data not supported by this model')
-          
-        if torch.cuda.is_available():
-          model = model.cuda()
-        clf = Estimator(model)
-        clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                            loss=loss)
-        clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-        ind = (population[:, population.shape[1] - 1] > 0)
-        ind = population[ind, population.shape[1] - 1] == i
-        test_batch = batch(test_x, batch_size = 32)
-        temp = []
-        for test in test_batch:
-          pred_test1 = model.predict_proba(test)[:, 1]
-          temp = np.concatenate((temp, pred_test1), axis = 0)
-        test_pred[ind] = temp
-        del model
-        print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0]))
-        print("Mean: %s prediction value" % (np.mean(test_pred[ind])))
-      # RETURN CV PREDICTION
-      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-      return prediction;
-      
-    # train final:
-    else:
-      print("Training final neural network model on all train data...")
-      print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0]))
-      start_time = timeit.default_timer()
-      train_x = X[trainInds, :]
-      train_y = y[trainInds]
-      learning_rate = 0.001
-      if model_type == 'CNN':
-        model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'CNN_LSTM':
-        model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-      elif model_type == 'CNN_MLF': # multiple kernels with different size
-        model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis
-        model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis
-        model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2])
-      elif model_type == 'ResNet':
-        model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2])
-      elif model_type == 'RNN':
-        model = RNN(train_x.shape[2], hidden_size, 2, 2)
-      elif model_type == 'BiRNN':
-        model = BiRNN(train_x.shape[2], hidden_size, 2, 2)
-      elif model_type == 'GRU':
-        model = GRU(train_x.shape[2], hidden_size, 2, 2)
-      else:
-        print('temproal data not supported by this model')
-      
-      if torch.cuda.is_available():
-        model = model.cuda()
-      clf = Estimator(model)
-      clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001),
-                        loss=loss)
-      clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs)
-      end_time = timeit.default_timer()
-      print("Training final took: %.2f s" % (end_time - start_time))
-      # save the model:
-      if not os.path.exists(modelOutput):
-        os.makedirs(modelOutput)
-      print("Model saved to: %s" % (modelOutput))
-      
-      joblib.dump(model, os.path.join(modelOutput,'model.pkl'))
-      # prediction on train:
-      test_batch = batch(train_x, batch_size = 32)
-      test_pred = []
-      for test in test_batch:
-        pred_test1 = model.predict_proba(test)[:, 1]
-        test_pred = np.concatenate((test_pred, pred_test1), axis = 0)
-      test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1)
-      prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1)
-      return prediction;
diff --git a/inst/python/predictFunctions.py b/inst/python/predictFunctions.py
deleted file mode 100644
index fcbb2e5..0000000
--- a/inst/python/predictFunctions.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#  apply random forest model on new data
-#===============================================================
-# INPUT:
-# 1) location of new data
-# 2) location of model
-#
-# OUTPUT:
-# it returns a file with indexes merged with prediction for test index  - named new_pred
-#================================================================
-import numpy as np
-from collections import OrderedDict
-import os
-import sys
-import timeit
-import math
-from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack
-from joblib import Memory
-import joblib
-
-def batch(tensor, batch_size = 50):
-    """ It is used to create batch samples, each batch has batch_size samples"""
-    tensor_list = []
-    length = tensor.shape[0]
-    i = 0
-    while True:
-        if (i+1) * batch_size >= length:
-            tensor_list.append(tensor[i * batch_size: length])
-            return tensor_list
-        tensor_list.append(tensor[i * batch_size: (i+1) * batch_size])
-        i += 1
-#================================================================
-###########################################################################
-def python_predict_temporal(population, plpData, model_loc, dense, autoencoder):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  #y=population[:,1]
-  
-  ###########################################################################	
-  # uf dense convert 
-  if dense==1:
-    print("converting to dense data...")
-    X = plpData.to_dense().numpy()
-  if dense==0:
-    print("keeping data sparse...")
-    X = plpData.numpy()
-  ###########################################################################	
-  
-  # order the data
-  X = X[np.int64(population[:, 0]), :]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-
-  # load model
-  print("Loading model...")
-  if autoencoder:
-    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
-    X = autoencoder_model.get_encode_features(X)
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_batch = batch(X, batch_size = 32)
-  test_pred = []
-  for test in test_batch:
-    pred_test1 = modelTrained.predict_proba(test)[:, 1]
-    test_pred = np.concatenate((test_pred , pred_test1), axis = 0)
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
-  
-def python_predict(population, plpData, model_loc, dense, autoencoder):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  #y=population[:,1]
-  X = plpData[population[:,0].astype(int),:]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-  ###########################################################################	
-  # uf dense convert 
-  if dense==1:
-    print("converting to dense data...")
-    X=X.toarray()
-  ###########################################################################	
-  # load model
-  print("Loading model...")
-  if autoencoder:
-    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
-    X = autoencoder_model.get_encode_features(X)
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_pred = modelTrained.predict_proba(X)[:, 1]
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
-
-
-def python_predict_survival(population, plpData, model_loc):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  X = plpData[population[:,0].astype(int),:]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-  ###########################################################################	
-  # load model
-  print("Loading model...")
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_pred = modelTrained.predict(X.toarray())
-  test_pred = test_pred.flatten()
-  rowCount = population.shape[0]
-  test_pred = test_pred[0:(rowCount)]
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
-  
-def python_predict_garden(population, plpData, model_loc,quantile=None):
-  print("Applying Python Model") 
-  print("Loading Data...")
-  # load data + train,test indexes + validation index
-  #y=population[:,1]
-  X = plpData[population[:,0].astype(int),:]
-  # load index file
-  print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-  print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-  print("Data ready for model has %s features" %(np.shape(X)[1]))
-  ###########################################################################	
-  # load model
-  print("Loading model...")
-  modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-  print("Calculating predictions on population...")
-  test_pred = modelTrained.predict(X,quantile=quantile)[:, 0]
-  print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-  print("Mean: %s prediction value" %(np.mean(test_pred)))
-  # merge pred with population
-  test_pred.shape = (population.shape[0], 1)
-  prediction = np.append(population,test_pred, axis=1)
-  return prediction
diff --git a/inst/python/python_predict.py b/inst/python/python_predict.py
deleted file mode 100644
index 3a2f132..0000000
--- a/inst/python/python_predict.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#  apply random forest model on new data
-#===============================================================
-# INPUT:
-# 1) location of new data
-# 2) location of model
-#
-# OUTPUT:
-# it returns a file with indexes merged with prediction for test index  - named new_pred
-#================================================================
-import numpy as np
-from collections import OrderedDict
-import os
-import sys
-import timeit
-import math
-#from sklearn.ensemble import RandomForestClassifier
-#from sklearn.naive_bayes import GaussianNB
-from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack
-#from sklearn.feature_selection import SelectFromModel
-#from sklearn.cross_validation import PredefinedSplit
-from sklearn.externals.joblib import Memory
-#from sklearn.datasets import load_svmlight_file
-from sklearn.externals import joblib
-if "python_dir" in globals():
-    sys.path.insert(0, python_dir)
-    import TorchUtils as tu
-#================================================================
-
-
-print("Applying Python Model") 
-
-###########################################################################
-	
-def get_temproal_data(covariates, population):
-    p_ids_in_cov = set(covariates[:, 0])
-    timeid_len = len(set(covariates[:, -2]))
-    full_covariates = np.array([]).reshape(0,4)
-    default_covid = covariates[0, 1]
-    for p_id in  population[:, 0]:
-        if p_id not in p_ids_in_cov:
-            tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1
-            full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-        else:
-            tmp_x = covariates[covariates[:, 0] == p_id, :]
-            #print tmp_x.shape, X.shape
-            full_covariates = np.concatenate((full_covariates, tmp_x), axis=0)
-
-    X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len  = timeid_len, predict = True)
-    return X
-
-
-print("Loading Data...")
-# load data + train,test indexes + validation index
-
-y=population[:,1]
-#print covariates.shape
-
-if modeltype == 'temporal':
-    X = plpData.to_dense().numpy()
-    X = X[np.int64(population[:, 0]), :]
-	#X = get_temproal_data(covariates, population)
-    dense = 0
-else:
-    #print included
-	X = plpData[population[:,0],:]
-	X = X[:,included.flatten()]
-
-# load index file
-print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1]))
-print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1]))
-print("Data ready for model has %s features" %(np.shape(X)[1]))
-
-###########################################################################	
-# uf dense convert 
-if dense==1:
-  print("converting to dense data...")
-  X=X.toarray()
-###########################################################################	
-
-# load model
-print("Loading model...")
-
-modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) 
-
-print(X.shape)
-print("Calculating predictions on population...")
-
-if autoencoder:
-    autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl'))
-    X = autoencoder_model.get_encode_features(X)
-
-if modeltype == 'temporal':
-    test_batch = tu.batch(X, batch_size = 32)
-    test_pred = []
-    for test in test_batch:
-        pred_test1 = modelTrained.predict_proba(test)[:, 1]
-        test_pred = np.concatenate((test_pred , pred_test1), axis = 0)
-else:
-    test_pred = modelTrained.predict_proba(X)[:, 1]
-
-
-if test_pred.ndim != 1:
-    test_pred = test_pred[:,1]
-  
-
-print("Prediction complete: %s rows" %(np.shape(test_pred)[0]))
-print("Mean: %s prediction value" %(np.mean(test_pred)))
-
-# merge pred with population
-test_pred.shape = (population.shape[0], 1)
-prediction = np.append(population,test_pred, axis=1)
diff --git a/man/setCIReNN.Rd b/man/setCIReNN.Rd
deleted file mode 100644
index 4fa2654..0000000
--- a/man/setCIReNN.Rd
+++ /dev/null
@@ -1,93 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CIReNN.R
-\name{setCIReNN}
-\alias{setCIReNN}
-\title{Create setting for CIReNN model}
-\usage{
-setCIReNN(
-  numberOfRNNLayer = c(1),
-  units = c(128, 64),
-  recurrentDropout = c(0.2),
-  layerDropout = c(0.2),
-  lr = c(1e-04),
-  decay = c(1e-05),
-  outcomeWeight = c(0),
-  batchSize = c(100),
-  epochs = c(100),
-  earlyStoppingMinDelta = c(1e-04),
-  earlyStoppingPatience = c(10),
-  bayes = T,
-  useDeepEnsemble = F,
-  numberOfEnsembleNetwork = 5,
-  useVae = T,
-  vaeDataSamplingProportion = 0.1,
-  vaeValidationSplit = 0.2,
-  vaeBatchSize = 100L,
-  vaeLatentDim = 10L,
-  vaeIntermediateDim = 256L,
-  vaeEpoch = 100L,
-  vaeEpislonStd = 1,
-  useGPU = FALSE,
-  maxGPUs = 2,
-  seed = 1234
-)
-}
-\arguments{
-\item{numberOfRNNLayer}{The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3)}
-
-\item{units}{The number of units of RNN layer - as a list of vectors}
-
-\item{recurrentDropout}{The reccurrent dropout rate (regularisation)}
-
-\item{layerDropout}{The layer dropout rate (regularisation)}
-
-\item{lr}{Learning rate}
-
-\item{decay}{Learning rate decay over each update.}
-
-\item{outcomeWeight}{The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight.}
-
-\item{batchSize}{The number of data points to use per training batch}
-
-\item{epochs}{Number of times to iterate over dataset}
-
-\item{earlyStoppingMinDelta}{minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement.}
-
-\item{earlyStoppingPatience}{Number of epochs with no improvement after which training will be stopped.}
-
-\item{bayes}{logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer}
-
-\item{useDeepEnsemble}{logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing.}
-
-\item{numberOfEnsembleNetwork}{Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5).}
-
-\item{useVae}{logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN}
-
-\item{vaeDataSamplingProportion}{Data sampling proportion for VAE}
-
-\item{vaeValidationSplit}{Validation split proportion for VAE}
-
-\item{vaeBatchSize}{batch size for VAE}
-
-\item{vaeLatentDim}{Number of latent dimesion for VAE}
-
-\item{vaeIntermediateDim}{Number of intermediate dimesion for VAE}
-
-\item{vaeEpoch}{Number of times to interate over dataset for VAE}
-
-\item{vaeEpislonStd}{Epsilon}
-
-\item{useGPU}{logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE}
-
-\item{maxGPUs}{Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas.}
-
-\item{seed}{Random seed used by deep learning model}
-}
-\description{
-Create setting for CIReNN model
-}
-\examples{
-\dontrun{
-model.CIReNN <- setCIReNN()
-}
-}
diff --git a/man/setCNNTorch.Rd b/man/setCNNTorch.Rd
deleted file mode 100644
index 9f7309b..0000000
--- a/man/setCNNTorch.Rd
+++ /dev/null
@@ -1,36 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CNNTorch.R
-\name{setCNNTorch}
-\alias{setCNNTorch}
-\title{Create setting for CNN model with python}
-\usage{
-setCNNTorch(
-  nbfilters = c(16, 32),
-  epochs = c(20, 50),
-  seed = 0,
-  class_weight = 0,
-  type = "CNN"
-)
-}
-\arguments{
-\item{nbfilters}{The number of filters}
-
-\item{epochs}{The number of epochs}
-
-\item{seed}{A seed for the model}
-
-\item{class_weight}{The class weight used for imbalanced data: 
- 0: Inverse ratio between positives and negatives
--1: Focal loss}
-
-\item{type}{It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, 
-'CNN_MIX', 'ResNet' and 'CNN_MULTI'}
-}
-\description{
-Create setting for CNN model with python
-}
-\examples{
-\dontrun{
-model.cnnTorch <- setCNNTorch()
-}
-}
diff --git a/man/setCovNN.Rd b/man/setCovNN.Rd
deleted file mode 100644
index 4ebb5ec..0000000
--- a/man/setCovNN.Rd
+++ /dev/null
@@ -1,48 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CovNN.R
-\name{setCovNN}
-\alias{setCovNN}
-\title{Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)}
-\usage{
-setCovNN(
-  batchSize = 1000,
-  outcomeWeight = 1,
-  lr = 1e-05,
-  decay = 1e-06,
-  dropout = 0,
-  epochs = 10,
-  filters = 3,
-  kernelSize = 10,
-  loss = "binary_crossentropy",
-  seed = NULL
-)
-}
-\arguments{
-\item{batchSize}{The number of samples to used in each batch during model training}
-
-\item{outcomeWeight}{The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)}
-
-\item{lr}{The learning rate}
-
-\item{decay}{The decay of the learning rate}
-
-\item{dropout}{[currently not used] the dropout rate for regularisation}
-
-\item{epochs}{The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)}
-
-\item{filters}{The number of columns output by each convolution}
-
-\item{kernelSize}{The number of time dimensions used for each convolution}
-
-\item{loss}{The loss function implemented}
-
-\item{seed}{The random seed}
-}
-\description{
-Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1)
-}
-\examples{
-\dontrun{
-model.CovNN <- setCovNN()
-}
-}
diff --git a/man/setCovNN2.Rd b/man/setCovNN2.Rd
deleted file mode 100644
index 76d4376..0000000
--- a/man/setCovNN2.Rd
+++ /dev/null
@@ -1,48 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/CovNN2.R
-\name{setCovNN2}
-\alias{setCovNN2}
-\title{Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf}
-\usage{
-setCovNN2(
-  batchSize = 1000,
-  outcomeWeight = 1,
-  lr = 1e-05,
-  decay = 1e-06,
-  dropout = 0,
-  epochs = 10,
-  filters = 3,
-  kernelSize = 10,
-  loss = "binary_crossentropy",
-  seed = NULL
-)
-}
-\arguments{
-\item{batchSize}{The number of samples to used in each batch during model training}
-
-\item{outcomeWeight}{The weight assined to the outcome (make greater than 1 to reduce unballanced label issue)}
-
-\item{lr}{The learning rate}
-
-\item{decay}{The decay of the learning rate}
-
-\item{dropout}{[currently not used] the dropout rate for regularisation}
-
-\item{epochs}{The number of times data is used to train the model (e.g., epoches=1 means data only used once to train)}
-
-\item{filters}{The number of columns output by each convolution}
-
-\item{kernelSize}{The number of time dimensions used for each convolution}
-
-\item{loss}{The loss function implemented}
-
-\item{seed}{The random seed}
-}
-\description{
-Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf
-}
-\examples{
-\dontrun{
-model.CovNN <- setCovNN()
-}
-}
diff --git a/man/setDeepNN.Rd b/man/setDeepNN.Rd
deleted file mode 100644
index a676bc6..0000000
--- a/man/setDeepNN.Rd
+++ /dev/null
@@ -1,42 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/DeepNN.R
-\name{setDeepNN}
-\alias{setDeepNN}
-\title{Create setting for DeepNN model}
-\usage{
-setDeepNN(
-  units = list(c(128, 64), 128),
-  layer_dropout = c(0.2),
-  lr = c(1e-04),
-  decay = c(1e-05),
-  outcome_weight = c(1),
-  batch_size = c(100),
-  epochs = c(100),
-  seed = NULL
-)
-}
-\arguments{
-\item{units}{The number of units of the deep network - as a list of vectors}
-
-\item{layer_dropout}{The layer dropout rate (regularisation)}
-
-\item{lr}{Learning rate}
-
-\item{decay}{Learning rate decay over each update.}
-
-\item{outcome_weight}{The weight of the outcome class in the loss function}
-
-\item{batch_size}{The number of data points to use per training batch}
-
-\item{epochs}{Number of times to iterate over dataset}
-
-\item{seed}{Random seed used by deep learning model}
-}
-\description{
-Create setting for DeepNN model
-}
-\examples{
-\dontrun{
-model <- setDeepNN()
-}
-}
diff --git a/man/setRNNTorch.Rd b/man/setRNNTorch.Rd
deleted file mode 100644
index 00550db..0000000
--- a/man/setRNNTorch.Rd
+++ /dev/null
@@ -1,35 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/RNNTorch.R
-\name{setRNNTorch}
-\alias{setRNNTorch}
-\title{Create setting for RNN model with python}
-\usage{
-setRNNTorch(
-  hidden_size = c(50, 100),
-  epochs = c(20, 50),
-  seed = 0,
-  class_weight = 0,
-  type = "RNN"
-)
-}
-\arguments{
-\item{hidden_size}{The hidden size}
-
-\item{epochs}{The number of epochs}
-
-\item{seed}{A seed for the model}
-
-\item{class_weight}{The class weight used for imbalanced data: 
- 0: Inverse ratio between positives and negatives
--1: Focal loss}
-
-\item{type}{It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU'}
-}
-\description{
-Create setting for RNN model with python
-}
-\examples{
-\dontrun{
-model.rnnTorch <- setRNNTorch()
-}
-}
diff --git a/man/toSparseMDeep.Rd b/man/toSparseMDeep.Rd
deleted file mode 100644
index f6ef1e0..0000000
--- a/man/toSparseMDeep.Rd
+++ /dev/null
@@ -1,40 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/Formatting.R
-\name{toSparseMDeep}
-\alias{toSparseMDeep}
-\title{Convert the plpData in COO format into a sparse R matrix
-Converts the standard plpData to a sparse matrix
-This function converts the covariate file from ffdf in COO format into a sparse matrix from
-the package Matrix}
-\usage{
-toSparseMDeep(plpData, population, map = NULL, temporal = F)
-}
-\arguments{
-\item{plpData}{An object of type \code{plpData} with covariate in coo format - the patient level prediction
-data extracted from the CDM.}
-
-\item{population}{The population to include in the matrix}
-
-\item{map}{A covariate map (telling us the column number for covariates)}
-
-\item{temporal}{Whether you want to convert temporal data}
-}
-\value{
-Returns a list, containing the data as a sparse matrix, the plpData covariateRef
-and a data.frame named map that tells us what covariate corresponds to each column
-This object is a list with the following components: \describe{
-\item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
-\item{covariateRef}{The plpData covariateRef.}
-\item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
-}
-}
-\description{
-Convert the plpData in COO format into a sparse R matrix
-Converts the standard plpData to a sparse matrix
-This function converts the covariate file from ffdf in COO format into a sparse matrix from
-the package Matrix
-}
-\examples{
-#TODO
-
-}
diff --git a/man/toSparseRTorch.Rd b/man/toSparseRTorch.Rd
deleted file mode 100644
index f112ea6..0000000
--- a/man/toSparseRTorch.Rd
+++ /dev/null
@@ -1,37 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sparseRTorch.R
-\name{toSparseRTorch}
-\alias{toSparseRTorch}
-\title{Convert the plpData in COO format into a sparse Torch tensor}
-\usage{
-toSparseRTorch(plpData, population, map = NULL, temporal = T)
-}
-\arguments{
-\item{plpData}{An object of type \code{plpData} with covariate in coo format - the patient level prediction
-data extracted from the CDM.}
-
-\item{population}{The population to include in the matrix}
-
-\item{map}{A covariate map (telling us the column number for covariates)}
-
-\item{temporal}{Whether you want to convert temporal data}
-}
-\value{
-Returns a list, containing the data as a sparse matrix, the plpData covariateRef
-and a data.frame named map that tells us what covariate corresponds to each column
-This object is a list with the following components: \describe{
-\item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.}
-\item{covariateRef}{The plpData covariateRef.}
-\item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.}
-}
-}
-\description{
-Converts the standard plpData to a sparse tensor for Torch
-}
-\details{
-This function converts the covariate file from COO format into a sparse Torch tensor
-}
-\examples{
-#TODO
-
-}
diff --git a/man/transferLearning.Rd b/man/transferLearning.Rd
deleted file mode 100644
index 1842179..0000000
--- a/man/transferLearning.Rd
+++ /dev/null
@@ -1,54 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/DeepNN.R
-\name{transferLearning}
-\alias{transferLearning}
-\title{[Under development] Transfer learning}
-\usage{
-transferLearning(
-  plpResult,
-  plpData,
-  population,
-  fixLayers = T,
-  includeTop = F,
-  addLayers = c(100, 10),
-  layerDropout = c(T, T),
-  layerActivation = c("relu", "softmax"),
-  outcomeWeight = 1,
-  batchSize = 10000,
-  epochs = 20
-)
-}
-\arguments{
-\item{plpResult}{The plp result when training a kersa deep learning model on big data}
-
-\item{plpData}{The new data to fine tune the model on}
-
-\item{population}{The population for the new data}
-
-\item{fixLayers}{boolean specificying whether to fix weights in model being transferred}
-
-\item{includeTop}{If TRUE the final layer of the model being transferred is removed}
-
-\item{addLayers}{vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10}
-
-\item{layerDropout}{Add dropout to each new layer (binary vector length of addLayers)}
-
-\item{layerActivation}{Activation function for each new layer (string vector length of addLayers)}
-
-\item{outcomeWeight}{The weight to assign the class 1 when training the model}
-
-\item{batchSize}{Size of each batch for updating layers}
-
-\item{epochs}{Number of epoches to run}
-}
-\description{
-[Under development] Transfer learning
-}
-\examples{
-\dontrun{
-modelSet <- setDeepNN()
-plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...)
-
-transferLearning(...)
-}
-}
diff --git a/tests/testthat/test-keras.R b/tests/testthat/test-keras.R
deleted file mode 100644
index 944f5bb..0000000
--- a/tests/testthat/test-keras.R
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-library("testthat")
-context("Keras")
-
-deepSet <- setDeepNN(epochs = 1)
-
-plpResultDeep <- runPlp(population = population,
-                                               plpData = plpData, 
-                                               modelSettings = deepSet, 
-                                               savePlpData = F, 
-                                               savePlpResult = F, 
-                                               saveEvaluation = F, 
-                                               savePlpPlots = F, 
-                                               analysisId = 'deepTest',
-                                               saveDirectory =  saveLoc)
-
-
-
-#TODO: add input checks and test these...
-#options(fftempdir = getwd())
-
-
-test_that("deepNN working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultDeep), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultDeep$prediction), nrow(population))
-
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultDeep$prediction$value), 0)
-  testthat::expect_lte(max(plpResultDeep$prediction$value), 1)
-
-})
-
-
-# add temporal data:
-CNN1Set <- setCovNN(epochs = 1, kernelSize = 3, batchSize =30)
-plpResultCNN1 <- runPlp(population = population2,
-                                                plpData = plpData3, 
-                                                modelSettings = CNN1Set, 
-                                                savePlpData = F, 
-                                                savePlpResult = F, 
-                                                saveEvaluation = F, 
-                                                savePlpPlots = F, 
-                                                analysisId = 'cnn1Test',
-                                                saveDirectory =  saveLoc)
-
-test_that("covNN working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultCNN1), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultCNN1$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultCNN1$prediction$value), 0)
-  testthat::expect_lte(max(plpResultCNN1$prediction$value), 1)
-  
-})
-
-CNN2Set <- setCovNN2(epochs = 1, kernelSize = 4, filters=4)
-plpResultCNN2 <- runPlp(population = population2,
-                                                plpData = plpData3, 
-                                                modelSettings = CNN2Set, 
-                                                savePlpData = F, 
-                                                savePlpResult = F, 
-                                                saveEvaluation = F, 
-                                                savePlpPlots = F, 
-                                                analysisId = 'cnn1Test',
-                                                saveDirectory =  saveLoc)
-
-test_that("covNN2 working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultCNN2), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultCNN2$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultCNN2$prediction$value), 0)
-  testthat::expect_lte(max(plpResultCNN2$prediction$value), 1)
-  
-})
-
-
-if(!travis){
-CIReNNSet <- setCIReNN(epochs = 1, useVae = F, units=c(10) )
-plpResultCIReNN <- runPlp(population = population2, plpData = plpData3, 
-                                         minCovariateFraction = 0.001, normalizeData = F, 
-                                         modelSettings = CIReNNSet, testSplit = 'person', 
-                                         testFraction = 0.25, splitSeed = 1, 
-                                         nfold = 3, savePlpData = F, savePlpResult = F,
-                                         savePlpPlots = F,
-                                         saveEvaluation = F, 
-                          analysisId = 'cireNNTest',
-                          saveDirectory =  saveLoc)
-}
-
-test_that("CIReNN working checks", {
-  if(travis){
-    skip("Too slow for travis")
-  }
-  # check same structure
-  testthat::expect_equal(names(plpResultCIReNN), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultCIReNN$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultCIReNN$prediction$value), 0)
-  testthat::expect_lte(max(plpResultCIReNN$prediction$value), 1)
-  
-})
-
diff --git a/tests/testthat/test-torch.R b/tests/testthat/test-torch.R
deleted file mode 100644
index ab732d2..0000000
--- a/tests/testthat/test-torch.R
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright 2020 Observational Health Data Sciences and Informatics
-#
-# This file is part of PatientLevelPrediction
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-library("testthat")
-context("Torch")
-
-lrtSet <- setLRTorch(w_decay=0.0005, 
-                     epochs=1, 
-                     class_weight = 0, 
-                     autoencoder = FALSE, 
-                     vae =FALSE)
-  
-
-plpResultLrt <- runPlp(population = population,
-                                               plpData = plpData, 
-                                               modelSettings = lrtSet, 
-                                               savePlpData = F, 
-                                               savePlpResult = F, 
-                                               saveEvaluation = F, 
-                                               savePlpPlots = F, 
-                                               analysisId = 'lrtTest',
-                                               saveDirectory =  saveLoc)
-
-
-
-#TODO: add input checks and test these...
-#options(fftempdir = getwd())
-
-
-test_that("torch LR working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultLrt), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultLrt$prediction), nrow(population))
-
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultLrt$prediction$value), 0)
-  testthat::expect_lte(max(plpResultLrt$prediction$value), 1)
-
-})
-
-mlptSet <- setMLPTorch(size = 10, 
-                       w_decay = 0.001, 
-                       epochs = 1,
-                       autoencode = F)
-
-
-plpResultMlpt <- runPlp(population = population,
-                       plpData = plpData, 
-                       modelSettings = mlptSet, 
-                       savePlpData = F, 
-                       savePlpResult = F, 
-                       saveEvaluation = F, 
-                       savePlpPlots = F, 
-                       analysisId = 'mlptTest',
-                       saveDirectory =  saveLoc)
-
-test_that("MLP LR working checks", {
-  
-  # check same structure
-  testthat::expect_equal(names(plpResultMlpt), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultMlpt$prediction), nrow(population))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultMlpt$prediction$value), 0)
-  testthat::expect_lte(max(plpResultMlpt$prediction$value), 1)
-  
-})
-
-
-# add temporal data:
-if(!travis){
-RNNTSet <- setRNNTorch(hidden_size = 1, 
-                        epochs =1)
-plpResultRNNT <- runPlp(population = population2,
-                                                plpData = plpData3, 
-                                                modelSettings = RNNTSet, 
-                                                savePlpData = F, 
-                                                savePlpResult = F, 
-                                                saveEvaluation = F, 
-                                                savePlpPlots = F, 
-                                                analysisId = 'rnntTest',
-                                                saveDirectory =  saveLoc)
-}
-
-test_that("RNN Torch working checks", {
-  if(travis){
-    skip("Too slow for travis")
-  }
-  # check same structure
-  testthat::expect_equal(names(plpResultRNNT), 
-                         names(plpResult))
-  
-  # check prediction same size as pop
-  testthat::expect_equal(nrow(plpResultRNNT$prediction), nrow(population2))
-  
-  # check prediction between 0 and 1
-  testthat::expect_gte(min(plpResultRNNT$prediction$value), 0)
-  testthat::expect_lte(max(plpResultRNNT$prediction$value), 1)
-  
-})
-
-# add CNN when it is fixed
-