diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..ac00a31 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,6 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^\.github$ +^_pkgdown\.yml$ +^docs$ +^pkgdown$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R_CDM_check_hades.yaml b/.github/workflows/R_CDM_check_hades.yaml new file mode 100644 index 0000000..7445265 --- /dev/null +++ b/.github/workflows/R_CDM_check_hades.yaml @@ -0,0 +1,164 @@ +# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. +# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +on: + push: + branches: + - '**' + pull_request: + branches: + - '**' + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: ubuntu-20.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest"} + + env: + GITHUB_PAT: ${{ secrets.GH_TOKEN }} + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + CDM5_ORACLE_CDM_SCHEMA: ${{ secrets.CDM5_ORACLE_CDM_SCHEMA }} + CDM5_ORACLE_OHDSI_SCHEMA: ${{ secrets.CDM5_ORACLE_OHDSI_SCHEMA }} + CDM5_ORACLE_PASSWORD: ${{ secrets.CDM5_ORACLE_PASSWORD }} + CDM5_ORACLE_SERVER: ${{ secrets.CDM5_ORACLE_SERVER }} + CDM5_ORACLE_USER: ${{ secrets.CDM5_ORACLE_USER }} + CDM5_POSTGRESQL_CDM_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_CDM_SCHEMA }} + CDM5_POSTGRESQL_OHDSI_SCHEMA: ${{ secrets.CDM5_POSTGRESQL_OHDSI_SCHEMA }} + CDM5_POSTGRESQL_PASSWORD: ${{ secrets.CDM5_POSTGRESQL_PASSWORD }} + CDM5_POSTGRESQL_SERVER: ${{ secrets.CDM5_POSTGRESQL_SERVER }} + CDM5_POSTGRESQL_USER: ${{ secrets.CDM5_POSTGRESQL_USER }} + CDM5_SQL_SERVER_CDM_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_CDM_SCHEMA }} + CDM5_SQL_SERVER_OHDSI_SCHEMA: ${{ secrets.CDM5_SQL_SERVER_OHDSI_SCHEMA }} + CDM5_SQL_SERVER_PASSWORD: ${{ secrets.CDM5_SQL_SERVER_PASSWORD }} + CDM5_SQL_SERVER_SERVER: ${{ secrets.CDM5_SQL_SERVER_SERVER }} + CDM5_SQL_SERVER_USER: ${{ secrets.CDM5_SQL_SERVER_USER }} + TORCH_INSTALL: 1 # so torch runs torch::install_torch() + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-tinytex@v2 + + - uses: r-lib/actions/setup-pandoc@v1 + + - name: Install system requirements + if: runner.os == 'Linux' + run: | + sudo apt-get install -y libssh-dev + Rscript -e 'install.packages("remotes")' + while read -r cmd + do + eval sudo $cmd + done < <(Rscript -e 'writeLines(remotes::system_requirements("ubuntu", "20.04"))') + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + + - uses: r-lib/actions/check-r-package@v2 + with: + args: 'c("--no-manual", "--as-cran")' + error-on: '"warning"' + check-dir: '"check"' + + - name: Upload source package + if: success() && runner.os == 'macOS' && github.event_name != 'pull_request' && github.ref == 'refs/heads/main' + uses: actions/upload-artifact@v2 + with: + name: package_tarball + path: check/*.tar.gz + + - name: Install covr + if: runner.os == 'Windows' + run: | + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Test coverage + if: runner.os == 'Windows' + run: covr::codecov() + shell: Rscript {0} + + Release: + needs: R-CMD-Check + + runs-on: macOS-latest + + env: + GH_TOKEN: ${{ secrets.GH_TOKEN }} + + if: ${{ github.event_name != 'pull_request' && github.ref == 'refs/heads/main' }} + + steps: + + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Check if version has increased + run: | + echo "new_version="$(perl compare_versions --tag) >> $GITHUB_ENV + + - name: Display new version number + if: ${{ env.new_version != '' }} + run: | + echo "${{ env.new_version }}" + + - name: Create release + if: ${{ env.new_version != '' }} + uses: actions/create-release@v1 + env: + GITHUB_TOKEN: ${{ secrets.GH_TOKEN }} + with: + tag_name: ${{ env.new_version }} + release_name: Release ${{ env.new_version }} + body: | + See NEWS.md for release notes. + draft: false + prerelease: false + + - uses: r-lib/actions/setup-r@v2 + if: ${{ env.new_version != '' }} + + - name: Install drat + if: ${{ env.new_version != '' }} + run: | + install.packages('drat') + shell: Rscript {0} + + - name: Remove any tarballs that already exists + if: ${{ env.new_version != '' }} + run: | + rm -f *.tar.gz + + - name: Download package tarball + if: ${{ env.new_version != '' }} + uses: actions/download-artifact@v2 + with: + name: package_tarball + + - name: Push to drat + if: ${{ env.new_version != '' }} + run: | + bash deploy.sh + - name: Push to BroadSea + if: ${{ env.new_version != '' }} + run: | + curl --data "build=true" -X POST https://registry.hub.docker.com/u/ohdsi/broadsea-methodslibrary/trigger/f0b51cec-4027-4781-9383-4b38b42dd4f5/ + diff --git a/.github/workflows/nightly_cleanup_Hades.yml b/.github/workflows/nightly_cleanup_Hades.yml new file mode 100644 index 0000000..3f9d7ce --- /dev/null +++ b/.github/workflows/nightly_cleanup_Hades.yml @@ -0,0 +1,19 @@ + +name: 'nightly artifacts cleanup' +on: + schedule: + - cron: '0 1 * * *' # every night at 1 am UTC + +jobs: + remove-old-artifacts: + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + - name: Remove old artifacts + uses: c-hive/gha-remove-artifacts@v1 + with: + age: '7 days' + # Optional inputs + # skip-tags: true + skip-recent: 1 diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml new file mode 100644 index 0000000..7a5e8ac --- /dev/null +++ b/.github/workflows/pkgdown.yaml @@ -0,0 +1,46 @@ +# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples +# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help +on: + push: + branches: [main, develop] + release: + types: [published] + workflow_dispatch: + +name: pkgdown + +jobs: + pkgdown: + runs-on: ubuntu-latest + # Only restrict concurrency for non-PR jobs + concurrency: + group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-pandoc@v2 + + - uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::pkgdown, ohdsi/OhdsiRTools + needs: website + + - name: Build site + run: Rscript -e 'pkgdown::build_site_github_pages(new_process = FALSE, install = TRUE)' + + - name: Fix Hades Logo + run: Rscript -e 'OhdsiRTools::fixHadesLogo()' + + - name: Deploy to GitHub pages 🚀 + if: github.event_name != 'pull_request' + uses: JamesIves/github-pages-deploy-action@4.1.4 + with: + clean: false + branch: gh-pages + folder: docs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c567877 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata +results +config.yml +docs diff --git a/DESCRIPTION b/DESCRIPTION index 42e28a4..8fa61a7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,49 +1,42 @@ Package: DeepPatientLevelPrediction Type: Package -Title: Package for deep learning patient level prediction using data in the OMOP Common Data - Model -Version: 0.0.1 -Date: 2021-06-07 +Title: Deep Learning For Patient Level Prediction Using Data In The OMOP Common Data Model +Version: 1.0.0 +Date: 29-08-2022 Authors@R: c( - person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut", "cre")), + person("Jenna", "Reps", email = "jreps@its.jnj.com", role = c("aut")), + person("Egill", "Fridgeirsson", email = "e.fridgeirsson@erasmusmc.nl", role = c("aut", "cre")), person("Seng", "Chan You", role = c("aut")), - person("Egill", "Friogeirsson", role = c("aut")) + person("Chungsoo", "Kim", role = c("aut")), + person("Henrik", "John", role = c("aut")) ) - -Maintainer: Jenna Reps -Description: A package for creating deep learning patient level prediction models following -the OHDSI PatientLevelPrediction framework. +Maintainer: Egill Fridgeirsson +Description: A package for creating deep learning patient level prediction models following the OHDSI PatientLevelPrediction framework. License: Apache License 2.0 URL: https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction BugReports: https://github.com/OHDSI/DeepPatientLevelPrediction/issues VignetteBuilder: knitr Depends: - R (>= 3.3.0), - FeatureExtraction (>= 3.0.0) + R (>= 3.5.0) Imports: - Andromeda, - DatabaseConnector (>= 3.0.0), dplyr, - knitr, - magrittr, - Matrix, + data.table, + FeatureExtraction (>= 3.0.0), ParallelLogger (>= 2.0.0), - reshape2, - reticulate (> 1.16), - RSQLite, - slam, - SqlRender (>= 1.1.3), - tibble, - tidyr, + PatientLevelPrediction, + rlang, + torch (>= 0.8.0) Suggests: devtools, - keras, + Eunomia, + knitr, + markdown, plyr, - tensorflow, testthat Remotes: - ohdsi/FeatureExtraction -LinkingTo: Rcpp -NeedsCompilation: yes -RoxygenNote: 7.1.1 + ohdsi/PatientLevelPrediction@develop, + ohdsi/FeatureExtraction, + ohdsi/Eunomia +RoxygenNote: 7.2.1 Encoding: UTF-8 +Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..a102062 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,14 @@ +# Generated by roxygen2: do not edit by hand + +export(Dataset) +export(Estimator) +export(fitEstimator) +export(gridCvDeep) +export(predictDeepEstimator) +export(setMultiLayerPerceptron) +export(setResNet) +export(setTransformer) +import(data.table) +importFrom(data.table,":=") +importFrom(dplyr,"%>%") +importFrom(rlang,.data) diff --git a/NEWS.md b/NEWS.md new file mode 100644 index 0000000..c0bdb3a --- /dev/null +++ b/NEWS.md @@ -0,0 +1,6 @@ +DeepPatientLevelPrediction 1.0.0 +========================== + +- created an Estimator R6 class to handle the model fitting +- Added three non-temporal models. An MLP, a ResNet and a Transformer +- Created tests and documentation for the package diff --git a/R/CIReNN.R b/R/CIReNN.R deleted file mode 100644 index 4cdaa81..0000000 --- a/R/CIReNN.R +++ /dev/null @@ -1,1162 +0,0 @@ -# @file CIReNN.R -# Code edited from OHDSI contributor @chandryou CIReNN branch -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# BuildCIReNN<-function(outcomes=ff::as.ffdf(population[,c('rowId','y')]), -# covariates = result$data, -# indexFolder=indexFolder){ -# -# } - -#' Create setting for CIReNN model -#' -#' @param numberOfRNNLayer The number of RNN layer, only 1, 2, or 3 layers available now. eg. 1, c(1,2), c(1,2,3) -#' @param units The number of units of RNN layer - as a list of vectors -#' @param recurrentDropout The reccurrent dropout rate (regularisation) -#' @param layerDropout The layer dropout rate (regularisation) -#' @param lr Learning rate -#' @param decay Learning rate decay over each update. -#' @param outcomeWeight The weight of the outcome class in the loss function. Default is 0, which will be replaced by balanced weight. -#' @param batchSize The number of data points to use per training batch -#' @param epochs Number of times to iterate over dataset -#' @param earlyStoppingMinDelta minimum change in the monitored quantity to qualify as an improvement for early stopping, i.e. an absolute change of less than min_delta in loss of validation data, will count as no improvement. -#' @param earlyStoppingPatience Number of epochs with no improvement after which training will be stopped. -#' @param useDeepEnsemble logical (either TRUE or FALSE) value for using Deep Ensemble -#' @param numberOfEnsembleNetwork Integer, Number of Ensemble. If you want to use Deep Ensemble, this number should be greater than 1. -#' @param bayes logical (either TRUE or FALSE) value for using Bayesian Drop Out Layer to measure uncertainty. If it is TRUE, both Epistemic and Aleatoric uncertainty will be measured through Bayesian Drop Out layer -#' @param useDeepEnsemble logical (either TRUE or FALSE) value for using Deep Ensemble (Lakshminarayanan et al., 2017) to measure uncertainty. It cannot be used together with Bayesian deep learing. -#' @param numberOfEnsembleNetwork Integer. Number of network used for Deep Ensemble (Lakshminarayanan et al recommended 5). -#' @param useVae logical (either TRUE or FALSE) value for using Variational AutoEncoder before RNN -#' @param vaeDataSamplingProportion Data sampling proportion for VAE -#' @param vaeValidationSplit Validation split proportion for VAE -#' @param vaeBatchSize batch size for VAE -#' @param vaeLatentDim Number of latent dimesion for VAE -#' @param vaeIntermediateDim Number of intermediate dimesion for VAE -#' @param vaeEpoch Number of times to interate over dataset for VAE -#' @param vaeEpislonStd Epsilon -#' @param useGPU logical (either TRUE or FALSE) value. If you have GPUs in your machine, and want to use multiple GPU for deep learning, set this value as TRUE -#' @param maxGPUs Integer, If you will use GPU, how many GPUs will be used for deep learning in VAE? GPU parallelisation for deep learning will be activated only when parallel vae is true. Integer >= 2 or list of integers, number of GPUs or list of GPU IDs on which to create model replicas. -#' @param seed Random seed used by deep learning model -#' @importFrom zeallot %<-% -#' @examples -#' \dontrun{ -#' model.CIReNN <- setCIReNN() -#' } -#' @export -setCIReNN <- function(numberOfRNNLayer=c(1),units=c(128, 64), recurrentDropout=c(0.2), layerDropout=c(0.2), - lr =c(1e-4), decay=c(1e-5), outcomeWeight = c(0), batchSize = c(100), - epochs= c(100), earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), - bayes = T, useDeepEnsemble = F, numberOfEnsembleNetwork = 5, - useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, - vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, - vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2, - seed=1234 ){ - - ensure_installed("keras") - ensure_installed("tensorflow") - ensure_installed("plyr") - - if( sum(!( numberOfRNNLayer %in% c(1,2,3)))!=0 ) stop ('Only 1,2 or 3 is available now. ') - if(!((all.equal(numberOfEnsembleNetwork, as.integer(numberOfEnsembleNetwork))) & (numberOfEnsembleNetwork>=1) )) { - stop ('Number of ensemble network should be a natural number') } - - # if(class(indexFolder)!='character') - # stop('IndexFolder must be a character') - # if(length(indexFolder)>1) - # stop('IndexFolder must be one') - # - # if(class(units)!='numeric') - # stop('units must be a numeric value >0 ') - # if(units<1) - # stop('units must be a numeric value >0 ') - # - # #if(length(units)>1) - # # stop('units can only be a single value') - # - # if(class(recurrent_dropout)!='numeric') - # stop('dropout must be a numeric value >=0 and <1') - # if( (recurrent_dropout<0) | (recurrent_dropout>=1)) - # stop('dropout must be a numeric value >=0 and <1') - # if(class(layer_dropout)!='numeric') - # stop('layer_dropout must be a numeric value >=0 and <1') - # if( (layer_dropout<0) | (layer_dropout>=1)) - # stop('layer_dropout must be a numeric value >=0 and <1') - # if(class(lr)!='numeric') - # stop('lr must be a numeric value >0') - # if(lr<=0) - # stop('lr must be a numeric value >0') - # if(class(decay)!='numeric') - # stop('decay must be a numeric value >=0') - # if(decay<=0) - # stop('decay must be a numeric value >=0') - # if(class(outcome_weight)!='numeric') - # stop('outcome_weight must be a numeric value >=0') - # if(outcome_weight<=0) - # stop('outcome_weight must be a numeric value >=0') - # if(class(batch_size)!='numeric') - # stop('batch_size must be an integer') - # if(batch_size%%1!=0) - # stop('batch_size must be an integer') - # if(class(epochs)!='numeric') - # stop('epochs must be an integer') - # if(epochs%%1!=0) - # stop('epochs must be an integer') - # if(!class(seed)%in%c('numeric','NULL')) - # stop('Invalid seed') - #if(class(UsetidyCovariateData)!='logical') - # stop('UsetidyCovariateData must be an TRUE or FALSE') - - result <- list(model='fitCIReNN', param=split(expand.grid( - numberOfRNNLayer=numberOfRNNLayer,units=units, recurrentDropout=recurrentDropout, - layerDropout=layerDropout, - lr =lr, decay=decay, outcomeWeight=outcomeWeight, epochs= epochs, - earlyStoppingMinDelta = earlyStoppingMinDelta, earlyStoppingPatience = earlyStoppingPatience, - bayes= bayes, useDeepEnsemble = useDeepEnsemble,numberOfEnsembleNetwork = numberOfEnsembleNetwork, - useVae= useVae,vaeDataSamplingProportion = vaeDataSamplingProportion, vaeValidationSplit= vaeValidationSplit, - vaeBatchSize = vaeBatchSize, vaeLatentDim = vaeLatentDim, vaeIntermediateDim = vaeIntermediateDim, - vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd, useGPU = useGPU, maxGPUs = maxGPUs, - seed=ifelse(is.null(seed),'NULL', seed)), - - 1:(length(numberOfRNNLayer)*length(units)*length(recurrentDropout)*length(layerDropout)*length(lr)*length(decay)*length(outcomeWeight)*length(earlyStoppingMinDelta)*length(earlyStoppingPatience)*length(epochs)*max(1,length(seed)))), - name='CIReNN' - ) - - class(result) <- 'modelSettings' - return(result) -} - - -fitCIReNN <- function(plpData,population, param, search='grid', quiet=F, - outcomeId, cohortId, ...){ - # check plpData is coo format: - if (!FeatureExtraction::isCovariateData(plpData$covariateData)) - stop("Needs correct covariateData") - - metaData <- attr(population, 'metaData') - if(!is.null(population$indexes)) - population <- population[population$indexes>0,] - attr(population, 'metaData') <- metaData - - start<-Sys.time() - - result<- toSparseM(plpData,population,map=NULL, temporal=T) - data <- result$data - covariateMap <- result$map - - #remove result to save memory - rm(result) - - if(param[[1]]$useVae){ - #Sampling the data for bulding VAE - vaeSampleData <- data[sample(seq(dim(data)[1]), floor(dim(data)[1]*param[[1]]$vaeDataSamplingProportion),replace=FALSE),,] - - #Build VAE - vae <- buildVae(vaeSampleData, vaeValidationSplit= param[[1]]$vaeValidationSplit, - vaeBatchSize = param[[1]]$vaeBatchSize, vaeLatentDim = param[[1]]$vaeLatentDim, vaeIntermediateDim = param[[1]]$vaeIntermediateDim, - vaeEpoch = param[[1]]$vaeEpoch, vaeEpislonStd = param[[1]]$vaeEpislonStd, useGPU= param[[1]]$useGPU, maxGPUs= param[[1]]$maxGPUs, temporal = TRUE) - #remove sample data for VAE to save memory - vaeSampleData <- NULL - - vaeEnDecoder<- vae[[1]] - vaeEncoder <- vae[[2]] - - #Embedding by using VAE encoder - data <- plyr::aaply(as.array(data), 2, function(x) stats::predict(vaeEncoder, x, batch_size = param$vaeBatchSize)) - data <- aperm(data, perm = c(2,1,3))#rearrange of dimension - - ##Check the performance of vae - # decodedVaeData<-plyr::aaply(as.array(data), 2, function(x) predict(vaeEnDecoder, x, batch_size = param$vaeBatchSzie)) - # decodedVaeData<-aperm(decodedVaeData, c(2,1,3)) - # a1=Epi::ROC(form=as.factor(as.vector(data))~as.vector(decodedVaeData),plot="ROC") - - }else { - vaeEnDecoder <- NULL - vaeEncoder <- NULL - } - - #one-hot encoding - population$y <- keras::to_categorical(population$outcomeCount, 2)#[,2] #population$outcomeCount - - # do cross validation to find hyperParameter - datas <- list(population=population, plpData=data) - - #remove data to save memory - data <- NULL - - #Selection of hyperparameters - hyperParamSel <- lapply(param, function(x) do.call(trainCIReNN, c(x,datas,train=TRUE) )) - hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum))) - hyperSummary <- as.data.frame(hyperSummary) - hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc)) - hyperParamSel <- unlist(lapply(hyperParamSel, function(x) x$auc)) - - #now train the final model and return coef - bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1] - finalModel<-do.call(trainCIReNN, c(param[[bestInd]],datas, train=FALSE)) - - covariateRef <- as.data.frame(plpData$covariateData$covariateRef) - incs <- rep(1, nrow(covariateRef)) - covariateRef$included <- incs - covariateRef$covariateValue <- rep(0, nrow(covariateRef)) - - #modelTrained <- file.path(outLoc) - param.best <- param[[bestInd]] - - comp <- start-Sys.time() - - # train prediction - prediction <- finalModel$prediction - finalModel$prediction <- NULL - - # return model location - result <- list(model = finalModel$model, - trainCVAuc = -1, # ToDo decide on how to deal with this - hyperParamSearch = hyperSummary, - modelSettings = list(model='fitCIReNN',modelParameters=param.best), - metaData = plpData$metaData, - populationSettings = attr(population, 'metaData'), - outcomeId = outcomeId, - cohortId = cohortId, - varImp = covariateRef, - trainingTime = comp, - covariateMap = covariateMap, - useDeepEnsemble = param.best$useDeepEnsemble, - numberOfEnsembleNetwork = param.best$numberOfEnsembleNetwork, - useVae = param.best$useVae, - vaeBatchSize = param.best$vaeBatchSize, - vaeEnDecoder = vaeEnDecoder, - vaeEncoder = vaeEncoder, - predictionTrain = prediction - ) - class(result) <- 'plpModel' - attr(result, 'type') <- 'deep' - if(param.best$useDeepEnsemble){ - attr(result, 'type') <- 'deepEnsemble' - } - if(param.best$bayes){ - attr(result, 'type') <- 'BayesianDeep' - } - attr(result, 'predictionType') <- 'binary' - - return(result) -} - -trainCIReNN<-function(plpData, population, - numberOfRNNLayer=1,units=128, recurrentDropout=0.2, layerDropout=0.2, - lr =1e-4, decay=1e-5, outcomeWeight = 0, batchSize = 100, - epochs= 100, earlyStoppingMinDelta = c(1e-4), earlyStoppingPatience = c(10), - bayes = T, useDeepEnsemble = F,numberOfEnsembleNetwork =3, - useVae = T, vaeDataSamplingProportion = 0.1,vaeValidationSplit= 0.2, - vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, - vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU = FALSE, maxGPUs = 2, - seed=NULL, train=TRUE){ - - mu <- function(){return(NULL)} - sigma <- function(){return(NULL)} - - output_dim = 2 #output dimension for outcomes - num_MC_samples = 100 #sample number for MC sampling in Bayesian Deep Learning Prediction - if(outcomeWeight == 0){ - outcomeWeight = round(sum(population$outcomeCount==0)/sum(population$outcomeCount>=1),1) #if outcome weight = 0, then it means balanced weight - } - #heteroscedatic loss function - heteroscedastic_loss = function(y_true, y_pred) { - mean = y_pred[, 1:output_dim] - log_var = y_pred[, (output_dim + 1):(output_dim * 2)] - precision = keras::k_exp(-log_var) - keras::k_sum(precision * (y_true - mean) ^ 2 + log_var, axis = 2) - } - - if(!is.null(population$indexes) && train==T){ - writeLines(paste('Training recurrent neural network with ',length(unique(population$indexes)),' fold CV')) - - index_vect <- unique(population$indexes) - perform <- c() - - # create prediction matrix to store all predictions - predictionMat <- population - predictionMat$value <- 0 - attr(predictionMat, "metaData") <- list(predictionType = "binary") - - for(index in 1:length(index_vect )){ - writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows')) - - if(useDeepEnsemble){ - predList<-list() - for (i in seq(numberOfEnsembleNetwork)){ - #print(i) - ParallelLogger::logInfo(paste(i,'th process is started')) - pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs, - earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience, - train_rows=train_rows,index=index,lr=lr,decay=decay, - units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer, - layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs) - ParallelLogger::logInfo(paste(i,'th process is ended started')) - predList <- append(predList,pred) - } - model <- predList - - # batch prediciton - maxVal <- sum(population$indexes==index) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population[population$indexes==index,] - prediction$value <- 0 - prediction$sigmas <- 0 - - for(batch in batches){ - - for (i in seq(numberOfEnsembleNetwork)){ - if(i==1){ - muMatrix <- data.frame() - sigmaMatrix <-data.frame() - } - c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))) - muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2]))) - sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2]))) - } - - muMean <- apply(muMatrix,2,mean) - muSq <- muMatrix^2 - sigmaSq <- sigmaMatrix^2 - sigmaMean <- apply(sigmaMatrix,2,mean) - sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2 - - prediction$value[batch] <- c(muMean) - #if prediction$value is negative, make this positive - prediction$sigmas[batch] <- c(sigmaResult) - - } - prediction$value[prediction$value>1] <- 1 - prediction$value[prediction$value<0] <- 0 - #prediction$value[batch] <- mu[,2] - #prediction$sigmas[batch] <- sigma[,2] - - #writeLines(paste0(dim(pred[,2]), collapse='-')) - #writeLines(paste0(pred[1,2], collapse='-')) - - attr(prediction, "metaData") <- list(predictionType = "binary") - aucVal <- computeAuc(prediction) - perform <- c(perform,aucVal) - - # add the fold predictions and compute AUC after loop - predictionMat$value[population$indexes==index] <- prediction$value - - }else{ - layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])) - if(useGPU){ - ##GRU layer - if(numberOfRNNLayer==1){ - layerOutput <- layerInput %>% - keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) - } - if(numberOfRNNLayer==2){ - layerOutput <- layerInput %>% - keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) - } - if(numberOfRNNLayer==3){ - layerOutput <- layerInput %>% - keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) - } - }else{ - ##GRU layer - if(numberOfRNNLayer == 1){ - layerOutput <- layerInput %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=FALSE) - } - if(numberOfRNNLayer > 1 ){ - layerInput %>% # !ISSUE : "missing layerOutput <- "? - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=TRUE) - } - if(numberOfRNNLayer == 2){ - layerOutput <- layerInput %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=FALSE) - } - if(numberOfRNNLayer==3){ - layerOutput <- layerInput %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=TRUE) %>% - # ISSUE- I removed layerOutput <- layerInput %>% as this was after a pipe - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=FALSE) - } - } - - earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience, - mode="auto",min_delta = earlyStoppingMinDelta) - reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", - min_delta = 1e-5, cooldown = 0, min_lr = 0) - - class_weight=list("0"=1,"1"=outcomeWeight) - - if(bayes){ - mean = layerOutput %>% - layer_concrete_dropout(layer = keras::layer_dense(units = output_dim)) - - log_var = layerOutput %>% - layer_concrete_dropout(layer = keras::layer_dense(units = output_dim)) - - output = keras::layer_concatenate(list(mean, log_var)) - model = keras::keras_model(layerInput, output) - #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output) - model %>% keras::compile( - optimizer = "adam", - loss = heteroscedastic_loss, - metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss)) - ) - - }else{ - model<- layerInput %>% - keras::layer_dense(units=2, activation='softmax') - model %>% keras::compile( - loss = 'binary_crossentropy', - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - } - - data <- plpData[population$rowId[population$indexes!=index],,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,sum(population$indexes!=index)*0.05) - val_rows <- sample(1:sum(population$indexes!=index), valN, replace=FALSE) - train_rows <- c(1:sum(population$indexes!=index))[-val_rows] - - sampling_generator <- function(data, population, batchSize, train_rows, index){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - - list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,]) - } - } - - - history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index), - steps_per_epoch = sum(population$indexes!=index)/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[population$indexes!=index,][val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight) - - # batch prediciton - maxVal <- sum(population$indexes==index) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population[population$indexes==index,] - prediction$value <- 0 - - if(bayes){ - prediction$epistemicUncertainty <- 0 - prediction$aleatoricUncertainty <- 0 - for(batch in batches){ - MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim)) - for (k in 1:num_MC_samples){ - MC_samples[k,, ] = stats::predict(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - } - pred <- apply(MC_samples[,,output_dim], 2, mean) - epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var) - logVar = MC_samples[, , output_dim * 2] - - if(length(dim(logVar))<=1){ - aleatoricUncertainty = exp(mean(logVar)) - }else{ - aleatoricUncertainty = exp(colMeans(logVar)) - } - - prediction$value[batch] <- pred - prediction$epistemicUncertainty[batch] = epistemicUncertainty - prediction$aleatoricUncertainty[batch] = aleatoricUncertainty - - } - - }else{ - for(batch in batches){ - pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - prediction$value[batch] <- pred[,2] - #writeLines(paste0(dim(pred[,2]), collapse='-')) - #writeLines(paste0(pred[1,2], collapse='-')) - } - } - prediction$value[prediction$value>1] <- 1 - prediction$value[prediction$value<0] <- 0 - attr(prediction, "metaData") <- list(predictionType = "binary") - aucVal <- computeAuc(prediction) - perform <- c(perform,aucVal) - - # add the fold predictions and compute AUC after loop - predictionMat$value[population$indexes==index] <- prediction$value - # add uncertainty - predictionMat$aleatoricUncertainty[population$indexes==index] <- prediction$aleatoricUncertainty - predictionMat$epistemicUncertainty[population$indexes==index] <- prediction$epistemicUncertainty - } - - } - - auc <- computeAuc(predictionMat) - foldPerm <- perform - - # Output ---------------------------------------------------------------- - param.val <- paste0('RNNlayer Number: ', numberOfRNNLayer, '-- units: ',units,'-- recurrentDropout: ', recurrentDropout, - 'layerDropout: ',layerDropout,'-- lr: ', lr, - '-- decay: ', decay,'-- outcomeWeight',outcomeWeight, '-- batchSize: ',batchSize, '-- epochs: ', epochs) - writeLines('==========================================') - writeLines(paste0('CIReNN with parameters:', param.val,' obtained an AUC of ',auc)) - writeLines('==========================================') - - } else { - if(useDeepEnsemble){ - predList<-list() - for (i in seq(numberOfEnsembleNetwork)){ - #print(i) - pred <- createEnsembleNetwork(train = train, plpData=plpData,population=population,batchSize=batchSize,epochs = epochs, - earlyStoppingMinDelta=earlyStoppingMinDelta, earlyStoppingPatience=earlyStoppingPatience, - train_rows=train_rows,index=index,lr=lr,decay=decay, - units=units,recurrentDropout=recurrentDropout,numberOfRNNLayer=numberOfRNNLayer, - layerDropout=layerDropout, useGPU = useGPU, maxGPUs = maxGPUs) - - predList <- append(predList,pred) - } - model <- predList - - # batch prediciton - maxVal <- nrow(population) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population - prediction$value <- 0 - prediction$sigmas <- 0 - - for(batch in batches){ - - for (i in seq(numberOfEnsembleNetwork)){ - if(i == 1){ - muMatrix <- data.frame() - sigmaMatrix <-data.frame() - } - c(mu,sigma) %<-% predList[[i]](inputs=list(as.array(plpData[batch,,]))) - muMatrix <- rbind(muMatrix,t(as.data.frame(mu[,2]))) - sigmaMatrix <- rbind(sigmaMatrix,t(as.data.frame(sigma[,2]))) - } - - muMean <- apply(muMatrix,2,mean) - muSq <- muMatrix^2 - sigmaSq <- sigmaMatrix^2 - sigmaMean <- apply(sigmaMatrix,2,mean) - sigmaResult = apply(muSq+sigmaSq,2, mean)- muMean^2 - - prediction$value[batch] <- c(muMean) - prediction$sigmas[batch] <- c(sigmaResult) - } - prediction$value[prediction$value>1] <- 1 - prediction$value[prediction$value<0] <- 0 - - - }else{ - layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])) - if(useGPU){ - ##GRU layer - if(numberOfRNNLayer==1){ - layerOutput <- layerInput %>% - keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) - } - if(numberOfRNNLayer==2){ - layerOutput <- layerInput %>% - keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) - } - if(numberOfRNNLayer==3){ - layerOutput <- layerInput %>% - keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) - } - }else{ - ##GRU layer - if(numberOfRNNLayer==1){ - layerOutput <- layerInput %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=FALSE) - } - if(numberOfRNNLayer>1 ){ - layerInput %>% # ISSUE - "layerInput <- " missing? - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=TRUE) - } - if(numberOfRNNLayer==2){ - layerOutput <- layerInput %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=FALSE) - } - if(numberOfRNNLayer==3){ - layerOutput <- layerInput %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=TRUE) %>% - #layerOutput <- layerInput %>% ISSUE - pipe above? - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, - return_sequences=FALSE) - } - } - - earlyStopping = keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience, - mode="auto",min_delta = earlyStoppingMinDelta) - reduceLr = keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", - min_delta = 1e-5, cooldown = 0, min_lr = 0) - - class_weight = list("0" = 1, - "1" = outcomeWeight) - - if(bayes){ - mean = layerOutput %>% - layer_concrete_dropout(layer = keras::layer_dense(units = output_dim)) - - log_var = layerOutput %>% - layer_concrete_dropout(layer = keras::layer_dense(units = output_dim)) - - output = keras::layer_concatenate(list(mean, log_var)) - model = keras::keras_model(layerInput, output) - #model = keras::keras_model(keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])), output) - model %>% keras::compile( - optimizer = "adam", - loss = heteroscedastic_loss, - metrics = c(keras::custom_metric("heteroscedastic_loss", heteroscedastic_loss)) - ) - - }else{ - model <- layerInput %>% - keras::layer_dense(units=2, activation='softmax') - model %>% keras::compile( - loss = 'binary_crossentropy', - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - } - - data <- plpData[population$rowId,,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,length(population$indexes)*0.05) - val_rows <- sample(1:length(population$indexes), valN, replace=FALSE) - train_rows <- c(1:length(population$indexes))[-val_rows] - - sampling_generator2 <- function(data, population, batchSize, train_rows){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - list(as.array(data[rows,,]), population$y[rows,]) - } - } - - - history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows), - steps_per_epoch = nrow(population[-val_rows,])/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight, - view_metrics=F) - - - # batched prediciton - maxVal <- nrow(population) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population - prediction$value <- 0 - - if(bayes){ - prediction$epistemicUncertainty <- 0 - prediction$aleatoricUncertainty <- 0 - for(batch in batches){ - MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim)) - for (k in 1:num_MC_samples){ - MC_samples[k,, ] = stats::predict(model, as.array(plpData[batch,,])) - #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - } - pred <- apply(MC_samples[,,output_dim], 2, mean) - epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var) - logVar = MC_samples[, , output_dim * 2] - if(length(dim(logVar)) <= 1){ - aleatoricUncertainty = exp(mean(logVar)) - }else{ - aleatoricUncertainty = exp(colMeans(logVar)) - - } - prediction$value[batch] <- pred - prediction$epistemicUncertainty[batch] = epistemicUncertainty - prediction$aleatoricUncertainty[batch] = aleatoricUncertainty - } - - }else{ - for(batch in batches){ - pred <- keras::predict_on_batch(model, as.array(plpData[batch,,])) - prediction$value[batch] <- pred[,2] - } - - } - prediction$value[prediction$value>1] <- 1 - prediction$value[prediction$value<0] <- 0 - - attr(prediction, "metaData") <- list(predictionType = "binary") - auc <- computeAuc(prediction) - foldPerm <- auc - predictionMat <- prediction - - } - - } - result <- list(model=model, - auc=auc, - prediction = predictionMat, - hyperSum = unlist(list(numberOfRNNLayer=numberOfRNNLayer, - units=units, recurrentDropout=recurrentDropout, - layerDropout=layerDropout,lr =lr, decay=decay,outcomeWeight=outcomeWeight, - batchSize = batchSize, epochs= epochs, earlyStoppingMinDelta = earlyStoppingMinDelta, - earlyStoppingPatience=earlyStoppingPatience, - useDeepEnsemble = useDeepEnsemble, - numberOfEnsembleNetwork =numberOfEnsembleNetwork, - useVae = useVae, vaeDataSamplingProportion =vaeDataSamplingProportion ,vaeValidationSplit = vaeValidationSplit, - vaeBatchSize =vaeBatchSize, - vaeLatentDim = vaeLatentDim, - vaeIntermediateDim = vaeIntermediateDim, - vaeEpoch = vaeEpoch, vaeEpislonStd = vaeEpislonStd)) - ) - return(result) - -} - -#function for building vae -buildVae<-function(data, vaeValidationSplit= 0.2, vaeBatchSize = 100L, vaeLatentDim = 10L, vaeIntermediateDim = 256L, - vaeEpoch = 100L, vaeEpislonStd = 1.0, useGPU= FALSE, maxGPUs = NULL, temporal = TRUE){ - if (temporal){ - dataSample <- data %>% - apply(3, as.numeric) - } else{ - dataSample <- data - } - originalDim <- dim(dataSample)[2] - K <- keras::backend() - x <- keras::layer_input (shape =originalDim) - h <- keras::layer_dense (x, vaeIntermediateDim, activation = 'relu') - z_mean <- keras::layer_dense(h, vaeLatentDim) - z_log_var <- keras::layer_dense(h, vaeLatentDim) - - sampling<- function(arg){ - z_mean <- arg[,1:vaeLatentDim] - z_log_var <- arg[, (vaeLatentDim+1):(2*vaeLatentDim)] - - epsilon <- keras::k_random_normal( - shape = c(keras::k_shape(z_mean)[[1]]), - mean = 0., - stddev = vaeEpislonStd - ) - - z_mean + keras::k_exp(z_log_var/2)*epsilon - } - - z <- keras::layer_concatenate(list(z_mean, z_log_var)) %>% - keras::layer_lambda(sampling) - - #we instantiate these layers separately so as to reuse them later - decoder_h <- keras::layer_dense(units = vaeIntermediateDim, activation = 'relu') - decoder_mean <- keras::layer_dense (units = originalDim, activation = 'sigmoid') - h_decoded <- decoder_h (z) - x_decoded_mean <- decoder_mean(h_decoded) - - #end-to-end autoencoder - vae <- keras::keras_model (x,x_decoded_mean) - #encoder, from inputs to latent space - encoder <- keras::keras_model(x, z_mean) - - #generator, from latent space to reconstruted inputs - decoder_input <- keras::layer_input (shape = vaeLatentDim) - h_decoded_2 <- decoder_h(decoder_input) - x_decoded_mean_2 <- decoder_mean(h_decoded_2) - generator <- keras::keras_model (decoder_input, x_decoded_mean_2) - - vae_loss <- function(x, x_decoded_mean){ - xent_loss <- (originalDim/1.0)* keras::loss_binary_crossentropy(x, x_decoded_mean) - k1_loss <- -0.5 * keras::k_mean(1 + z_log_var - keras::k_square(z_mean) - keras::k_exp(z_log_var), axis = -1L) - xent_loss + k1_loss - } - #Activating parallelisation of GPU in encoder - if(useGPU & (maxGPUs>1) ){ - vae <- keras::multi_gpu_model(vae,gpus = maxGPUs) - } - - vae %>% keras::compile (optimizer = "rmsprop", loss = vae_loss) - #if (!is.null(dataValidation)) dataValidation<-list(dataValidation,dataValidation) - vaeEarlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=5,mode="auto",min_delta = 1e-3) - naanStopping = keras::callback_terminate_on_naan() - csvLogging = keras::callback_csv_logger (filename="./vae.csv",separator = ",", append =TRUE ) - - vae %>% keras::fit ( - dataSample,dataSample - ,shuffle = TRUE - ,epochs = vaeEpoch - ,batch_size = vaeBatchSize - #,validation_data = dataValidation - ,validation_split = vaeValidationSplit - ,callbacks = list(vaeEarlyStopping, - csvLogging, - naanStopping) - ) - return (list (vae,encoder)) -} - -#Defining Gaussian Layer for Deep Ensemble -GaussianLayer <- R6::R6Class("GaussianLayer", - inherit = keras::KerasLayer, - - public = list( - output_dim = NULL, - kernel_1 = NULL, - kernel_2 = NULL, - bias_1 = NULL, - bias_2 = NULL, - - initialize = function(output_dim){ - self$output_dim <- output_dim - }, - build = function(input_shape){ - super$build(input_shape) - - self$kernel_1 = self$add_weight(name = 'kernel_1', - shape = list(as.integer(input_shape[[2]]), self$output_dim), #list(30, self$output_dim),#shape = keras::shape(30, self$output_dim), - initializer = keras::initializer_glorot_normal(), - trainable = TRUE) - self$kernel_2 = self$add_weight(name = 'kernel_2', - shape = list(as.integer(input_shape[[2]]), self$output_dim),#list(30, self$output_dim), #shape = keras::shape(30, self$output_dim), - initializer = keras::initializer_glorot_normal(), - trainable = TRUE) - self$bias_1 = self$add_weight(name = 'bias_1', - shape = list(self$output_dim), #shape = keras::shape(self$output_dim), - initializer = keras::initializer_glorot_normal(), - trainable = TRUE) - self$bias_2 = self$add_weight(name = 'bias_2', - shape = list(self$output_dim), #shape = keras::shape(self$output_dim), - initializer = keras::initializer_glorot_normal(), - trainable = TRUE) - }, - - call = function(x, mask = NULL){ - output_mu = keras::k_dot(x, self$kernel_1) + self$bias_1 - output_sig = keras::k_dot(x, self$kernel_2) + self$bias_2 - output_sig_pos = keras::k_log(1 + keras::k_exp(output_sig)) + 1e-06 - return (list(output_mu, output_sig_pos)) - }, - - - compute_output_shape = function(input_shape){ - return (list ( - list(input_shape[[1]], self$output_dim), - list(input_shape[[1]], self$output_dim) ) - ) - } - ) -) - -#define layer wrapper function for Deep Ensemble -layer_custom <- function(object, output_dim, name = NULL, trainable = TRUE) { - keras::create_layer(GaussianLayer, object, list( - output_dim = as.integer(output_dim), - name = name, - trainable = trainable - )) -} - -#Custom loss function for Deep Ensemble -custom_loss <- function(sigma){ - gaussian_loss <- function(y_true,y_pred){ - tensorflow::tf$reduce_mean(0.5*tensorflow::tf$log(sigma) + 0.5*tensorflow::tf$div(tensorflow::tf$square(y_true - y_pred), sigma)) + 1e-6 - } - return(gaussian_loss) -} - -#Create Deep Ensemble Network function -createEnsembleNetwork<-function(train, plpData,population,batchSize,epochs, earlyStoppingPatience, earlyStoppingMinDelta, - train_rows=NULL,index=NULL,lr,decay, - units,recurrentDropout,numberOfRNNLayer,layerDropout, useGPU = useGPU, maxGPUs = maxGPUs){ - - mu <- function(){return(NULL)} - sigma <- function(){return(NULL)} - - if(useGPU){ - ##GRU layer - layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])) - - if(numberOfRNNLayer==1){ - layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_dense(units=2, activation='softmax') - } - if(numberOfRNNLayer==2){ - layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_gru(units=units, return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_dense(units=2, activation='softmax') - } - if(numberOfRNNLayer==3){ - layers <- layerInput %>% keras::layer_cudnn_gru(units=units, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=TRUE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_cudnn_gru(units=units, return_sequences=FALSE) %>% - keras::layer_dropout(layerDropout) %>% - keras::layer_dense(units=2, activation='softmax') - } - }else{ - ##GRU layer - layerInput <- keras::layer_input(shape = c(dim(plpData)[2],dim(plpData)[3])) - - if(numberOfRNNLayer==1){ - layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features - return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>% - keras::layer_dense(units=2, activation='softmax') - } - if(numberOfRNNLayer==2){ - layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE)%>% keras::layer_dropout(layerDropout) %>% - keras::layer_dense(units=2, activation='softmax') - } - if(numberOfRNNLayer==3){ - layers <- layerInput %>% keras::layer_gru(units=units, recurrent_dropout = recurrentDropout, #time step x number of features - return_sequences=TRUE) %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=TRUE) %>% - keras::layer_gru(units=units, recurrent_dropout = recurrentDropout,return_sequences=FALSE) %>% keras::layer_dropout(layerDropout) %>% - keras::layer_dense(units=2, activation='softmax') - } - } - - c(mu,sigma) %<-% layer_custom(layers, 2, name = 'main_output') - - model <- keras::keras_model(inputs = layerInput,outputs=mu) - - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=earlyStoppingPatience, - mode="auto",min_delta = earlyStoppingMinDelta) - #Currently using Parallel GPU makes errors in Deep Ensemble - #if(useGPU & (maxGPUs>1) ) model <- keras::multi_gpu_model(model,gpus = maxGPUs) - - model %>% keras::compile( - loss = custom_loss(!!sigma), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - - - if(!is.null(population$indexes) && train==T){ - data <- plpData[population$rowId[population$indexes!=index],,] - #Extract validation set first - 10k people or 5% - valN <- min(10000,sum(population$indexes!=index)*0.05) - val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE) - train_rows <- c(1:sum(population$indexes!=index))[-val_rows] - - sampling_generator<-function(data, population, batchSize, train_rows, index){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - - list(as.array(data[rows,,]), population$y[population$indexes!=index,][rows,]) - } - } - - history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index), - steps_per_epoch = sum(population$indexes!=index)/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[population$indexes!=index,][val_rows,])#, - ,callbacks=list(earlyStopping) - #callbacks=list(earlyStopping,reduceLr), - ) - - }else{ - data <- plpData[population$rowId,,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,length(population$indexes)*0.05) - val_rows<-sample(1:length(population$indexes), valN, replace=FALSE) - train_rows <- c(1:length(population$indexes))[-val_rows] - - sampling_generator2<-function(data, population, batchSize, train_rows){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - list(as.array(data[rows,,]), population$y[rows,]) - } - } - - - history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows), - steps_per_epoch = nrow(population[-val_rows,])/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[val_rows,]), - callbacks=list(earlyStopping), - view_metrics=F) - - } - #ParallelLogger::logInfo('right before get_intermediate') - layer_name = 'main_output' - get_intermediate = keras::k_function(inputs=list(model$input), - outputs=model$get_layer(layer_name)$output) - #ParallelLogger::logInfo('right after get_intermediate') - return(get_intermediate) -} - -#Custom layer for Bayesian Drop Out Layer -ConcreteDropout <- R6::R6Class("ConcreteDropout", - - inherit = keras::KerasWrapper, - - public = list( - weight_regularizer = NULL, - dropout_regularizer = NULL, - init_min = NULL, - init_max = NULL, - is_mc_dropout = NULL, - supports_masking = TRUE, - p_logit = NULL, - p = NULL, - - initialize = function(weight_regularizer, - dropout_regularizer, - init_min, - init_max, - is_mc_dropout) { - self$weight_regularizer <- weight_regularizer - self$dropout_regularizer <- dropout_regularizer - self$is_mc_dropout <- is_mc_dropout - self$init_min <- keras::k_log(init_min) - keras::k_log(1 - init_min) - self$init_max <- keras::k_log(init_max) - keras::k_log(1 - init_max) - }, - - build = function(input_shape) { - super$build(input_shape) - - self$p_logit <- super$add_weight( - name = "p_logit", - shape = keras::shape(1), - initializer = keras::initializer_random_uniform(self$init_min, self$init_max), - trainable = TRUE - ) - - self$p <- keras::k_sigmoid(self$p_logit) - - input_dim <- input_shape[[2]] - - weight <- private$py_wrapper$layer$kernel - - kernel_regularizer <- self$weight_regularizer * - keras::k_sum(keras::k_square(weight)) / - (1 - self$p) - - dropout_regularizer <- self$p * keras::k_log(self$p) - dropout_regularizer <- dropout_regularizer + - (1 - self$p) * keras::k_log(1 - self$p) - dropout_regularizer <- dropout_regularizer * - self$dropout_regularizer * - keras::k_cast(input_dim, keras::k_floatx()) - - regularizer <- keras::k_sum(kernel_regularizer + dropout_regularizer) - super$add_loss(regularizer) - }, - - concrete_dropout = function(x) { - eps <- keras::k_cast_to_floatx(keras::k_epsilon()) - temp <- 0.1 - - unif_noise <- keras::k_random_uniform(shape = keras::k_shape(x)) - - drop_prob <- keras::k_log(self$p + eps) - - keras::k_log(1 - self$p + eps) + - keras::k_log(unif_noise + eps) - - keras::k_log(1 - unif_noise + eps) - drop_prob <- keras::k_sigmoid(drop_prob / temp) - - random_tensor <- 1 - drop_prob - - retain_prob <- 1 - self$p - x <- x * random_tensor - x <- x / retain_prob - x - }, - - call = function(x, mask = NULL, training = NULL) { - if (self$is_mc_dropout) { - super$call(self$concrete_dropout(x)) - } else { - k_in_train_phase( - function() - super$call(self$concrete_dropout(x)), - super$call(x), - training = training - ) - } - } - ) -) -#define layer wrapper for Bayesian Drop-out layer -layer_concrete_dropout <- function(object, - layer, - weight_regularizer = 1e-6, - dropout_regularizer = 1e-5, - init_min = 0.1, - init_max = 0.1, - is_mc_dropout = TRUE, - name = NULL, - trainable = TRUE) { - keras::create_wrapper(ConcreteDropout, object, list( - layer = layer, - weight_regularizer = weight_regularizer, - dropout_regularizer = dropout_regularizer, - init_min = init_min, - init_max = init_max, - is_mc_dropout = is_mc_dropout, - name = name, - trainable = trainable - )) -} diff --git a/R/CNNTorch.R b/R/CNNTorch.R deleted file mode 100644 index b2d021d..0000000 --- a/R/CNNTorch.R +++ /dev/null @@ -1,178 +0,0 @@ -# @file CNNTorch.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Create setting for CNN model with python -#' @param nbfilters The number of filters -#' @param epochs The number of epochs -#' @param seed A seed for the model -#' @param class_weight The class weight used for imbalanced data: -#' 0: Inverse ratio between positives and negatives -#' -1: Focal loss -#' @param type It can be normal 'CNN', 'CNN_LSTM', CNN_MLF' with multiple kernels with different kernel size, -#' 'CNN_MIX', 'ResNet' and 'CNN_MULTI' -#' -#' @examples -#' \dontrun{ -#' model.cnnTorch <- setCNNTorch() -#' } -#' @export -setCNNTorch <- function(nbfilters=c(16, 32), epochs=c(20, 50), seed=0, class_weight = 0, type = 'CNN'){ - - ParallelLogger::logWarn('This model has broken - please use setCNN() or setCNN2() instead ') - - # set seed - if(is.null(seed[1])){ - seed <- as.integer(sample(100000000,1)) - } - - result <- list(model='fitCNNTorch', param=split(expand.grid(nbfilters=nbfilters, - epochs=epochs, seed=seed[1], - class_weight = class_weight, type = type), - 1:(length(nbfilters)*length(epochs)) ), - name='CNN Torch') - - class(result) <- 'modelSettings' - - return(result) -} - - -fitCNNTorch <- function(population, plpData, param, search='grid', quiet=F, - outcomeId, cohortId, ...){ - - # check plpData is libsvm format or convert if needed - if (!FeatureExtraction::isCovariateData(plpData$covariateData)) - stop("Needs correct covariateData") - - if(colnames(population)[ncol(population)]!='indexes'){ - warning('indexes column not present as last column - setting all index to 1') - population$indexes <- rep(1, nrow(population)) - } - - - start <- Sys.time() - - population$rowIdPython <- population$rowId-1 #to account for python/r index difference #subjectId - pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')]) - - result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T) - - outLoc <- createTempModelLoc() - # clear the existing model pickles - for(file in dir(outLoc)) - file.remove(file.path(outLoc,file)) - - # do cross validation to find hyperParameter - hyperParamSel <- lapply(param, function(x) do.call(trainCNNTorch, listAppend(x, - list(plpData = result$data, - population = pPopulation, - train=TRUE, - modelOutput=outLoc)) )) - - hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel)) - - #now train the final model and return coef - bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1] - finalModel <- do.call(trainCNNTorch, listAppend(param[[bestInd]], - list(plpData = result$data, - population = pPopulation, - train=FALSE, - modelOutput=outLoc))) - - - covariateRef <- as.data.frame(plpData$covariateData$covariateRef) - incs <- rep(1, nrow(covariateRef)) - covariateRef$included <- incs - covariateRef$covariateValue <- rep(0, nrow(covariateRef)) - - modelTrained <- file.path(outLoc) - param.best <- param[[bestInd]] - - comp <- start-Sys.time() - - # train prediction - pred <- as.matrix(finalModel) - pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes - colnames(pred) <- c('rowId','outcomeCount','indexes', 'value') - pred <- as.data.frame(pred) - attr(pred, "metaData") <- list(predictionType="binary") - - pred$value <- 1-pred$value - prediction <- merge(population, pred[,c('rowId','value')], by='rowId') - - - # return model location - result <- list(model = modelTrained, - trainCVAuc = -1, # ToDo decide on how to deal with this - hyperParamSearch = hyperSummary, - modelSettings = list(model='fitCNNTorch',modelParameters=param.best), - metaData = plpData$metaData, - populationSettings = attr(population, 'metaData'), - outcomeId=outcomeId, - cohortId=cohortId, - varImp = covariateRef, - trainingTime =comp, - dense=1, - covariateMap=result$map, # I think this is need for new data to map the same? - predictionTrain = prediction - ) - class(result) <- 'plpModel' - attr(result, 'type') <- 'pythonReticulate' - attr(result, 'predictionType') <- 'binary' - - return(result) -} - - -trainCNNTorch <- function(plpData, population, epochs=50, nbfilters = 16, seed=0, class_weight= 0, type = 'CNN', train=TRUE, modelOutput, quiet=F){ - - train_deeptorch <- function(){return(NULL)} - - python_dir <- system.file(package='PatientLevelPrediction','python') - e <- environment() - reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e) - - - result <- train_deeptorch(population = population, - plpData = plpData, - epochs = as.integer(epochs), - nbfilters = as.integer(nbfilters), - seed = as.integer(seed), - class_weight = as.double(class_weight), - model_type = as.character(type), - train = train, - modelOutput = modelOutput, - quiet = quiet - ) - - if(train){ - # then get the prediction - pred <- as.matrix(result) - colnames(pred) <- c('rowId','outcomeCount','indexes', 'value') - pred <- as.data.frame(pred) - attr(pred, "metaData") <- list(predictionType="binary") - - pred$value <- 1-pred$value - auc <- computeAuc(pred) - writeLines(paste0('Model obtained CV AUC of ', auc)) - return(auc) - } - - return(result) - -} diff --git a/R/CovNN.R b/R/CovNN.R deleted file mode 100644 index 4c05ffb..0000000 --- a/R/CovNN.R +++ /dev/null @@ -1,535 +0,0 @@ -# @file CovNN.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Create setting for multi-resolution CovNN model (stucture based on https://arxiv.org/pdf/1608.00647.pdf CNN1) -#' -#' @param batchSize The number of samples to used in each batch during model training -#' @param outcomeWeight The weight assined to the outcome (make greater than 1 to reduce unballanced label issue) -#' @param lr The learning rate -#' @param decay The decay of the learning rate -#' @param dropout [currently not used] the dropout rate for regularisation -#' @param epochs The number of times data is used to train the model (e.g., epoches=1 means data only used once to train) -#' @param filters The number of columns output by each convolution -#' @param kernelSize The number of time dimensions used for each convolution -#' @param loss The loss function implemented -#' @param seed The random seed -#' -#' @examples -#' \dontrun{ -#' model.CovNN <- setCovNN() -#' } -#' @export -setCovNN <- function(batchSize = 1000, - outcomeWeight=1, - lr=0.00001, - decay=0.000001, - dropout=0, - epochs = 10, - filters = 3, kernelSize = 10, - loss = "binary_crossentropy", - seed=NULL ){ - #[TODO: add input checks...] - - ensure_installed("keras") - - if(!is.null(seed)){ - warning('seed currently not implemented in CovNN') - } - - - result <- list(model='fitCovNN', param=split(expand.grid( - batchSize=batchSize, - outcomeWeight=outcomeWeight, - lr=lr, - decay=decay, - dropout=dropout, - epochs= epochs,filters=filters, - kernelSize=kernelSize,loss =loss, - seed=ifelse(is.null(seed),'NULL', seed)), - 1:(length(batchSize)*length(outcomeWeight)*length(epochs)* - length(filters)*length(lr)*length(decay)* - length(kernelSize)*length(loss)*max(1,length(seed)))), - name='CovNN' - ) - - class(result) <- 'modelSettings' - return(result) -} - - -fitCovNN <- function(plpData,population, param, search='grid', quiet=F, - outcomeId, cohortId, ...){ - # check plpData is coo format: - if (!FeatureExtraction::isCovariateData(plpData$covariateData)) - stop("Needs correct covariateData") - if(is.null(plpData$timeRef)){ - stop('Data not temporal...') - } - - metaData <- attr(population, 'metaData') - if(!is.null(population$indexes)) - population <- population[population$indexes>0,] - attr(population, 'metaData') <- metaData - - start<-Sys.time() - - result<- toSparseM(plpData,population,map=NULL, temporal=T) - - data <- result$data#[population$rowId,,] - #data<-as.array(data) -- cant make dense on big data! - - #one-hot encoding - population$y <- keras::to_categorical(population$outcomeCount, 2) - #colnames(population$y) <- c('0','1') - - # do cross validation to find hyperParameter - datas <- list(population=population, plpData=data) - hyperParamSel <- lapply(param, function(x) do.call(trainCovNN, c(x,datas,train=TRUE) )) - - hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum))) - hyperSummary <- as.data.frame(hyperSummary) - hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc)) - hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc)) - - #now train the final model and return coef - bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1] - finalModel<-do.call(trainCovNN, c(param[[bestInd]],datas, train=FALSE)) - - covariateRef <- as.data.frame(plpData$covariateData$covariateRef) - incs <- rep(1, nrow(covariateRef)) - covariateRef$included <- incs - covariateRef$covariateValue <- rep(0, nrow(covariateRef)) - - #modelTrained <- file.path(outLoc) - param.best <- param[[bestInd]] - - comp <- start-Sys.time() - - # train prediction - prediction <- finalModel$prediction - finalModel$prediction <- NULL - - # return model location - result <- list(model = finalModel$model, - trainCVAuc = -1, # ToDo decide on how to deal with this - hyperParamSearch = hyperSummary, - modelSettings = list(model='fitCovNN',modelParameters=param.best), - metaData = plpData$metaData, - populationSettings = attr(population, 'metaData'), - outcomeId=outcomeId, - cohortId=cohortId, - varImp = covariateRef, - trainingTime =comp, - covariateMap=result$map, - predictionTrain = prediction - ) - class(result) <- 'plpModel' - attr(result, 'type') <- 'deepMulti' - attr(result, 'inputs') <- '3' - attr(result, 'predictionType') <- 'binary' - - return(result) -} - -trainCovNN<-function(plpData, population, - outcomeWeight=1, lr=0.0001, decay=0.9, - dropout=0.5, filters=3, - kernelSize = dim(plpData)[3], - batchSize, epochs, loss= "binary_crossentropy", - seed=NULL, train=TRUE){ - - if(!is.null(population$indexes) && train==T){ - writeLines(paste('Training covolutional multi-resolution neural network with ',length(unique(population$indexes)),' fold CV')) - - index_vect <- unique(population$indexes) - perform <- c() - - # create prediction matrix to store all predictions - predictionMat <- population - predictionMat$value <- 0 - attr(predictionMat, "metaData") <- list(predictionType = "binary") - - if(kernelSize>dim(plpData)[3]){ - kernelSize <- dim(plpData)[3] -1 - warning('kernelsize reduced') - } - - for(index in 1:length(index_vect )){ - writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows')) - - #submodel1 <- keras::keras_model_sequential() - - submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - name='submodel1_input') - submodel1_output <- submodel1_input %>% - # Begin with 1D convolutional layer - keras::layer_max_pooling_1d(pool_size = kernelSize) %>% - keras::layer_conv_1d( - #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), - filters = filters, - kernel_size = floor(dim(plpData)[2]/kernelSize), - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_flatten() - - #submodel2 <- keras::keras_model_sequential() - #submodel2 %>% - # keras::layer_reshape(input_shape=c(dim(plpData)[2], dim(plpData)[3]), - # target_shape = c(dim(plpData)[2], dim(plpData)[3])) %>% - submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - name='submodel2_input') - submodel2_output <- submodel2_input %>% - # Begin with 1D convolutional layer - keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>% - keras::layer_conv_1d( - #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), - filters = filters, - kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))), - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_flatten() - - - #submodel3 <- keras::keras_model_sequential() - #submodel3 %>% - submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - name='submodel3_input') - submodel3_output <- submodel3_input %>% - # Begin with 1D convolutional layer - keras::layer_conv_1d( - input_shape = c(dim(plpData)[2], dim(plpData)[3]), - filters = filters, - kernel_size = kernelSize, - padding = "valid" - ) %>% - # Normalize the activations of the previous layer - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>% - keras::layer_conv_1d( - filters = filters, - kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))), - padding = "valid", - use_bias = T - ) %>% - keras::layer_flatten() - - #model <- keras::keras_model_sequential() - - deep_output <- keras::layer_concatenate(list(submodel1_output, - submodel2_output, - submodel3_output)) %>% - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FULLY CONNECTED LAYER 2 - # add drop out of 0.5 - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FINAL LAYER - keras::layer_dropout(rate=dropout) %>% - keras::layer_dense(name = 'final', - units = 2, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'sigmoid', name='deep_output') - - model <- keras::keras_model( - inputs = c(submodel1_input, submodel2_input, submodel3_input), - outputs = c(deep_output) - ) - - - # Prepare model for training - model %>% keras::compile( - loss = "binary_crossentropy", - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0) - - class_weight=list("0"=1,"1"=outcomeWeight) - - data <- plpData[population$rowId[population$indexes!=index],,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,sum(population$indexes!=index)*0.05) - val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE) - train_rows <- c(1:sum(population$indexes!=index))[-val_rows] - - sampling_generator<-function(data, population, batchSize, train_rows, index){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - - list(list(as.array(data[rows,,]), - as.array(data[rows,,]), - as.array(data[rows,,])), - population$y[population$indexes!=index,1:2][rows,]) - } - } - - - #print(table(population$y)) - - if(length(train_rows) < batchSize){ - # checking if this fixes issue with batchsize too big - batchSize <- length(train_rows) - ParallelLogger::logInfo('Reduce batchSize to training size') - } - - history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index), - steps_per_epoch = length(train_rows)/batchSize, - epochs=epochs, - validation_data=list(list(as.array(data[val_rows,,]), - as.array(data[val_rows,,]), - as.array(data[val_rows,,])), - population$y[population$indexes!=index,1:2][val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight) - - - - - # batch prediciton - maxVal <- sum(population$indexes==index) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population[population$indexes==index,] - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_on_batch(model, list(as.array(plpData[population$rowId[population$indexes==index],,][batch,,]), - as.array(plpData[population$rowId[population$indexes==index],,][batch,,]), - as.array(plpData[population$rowId[population$indexes==index],,][batch,,]))) - prediction$value[batch] <- as.double(pred[,2]) - } - - attr(prediction, "metaData") <- list(predictionType = "binary") - aucVal <- computeAuc(prediction) - perform <- c(perform,aucVal) - - # add the fold predictions and compute AUC after loop - predictionMat$value[population$indexes==index] <- prediction$value - - } - - auc <- computeAuc(predictionMat) - foldPerm <- perform - - # Output ---------------------------------------------------------------- - param.val <- paste0('outcomeWeight: ', outcomeWeight, - '-- kernelSize: ',paste0(kernelSize,collapse ='-'), - '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay, - '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs) - writeLines('==========================================') - writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc)) - writeLines('==========================================') - } else { - - #Initialize model - submodel1_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - name='submodel1_input') - submodel1_output <- submodel1_input %>% - # Begin with 1D convolutional layer - keras::layer_max_pooling_1d(pool_size = kernelSize) %>% - keras::layer_conv_1d( - #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-kernelSize), - filters = filters, - kernel_size = floor(dim(plpData)[2]/kernelSize), - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_flatten() - - submodel2_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - name='submodel2_input') - submodel2_output <- submodel2_input %>% - # Begin with 1D convolutional layer - keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>% - keras::layer_conv_1d( - #input_shape = c(dim(plpData)[2], dim(plpData)[3]+1-floor(sqrt(kernelSize))), - filters = filters, - kernel_size = floor(dim(plpData)[2]/floor(sqrt(kernelSize))), - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_flatten() - - - submodel3_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - name='submodel3_input') - submodel3_output <- submodel3_input %>% - # Begin with 1D convolutional layer - keras::layer_conv_1d( - input_shape = c(dim(plpData)[2], dim(plpData)[3]), - filters = filters, - kernel_size = kernelSize, - padding = "valid" - ) %>% - # Normalize the activations of the previous layer - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_max_pooling_1d(pool_size = floor(sqrt(kernelSize))) %>% - keras::layer_conv_1d( - filters = filters, - kernel_size = floor((dim(plpData)[2]+1-kernelSize)/floor(sqrt(kernelSize))), - padding = "valid", - use_bias = T - ) %>% - keras::layer_flatten() - - deep_output <- keras::layer_concatenate(list(submodel1_output, - submodel2_output, - submodel3_output)) %>% - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FULLY CONNECTED LAYER 2 - # add drop out of 0.5 - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FINAL LAYER - keras::layer_dropout(rate=dropout) %>% - keras::layer_dense(name = 'final', - units = 2, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'sigmoid', name='deep_output') - - model <- keras::keras_model( - inputs = c(submodel1_input, submodel2_input, submodel3_input), - outputs = c(deep_output) - ) - - - # Prepare model for training - model %>% keras::compile( - loss = "binary_crossentropy",#loss, - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0) - - - class_weight=list("0"=1,"1"=outcomeWeight) - - data <- plpData[population$rowId,,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,length(population$indexes)*0.05) - val_rows<-sample(1:length(population$indexes), valN, replace=FALSE) - train_rows <- c(1:length(population$indexes))[-val_rows] - - - sampling_generator2<-function(data, population, batchSize, train_rows){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - list(list(as.array(data[rows,,]), - as.array(data[rows,,]), - as.array(data[rows,,])), population$y[rows,]) - } - } - - - if(length(train_rows) < batchSize){ - # checking if this fixes issue with batchsize too big - batchSize <- length(train_rows) - ParallelLogger::logInfo('Reduce batchSize to training size') - } - - history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows), - steps_per_epoch = length(train_rows)/batchSize, - epochs=epochs, - validation_data=list(list(as.array(data[val_rows,,]), - as.array(data[val_rows,,]), - as.array(data[val_rows,,])), - population$y[val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight, - view_metrics=F) - - # batched prediciton - maxVal <- nrow(population) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_on_batch(model, list(as.array(plpData[batch,,]), - as.array(plpData[batch,,]), - as.array(plpData[batch,,]))) - prediction$value[batch] <- as.double(pred[,2]) - } - - attr(prediction, "metaData") <- list(predictionType = "binary") - auc <- computeAuc(prediction) - foldPerm <- auc - predictionMat <- prediction - } - - result <- list(model=model, - auc=auc, - prediction = predictionMat, - hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay, - outcomeWeight=outcomeWeight, - dropout=dropout,filters=filters, - kernelSize=paste0(kernelSize,collapse ='-'), - epochs=epochs, loss=loss, - fold_auc=foldPerm)) - ) - - return(result) - -} diff --git a/R/CovNN2.R b/R/CovNN2.R deleted file mode 100644 index de7bd45..0000000 --- a/R/CovNN2.R +++ /dev/null @@ -1,461 +0,0 @@ -# @file CovNN2.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Create setting for CovNN2 model - convolution across input and time - https://arxiv.org/pdf/1608.00647.pdf -#' -#' @param batchSize The number of samples to used in each batch during model training -#' @param outcomeWeight The weight assined to the outcome (make greater than 1 to reduce unballanced label issue) -#' @param lr The learning rate -#' @param decay The decay of the learning rate -#' @param dropout [currently not used] the dropout rate for regularisation -#' @param epochs The number of times data is used to train the model (e.g., epoches=1 means data only used once to train) -#' @param filters The number of columns output by each convolution -#' @param kernelSize The number of time dimensions used for each convolution -#' @param loss The loss function implemented -#' @param seed The random seed -#' -#' @examples -#' \dontrun{ -#' model.CovNN <- setCovNN() -#' } -#' @export -setCovNN2 <- function(batchSize = 1000, - outcomeWeight=1, - lr=0.00001, - decay=0.000001, - dropout=0, - epochs = 10, - filters = 3, kernelSize = 10, - loss = "binary_crossentropy", - seed=NULL ){ - #[TODO: add input checks...] - - ensure_installed("keras") - - if(!is.null(seed)){ - warning('seed currently not implemented in CovNN') - } - - result <- list(model='fitCovNN2', param=split(expand.grid( - batchSize=batchSize, - outcomeWeight=outcomeWeight, - lr=lr, - decay=decay, - dropout=dropout, - epochs= epochs,filters=filters, - kernelSize=kernelSize,loss =loss, - seed=ifelse(is.null(seed),'NULL', seed)), - 1:(length(batchSize)*length(outcomeWeight)*length(epochs)* - length(filters)*length(lr)*length(decay)* - length(kernelSize)*length(loss)*max(1,length(seed)))), - name='CovNN2' - ) - - class(result) <- 'modelSettings' - return(result) -} - - -fitCovNN2 <- function(plpData,population, param, search='grid', quiet=F, - outcomeId, cohortId, ...){ - # check plpData is coo format: - if (!FeatureExtraction::isCovariateData(plpData$covariateData)) - stop("Needs correct covariateData") - if(is.null(plpData$timeRef)){ - stop('Data not temporal...') - } - - metaData <- attr(population, 'metaData') - if(!is.null(population$indexes)) - population <- population[population$indexes>0,] - attr(population, 'metaData') <- metaData - - start<-Sys.time() - - result<- toSparseM(plpData,population,map=NULL, temporal=T) - - data <- result$data#[population$rowId,,] - #data<-as.array(data) -- cant make dense on big data! - - #one-hot encoding - population$y <- keras::to_categorical(population$outcomeCount, 2) - #colnames(population$y) <- c('0','1') - - # do cross validation to find hyperParameter - datas <- list(population=population, plpData=data) - hyperParamSel <- lapply(param, function(x) do.call(trainCovNN2, c(x,datas,train=TRUE) )) - - hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum))) - hyperSummary <- as.data.frame(hyperSummary) - hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc)) - hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc)) - - #now train the final model and return coef - bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1] - finalModel<-do.call(trainCovNN2, c(param[[bestInd]],datas, train=FALSE)) - - covariateRef <- as.data.frame(plpData$covariateData$covariateRef) - incs <- rep(1, nrow(covariateRef)) - covariateRef$included <- incs - covariateRef$covariateValue <- rep(0, nrow(covariateRef)) - - #modelTrained <- file.path(outLoc) - param.best <- param[[bestInd]] - - comp <- start-Sys.time() - - # train prediction - prediction <- finalModel$prediction - finalModel$prediction <- NULL - - # return model location - result <- list(model = finalModel$model, - trainCVAuc = -1, # ToDo decide on how to deal with this - hyperParamSearch = hyperSummary, - modelSettings = list(model='fitCovNN2',modelParameters=param.best), - metaData = plpData$metaData, - populationSettings = attr(population, 'metaData'), - outcomeId=outcomeId, - cohortId=cohortId, - varImp = covariateRef, - trainingTime =comp, - covariateMap=result$map, - predictionTrain = prediction - ) - class(result) <- 'plpModel' - attr(result, 'type') <- 'deep' - attr(result, 'predictionType') <- 'binary' - - return(result) -} - -trainCovNN2<-function(plpData, population, - outcomeWeight=1, lr=0.0001, decay=0.9, - dropout=0.5, filters=3, - kernelSize = dim(plpData)[3], - batchSize, epochs, loss= "binary_crossentropy", - seed=NULL, train=TRUE){ - - if(!is.null(population$indexes) && train==T){ - writeLines(paste('Training covolutional neural network (input and time) with ',length(unique(population$indexes)),' fold CV')) - - index_vect <- unique(population$indexes) - perform <- c() - - # create prediction matrix to store all predictions - predictionMat <- population - predictionMat$value <- 0 - attr(predictionMat, "metaData") <- list(predictionType = "binary") - - for(index in 1:length(index_vect )){ - writeLines(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows')) - - model <- keras::keras_model_sequential() %>% - - ##model_input <- keras::layer_input(shape=c(dim(plpData)[2], dim(plpData)[3]), - ## name='initial_input') %>% - keras::layer_permute(dims = c(2,1), - input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>% - - # Begin with 1D convolutional layer across input - hidden layer 1 - ##model_output <- keras::layer_input(shape=c(dim(plpData)[3], dim(plpData)[2]), - ## name='second_input') %>% - keras::layer_conv_1d( - filters = filters, - kernel_size = dim(plpData)[3]-2, - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - # second layer across input - hidden layer 2 time x filters - keras::layer_conv_1d( - filters = filters, - kernel_size = 3,#filters, - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # permute bact to time x var - keras::layer_permute(dims = c(2,1)) %>% - #max pool over time and conv - keras::layer_max_pooling_1d(pool_size = 2) %>% - keras::layer_conv_1d( - filters = filters, - kernel_size = 2, - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_flatten() %>% - # final 2 deep layers with dropout and batchnorm/ relu activation - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FULLY CONNECTED LAYER 2 - # add drop out of 0.5 - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FINAL LAYER - keras::layer_dropout(rate=dropout) %>% - keras::layer_dense(name = 'final', - units = 2, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'sigmoid', name='main_output') - - ##model <- keras::keras_model( - ## inputs = c(model_input), - ## outputs = c(model_output) - ##) - - # Prepare model for training - model %>% keras::compile( - loss = "binary_crossentropy", - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0) - - class_weight=list("0"=1,"1"=outcomeWeight) - - data <- plpData[population$rowId[population$indexes!=index],,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,sum(population$indexes!=index)*0.05) - val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE) - train_rows <- c(1:sum(population$indexes!=index))[-val_rows] - - sampling_generator<-function(data, population, batchSize, train_rows, index){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - - list(as.array(data[rows,,]), - population$y[population$indexes!=index,1:2][rows,]) - } - } - - - #print(table(population$y)) - if(length(train_rows) < batchSize){ - # checking if this fixes issue with batchsize too big - batchSize <- length(train_rows) - ParallelLogger::logInfo('Reducing batchSize to training size') - } - - history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows, index), - steps_per_epoch = length(train_rows)/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[population$indexes!=index,1:2][val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight) - - - - - # batch prediciton - maxVal <- sum(population$indexes==index) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population[population$indexes==index,] - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_on_batch(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - prediction$value[batch] <- as.double(pred[,2]) - } - - attr(prediction, "metaData") <- list(predictionType = "binary") - aucVal <- computeAuc(prediction) - perform <- c(perform,aucVal) - - # add the fold predictions and compute AUC after loop - predictionMat$value[population$indexes==index] <- prediction$value - - } - - auc <- computeAuc(predictionMat) - foldPerm <- perform - - # Output ---------------------------------------------------------------- - param.val <- paste0('outcomeWeight: ', outcomeWeight, - '-- kernelSize: ',paste0(kernelSize,collapse ='-'), - '-- filters: ', filters, '--loss: ', loss, '-- lr: ', lr, '-- decay: ', decay, - '-- dropout: ', dropout, '-- batchSize: ',batchSize, '-- epochs: ', epochs) - writeLines('==========================================') - writeLines(paste0('CovNN with parameters:', param.val,' obtained an AUC of ',auc)) - writeLines('==========================================') - } else { - - model <- keras::keras_model_sequential() %>% - - keras::layer_permute(dims = c(2,1), - input_shape = c(dim(plpData)[2], dim(plpData)[3])) %>% - - # Begin with 1D convolutional layer across input - hidden layer 1 - keras::layer_conv_1d( - filters = filters, - kernel_size = dim(plpData)[3]-2, - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - # second layer across input - hidden layer 2 time x filters - keras::layer_conv_1d( - filters = filters, - kernel_size = 3, - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # permute bact to time x var - keras::layer_permute(dims = c(2,1)) %>% - #max pool over time and conv - keras::layer_max_pooling_1d(pool_size = 2) %>% - keras::layer_conv_1d( - filters = filters, - kernel_size = 2, - padding = "valid" - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - keras::layer_flatten() %>% - # final 2 deep layers with dropout and batchnorm/ relu activation - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FULLY CONNECTED LAYER 2 - # add drop out of 0.5 - keras::layer_dropout(rate=dropout) %>% - # add fully connected layer 2 - keras::layer_dense( - units = 100, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'relu') %>% - - # =========== FINAL LAYER - keras::layer_dropout(rate=dropout) %>% - keras::layer_dense(name = 'final', - units = 2, - activation = "linear", use_bias=T - ) %>% - keras::layer_batch_normalization() %>% - keras::layer_activation(activation = 'sigmoid', name='main_output') - - - - # Prepare model for training - model %>% keras::compile( - loss = "binary_crossentropy",#loss, - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0) - - - class_weight=list("0"=1,"1"=outcomeWeight) - - data <- plpData[population$rowId,,] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,length(population$indexes)*0.05) - val_rows<-sample(1:length(population$indexes), valN, replace=FALSE) - train_rows <- c(1:length(population$indexes))[-val_rows] - - - sampling_generator2<-function(data, population, batchSize, train_rows){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - list(as.array(data[rows,,]), population$y[rows,]) - } - } - - if(length(train_rows) < batchSize){ - # checking if this fixes issue with batchsize too big - batchSize <- length(train_rows) - ParallelLogger::logInfo('Reducing batchSize to training size') - } - - - history <- model %>% keras::fit_generator(sampling_generator2(data,population,batchSize,train_rows), - steps_per_epoch = length(train_rows)/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight, - view_metrics=F) - - # batched prediciton - maxVal <- nrow(population) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_on_batch(model, as.array(plpData[batch,,])) - prediction$value[batch] <- as.double(pred[,2]) - } - - attr(prediction, "metaData") <- list(predictionType = "binary") - auc <- computeAuc(prediction) - foldPerm <- auc - predictionMat <- prediction - } - - result <- list(model=model, - auc=auc, - prediction = predictionMat, - hyperSum = unlist(list(batchSize = batchSize, lr=lr, decay=decay, - outcomeWeight=outcomeWeight, - dropout=dropout,filters=filters, - kernelSize=paste0(kernelSize,collapse ='-'), - epochs=epochs, loss=loss, - fold_auc=foldPerm)) - ) - - return(result) - -} diff --git a/R/Dataset.R b/R/Dataset.R new file mode 100644 index 0000000..73464d1 --- /dev/null +++ b/R/Dataset.R @@ -0,0 +1,120 @@ +#' A torch dataset +#' @import data.table +#' @export +Dataset <- torch::dataset( + name = "myDataset", + #' @param data a dataframe like object with the covariates + #' @param labels a dataframe with the labels + #' @param numericalIndex in what column numeric data is in (if any) + initialize = function(data, labels = NULL, numericalIndex = NULL) { + # determine numeric + if (is.null(numericalIndex)) { + numericalIndex <- data %>% + dplyr::group_by(columnId) %>% + dplyr::collect() %>% + dplyr::summarise(n = dplyr::n_distinct(.data$covariateValue)) %>% + dplyr::pull(n) > 1 + self$numericalIndex <- numericalIndex + } else { + self$numericalIndex <- NULL + } + + # add labels if training (make 0 vector for prediction) + if (!is.null(labels)) { + self$target <- torch::torch_tensor(labels) + } else { + self$target <- torch::torch_tensor(rep(0, data %>% dplyr::distinct(rowId) + %>% dplyr::collect() %>% nrow())) + } + # Weight to add in loss function to positive class + self$posWeight <- (self$target == 0)$sum() / self$target$sum() + + # add features + catColumns <- which(!numericalIndex) + dataCat <- dplyr::filter(data, columnId %in% catColumns) %>% + dplyr::arrange(columnId) %>% + dplyr::group_by(columnId) %>% + dplyr::collect() %>% + dplyr::mutate(newColumnId = dplyr::cur_group_id()) %>% + dplyr::ungroup() %>% + dplyr::select(c("rowId", "newColumnId")) %>% + dplyr::rename(columnId = newColumnId) + # the fastest way I found so far to convert data using data.table + # 1.5 min for 100k rows :( + dt <- data.table::data.table(rows = dataCat$rowId, cols = dataCat$columnId) + maxFeatures <- max(dt[, .N, by = rows][, N]) + start <- Sys.time() + tensorList <- lapply(1:max(data %>% dplyr::pull(rowId)), function(x) { + torch::torch_tensor(dt[rows == x, cols]) + }) + self$lengths <- lengths + self$cat <- torch::nn_utils_rnn_pad_sequence(tensorList, batch_first = T) + delta <- Sys.time() - start + ParallelLogger::logInfo("Data conversion for dataset took ", signif(delta, 3), " ", attr(delta, "units")) + if (sum(numericalIndex) == 0) { + self$num <- NULL + } else { + numericalData <- data %>% + dplyr::filter(columnId %in% !!which(numericalIndex)) %>% + dplyr::collect() + numericalData <- numericalData %>% + dplyr::group_by(columnId) %>% + dplyr::mutate(newId = dplyr::cur_group_id()) + indices <- torch::torch_tensor(cbind(numericalData$rowId, numericalData$newId), dtype = torch::torch_long())$t_() + values <- torch::torch_tensor(numericalData$covariateValue, dtype = torch::torch_float32()) + self$num <- torch::torch_sparse_coo_tensor( + indices = indices, + values = values, + size = c(self$target$shape, sum(numericalIndex)) + )$to_dense() + } + }, + getNumericalIndex = function() { + return( + self$numericalIndex + ) + }, + numCatFeatures = function() { + return( + sum(!self$numericalIndex) + ) + }, + numNumFeatures = function() { + if (!is.null(self$num)) { + return(self$num$shape[2]) + } else { + return(0) + } + }, + .getbatch = function(item) { + if (length(item) == 1) { + return(self$.getBatchSingle(item)) + } else { + return(self$.getBatchRegular(item)) + } + }, + .getBatchSingle = function(item) { + # add leading singleton dimension since models expects 2d tensors + batch <- list( + cat = self$cat[item]$unsqueeze(1), + num = self$num[item]$unsqueeze(1) + ) + return(list( + batch = batch, + target = self$target[item]$unsqueeze(1) + )) + }, + .getBatchRegular = function(item) { + batch <- list( + cat = self$cat[item], + num = self$num[item] + ) + return(list( + batch = batch, + target = self$target[item] + )) + }, + .length = function() { + self$target$size()[[1]] # shape[1] + } +) diff --git a/R/DeepNN.R b/R/DeepNN.R deleted file mode 100644 index 14c57b0..0000000 --- a/R/DeepNN.R +++ /dev/null @@ -1,557 +0,0 @@ -# @file DeepNN.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Create setting for DeepNN model -#' -#' @param units The number of units of the deep network - as a list of vectors -#' @param layer_dropout The layer dropout rate (regularisation) -#' @param lr Learning rate -#' @param decay Learning rate decay over each update. -#' @param outcome_weight The weight of the outcome class in the loss function -#' @param batch_size The number of data points to use per training batch -#' @param epochs Number of times to iterate over dataset -#' @param seed Random seed used by deep learning model -#' -#' @examples -#' \dontrun{ -#' model <- setDeepNN() -#' } -#' @export -setDeepNN <- function(units=list(c(128, 64), 128), layer_dropout=c(0.2), - lr =c(1e-4), decay=c(1e-5), outcome_weight = c(1.0), batch_size = c(100), - epochs= c(100), seed=NULL ){ - - ensure_installed("keras") - - # if(class(indexFolder)!='character') - # stop('IndexFolder must be a character') - # if(length(indexFolder)>1) - # stop('IndexFolder must be one') - # - # if(class(units)!='numeric') - # stop('units must be a numeric value >0 ') - # if(units<1) - # stop('units must be a numeric value >0 ') - # - # #if(length(units)>1) - # # stop('units can only be a single value') - # - # if(class(recurrent_dropout)!='numeric') - # stop('dropout must be a numeric value >=0 and <1') - # if( (recurrent_dropout<0) | (recurrent_dropout>=1)) - # stop('dropout must be a numeric value >=0 and <1') - # if(class(layer_dropout)!='numeric') - # stop('layer_dropout must be a numeric value >=0 and <1') - # if( (layer_dropout<0) | (layer_dropout>=1)) - # stop('layer_dropout must be a numeric value >=0 and <1') - # if(class(lr)!='numeric') - # stop('lr must be a numeric value >0') - # if(lr<=0) - # stop('lr must be a numeric value >0') - # if(class(decay)!='numeric') - # stop('decay must be a numeric value >=0') - # if(decay<=0) - # stop('decay must be a numeric value >=0') - # if(class(outcome_weight)!='numeric') - # stop('outcome_weight must be a numeric value >=0') - # if(outcome_weight<=0) - # stop('outcome_weight must be a numeric value >=0') - # if(class(batch_size)!='numeric') - # stop('batch_size must be an integer') - # if(batch_size%%1!=0) - # stop('batch_size must be an integer') - # if(class(epochs)!='numeric') - # stop('epochs must be an integer') - # if(epochs%%1!=0) - # stop('epochs must be an integer') - # if(!class(seed)%in%c('numeric','NULL')) - # stop('Invalid seed') - #if(class(UsetidyCovariateData)!='logical') - # stop('UsetidyCovariateData must be an TRUE or FALSE') - - param <- expand.grid(units=units, - layer_dropout=layer_dropout, - lr =lr, decay=decay, outcome_weight=outcome_weight,epochs= epochs, - seed=ifelse(is.null(seed),'NULL', seed)) - param$units1=unlist(lapply(param$units, function(x) x[1])) - param$units2=unlist(lapply(param$units, function(x) x[2])) - param$units3=unlist(lapply(param$units, function(x) x[3])) - - result <- list(model='fitDeepNN', param=split(param, - 1:(length(units)*length(layer_dropout)*length(lr)*length(decay)*length(outcome_weight)*length(epochs)*max(1,length(seed)))), - name='DeepNN' - ) - - class(result) <- 'modelSettings' - return(result) -} - - -fitDeepNN <- function(plpData,population, param, search='grid', quiet=F, - outcomeId, cohortId, ...){ - # check plpData is coo format: - if (!FeatureExtraction::isCovariateData(plpData$covariateData)){ - stop('DeepNN requires correct covariateData') - } - if(!is.null(plpData$timeRef)){ - warning('Data temporal but deepNN uses non-temporal data...') - } - - metaData <- attr(population, 'metaData') - if(!is.null(population$indexes)) - population <- population[population$indexes>0,] - attr(population, 'metaData') <- metaData - - start<-Sys.time() - - result<- toSparseM(plpData,population,map=NULL, temporal=F) - data <- result$data - - #one-hot encoding - population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount))) - - # do cross validation to find hyperParameter - datas <- list(population=population, plpData=data) - hyperParamSel <- lapply(param, function(x) do.call(trainDeepNN, c(x,datas,train=TRUE) )) - - hyperSummary <- cbind(do.call(rbind, lapply(hyperParamSel, function(x) x$hyperSum))) - hyperSummary <- as.data.frame(hyperSummary) - hyperSummary$auc <- unlist(lapply(hyperParamSel, function (x) x$auc)) - hyperParamSel<-unlist(lapply(hyperParamSel, function(x) x$auc)) - - #now train the final model and return coef - bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1] - finalModel<-do.call(trainDeepNN, c(param[[bestInd]],datas, train=FALSE)) - - covariateRef <- as.data.frame(plpData$covariateData$covariateRef) - incs <- rep(1, nrow(covariateRef)) - covariateRef$included <- incs - covariateRef$covariateValue <- rep(0, nrow(covariateRef)) - - #modelTrained <- file.path(outLoc) - param.best <- param[[bestInd]] - - comp <- start-Sys.time() - - # train prediction - prediction <- finalModel$prediction - finalModel$prediction <- NULL - - # return model location - result <- list(model = finalModel$model, - trainCVAuc = -1, # ToDo decide on how to deal with this - hyperParamSearch = hyperSummary, - modelSettings = list(model='fitDeepNN',modelParameters=param.best), - metaData = plpData$metaData, - populationSettings = attr(population, 'metaData'), - outcomeId=outcomeId, - cohortId=cohortId, - varImp = covariateRef, - trainingTime =comp, - covariateMap=result$map, - predictionTrain = prediction - ) - class(result) <- 'plpModel' - attr(result, 'type') <- 'deep' - attr(result, 'predictionType') <- 'binary' - - return(result) -} - -trainDeepNN<-function(plpData, population, - units1=128, units2= NULL, units3=NULL, - layer_dropout=0.2, - lr =1e-4, decay=1e-5, outcome_weight = 1.0, batch_size = 100, - epochs= 100, seed=NULL, train=TRUE, ...){ - - ParallelLogger::logInfo(paste('Training deep neural network with ',length(unique(population$indexes)),' fold CV')) - if(!is.null(population$indexes) && train==T){ - index_vect <- unique(population$indexes) - perform <- c() - - # create prediction matrix to store all predictions - predictionMat <- population - predictionMat$value <- 0 - attr(predictionMat, "metaData") <- list(predictionType = "binary") - - for(index in 1:length(index_vect )){ - ParallelLogger::logInfo(paste('Fold ',index, ' -- with ', sum(population$indexes!=index),'train rows')) - - model <- keras::keras_model_sequential() - - if(is.na(units2)){ - model %>% - keras::layer_dense(units=units1, #activation='identify', - input_shape=ncol(plpData)) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=2, activation='sigmoid', use_bias = T) - } else if(is.na(units3)){ - model %>% - keras::layer_dense(units=units1, #activation='identify', - input_shape=ncol(plpData)) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=units2 #,activation='identify' - ) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=2, activation='sigmoid', use_bias = T) - } else{ - model %>% - keras::layer_dense(units=units1, #activation='identify', - input_shape=ncol(plpData)) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=units2 #,activation='identify' - ) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=units3 #,activation='identify' - ) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=2, activation='sigmoid', use_bias = T) - } - - - # Prepare model for training - model %>% keras::compile( - loss = "binary_crossentropy", - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0) - - class_weight=list("0"=1,"1"=outcome_weight) - - data <- plpData[population$rowId[population$indexes!=index],] - - #Extract validation set first - 10k people or 5% - valN <- min(10000,sum(population$indexes!=index)*0.05) - val_rows<-sample(1:sum(population$indexes!=index), valN, replace=FALSE) - train_rows <- c(1:sum(population$indexes!=index))[-val_rows] - - sampling_generator<-function(data, population, batch_size, train_rows, index){ - function(){ - gc() - rows<-sample(train_rows, batch_size, replace=FALSE) - - list(as.array(data[rows,]), - population$y[population$indexes!=index,1:2][rows,]) - } - } - - - #print(table(population$y)) - - history <- model %>% keras::fit_generator(sampling_generator(data,population,batch_size,train_rows, index), - steps_per_epoch = floor(sum(population$indexes!=index)/batch_size), - epochs=epochs, - validation_data=list(as.array(data[val_rows,]), - population$y[population$indexes!=index,1:2][val_rows,]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight) - - - # batch prediciton - maxVal <- sum(population$indexes==index) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population[population$indexes==index,] - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],][batch,,drop=FALSE])) - prediction$value[batch] <- pred[,2] - } - - attr(prediction, "metaData") <- list(predictionType = "binary") - aucVal <- computeAuc(prediction) - perform <- c(perform,aucVal) - - # add the fold predictions and compute AUC after loop - predictionMat$value[population$indexes==index] <- prediction$value - - } - - auc <- computeAuc(predictionMat) - foldPerm <- perform - - # Output ---------------------------------------------------------------- - param.val <- paste0('units1: ',units1,'units2: ',units2,'units3: ',units3, - 'layer_dropout: ',layer_dropout,'-- lr: ', lr, - '-- decay: ', decay, '-- batch_size: ',batch_size, '-- epochs: ', epochs) - ParallelLogger::logInfo('==========================================') - ParallelLogger::logInfo(paste0('DeepNN with parameters:', param.val,' obtained an AUC of ',auc)) - ParallelLogger::logInfo('==========================================') - - } else { - - model <- keras::keras_model_sequential() - if(is.na(units2)){ - model %>% - keras::layer_dense(units=units1, #activation='identify', - input_shape=ncol(plpData)) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=2, activation='sigmoid', use_bias = T) - } else if(is.na(units3)){ - model %>% - keras::layer_dense(units=units1, #activation='identify', - input_shape=ncol(plpData)) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=units2 #,activation='identify' - ) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=2, activation='sigmoid', use_bias = T) - } else{ - model %>% - keras::layer_dense(units=units1, #activation='identify', - input_shape=ncol(plpData)) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=units2 #,activation='identify' - ) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=units3 #,activation='identify' - ) %>% - keras::layer_dropout(layer_dropout) %>% - keras::layer_dense(units=2, activation='sigmoid', use_bias = T) - } - - # Prepare model for training - model %>% keras::compile( - loss = "binary_crossentropy", - metrics = c('accuracy'), - optimizer = keras::optimizer_rmsprop(lr = lr,decay = decay) - ) - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, min_lr = 0) - - class_weight=list("0"=1,"1"=outcome_weight) - - #Extract validation set first - 10k people or 5% - valN <- min(10000,nrow(population)*0.05) - val_rows<-sample(1:nrow(population), valN, replace=FALSE) - train_rows <- c(1:nrow(population))[-val_rows] - - sampling_generator2<-function(data, population, batch_size, train_rows){ - function(){ - gc() - rows<-sample(train_rows, batch_size, replace=FALSE) - - list(as.array(data[rows,,]), - population$y[,1:2][rows,]) - } - } - - history <- model %>% keras::fit_generator(sampling_generator2(plpData,population,batch_size,train_rows), - steps_per_epoch = nrow(population)/batch_size, - epochs=epochs, - validation_data=list(as.array(plpData[val_rows,,]), - population$y[val_rows,1:2]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight) - - - # batch prediciton - maxVal <- nrow(population) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_proba(model, as.array(plpData[batch,,drop=FALSE])) - prediction$value[batch] <- pred[,2] - } - - attr(prediction, "metaData") <- list(predictionType = "binary") - auc <- computeAuc(prediction) - foldPerm <- auc - predictionMat <- prediction - } - - result <- list(model=model, - auc=auc, - prediction = predictionMat, - hyperSum = unlist(list(units1=units1,units2=units2,units3=units3, - layer_dropout=layer_dropout,lr =lr, decay=decay, - batch_size = batch_size, epochs= epochs)) - ) - return(result) - -} - - - -#' [Under development] Transfer learning -#' -#' @param plpResult The plp result when training a kersa deep learning model on big data -#' @param plpData The new data to fine tune the model on -#' @param population The population for the new data -#' @param fixLayers boolean specificying whether to fix weights in model being transferred -#' @param includeTop If TRUE the final layer of the model being transferred is removed -#' @param addLayers vector specifying nodes in each layer to add e.g. c(100,10) will add another layer with 100 nodels and then a final layer with 10 -#' @param layerDropout Add dropout to each new layer (binary vector length of addLayers) -#' @param layerActivation Activation function for each new layer (string vector length of addLayers) -#' @param outcomeWeight The weight to assign the class 1 when training the model -#' @param batchSize Size of each batch for updating layers -#' @param epochs Number of epoches to run -#' @examples -#' \dontrun{ -#' modelSet <- setDeepNN() -#' plpResult <- runPlp(plpData, population, modelSettings = modelSet, ...) -#' -#' transferLearning(...) -#' } -#' @export -transferLearning <- function(plpResult, - plpData, - population, - fixLayers = T, - includeTop= F, - addLayers = c(100,10), - layerDropout = c(T,T), - layerActivation = c('relu','softmax'), - outcomeWeight = 1, - batchSize = 10000, - epochs=20){ - - # checks - if(!is.null(addLayers)){ - if(length(addLayers)!=length(layerDropout)){ - stop('Layer vector not same length as layer dropout vector') - } - if(length(addLayers)!=length(layerActivation)){ - stop('Layer vector not same length as layer activation vector') - } - } - if(batchSize > nrow(population)){ - warning('batchSize is too big for your data...') - batchSize = nrow(population)/10 - } - - if(!includeTop){ - # remove last layer if not dropout - if(length(grep('Dropout',as.character(plpResult$model$model$layers[[length(plpResult$model$model$layers)]])))==0){ - keras::pop_layer(plpResult$model$model) - } - } - - # create the base pre-trained - base_model <- plpResult$model$model - - # add our custom layers - predictions <- base_model$output - - - # fix the older layers - if(fixLayers){ - for (i in 1:length(base_model$layers)){ - try({base_model$layers[[i]]$trainable <- F}, silent = TRUE) - } - } - - ## add loop over settings here - move code to new function and call it - ##==== - # !!check this looping logic works - if(!is.null(addLayers)){ - for(i in 1:length(addLayers)){ - predictions <- keras::layer_dense(predictions,units = addLayers[i], activation = layerActivation[i]) - if(layerDropout[i]){ - predictions <- keras::layer_dropout(predictions, rate = 0.5) - } - } - } - - # add find layer for binary outcome - predictions <- keras::layer_dense(predictions,units = 2, activation = 'sigmoid') - - - # this is the model we will train - model <- keras::keras_model(inputs = base_model$input, outputs = predictions) - - # compile the model (should be done *after* setting layers to non-trainable) - model %>% keras::compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', - metrics = c('accuracy')) - - - # make this input... - earlyStopping=keras::callback_early_stopping(monitor = "val_loss", patience=10,mode="auto",min_delta = 1e-4) - reduceLr=keras::callback_reduce_lr_on_plateau(monitor="val_loss", factor =0.1, - patience = 5,mode = "auto", min_delta = 1e-5, cooldown = 0, - min_lr = 0) - - class_weight=list("0"=1,"1"=outcomeWeight) - - - sampling_generator<-function(data, population, batchSize, train_rows){ - function(){ - gc() - rows<-sample(train_rows, batchSize, replace=FALSE) - - list(as.array(data[rows,]), - population$y[rows,1:2]) - } - } - - # convert plpdata to matrix: - metaData <- attr(population, 'metaData') - if(!is.null(population$indexes)) - population <- population[population$indexes>0,] - attr(population, 'metaData') <- metaData - - result<- toSparseM(plpData,population,map=plpResult$model$covariateMap, temporal=F) - data <- result$data - population$y <- keras::to_categorical(population$outcomeCount, length(unique(population$outcomeCount))) - - #Extract validation set first - 10k people or 5% - valN <- min(10000,nrow(population)*0.05) - val_rows<-sample(1:nrow(population), valN, replace=FALSE) - train_rows <- c(1:nrow(population))[-val_rows] - - history <- model %>% keras::fit_generator(sampling_generator(data,population,batchSize,train_rows), - steps_per_epoch = nrow(population)/batchSize, - epochs=epochs, - validation_data=list(as.array(data[val_rows,,]), - population$y[val_rows,1:2]), - callbacks=list(earlyStopping,reduceLr), - class_weight=class_weight) - - - # batch prediciton - maxVal <- nrow(population) - batches <- lapply(1:ceiling(maxVal/batchSize), function(x) ((x-1)*batchSize+1):min((x*batchSize),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - pred <- model$predict(as.array(data[batch,,drop=FALSE])) # added drop=FALSE - prediction$value[batch] <- pred[,2] - } - - ##=== - - attr(prediction, "metaData") <- list(predictionType = "binary") - auc <- computeAuc(prediction) - foldPerm <- auc - predictionMat <- prediction - -result <- list(model=model, - auc=auc, - prediction = predictionMat, - hyperSum = list(fixLayers = fixLayers, - addLayers = addLayers, - layerDropout = layerDropout, - layerActivation = layerActivation)) - - return(result) - -} diff --git a/R/DeepPatientLevelPrediction.R b/R/DeepPatientLevelPrediction.R new file mode 100644 index 0000000..f6f5a19 --- /dev/null +++ b/R/DeepPatientLevelPrediction.R @@ -0,0 +1,28 @@ +# @file DeepPatientLevelPrediction.R +# +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of DeepPatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' DeepPatientLevelPrediction +#' +#' @description A package containing deep learning extensions for developing prediction models using data in the OMOP CDM +#' +#' @docType package +#' @name DeepPatientLevelPrediction +#' @importFrom dplyr %>% +#' @importFrom data.table := +#' @importFrom rlang .data +NULL diff --git a/R/Estimator.R b/R/Estimator.R new file mode 100644 index 0000000..8abb048 --- /dev/null +++ b/R/Estimator.R @@ -0,0 +1,736 @@ +# @file Estimator.R +# +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of DeepPatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' setEstimator +#' +#' @description +#' creates settings for the Estimator, which takes a model and trains it +#' +#' @name setEstimator +#' @param learningRate what learning rate to use +#' @param weightDecay what weight_decay to use +#' @param optimizer which optimizer to use +#' @param scheduler which learning rate scheduler to use +#' @param criterion loss function to use +#' @param posWeight If more weight should be added to positive labels during training - will result in miscalibrated models +#' @param earlyStopping If earlyStopping should be used which stops the training of your metric is not improving +#' @param earlyStoppingMetric Which parameter to use for early stopping +#' @param patience patience for earlyStopper +#' @param hyperparameterMetric which metric to use for hyperparameter, loss, auc, auprc or a custom function +NULL + +#' fitEstimator +#' +#' @description +#' fits a deep learning estimator to data. +#' +#' @param trainData the data to use +#' @param modelSettings modelSettings object +#' @param analysisId Id of the analysis +#' @param ... Extra inputs +#' +#' @export +fitEstimator <- function(trainData, + modelSettings, + analysisId, + ...) { + start <- Sys.time() + + # check covariate data + if (!FeatureExtraction::isCovariateData(trainData$covariateData)) { + stop("Needs correct covariateData") + } + + param <- modelSettings$param + + # get the settings from the param + settings <- attr(param, "settings") + if (!is.null(trainData$folds)) { + trainData$labels <- merge(trainData$labels, trainData$fold, by = "rowId") + } + mappedCovariateData <- PatientLevelPrediction::MapIds( + covariateData = trainData$covariateData, + cohort = trainData$labels + ) + + covariateRef <- mappedCovariateData$covariateRef + + outLoc <- PatientLevelPrediction::createTempModelLoc() + cvResult <- do.call( + what = gridCvDeep, + args = list( + mappedData = mappedCovariateData, + labels = trainData$labels, + settings = settings, + modelLocation = outLoc, + paramSearch = param + ) + ) + + hyperSummary <- do.call(rbind, lapply(cvResult$paramGridSearch, function(x) x$hyperSummary)) + prediction <- cvResult$prediction + incs <- rep(1, covariateRef %>% dplyr::tally() %>% dplyr::pull()) + covariateRef <- covariateRef %>% + dplyr::collect() %>% + dplyr::mutate( + included = incs, + covariateValue = 0, + isNumeric = cvResult$numericalIndex + ) + + + comp <- start - Sys.time() + result <- list( + model = cvResult$estimator, # file.path(outLoc), + + preprocessing = list( + featureEngineering = attr(trainData$covariateData, "metaData")$featureEngineering, + tidyCovariates = attr(trainData$covariateData, "metaData")$tidyCovariateDataSettings, + requireDenseMatrix = settings$requiresDenseMatrix + ), + prediction = prediction, + modelDesign = PatientLevelPrediction::createModelDesign( + targetId = attr(trainData, "metaData")$targetId, + outcomeId = attr(trainData, "metaData")$outcomeId, + restrictPlpDataSettings = attr(trainData, "metaData")$restrictPlpDataSettings, + covariateSettings = attr(trainData, "metaData")$covariateSettings, + populationSettings = attr(trainData, "metaData")$populationSettings, + featureEngineeringSettings = attr(trainData$covariateData, "metaData")$featureEngineeringSettings, + preprocessSettings = attr(trainData$covariateData, "metaData")$preprocessSettings, + modelSettings = modelSettings, + splitSettings = attr(trainData, "metaData")$splitSettings, + sampleSettings = attr(trainData, "metaData")$sampleSettings + ), + trainDetails = list( + analysisId = analysisId, + analysisSource = "", + developementDatabase = attr(trainData, "metaData")$cdmDatabaseSchema, + attrition = attr(trainData, "metaData")$attrition, + trainingTime = paste(as.character(abs(comp)), attr(comp, "units")), + trainingDate = Sys.Date(), + modelName = settings$name, + finalModelParameters = cvResult$finalParam, + hyperParamSearch = hyperSummary + ), + covariateImportance = covariateRef + ) + + class(result) <- "plpModel" + attr(result, "predictionFunction") <- "predictDeepEstimator" + attr(result, "modelType") <- "binary" + attr(result, "saveType") <- attr(param, "settings")$saveType + + return(result) +} + +#' predictDeepEstimator +#' +#' @description +#' the prediction function for the estimator +#' +#' @param plpModel the plpModel +#' @param data plp data object or a torch dataset +#' @param cohort data.frame with the rowIds of the people +#' +#' @export +predictDeepEstimator <- function(plpModel, + data, + cohort) { + if (!"plpModel" %in% class(plpModel)) { + plpModel <- list(model = plpModel) + attr(plpModel, "modelType") <- "binary" + } + if ("plpData" %in% class(data)) { + mappedData <- PatientLevelPrediction::MapIds(data$covariateData, + cohort = cohort, + mapping = plpModel$covariateImportance %>% + dplyr::select( + .data$columnId, + .data$covariateId + ) + ) + data <- Dataset(mappedData$covariates, + numericalIndex = plpModel$covariateImportance$isNumeric + ) + } + + # get predictions + prediction <- cohort + if (is.character(plpModel$model)) { + model <- torch::torch_load(file.path(plpModel$model, "DeepEstimatorModel.pt"), device = "cpu") + estimator <- Estimator$new( + baseModel = attr(plpModel$modelDesign$modelSettings$param, "settings")$baseModel, + modelParameters = model$modelParameters, + fitParameters = model$fitParameters, + device = attr(plpModel$modelDesign$modelSettings$param, "settings")$device + ) + estimator$model$load_state_dict(model$modelStateDict) + prediction$value <- estimator$predictProba(data) + } else { + prediction$value <- plpModel$model$predictProba(data) + } + + + attr(prediction, "metaData")$modelType <- attr(plpModel, "modelType") + + return(prediction) +} + +#' gridCvDeep +#' +#' @description +#' Performs grid search for a deep learning estimator +#' +#' +#' @param mappedData Mapped data with covariates +#' @param labels Dataframe with the outcomes +#' @param settings Settings of the model +#' @param modelLocation Where to save the model +#' @param paramSearch model parameters to perform search over +#' +#' @export +gridCvDeep <- function(mappedData, + labels, + settings, + modelLocation, + paramSearch) { + modelName <- settings$modelName + modelParamNames <- settings$modelParamNames + fitParamNames <- c("weightDecay", "learningRate") + epochs <- settings$epochs + batchSize <- settings$batchSize + baseModel <- settings$baseModel + device <- settings$device + + ParallelLogger::logInfo(paste0("Running CV for ", modelName, " model")) + + ########################################################################### + + gridSearchPredictons <- list() + length(gridSearchPredictons) <- length(paramSearch) + dataset <- Dataset(mappedData$covariates, labels$outcomeCount) + for (gridId in 1:length(paramSearch)) { + ParallelLogger::logInfo(paste0("Running hyperparameter combination no ", gridId)) + ParallelLogger::logInfo(paste0("HyperParameters: ")) + ParallelLogger::logInfo(paste(names(paramSearch[[gridId]]), paramSearch[[gridId]], collapse = " | ")) + modelParams <- paramSearch[[gridId]][modelParamNames] + + fitParams <- paramSearch[[gridId]][fitParamNames] + fitParams$epochs <- epochs + fitParams$batchSize <- batchSize + + + # initiate prediction + prediction <- c() + + fold <- labels$index + ParallelLogger::logInfo(paste0("Max fold: ", max(fold))) + modelParams$catFeatures <- dataset$numCatFeatures() + modelParams$numFeatures <- dataset$numNumFeatures() + learnRates <- list() + for (i in 1:max(fold)) { + ParallelLogger::logInfo(paste0("Fold ", i)) + trainDataset <- torch::dataset_subset(dataset, indices = which(fold != i)) + testDataset <- torch::dataset_subset(dataset, indices = which(fold == i)) + # fitParams$posWeight <- trainDataset$dataset$posWeight + estimator <- Estimator$new( + baseModel = baseModel, + modelParameters = modelParams, + fitParameters = fitParams, + device = device + ) + + estimator$fit( + trainDataset, + testDataset + ) + + ParallelLogger::logInfo("Calculating predictions on left out fold set...") + + prediction <- rbind( + prediction, + predictDeepEstimator( + plpModel = estimator, + data = testDataset, + cohort = labels[fold == i, ] + ) + ) + learnRates[[i]] <- list( + LRs = estimator$learnRateSchedule, + bestEpoch = estimator$bestEpoch + ) + } + maxIndex <- which.max(unlist(sapply(learnRates, `[`, 2))) + paramSearch[[gridId]]$learnSchedule <- learnRates[[maxIndex]] + + gridSearchPredictons[[gridId]] <- list( + prediction = prediction, + param = paramSearch[[gridId]] + ) + } + # get best para (this could be modified to enable any metric instead of AUC, just need metric input in function) + paramGridSearch <- lapply(gridSearchPredictons, function(x) { + do.call(PatientLevelPrediction::computeGridPerformance, x) + }) # cvAUCmean, cvAUC, param + + optimalParamInd <- which.max(unlist(lapply(paramGridSearch, function(x) x$cvPerformance))) + finalParam <- paramGridSearch[[optimalParamInd]]$param + + cvPrediction <- gridSearchPredictons[[optimalParamInd]]$prediction + cvPrediction$evaluationType <- "CV" + + ParallelLogger::logInfo("Training final model using optimal parameters") + + # get the params + modelParams <- finalParam[modelParamNames] + fitParams <- finalParam[fitParamNames] + fitParams$epochs <- finalParam$learnSchedule$bestEpoch + fitParams$batchSize <- batchSize + fitParams$posWeight <- dataset$posWeight + # create the dir + if (!dir.exists(file.path(modelLocation))) { + dir.create(file.path(modelLocation), recursive = T) + } + modelParams$catFeatures <- dataset$numCatFeatures() + modelParams$numFeatures <- dataset$numNumFeatures() + + estimator <- Estimator$new( + baseModel = baseModel, + modelParameters = modelParams, + fitParameters = fitParams, + device = device + ) + numericalIndex <- dataset$getNumericalIndex() + + estimator$fitWholeTrainingSet(dataset, finalParam$learnSchedule$LRs) + + ParallelLogger::logInfo("Calculating predictions on all train data...") + prediction <- predictDeepEstimator( + plpModel = estimator, + data = dataset, + cohort = labels + ) + prediction$evaluationType <- "Train" + + prediction <- rbind( + prediction, + cvPrediction + ) + # modify prediction + prediction <- prediction %>% + dplyr::select(-.data$index) + + prediction$cohortStartDate <- as.Date(prediction$cohortStartDate, origin = "1970-01-01") + + + # save torch code here + estimator$save(modelLocation, "DeepEstimatorModel.pt") + + return( + list( + estimator = modelLocation, + prediction = prediction, + finalParam = finalParam, + paramGridSearch = paramGridSearch, + numericalIndex = numericalIndex + ) + ) +} + +#' Estimator +#' @description +#' A generic R6 class that wraps around a torch nn module and can be used to +#' fit and predict the model defined in that module. +#' @export +Estimator <- R6::R6Class( + classname = "Estimator", + lock_objects = FALSE, + public = list( + #' @description + #' Creates a new estimator + #' @param baseModel The torch nn module to use as model + #' @param modelParameters Parameters to initialize the baseModel + #' @param fitParameters Parameters required for the estimator fitting + #' @param optimizer A torch optimizer to use, default is Adam + #' @param criterion The torch loss function to use, defaults to + #' binary cross entropy with logits + #' @param scheduler learning rate scheduler to use + #' @param device Which device to use for fitting, default is cpu + #' @param patience Patience to use for early stopping + initialize = function(baseModel, + modelParameters, + fitParameters, + optimizer = torch::optim_adam, + criterion = torch::nn_bce_with_logits_loss, + scheduler = torch::lr_reduce_on_plateau, + device = "cpu", + patience = 4) { + self$device <- device + self$model <- do.call(baseModel, modelParameters) + self$modelParameters <- modelParameters + self$fitParameters <- fitParameters + self$epochs <- self$itemOrDefaults(fitParameters, "epochs", 10) + self$learningRate <- self$itemOrDefaults(fitParameters, "learningRate", 1e-3) + self$l2Norm <- self$itemOrDefaults(fitParameters, "weightDecay", 1e-5) + self$batchSize <- self$itemOrDefaults(fitParameters, "batchSize", 1024) + self$posWeight <- self$itemOrDefaults(fitParameters, "posWeight", 1) + self$prefix <- self$itemOrDefaults(fitParameters, "prefix", self$model$name) + + self$previousEpochs <- self$itemOrDefaults(fitParameters, "previousEpochs", 0) + self$model$to(device = self$device) + + self$optimizer <- optimizer( + params = self$model$parameters, + lr = self$learningRate, + weight_decay = self$l2Norm + ) + self$criterion <- criterion(torch::torch_tensor(self$posWeight, + device = self$device + )) + + self$scheduler <- scheduler(self$optimizer, + patience = 1, + verbose = FALSE, mode = "max" + ) + + # gradient accumulation is useful when training large numbers where + # you can only fit few samples on the GPU in each batch. + self$gradAccumulationIter <- 1 + + if (!is.null(patience)) { + self$earlyStopper <- EarlyStopping$new(patience = patience) + } else { + self$earlyStopper <- NULL + } + + self$bestScore <- NULL + self$bestEpoch <- NULL + }, + + #' @description fits the estimator + #' @param dataset a torch dataset to use for model fitting + #' @param testDataset a torch dataset to use for early stopping + fit = function(dataset, testDataset) { + valLosses <- c() + valAUCs <- c() + batchIndex <- torch::torch_randperm(length(dataset)) + 1L + batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize)) + + testBatchIndex <- 1:length(testDataset) + testBatchIndex <- split(testBatchIndex, ceiling(seq_along(testBatchIndex) / self$batchSize)) + + modelStateDict <- list() + epoch <- list() + times <- list() + learnRates <- list() + for (epochI in 1:self$epochs) { + startTime <- Sys.time() + trainLoss <- self$fitEpoch(dataset, batchIndex) + endTime <- Sys.time() + + # predict on test data + scores <- self$score(testDataset, testBatchIndex) + delta <- endTime - startTime + currentEpoch <- epochI + self$previousEpochs + lr <- self$optimizer$param_groups[[1]]$lr + ParallelLogger::logInfo( + "Epochs: ", currentEpoch, + " | Val AUC: ", round(scores$auc, 3), + " | Val Loss: ", round(scores$loss, 3), + " | Train Loss: ", round(trainLoss, 3), + " | Time: ", round(delta, 3), " ", + units(delta), + " | LR: ", lr + ) + self$scheduler$step(scores$auc) + valLosses <- c(valLosses, scores$loss) + valAUCs <- c(valAUCs, scores$auc) + learnRates <- c(learnRates, lr) + times <- c(times, round(delta, 3)) + if (!is.null(self$earlyStopper)) { + self$earlyStopper$call(scores$auc) + if (self$earlyStopper$improved) { + # here it saves the results to lists rather than files + modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu()) + epoch[[epochI]] <- currentEpoch + } + if (self$earlyStopper$earlyStop) { + ParallelLogger::logInfo("Early stopping, validation AUC stopped improving") + ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta)) + self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates) + return(invisible(self)) + } + } else { + modelStateDict[[epochI]] <- lapply(self$model$state_dict(), function(x) x$detach()$cpu()) + epoch[[epochI]] <- currentEpoch + } + } + ParallelLogger::logInfo("Average time per epoch was: ", round(mean(as.numeric(times)), 3), " ", units(delta)) + self$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates) + invisible(self) + }, + + #' @description + #' fits estimator for one epoch (one round through the data) + #' @param dataset torch dataset to use for fitting + #' @param batchIndex indices of batches + fitEpoch = function(dataset, batchIndex) { + trainLosses <- torch::torch_empty(length(batchIndex)) + ix <- 1 + self$model$train() + progressBar <- utils::txtProgressBar(style = 3) + coro::loop(for (b in batchIndex) { + self$optimizer$zero_grad() + batch <- self$batchToDevice(dataset[b]) + out <- self$model(batch[[1]]) + loss <- self$criterion(out, batch[[2]]) + loss$backward() + + self$optimizer$step() + trainLosses[ix] <- loss$detach() + utils::setTxtProgressBar(progressBar, ix / length(batchIndex)) + ix <- ix + 1 + }) + close(progressBar) + trainLosses$mean()$item() + }, + + #' @description + #' calculates loss and auc after training for one epoch + #' @param dataset The torch dataset to use to evaluate loss and auc + #' @param batchIndex Indices of batches in the dataset + #' @return list with average loss and auc in the dataset + score = function(dataset, batchIndex) { + torch::with_no_grad({ + loss <- torch::torch_empty(c(length(batchIndex))) + predictions <- list() + targets <- list() + self$model$eval() + ix <- 1 + coro::loop(for (b in batchIndex) { + batch <- self$batchToDevice(dataset[b]) + pred <- self$model(batch[[1]]) + predictions <- c(predictions, pred) + targets <- c(targets, batch[[2]]) + loss[ix] <- self$criterion(pred, batch[[2]]) + ix <- ix + 1 + }) + mean_loss <- loss$mean()$item() + predictionsClass <- data.frame( + value = as.matrix(torch::torch_sigmoid(torch::torch_cat(predictions)$cpu())), + outcomeCount = as.matrix(torch::torch_cat(targets)$cpu()) + ) + attr(predictionsClass, "metaData")$modelType <- "binary" + auc <- PatientLevelPrediction::computeAuc(predictionsClass) + }) + return(list(loss = mean_loss, auc = auc)) + }, + + #' @description + #' operations that run when fitting is finished + #' @param valAUCs validation AUC values + #' @param modelStateDict fitted model parameters + #' @param valLosses validation losses + #' @param epoch list of epochs fit + #' @param learnRates learning rate sequence used so far + finishFit = function(valAUCs, modelStateDict, valLosses, epoch, learnRates) { + bestEpochInd <- which.max(valAUCs) # change this if a different metric is used + + bestModelStateDict <- lapply(modelStateDict[[bestEpochInd]], function(x) x$to(device = self$device)) + self$model$load_state_dict(bestModelStateDict) + + bestEpoch <- epoch[[bestEpochInd]] + self$bestEpoch <- bestEpoch + self$bestScore <- list( + loss = valLosses[bestEpochInd], + auc = valAUCs[bestEpochInd] + ) + self$learnRateSchedule <- learnRates[1:bestEpochInd] + + ParallelLogger::logInfo("Loaded best model (based on AUC) from epoch ", bestEpoch) + ParallelLogger::logInfo("ValLoss: ", self$bestScore$loss) + ParallelLogger::logInfo("valAUC: ", self$bestScore$auc) + }, + + #' @description + #' Fits whole training set on a specific number of epochs + #' TODO What happens when learning rate changes per epochs? + #' Ideally I would copy the learning rate strategy from before + #' and adjust for different sizes ie more iterations/updates??? + #' @param dataset torch dataset + #' @param learnRates learnRateSchedule from CV + fitWholeTrainingSet = function(dataset, learnRates = NULL) { + if (is.null(self$bestEpoch)) { + self$bestEpoch <- self$epochs + } + + batchIndex <- torch::torch_randperm(length(dataset)) + 1L + batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize)) + for (epoch in 1:self$bestEpoch) { + self$optimizer$param_groups[[1]]$lr <- learnRates[[epoch]] + self$fitEpoch(dataset, batchIndex) + } + }, + + #' @description + #' save model and those parameters needed to reconstruct it + #' @param path where to save the model + #' @param name name of file + #' @return the path to saved model + save = function(path, name) { + savePath <- file.path(path, name) + torch::torch_save( + list( + modelStateDict = self$model$state_dict(), + modelParameters = self$modelParameters, + fitParameters = self$fitParameters, + epoch = self$epochs + ), + savePath + ) + return(savePath) + }, + + + #' @description + #' predicts and outputs the probabilities + #' @param dataset Torch dataset to create predictions for + #' @return predictions as probabilities + predictProba = function(dataset) { + batchIndex <- 1:length(dataset) + batchIndex <- split(batchIndex, ceiling(seq_along(batchIndex) / self$batchSize)) + torch::with_no_grad({ + predictions <- c() + self$model$eval() + coro::loop(for (b in batchIndex) { + batch <- self$batchToDevice(dataset[b]) + target <- batch$target + pred <- self$model(batch$batch) + predictions <- c(predictions, as.array(torch::torch_sigmoid(pred$cpu()))) + }) + }) + return(predictions) + }, + + #' @description + #' predicts and outputs the class + #' @param dataset A torch dataset to create predictions for + #' @param threshold Which threshold to use for predictions + #' @return The predicted class for the data in the dataset + predict = function(dataset, threshold = NULL) { + predictions <- self$predictProba(dataset) + + if (is.null(threshold)) { + # use outcome rate + threshold <- dataset$target$sum()$item() / length(dataset) + } + predicted_class <- as.integer(predictions > threshold) + return(predicted_class) + }, + + #' @description + #' sends a batch of data to device + #' assumes batch includes lists of tensors to arbitrary nested depths + #' @param batch the batch to send, usually a list of torch tensors + #' @return the batch on the required device + batchToDevice = function(batch) { + if (class(batch)[1] == "torch_tensor") { + batch <- batch$to(device = self$device) + } else { + ix <- 1 + for (b in batch) { + if (class(b)[1] == "torch_tensor") { + b <- b$to(device = self$device) + } else { + b <- self$batchToDevice(b) + } + if (!is.null(b)) { + batch[[ix]] <- b + } + ix <- ix + 1 + } + } + return(batch) + }, + + #' @description + #' select item from list, and if it's null sets a default + #' @param list A list with items + #' @param item Which list item to retrieve + #' @param default The value to return if list doesn't have item + #' @return the list item or default + itemOrDefaults = function(list, item, default = NULL) { + value <- list[[item]] + if (is.null(value)) default else value + } + ) +) + +#' Earlystopping class +#' @description +#' Stops training if a loss or metric has stopped improving +EarlyStopping <- R6::R6Class( + classname = "EarlyStopping", + lock_objects = FALSE, + public = list( + #' @description + #' Creates a new earlystopping object + #' @param patience Stop after this number of epochs if loss doesn't improve + #' @param delta How much does the loss need to improve to count as improvement + #' @param verbose If information should be printed out + #' @return a new earlystopping object + initialize = function(patience = 3, delta = 0, verbose = TRUE) { + self$patience <- patience + self$counter <- 0 + self$verbose <- verbose + self$bestScore <- NULL + self$earlyStop <- FALSE + self$improved <- FALSE + self$delta <- delta + self$previousScore <- 0 + }, + #' @description + #' call the earlystopping object and increment a counter if loss is not + #' improving + #' @param metric the current metric value + call = function(metric) { + score <- metric + if (is.null(self$bestScore)) { + self$bestScore <- score + self$improved <- TRUE + } else if (score < self$bestScore + self$delta) { + self$counter <- self$counter + 1 + self$improved <- FALSE + if (self$verbose) { + ParallelLogger::logInfo( + "EarlyStopping counter: ", self$counter, + " out of ", self$patience + ) + } + if (self$counter >= self$patience) { + self$earlyStop <- TRUE + } + } else { + self$bestScore <- score + self$counter <- 0 + self$improved <- TRUE + } + self$previousScore <- score + } + ) +) diff --git a/R/Formatting.R b/R/Formatting.R deleted file mode 100644 index 600c24b..0000000 --- a/R/Formatting.R +++ /dev/null @@ -1,457 +0,0 @@ -# @file formatting.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Convert the plpData in COO format into a sparse R matrix -#' -#' @description -#' Converts the standard plpData to a sparse matrix -#' -#' @details -#' This function converts the covariate file from ffdf in COO format into a sparse matrix from -#' the package Matrix -#' @param plpData An object of type \code{plpData} with covariate in coo format - the patient level prediction -#' data extracted from the CDM. -#' @param population The population to include in the matrix -#' @param map A covariate map (telling us the column number for covariates) -#' @param temporal Whether you want to convert temporal data -#' @examples -#' #TODO -#' -#' @return -#' Returns a list, containing the data as a sparse matrix, the plpData covariateRef -#' and a data.frame named map that tells us what covariate corresponds to each column -#' This object is a list with the following components: \describe{ -#' \item{data}{A sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.} -#' \item{covariateRef}{The plpData covariateRef.} -#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.} -#' } -#' -#' @export -toSparseM <- function(plpData,population, map=NULL, temporal=F){ - # check logger - if(length(ParallelLogger::getLoggers())==0){ - logger <- ParallelLogger::createLogger(name = "SIMPLE", - threshold = "INFO", - appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp))) - ParallelLogger::registerLogger(logger) - } - - ParallelLogger::logInfo(paste0('starting toSparseM')) - - - ParallelLogger::logDebug(paste0('covariates nrow: ', nrow(plpData$covariateData$covariates))) - ParallelLogger::logDebug(paste0('covariateRef nrow: ', nrow(plpData$covariateData$covariateRef))) - - - #assign newIds to covariateRef - newcovariateData <- MapCovariates(plpData$covariateData, - population, - mapping=map) - - ParallelLogger::logDebug(paste0('Max covariateId in covariates: ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T))))) - ParallelLogger::logDebug(paste0('# covariates in covariateRef: ', nrow(newcovariateData$covariateRef))) - ParallelLogger::logDebug(paste0('Max rowId in covariates: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T))))) - - maxY <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId, na.rm = TRUE)))$max - ParallelLogger::logDebug(paste0('Max newCovariateId in mapping: ',maxY)) - maxX <- max(population$rowId) - ParallelLogger::logDebug(paste0('Max rowId in population: ',maxX)) - - # chunk then add - if(!temporal){ - ParallelLogger::logInfo(paste0('toSparseM non temporal used')) - data <- Matrix::sparseMatrix(i=1, - j=1, - x=0, - dims=c(maxX,maxY)) - - dataEnv <- environment() - convertData1 <- function(batch,dataEnv) { - data <- get("data", envir = dataEnv) - data <- data + Matrix::sparseMatrix(i=as.data.frame(batch %>% dplyr::select(.data$rowId))$rowId, - j=as.data.frame(batch %>% dplyr::select(.data$covariateId))$covariateId, - x=as.data.frame(batch %>% dplyr::select(.data$covariateValue))$covariateValue, - dims=c(maxX,maxY)) - assign("data", data, envir = dataEnv) - return(NULL) - } - Andromeda::batchApply(newcovariateData$covariates, convertData1, batchSize = 100000, dataEnv = dataEnv) - - } else { - ParallelLogger::logInfo(paste0('toSparseM temporal used')) - - ParallelLogger::logTrace(paste0('Min time:', min(plpData$timeRef$timeId))) - ParallelLogger::logTrace(paste0('Max time:', max(plpData$timeRef$timeId))) - - # do we want to use for(i in sort(plpData$timeRef$timeId)){ ? - for(i in min(plpData$timeRef$timeId):max(plpData$timeRef$timeId)){ - - if(nrow(newcovariateData$covariates %>% dplyr::filter(.data$timeId==i))>0){ - ParallelLogger::logTrace(paste0('Found covariates for timeId ', i)) - - - # initiate the sparse matrix - dataPlp <- Matrix::sparseMatrix(i=1, - j=1, - x=0, - dims=c(maxX, maxY)) - dataEnv <- environment() - ParallelLogger::logTrace(paste0('Initiated Mapping covariates for timeId ', i)) - - - # add function to batch creating matrix from Andromeda data - convertData <- function(batch, dataEnv) { - dataPlp <- get("dataPlp", envir = dataEnv) - dataPlp <- dataPlp + Matrix::sparseMatrix(i=as.double(as.character(batch$rowId)), - j=as.double(as.character(batch$covariateId)), - x=batch$covariateValue, - dims=c(maxX,maxY)) - - assign("dataPlp", dataPlp, envir = dataEnv) - return(NULL) - } - - # add age for each time - tempCovs <- addAgeTemp(timeId = i,newcovariateData, plpData$timeRef) # EDITED adding newCov - if(!is.null(tempCovs)){ - Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv) - ParallelLogger::logTrace(paste0('Added any age covariates for timeId ', i)) - } - - # add non age temporal covs for each time - tempCovs <- addNonAgeTemp(timeId = i,newcovariateData) - if(!is.null(tempCovs)){ - Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv) - ParallelLogger::logTrace(paste0('Added non-age non-temporal covariates for timeId ', i)) - } - - # add non temporal covs - tempCovs <- newcovariateData$covariates %>% - dplyr::filter(!is.na(.data$timeId)) %>% - dplyr::filter(.data$timeId == i) - Andromeda::batchApply(tempCovs, convertData, batchSize = 100000, dataEnv=dataEnv) - - data_array <- slam::as.simple_sparse_array(dataPlp) - # remove dataPlp - #dataPlp <<- NULL - ParallelLogger::logTrace(paste0('Dim of data_array: ', paste0(dim(data_array), collapse='-'))) - - #extending one more dimesion to the array - data_array<-slam::extend_simple_sparse_array(data_array, MARGIN =c(1L)) - ParallelLogger::logTrace(paste0('Finished Mapping covariates for timeId ', i)) - } else { - data_array <- tryCatch(slam::simple_sparse_array(i=matrix(c(1,1,1), ncol = 3), - v=0, - dim=c(maxX,1, maxY)) - ) - - } - #binding arrays along the dimesion - if(i==min(plpData$timeRef$timeId)) { - result_array <- data_array - }else{ - result_array <- slam::abind_simple_sparse_array(result_array,data_array,MARGIN=2L) - } - } - data <- result_array - } - - ParallelLogger::logDebug(paste0('Sparse matrix with dimensionality: ', paste(dim(data), collapse=',') )) - - ParallelLogger::logInfo(paste0('finishing toSparseM')) - - result <- list(data=data, - covariateRef=as.data.frame(newcovariateData$covariateRef), - map=as.data.frame(newcovariateData$mapping)) - return(result) - -} - -# restricts to pop and saves/creates mapping -MapCovariates <- function(covariateData,population, mapping){ - - # to remove check notes - #covariateId <- oldCovariateId <- newCovariateId <- NULL - ParallelLogger::logInfo(paste0('starting MapCovariates')) - - newCovariateData <- Andromeda::andromeda(covariateRef = covariateData$covariateRef, - analysisRef = covariateData$analysisRef) - - # restrict to population for speed - ParallelLogger::logTrace('restricting to population for speed and mapping') - if(is.null(mapping)){ - mapping <- data.frame(oldCovariateId = as.data.frame(covariateData$covariateRef %>% dplyr::select(.data$covariateId)), - newCovariateId = 1:nrow(covariateData$covariateRef)) - } - if(sum(colnames(mapping)%in%c('oldCovariateId','newCovariateId'))!=2){ - colnames(mapping) <- c('oldCovariateId','newCovariateId') - } - covariateData$mapping <- mapping - covariateData$population <- data.frame(rowId = population[,'rowId']) - # assign new ids : - newCovariateData$covariates <- covariateData$covariates %>% - dplyr::inner_join(covariateData$population) %>% - dplyr::rename(oldCovariateId = .data$covariateId) %>% - dplyr::inner_join(covariateData$mapping) %>% - dplyr::select(- .data$oldCovariateId) %>% - dplyr::rename(covariateId = .data$newCovariateId) - covariateData$population <- NULL - covariateData$mapping <- NULL - - newCovariateData$mapping <- mapping - - ParallelLogger::logInfo(paste0('finished MapCovariates')) - - return(newCovariateData) -} - - - -#' Convert the plpData in COO format into a sparse python matrix using torch.sparse -#' -#' @description -#' Converts the standard plpData to a sparse matrix firectly into python -#' -#' @details -#' This function converts the covariate file from ffdf in COO format into a sparse matrix from -#' the package Matrix -#' @param plpData An object of type \code{plpData} with covariate in coo format - the patient level prediction -#' data extracted from the CDM. -#' @param population The population to include in the matrix -#' @param map A covariate map (telling us the column number for covariates) -#' @param temporal Whether to include timeId into tensor -#' @param pythonExePath Location of python exe you want to use -#' @examples -#' #TODO -#' -#' @return -#' Returns a list, containing the python object name of the sparse matrix, the plpData covariateRef -#' and a data.frame named map that tells us what covariate corresponds to each column -#' This object is a list with the following components: \describe{ -#' \item{data}{The python object name containing a sparse matrix with the rows corresponding to each person in the plpData and the columns corresponding to the covariates.} -#' \item{covariateRef}{The plpData covariateRef.} -#' \item{map}{A data.frame containing the data column ids and the corresponding covariateId from covariateRef.} -#' } -#' -#' @export -toSparseTorchPython <- function(plpData,population, map=NULL, temporal=F, pythonExePath=NULL){ - - map_python_initiate <- map_python <- function(){return(NULL)} - - # check logger - if(length(ParallelLogger::getLoggers())==0){ - logger <- ParallelLogger::createLogger(name = "SIMPLE", - threshold = "INFO", - appenders = list(ParallelLogger::createConsoleAppender(layout = 'layoutTimestamp'))) - ParallelLogger::registerLogger(logger) - } - - newcovariateData <- MapCovariates(plpData$covariateData, - population, - mapping=map) - - ParallelLogger::logDebug(paste0('Max ',as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$covariateId, na.rm=T))))) - ParallelLogger::logDebug(paste0('# cols: ', nrow(newcovariateData$covariateRef))) - ParallelLogger::logDebug(paste0('Max rowId: ', as.data.frame(newcovariateData$covariates %>% dplyr::summarise(max = max(.data$rowId, na.rm=T))))) - - ParallelLogger::logTrace(paste0('Converting data into python sparse matrix...')) - - maxT <- NULL - if(temporal){ - maxT <- as.data.frame(newcovariateData$covariates$timeId %>% dplyr::summarise(max = max(.data$id, na.rm=T))) - ParallelLogger::logDebug(paste0('Max timeId: ', maxT)) - } - - maxCol <- as.data.frame(newcovariateData$mapping %>% dplyr::summarise(max=max(.data$newCovariateId,na.rm = TRUE)))$max - maxRow <- max(population$rowId) - - # source the python fucntion - e <- environment() - reticulate::source_python(system.file(package='PatientLevelPrediction','python','TorchMap.py'), envir = e) - - dataEnv <- e # adding to remove <<- - #dataPlp <<- map_python_initiate(maxCol = as.integer(maxCol), - dataPlp <- map_python_initiate(maxCol = as.integer(maxCol), - maxRow = as.integer(maxRow), - maxT= as.integer(maxT)) - - convertData <- function(batch, temporal=T, dataEnv) { - if(temporal){ - #dataPlp <<- map_python(matrix = dataPlp , - dataEnv$dataPlp <- map_python(matrix = dataEnv$dataPlp, - datas = as.matrix(as.data.frame(batch %>% dplyr::select(.data$rowId,.data$covariateId,.data$timeId,.data$covariateValue))), - maxCol = as.integer(maxCol), - maxRow = as.integer(maxRow), - maxT = as.integer(maxT)) - }else{ - # dataPlp <<- map_python(matrix = dataPlp , - dataEnv$dataPlp <- map_python(matrix = dataEnv$dataPlp, - datas = as.matrix(as.data.frame(batch %>% dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue))), - maxCol = as.integer(maxCol), - maxRow = as.integer(maxRow), - maxT = NULL) - } - return(NULL) - } - - if(temporal==T){ - # add the age and non-temporal data - timeIds <- unique(plpData$timeRef$timeId) - for(timeId in timeIds){ - tempData <- addAgeTemp(timeId, newcovariateData) - if(!is.null(tempData)){ - Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv) - } - #tempData <- addNonAgeTemp(timeId,plpData.mapped) - what is plpData.mapped? - tempData <- addNonAgeTemp(timeId, newcovariateData) - if(!is.null(tempData)){ - Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv) - } - tempData <- NULL - } - - # add the rest - tempData <- newcovariateData$covariates %>% - dplyr::filter(.data$timeId!=0) %>% - dplyr::filter(!is.na(.data$timeId)) - Andromeda::batchApply(tempData, convertData,temporal =T, batchSize = 100000, dataEnv=dataEnv) - tempData <- NULL - } else { - Andromeda::batchApply(newcovariateData$covariates, convertData, - temporal =F, batchSize = 100000, dataEnv=dataEnv) - } - ##result <- dataEnv$dataPlp - ##dataPlp <<- NULL - ##dataEnv$dataPlp <- NULL - ParallelLogger::logTrace(paste0('Sparse python tensor converted')) - - result <- list(data=dataPlp, - covariateRef=as.data.frame(newcovariateData$covariateRef), - map=as.data.frame(newcovariateData$mapping)) - return(result) -} - - -# reformat the evaluation -reformatPerformance <- function(train, test, analysisId){ - - ParallelLogger::logInfo(paste0('starting reformatPerformance')) - - nr1 <- length(unlist(train$evaluationStatistics[-1])) - nr2 <- length(unlist(test$evaluationStatistics[-1])) - evaluationStatistics <- cbind(analysisId=rep(analysisId,nr1+nr2), - Eval=c(rep('train', nr1),rep('test', nr2)), - Metric = names(c(unlist(train$evaluationStatistics[-1]), - unlist(test$evaluationStatistics[-1]))), - Value = c(unlist(train$evaluationStatistics[-1]), - unlist(test$evaluationStatistics[-1])) - ) - - - if(!is.null(test$thresholdSummary) & !is.null(train$thresholdSummary)){ - nr1 <- nrow(train$thresholdSummary) - nr2 <- nrow(test$thresholdSummary) - thresholdSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1), - train$thresholdSummary), - cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2), - test$thresholdSummary)) - } else{ - thresholdSummary <- NULL - } - - - if(!is.null(train$demographicSummary) & !is.null(test$demographicSummary)){ - nr1 <- nrow(train$demographicSummary) - nr2 <- nrow(test$demographicSummary) - demographicSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1), - train$demographicSummary), - cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2), - test$demographicSummary)) - } else{ - demographicSummary <- NULL - } - - nr1 <- nrow(train$calibrationSummary) - nr2 <- nrow(test$calibrationSummary) - calibrationSummary <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1), - train$calibrationSummary), - cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2), - test$calibrationSummary)) - - if(!is.null(train$predictionDistribution) & !is.null(test$predictionDistribution)){ - nr1 <- nrow(train$predictionDistribution) - nr2 <- nrow(test$predictionDistribution) - predictionDistribution <- rbind(cbind(analysisId=rep(analysisId,nr1),Eval=rep('train', nr1), - train$predictionDistribution), - cbind(analysisId=rep(analysisId,nr2),Eval=rep('test', nr2), - test$predictionDistribution)) - } else { - predictionDistribution <- NULL - } - - result <- list(evaluationStatistics=evaluationStatistics, - thresholdSummary=thresholdSummary, - demographicSummary =demographicSummary, - calibrationSummary=calibrationSummary, - predictionDistribution=predictionDistribution) - - return(result) -} - - -# helpers for converting temporal PLP data to matrix/tensor -addAgeTemp <- function(timeId, newcovariateData, timeRef){ - - startDay <- as.data.frame(timeRef[timeRef$timeId==timeId,])$startDay - - ageId <- as.data.frame(newcovariateData$mapping %>% - dplyr::filter(.data$oldCovariateId == 1002) %>% - dplyr::select(.data$newCovariateId))$newCovariateId - - ageData <- newcovariateData$covariates%>% # changed from plpData$covariateData - dplyr::filter(.data$covariateId == ageId) %>% - dplyr::mutate(covariateValueNew = .data$covariateValue*365 + startDay, - timeId = timeId) %>% - dplyr::select(- .data$covariateValue) %>% - dplyr::rename(covariateValue = .data$covariateValueNew) %>% - dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue, .data$timeId) - - if(nrow(ageData)==0){ - return(NULL) - } - return(ageData) -} - - -addNonAgeTemp <- function(timeId, newcovariateData){ - - ageId <- as.data.frame(newcovariateData$mapping %>% - dplyr::filter(.data$oldCovariateId == 1002) %>% - dplyr::select(.data$newCovariateId))$newCovariateId - - otherTempCovs <- newcovariateData$covariates%>% - dplyr::filter(is.na(.data$timeId)) %>% - dplyr::filter(.data$covariateId != ageId) %>% - dplyr::mutate(timeId = timeId) %>% - dplyr::select(.data$rowId,.data$covariateId,.data$covariateValue,.data$timeId) - - if(nrow(otherTempCovs)==0){ - return(NULL) - } - return(otherTempCovs) -} diff --git a/R/MLP.R b/R/MLP.R new file mode 100644 index 0000000..dc5f56a --- /dev/null +++ b/R/MLP.R @@ -0,0 +1,175 @@ +# @file MLP.R +# +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of DeepPatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' setMultiLayerPerceptron +#' +#' @description +#' Creates settings for a Multilayer perceptron model +#' +#' @details +#' Model architecture +#' +#' +#' @param numLayers Number of layers in network, default: 1:16 +#' @param sizeHidden Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024) +#' @param dropout How much dropout to apply after first linear, default: seq(0, 0.3, 0.05) +#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) (64 to 512) +#' @param weightDecay Weight decay to apply, default: c(1e-6, 1e-3) +#' @param learningRate Learning rate to use. default: c(1e-2, 1e-5) +#' @param seed Seed to use for sampling hyperparameter space +#' @param hyperParamSearch Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random' +#' @param randomSample How many random samples from hyperparameter space to use +#' @param device Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu' +#' @param batchSize Size of batch, default: 1024 +#' @param epochs Number of epochs to run, default: 10 +#' +#' @export +setMultiLayerPerceptron <- function(numLayers = c(1:8), + sizeHidden = c(2^(6:9)), + dropout = c(seq(0, 0.5, 0.05)), + sizeEmbedding = c(2^(6:9)), + weightDecay = c(1e-6, 1e-3), + learningRate = c(1e-2, 3e-4, 1e-5), + seed = NULL, + hyperParamSearch = "random", + randomSample = 100, + device = "cpu", + batchSize = 1024, + epochs = 30) { + if (is.null(seed)) { + seed <- as.integer(sample(1e5, 1)) + } + + paramGrid <- list( + numLayers = numLayers, + sizeHidden = sizeHidden, + dropout = dropout, + sizeEmbedding = sizeEmbedding, + weightDecay = weightDecay, + learningRate = learningRate, + seed = list(as.integer(seed[[1]])) + ) + + param <- PatientLevelPrediction::listCartesian(paramGrid) + + if (hyperParamSearch == "random") { + param <- param[sample(length(param), randomSample)] + } + + attr(param, "settings") <- list( + seed = seed[1], + device = device, + batchSize = batchSize, + epochs = epochs, + name = "MLP", + saveType = "file", + modelParamNames = c( + "numLayers", "sizeHidden", + "dropout", "sizeEmbedding" + ), + baseModel = "MLP" + ) + + results <- list( + fitFunction = "fitEstimator", + param = param + ) + + class(results) <- "modelSettings" + + return(results) +} + + +MLP <- torch::nn_module( + name = "MLP", + initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers, + activation = torch::nn_relu, + normalization = torch::nn_batch_norm1d, dropout = NULL, + d_out = 1) { + self$embedding <- torch::nn_embedding_bag( + num_embeddings = catFeatures + 1, + embedding_dim = sizeEmbedding, + padding_idx = 1 + ) + if (numFeatures != 0) { + self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding) + } + + self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden) + + + self$layers <- torch::nn_module_list(lapply( + 1:numLayers, + function(x) { + MLPLayer( + sizeHidden, + normalization, activation, + dropout + ) + } + )) + self$lastNorm <- normalization(sizeHidden) + self$head <- torch::nn_linear(sizeHidden, d_out) + + self$lastAct <- activation() + }, + forward = function(x) { + x_cat <- x$cat + x_num <- x$num + x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1 + if (!is.null(x_num)) { + x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2 + } else { + x <- x_cat + } + x <- self$first_layer(x) + + for (i in 1:length(self$layers)) { + x <- self$layers[[i]](x) + } + x <- self$lastNorm(x) + x <- self$lastAct(x) + x <- self$head(x) + x <- x$squeeze(-1) + return(x) + } +) + +MLPLayer <- torch::nn_module( + name = "MLPLayer", + initialize = function(sizeHidden = 64, + normalization = torch::nn_batch_norm1d, + activation = torch::nn_relu, + dropout = 0.0, bias = TRUE) { + self$norm <- normalization(sizeHidden) + self$activation <- activation() + self$linear <- torch::nn_linear(sizeHidden, sizeHidden, bias = bias) + + if (!is.null(dropout) | !dropout == 0.0) { + self$dropout <- torch::nn_dropout(p = dropout) + } + }, + forward = function(x) { + x <- self$linear(self$norm(x)) + if (!is.null(self$dropout)) { + x <- self$dropout(x) + } + return(self$activation(x)) + } +) diff --git a/R/Predict.R b/R/Predict.R deleted file mode 100644 index caad604..0000000 --- a/R/Predict.R +++ /dev/null @@ -1,643 +0,0 @@ -# @file predict.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' predictPlp -#' -#' @description -#' Predict the risk of the outcome using the input plpModel for the input plpData -#' @details -#' The function applied the trained model on the plpData to make predictions -#' @param plpModel An object of type \code{plpModel} - a patient level prediction model -#' @param population The population created using createStudyPopulation() who will have their risks predicted -#' @param plpData An object of type \code{plpData} - the patient level prediction -#' data extracted from the CDM. -#' @param index A data frame containing rowId: a vector of rowids and index: a vector of doubles the same length as the rowIds. If used, only the rowIds with a negative index value are used to calculate the prediction. -#' -#' @return -#' A dataframe containing the prediction for each person in the population with an attribute metaData containing prediction details. -#' - -# parent predict that calls the others -#' @export -predictPlp <- function(plpModel, population, plpData, index=NULL){ - - if(is.null(plpModel)) - stop('No model input') - if(is.null(population)) - stop('No population input') - if(is.null(plpData)) - stop('No plpData input') - - # check logger - if(length(ParallelLogger::getLoggers())==0){ - logger <- ParallelLogger::createLogger(name = "SIMPLE", - threshold = "INFO", - appenders = list(ParallelLogger::createConsoleAppender(layout = ParallelLogger::layoutTimestamp))) - ParallelLogger::registerLogger(logger) - } - - # apply the feature transformations - if(!is.null(index)){ - ParallelLogger::logTrace(paste0('Calculating prediction for ',sum(index$index<0),' in test set')) - ind <- population$rowId%in%index$rowId[index$index<0] - } else{ - ParallelLogger::logTrace(paste0('Calculating prediction for ',nrow(population),' in dataset')) - ind <- rep(T, nrow(population)) - } - - # do the predction on the new data - if(class(plpModel)=='plpModel'){ - # extract the classifier type - prediction <- plpModel$predict(plpData=plpData,population=population[ind,]) - - if(nrow(prediction)!=nrow(population[ind,])) - ParallelLogger::logWarn(paste0('Dimension mismatch between prediction and population test cases. Population test: ',nrow(population[ind, ]), '-- Prediction:', nrow(prediction) )) - } else{ - ParallelLogger::logError('Non plpModel input') - stop() - } - - metaData <- list(predictionType = attr(plpModel, 'predictionType'), #"binary", - cohortId = attr(population,'metaData')$cohortId, - outcomeId = attr(population,'metaData')$outcomeId, - timepoint = attr(population,'metaData')$riskWindowEnd) - - attr(prediction, "metaData") <- metaData - return(prediction) -} - -# default patient level prediction prediction -predict.plp <- function(plpModel,population, plpData, ...){ - ## done in transform covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId) - ParallelLogger::logTrace('predict.plp - predictingProbabilities start') - prediction <- predictProbabilities(plpModel$model, population, - plpData$covariateData) - ParallelLogger::logTrace('predict.plp - predictingProbabilities end') - - # add baselineHazard function - if(!is.null(plpModel$model$baselineHazard)){ - time <- attr(population,'metaData')$riskWindowEnd - bhind <- which.min(abs(plpModel$model$baselineHazard$time-time)) - prediction$value <- 1-plpModel$model$baselineHazard$surv[bhind]^prediction$value - - attr(prediction, "baselineHazard") <- plpModel$model$baselineHazard$surv[bhind] - attr(prediction, "timePoint") <- time - attr(prediction, "offset") <- 0 - } - - return(prediction) -} - -# for gxboost -predict.xgboost <- function(plpModel,population, plpData, ...){ - result <- toSparseM(plpData, population, map=plpModel$covariateMap) - data <- result$data[population$rowId,, drop = F] - prediction <- data.frame(rowId=population$rowId, - value=stats::predict(plpModel$model, data) - ) - - prediction <- merge(population, prediction, by='rowId', all.x=T, fill=0) - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue - attr(prediction, "metaData") <- list(predictionType = "binary") - return(prediction) - -} - -predict.pythonReticulate <- function(plpModel, population, plpData){ - - - python_predict <- python_predict_temporal <- function(){return(NULL)} - - e <- environment() - reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e) - - - ParallelLogger::logInfo('Mapping covariates...') - if(!is.null(plpData$timeRef)){ - pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T) - pdata <- pdata$data - fun_predict <- python_predict_temporal - } else { - newData <- toSparseM(plpData, population, map=plpModel$covariateMap) - included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map? - included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] - pdata <- reticulate::r_to_py(newData$data[,included, drop = F]) - fun_predict <- python_predict - } - - population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference - namesInd <- c('rowIdPython','rowId')%in%colnames(population) - namesInd <- c('rowIdPython','rowId')[namesInd] - pPopulation <- as.matrix(population[,namesInd]) - - # run the python predict code: - ParallelLogger::logInfo('Executing prediction...') - result <- fun_predict(population = pPopulation, - plpData = pdata, - model_loc = plpModel$model, - dense = ifelse(is.null(plpModel$dense),0,plpModel$dense), - autoencoder = F) - - #get the prediction from python and reformat: - ParallelLogger::logInfo('Returning results...') - prediction <- result - prediction <- as.data.frame(prediction) - attr(prediction, "metaData") <- list(predictionType="binary") - colnames(prediction) <- c(namesInd, 'value') - - # add 1 to rowId from python: - ##prediction$rowId <- prediction$rowId+1 - - # add subjectId and date: - prediction <- merge(prediction[,colnames(prediction)!='rowIdPython'], - population, - by='rowId') - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] - return(prediction) -} - -predict.pythonAuto <- function(plpModel, population, plpData){ - - python_predict <- function(){return(NULL)} - - ParallelLogger::logInfo('Mapping covariates...') - if(!is.null(plpData$timeRef)){ - pdata <- toSparseTorchPython(plpData,population, map=plpModel$covariateMap, temporal=T) - pdata <- pdata$data - } else { - newData <- toSparseM(plpData, population, map=plpModel$covariateMap) - included <- plpModel$varImp$covariateId[plpModel$varImp$included>0] # does this include map? - included <- newData$map$newCovariateId[newData$map$oldCovariateId%in%included] - pdata <- reticulate::r_to_py(newData$data[,included]) - } - - population$rowIdPython <- population$rowId-1 # -1 to account for python/r index difference - namesInd <- c('rowIdPython','rowId')%in%colnames(population) - namesInd <- c('rowIdPython','rowId')[namesInd] - pPopulation <- as.matrix(population[,namesInd]) - - # run the python predict code: - ParallelLogger::logInfo('Executing prediction...') - e <- environment() - reticulate::source_python(system.file(package='PatientLevelPrediction','python','predictFunctions.py'), envir = e) - - result <- python_predict(population = pPopulation, - plpData = pdata, - model_loc = plpModel$model, - dense = ifelse(is.null(plpModel$dense),0,plpModel$dense), - autoencoder = T) - #get the prediction from python and reformat: - ParallelLogger::logInfo('Returning results...') - prediction <- result - prediction <- as.data.frame(prediction) - attr(prediction, "metaData") <- list(predictionType="binary") - colnames(prediction) <- c(namesInd, 'value') - - prediction <- merge(prediction[,colnames(prediction)!='rowIdPython'], - population, - by='rowId') - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] - return(prediction) -} - - - -predict.knn <- function(plpData, population, plpModel, ...){ - ##covariateData <- limitCovariatesToPopulation(plpData$covariateData, population$rowId) - prediction <- BigKnn::predictKnn(covariates = plpData$covariateData$covariates, - cohorts= population[,!colnames(population)%in%'cohortStartDate'], - indexFolder = plpModel$model, - k = plpModel$modelSettings$modelParameters$k, - weighted = TRUE, - threads = plpModel$modelSettings$threads) - - # can add: threads = 1 in the future - - # return the cohorts as a data frame with the prediction added as - # a new column with the column name 'value' - prediction <- merge(population, prediction[,c('rowId','value')], by='rowId', - all.x=T, fill=0) - prediction$value[is.na(prediction$value)] <- 0 - - return(prediction) -} - - -predict.deep <- function(plpModel, population, plpData, ...){ - ensure_installed("plyr") - - temporal <- !is.null(plpData$timeRef) - ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef))) - if(temporal){ - ParallelLogger::logTrace('temporal') - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T) - - data <-result$data[population$rowId,,] - if(!is.null(plpModel$useVae)){ - if(plpModel$useVae==TRUE){ - data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize)) - data<-aperm(data, perm = c(2,1,3))#rearrange of dimension - }} - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,])) - if(is.null(dim(pred))){ - prediction$value[batch] <- as.double(pred) - } else{ - prediction$value[batch] <- as.double(pred[,2]) - } - } - - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue - return(prediction) - } else{ - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F) - data <-result$data[population$rowId,] - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,])) - prediction$value[batch] <- as.double(pred[,2]) - } - - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue - return(prediction) - - } -} - -predict.BayesianDeep <- function(plpModel, population, plpData, ...){ - ensure_installed("plyr") - - temporal <- !is.null(plpData$timeRef) - ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef))) - if(temporal){ - ParallelLogger::logTrace('temporal') - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T) - - data <-result$data[population$rowId,,] - if(!is.null(plpModel$useVae)){ - if(plpModel$useVae==TRUE){ - data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize)) - data<-aperm(data, perm = c(2,1,3))#rearrange of dimension - }} - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - prediction$epistemicUncertainty <- 0 - prediction$aleatoricUncertainty <- 0 - for(batch in batches){ - num_MC_samples = 100 - output_dim = 2 - pred <- keras::predict_on_batch(plpModel$model, as.array(data[batch,,])) - MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim)) - for (k in 1:num_MC_samples){ - MC_samples[k,, ] = stats::predict(plpModel$model, as.array(data[batch,,])) - #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - } - pred <- apply(MC_samples[,,output_dim], 2, mean) - epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var) - logVar = MC_samples[, , output_dim * 2] - if(length(dim(logVar))<=1){ - aleatoricUncertainty = exp(mean(logVar)) - }else{ - aleatoricUncertainty = exp(colMeans(logVar)) - - } - prediction$value[batch] <- pred - prediction$epistemicUncertainty[batch] = epistemicUncertainty - prediction$aleatoricUncertainty[batch] = aleatoricUncertainty - #writeLines(paste0(dim(pred[,2]), collapse='-')) - #writeLines(paste0(pred[1,2], collapse='-')) - - } - prediction$value[prediction$value>1] <- 1 - prediction$value[prediction$value<0] <- 0 - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender', - 'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue - return(prediction) - } else{ - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F) - data <-result$data[population$rowId,] - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - num_MC_samples = 100 - output_dim =2 - MC_samples <- array(0, dim = c(num_MC_samples, length(batch), 2 * output_dim)) - for (k in 1:num_MC_samples){ - MC_samples[k,, ] = stats::predict(plpModel$model, as.array(data[batch,,])) - #keras::predict_proba(model, as.array(plpData[population$rowId[population$indexes==index],,][batch,,])) - } - pred <- apply(MC_samples[,,output_dim], 2, mean) - epistemicUncertainty <- apply(MC_samples[,,output_dim], 2, stats::var) - logVar = MC_samples[, , output_dim * 2] - if(length(dim(logVar))<=1){ - aleatoricUncertainty = exp(mean(logVar)) - }else{ - aleatoricUncertainty = exp(colMeans(logVar)) - - } - prediction$value[batch] <- pred - prediction$epistemicUncertainty[batch] = epistemicUncertainty - prediction$aleatoricUncertainty[batch] = aleatoricUncertainty - #writeLines(paste0(dim(pred[,2]), collapse='-')) - #writeLines(paste0(pred[1,2], collapse='-')) - } - prediction$value[prediction$value>1] <- 1 - prediction$value[prediction$value<0] <- 0 - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender', - 'epistemicUncertainty', 'aleatoricUncertainty')] # need to fix no index issue - return(prediction) - - } -} - -predict.deepEnsemble <- function(plpModel, population, plpData, ...){ - ensure_installed("plyr") - - mu <- function(){return(NULL)} - sigma <- function(){return(NULL)} - - temporal <- !is.null(plpData$timeRef) - ParallelLogger::logDebug(paste0('timeRef null: ',is.null(plpData$timeRef))) - if(temporal){ - ParallelLogger::logTrace('temporal') - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T) - - data <-result$data[population$rowId,,] - if(!is.null(plpModel$useVae)){ - if(plpModel$useVae==TRUE){ - data<- plyr::aaply(as.array(data), 2, function(x) stats::predict(plpModel$vaeEncoder, x, batch_size = plpModel$vaeBatchSize)) - data<-aperm(data, perm = c(2,1,3))#rearrange of dimension - }} - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - - prediction$sigmas <- 0 - for(batch in batches){ - for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){ - if(i==1){ - muMatrix <- data.frame() - sigmaMatrix <-data.frame() - } - c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,]))) - muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2]))) - sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2]))) - } - - muMean <- apply(muMatrix,2,mean) - muSq <- muMatrix^2 - sigmaSq <- sigmaMatrix^2 - sigmaMean <- apply(sigmaMatrix,2,mean) - sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2 - - prediction$value[batch] <- c(muMean) - prediction$sigmas[batch] <- c(sigmaResult) - } - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas')] # need to fix no index issue - #If the prediction value is negative, please add values to make all the values to positive. - if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1)) } - return(prediction) - - } else{ - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F) - data <-result$data[population$rowId,] - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - - prediction$sigmas <- 0 - for(batch in batches){ - for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){ - if(i==1){ - muMatrix <- data.frame() - sigmaMatrix <-data.frame() - } - c(mu,sigma) %<-% plpModel$model[[i]](inputs=list(as.array(data[batch,,]))) - muMatrix<-rbind(muMatrix,t(as.data.frame(mu[,2]))) - sigmaMatrix<-rbind(sigmaMatrix,t(as.data.frame(sigma[,2]))) - } - - muMean <- apply(muMatrix,2,mean) - muSq <- muMatrix^2 - sigmaSq <- sigmaMatrix^2 - sigmaMean <- apply(sigmaMatrix,2,mean) - sigmaResult=apply(muSq+sigmaSq,2, mean)- muMean^2 - - prediction$value[batch] <- c(muMean) - prediction$sigmas[batch] <- c(sigmaResult) - } - - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value', 'sigmas','ageYear', 'gender')] # need to fix no index issue - if(min(prediction$value)<0){prediction$value = prediction$value+ (min(prediction$value)* (-1)) } - return(prediction) - - } -} - -predict.deepMulti <- function(plpModel, population, plpData, ...){ - - repeats <- attr(plpModel, 'inputs') - - temporal <- !is.null(plpData$timeRef) - ParallelLogger::logDebug('timeRef null: ',paste0(is.null(plpData$timeRef))) - if(temporal){ - ParallelLogger::logTrace('temporal') - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=T) - ParallelLogger::logDebug('result$data dim: ',paste0(dim(result$data), collapse = '-')) - ParallelLogger::logDebug('population dim: ',paste0(dim(population), collapse = '-')) - data <-result$data[population$rowId,,] - - ParallelLogger::logInfo('running batch prediction') - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - ParallelLogger::logDebug('batch length: ', length(batch)) - dat <- list() - length(dat) <- repeats - for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])} - - pred <- keras::predict_on_batch(plpModel$model, dat) - if(is.null(dim(pred))){ - ParallelLogger::logDebug('Pred length: ', length(pred)) - prediction$value[batch] <- as.double(pred) - } else{ - ParallelLogger::logDebug('Pred dim: ', paste0(dim(pred), collapse = '-')) - prediction$value[batch] <- as.double(pred[,2]) - } - } - - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue - return(prediction) - } else{ - result<-toSparseM(plpData,population,map=plpModel$covariateMap, temporal=F) - data <-result$data[population$rowId,] - - batch_size <- min(2000, length(population$rowId)) - maxVal <- length(population$rowId) - batches <- lapply(1:ceiling(maxVal/batch_size), function(x) ((x-1)*batch_size+1):min((x*batch_size),maxVal)) - prediction <- population - prediction$value <- 0 - for(batch in batches){ - dat <- list() - length(dat) <- repeats - for( i in 1:repeats) {dat[[i]] <- as.array(data[batch,,])} - - pred <- keras::predict_on_batch(plpModel$model, dat) - prediction$value[batch] <- as.double(pred) - } - - prediction <- prediction[,colnames(prediction)%in%c('rowId','subjectId','cohortStartDate','outcomeCount','indexes', 'value','ageYear', 'gender')] # need to fix no index issue - return(prediction) - - } -} - - -#' Create predictive probabilities -#' -#' @details -#' Generates predictions for the population specified in plpData given the model. -#' -#' @return -#' The value column in the result data.frame is: logistic: probabilities of the outcome, poisson: -#' Poisson rate (per day) of the outome, survival: hazard rate (per day) of the outcome. -#' -#' @param predictiveModel An object of type \code{predictiveModel} as generated using -#' \code{\link{fitPlp}}. -#' @param population The population to calculate the prediction for -#' @param covariateData The covariateData containing the covariates for the population -#' @export -predictProbabilities <- function(predictiveModel, population, covariateData) { - start <- Sys.time() - - ParallelLogger::logTrace('predictProbabilities - predictAndromeda start') - prediction <- predictAndromeda(predictiveModel$coefficients, - population, - covariateData, - predictiveModel$modelType) - ParallelLogger::logTrace('predictProbabilities - predictAndromeda end') - prediction$time <- NULL - attr(prediction, "modelType") <- predictiveModel$modelType - attr(prediction, "cohortId") <- attr(population, "metadata")$cohortId - attr(prediction, "outcomeId") <- attr(population, "metadata")$outcomeId - - if (predictiveModel$modelType %in% c("survival", "cox")) { - attr(prediction, "timepoint") <- attr(population, "metadata")$riskWindowEnd - } - - delta <- Sys.time() - start - ParallelLogger::logInfo("Prediction took ", signif(delta, 3), " ", attr(delta, "units")) - return(prediction) -} - -#' Generated predictions from a regression model -#' -#' @param coefficients A names numeric vector where the names are the covariateIds, except for the -#' first value which is expected to be the intercept. -#' @param population A data frame containing the population to do the prediction for -#' @param covariateData An andromeda object containing the covariateData with predefined columns -#' (see below). -#' @param modelType Current supported types are "logistic", "poisson", "cox" or "survival". -#' -#' @details -#' These columns are expected in the outcome object: \tabular{lll}{ \verb{rowId} \tab(integer) \tab -#' Row ID is used to link multiple covariates (x) to a single outcome (y) \cr \verb{time} \tab(real) -#' \tab For models that use time (e.g. Poisson or Cox regression) this contains time \cr \tab -#' \tab(e.g. number of days) \cr } These columns are expected in the covariates object: \tabular{lll}{ -#' \verb{rowId} \tab(integer) \tab Row ID is used to link multiple covariates (x) to a single outcome -#' (y) \cr \verb{covariateId} \tab(integer) \tab A numeric identifier of a covariate \cr -#' \verb{covariateValue} \tab(real) \tab The value of the specified covariate \cr } -#' -#' @export -predictAndromeda <- function(coefficients, population, covariateData, modelType = "logistic") { - if (!(modelType %in% c("logistic", "poisson", "survival","cox"))) { - stop(paste("Unknown modelType:", modelType)) - } - if (!FeatureExtraction::isCovariateData(covariateData)){ - stop("Needs correct covariateData") - } - - intercept <- coefficients[names(coefficients)%in%'(Intercept)'] - if(length(intercept)==0) intercept <- 0 - coefficients <- coefficients[!names(coefficients)%in%'(Intercept)'] - coefficients <- data.frame(beta = as.numeric(coefficients), - covariateId = bit64::as.integer64(names(coefficients)) #!@ modified - ) - coefficients <- coefficients[coefficients$beta != 0, ] - if(sum(coefficients$beta != 0)>0){ - covariateData$coefficients <- coefficients - on.exit(covariateData$coefficients <- NULL, add = TRUE) - - prediction <- covariateData$covariates %>% - dplyr::inner_join(covariateData$coefficients, by= 'covariateId') %>% - dplyr::mutate(values = .data$covariateValue*.data$beta) %>% - dplyr::group_by(.data$rowId) %>% - dplyr::summarise(value = sum(.data$values, na.rm = TRUE)) %>% - dplyr::select(.data$rowId, .data$value) - - prediction <- as.data.frame(prediction) - prediction <- merge(population, prediction, by ="rowId", all.x = TRUE, fill = 0) - prediction$value[is.na(prediction$value)] <- 0 - prediction$value <- prediction$value + intercept - } else{ - warning('Model had no non-zero coefficients so predicted same for all population...') - prediction <- population - prediction$value <- rep(0, nrow(population)) + intercept - } - if (modelType == "logistic") { - link <- function(x) { - return(1/(1 + exp(0 - x))) - } - prediction$value <- link(prediction$value) - } else if (modelType == "poisson" || modelType == "survival" || modelType == "cox") { - prediction$value <- exp(prediction$value) - #if(max(prediction$value)>1){ - # prediction$value <- prediction$value/max(prediction$value) - #} - } - return(prediction) -} - diff --git a/R/RNNTorch.R b/R/RNNTorch.R deleted file mode 100644 index ad86b0e..0000000 --- a/R/RNNTorch.R +++ /dev/null @@ -1,173 +0,0 @@ -# @file RNNTorch.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Create setting for RNN model with python -#' @param hidden_size The hidden size -#' @param epochs The number of epochs -#' @param seed A seed for the model -#' @param class_weight The class weight used for imbalanced data: -#' 0: Inverse ratio between positives and negatives -#' -1: Focal loss -#' @param type It can be normal 'RNN', 'BiRNN' (bidirectional RNN) and 'GRU' -#' -#' @examples -#' \dontrun{ -#' model.rnnTorch <- setRNNTorch() -#' } -#' @export -setRNNTorch <- function(hidden_size=c(50, 100), epochs=c(20, 50), seed=0, class_weight = 0, type = 'RNN'){ - # check inputs - - # add warning about dense data.. - - - # set seed - if(is.null(seed[1])){ - seed <- as.integer(sample(100000000,1)) - } - - result <- list(model='fitRNNTorch', param=split(expand.grid(hidden_size=hidden_size, - epochs=epochs, seed=seed[1], - class_weight = class_weight, type = type), - 1:(length(hidden_size)*length(epochs)) ), - name='RNN Torch') - - class(result) <- 'modelSettings' - - return(result) -} - -fitRNNTorch <- function(population, plpData, param, search='grid', quiet=F, - outcomeId, cohortId, ...){ - - # check plpData is libsvm format or convert if needed - if (!FeatureExtraction::isCovariateData(plpData$covariateData)) - stop("Needs correct covariateData") - - if(colnames(population)[ncol(population)]!='indexes'){ - warning('indexes column not present as last column - setting all index to 1') - population$indexes <- rep(1, nrow(population)) - } - - start <- Sys.time() - - population$rowIdPython <- population$rowId-1 #to account for python/r index difference #subjectId - pPopulation <- as.matrix(population[,c('rowIdPython','outcomeCount','indexes')]) - - result <- toSparseTorchPython(plpData,population, map=NULL, temporal=T) - - outLoc <- createTempModelLoc() - # clear the existing model pickles - for(file in dir(outLoc)) - file.remove(file.path(outLoc,file)) - - # do cross validation to find hyperParameter - hyperParamSel <- lapply(param, function(x) do.call(trainRNNTorch, listAppend(x, - list(plpData = result$data, - population = pPopulation, - train=TRUE, - modelOutput=outLoc)) )) - - hyperSummary <- cbind(do.call(rbind, param), unlist(hyperParamSel)) - - #now train the final model and return coef - bestInd <- which.max(abs(unlist(hyperParamSel)-0.5))[1] - finalModel <- do.call(trainRNNTorch, listAppend(param[[bestInd]], - list(plpData = result$data, - population = pPopulation, - train=FALSE, - modelOutput=outLoc))) - - covariateRef <- as.data.frame(plpData$covariateData$covariateRef) - incs <- rep(1, nrow(covariateRef)) - covariateRef$included <- incs - covariateRef$covariateValue <- rep(0, nrow(covariateRef)) - - modelTrained <- file.path(outLoc) - param.best <- param[[bestInd]] - - comp <- start-Sys.time() - - # prediction on train set: - pred <- finalModel - pred[,1] <- pred[,1] + 1 # adding one to convert from python to r indexes - colnames(pred) <- c('rowId','outcomeCount','indexes', 'value') - pred <- as.data.frame(pred) - attr(pred, "metaData") <- list(predictionType="binary") - - pred$value <- 1-pred$value - prediction <- merge(population, pred[,c('rowId','value')], by='rowId') - - # return model location - result <- list(model = modelTrained, - trainCVAuc = -1, # ToDo decide on how to deal with this - hyperParamSearch = hyperSummary, - modelSettings = list(model='fitRNNTorch',modelParameters=param.best), - metaData = plpData$metaData, - populationSettings = attr(population, 'metaData'), - outcomeId=outcomeId, - cohortId=cohortId, - varImp = covariateRef, - trainingTime =comp, - dense=1, - covariateMap=result$map, # I think this is need for new data to map the same? - predictionTrain = prediction - ) - class(result) <- 'plpModel' - attr(result, 'type') <- 'pythonReticulate' - attr(result, 'predictionType') <- 'binary' - - return(result) -} - - -trainRNNTorch <- function(plpData, population, epochs=50, hidden_size = 100, seed=0, class_weight= 0, type = 'RNN', train=TRUE, modelOutput, quiet=F){ - - train_deeptorch <- function(){return(NULL)} - python_dir <- system.file(package='PatientLevelPrediction','python') - - e <- environment() - reticulate::source_python(system.file(package='PatientLevelPrediction','python','deepTorchFunctions.py'), envir = e) - - result <- train_deeptorch(population = population, - train = train, - plpData = plpData, - model_type = as.character(type), - epochs = as.integer(epochs), - hidden_size = as.integer(hidden_size), - class_weight = class_weight, - seed = seed, - quiet = quiet, - modelOutput = modelOutput) - - if(train){ - # then get the prediction - pred <- result - colnames(pred) <- c('rowId','outcomeCount','indexes', 'value') - pred <- as.data.frame(pred) - attr(pred, "metaData") <- list(predictionType="binary") - - pred$value <- 1-pred$value - auc <- computeAuc(pred) - writeLines(paste0('Model obtained CV AUC of ', auc)) - return(auc) - } - - return(result) - -} diff --git a/R/ResNet.R b/R/ResNet.R new file mode 100644 index 0000000..e8ce322 --- /dev/null +++ b/R/ResNet.R @@ -0,0 +1,194 @@ +# @file ResNet.R +# +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of DeepPatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' setResNet +#' +#' @description +#' Creates settings for a ResNet model +#' +#' @details +#' Model architecture from by https://arxiv.org/abs/2106.11959 +#' +#' +#' @param numLayers Number of layers in network, default: 1:16 +#' @param sizeHidden Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024) +#' @param hiddenFactor How much to grow the amount of neurons in each ResLayer, default: 1:4 +#' @param residualDropout How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05) +#' @param hiddenDropout How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05) +#' @param sizeEmbedding Size of embedding layer, default: 2^(6:9) (64 to 512) +#' @param weightDecay Weight decay to apply, default: c(1e-6, 1e-3) +#' @param learningRate Learning rate to use. default: c(1e-2, 1e-5) +#' @param seed Seed to use for sampling hyperparameter space +#' @param hyperParamSearch Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random' +#' @param randomSample How many random samples from hyperparameter space to use +#' @param device Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu' +#' @param batchSize Size of batch, default: 1024 +#' @param epochs Number of epochs to run, default: 10 +#' +#' @export +setResNet <- function(numLayers = c(1:8), + sizeHidden = c(2^(6:9)), + hiddenFactor = c(1:4), + residualDropout = c(seq(0, 0.5, 0.05)), + hiddenDropout = c(seq(0, 0.5, 0.05)), + sizeEmbedding = c(2^(6:9)), + weightDecay = c(1e-6, 1e-3), + learningRate = c(1e-2, 3e-4, 1e-5), + seed = NULL, + hyperParamSearch = "random", + randomSample = 100, + device = "cpu", + batchSize = 1024, + epochs = 30) { + if (is.null(seed)) { + seed <- as.integer(sample(1e5, 1)) + } + + paramGrid <- list( + numLayers = numLayers, + sizeHidden = sizeHidden, + hiddenFactor = hiddenFactor, + residualDropout = residualDropout, + hiddenDropout = hiddenDropout, + sizeEmbedding = sizeEmbedding, + weightDecay = weightDecay, + learningRate = learningRate, + seed = list(as.integer(seed[[1]])) + ) + + param <- PatientLevelPrediction::listCartesian(paramGrid) + + if (hyperParamSearch == "random") { + param <- param[sample(length(param), randomSample)] + } + + attr(param, "settings") <- list( + seed = seed[1], + device = device, + batchSize = batchSize, + epochs = epochs, + name = "ResNet", + saveType = "file", + modelParamNames = c( + "numLayers", "sizeHidden", "hiddenFactor", + "residualDropout", "hiddenDropout", "sizeEmbedding" + ), + baseModel = "ResNet" + ) + + results <- list( + fitFunction = "fitEstimator", + param = param + ) + + class(results) <- "modelSettings" + + return(results) +} + +ResNet <- torch::nn_module( + name = "ResNet", + initialize = function(catFeatures, numFeatures = 0, sizeEmbedding, sizeHidden, numLayers, + hiddenFactor, activation = torch::nn_relu, + normalization = torch::nn_batch_norm1d, hiddenDropout = NULL, + residualDropout = NULL, d_out = 1) { + self$embedding <- torch::nn_embedding_bag( + num_embeddings = catFeatures + 1, + embedding_dim = sizeEmbedding, + padding_idx = 1 + ) + if (numFeatures != 0) { + self$numEmbedding <- numericalEmbedding(numFeatures, sizeEmbedding) + } + + self$first_layer <- torch::nn_linear(sizeEmbedding, sizeHidden) + + resHidden <- sizeHidden * hiddenFactor + + self$layers <- torch::nn_module_list(lapply( + 1:numLayers, + function(x) { + ResLayer( + sizeHidden, resHidden, + normalization, activation, + hiddenDropout, + residualDropout + ) + } + )) + self$lastNorm <- normalization(sizeHidden) + self$head <- torch::nn_linear(sizeHidden, d_out) + + self$lastAct <- activation() + }, + forward = function(x) { + x_cat <- x$cat + x_num <- x$num + x_cat <- self$embedding(x_cat + 1L) # padding_idx is 1 + if (!is.null(x_num)) { + x <- (x_cat + self$numEmbedding(x_num)$mean(dim = 2)) / 2 + # x <- torch::torch_cat(list(x_cat, x_num), dim = 2L) + } else { + x <- x_cat + } + x <- self$first_layer(x) + + for (i in 1:length(self$layers)) { + x <- self$layers[[i]](x) + } + x <- self$lastNorm(x) + x <- self$lastAct(x) + x <- self$head(x) + x <- x$squeeze(-1) + return(x) + } +) + +ResLayer <- torch::nn_module( + name = "ResLayer", + initialize = function(sizeHidden, resHidden, normalization, + activation, hiddenDropout = NULL, residualDropout = NULL) { + self$norm <- normalization(sizeHidden) + self$linear0 <- torch::nn_linear(sizeHidden, resHidden) + self$linear1 <- torch::nn_linear(resHidden, sizeHidden) + + if (!is.null(hiddenDropout)) { + self$hiddenDropout <- torch::nn_dropout(p = hiddenDropout) + } + if (!is.null(residualDropout)) { + self$residualDropout <- torch::nn_dropout(p = residualDropout) + } + + self$activation <- activation() + }, + forward = function(x) { + z <- x + z <- self$norm(z) + z <- self$linear0(z) + z <- self$activation(z) + if (!is.null(self$hiddenDropout)) { + z <- self$hiddenDropout(z) + } + z <- self$linear1(z) + if (!is.null(self$residualDropout)) { + z <- self$residualDropout(z) + } + x <- z + x + return(x) + } +) diff --git a/R/SaveLoadPlp.R b/R/SaveLoadPlp.R deleted file mode 100644 index 4b7e6d6..0000000 --- a/R/SaveLoadPlp.R +++ /dev/null @@ -1,891 +0,0 @@ -# @file PlpSaveLoad.R -# -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of CohortMethod -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Get the patient level prediction data from the server -#' @description -#' This function executes a large set of SQL statements against the database in OMOP CDM format to -#' extract the data needed to perform the analysis. -#' -#' @details -#' Based on the arguments, the at risk cohort data is retrieved, as well as outcomes -#' occurring in these subjects. The at risk cohort is identified through -#' user-defined cohorts in a cohort table either inside the CDM instance or in a separate schema. -#' Similarly, outcomes are identified -#' through user-defined cohorts in a cohort table either inside the CDM instance or in a separate -#' schema. Covariates are automatically extracted from the appropriate tables within the CDM. -#' If you wish to exclude concepts from covariates you will need to -#' manually add the concept_ids and descendants to the \code{excludedCovariateConceptIds} of the -#' \code{covariateSettings} argument. -#' -#' @param connectionDetails An R object of type\cr\code{connectionDetails} created using the -#' function \code{createConnectionDetails} in the -#' \code{DatabaseConnector} package. -#' @param cdmDatabaseSchema The name of the database schema that contains the OMOP CDM -#' instance. Requires read permissions to this database. On SQL -#' Server, this should specifiy both the database and the schema, -#' so for example 'cdm_instance.dbo'. -#' @param oracleTempSchema For Oracle only: the name of the database schema where you want -#' all temporary tables to be managed. Requires create/insert -#' permissions to this database. -#' @param cohortId A unique identifier to define the at risk cohort. CohortId is -#' used to select the cohort_concept_id in the cohort-like table. -#' @param outcomeIds A list of cohort_definition_ids used to define outcomes (-999 mean no outcome gets downloaded). -#' @param studyStartDate A calendar date specifying the minimum date that a cohort index -#' date can appear. Date format is 'yyyymmdd'. -#' @param studyEndDate A calendar date specifying the maximum date that a cohort index -#' date can appear. Date format is 'yyyymmdd'. Important: the study -#' end data is also used to truncate risk windows, meaning no outcomes -#' beyond the study end date will be considered. -#' @param cohortDatabaseSchema The name of the database schema that is the location where the -#' cohort data used to define the at risk cohort is available. -#' Requires read permissions to this database. -#' @param cohortTable The tablename that contains the at risk cohort. cohortTable has -#' format of COHORT table: cohort_concept_id, SUBJECT_ID, -#' COHORT_START_DATE, COHORT_END_DATE. -#' @param outcomeDatabaseSchema The name of the database schema that is the location where -#' the data used to define the outcome cohorts is available. -#' Requires read permissions to this database. -#' @param outcomeTable The tablename that contains the outcome cohorts. Expectation is -#' outcomeTable has format of COHORT table: -#' COHORT_DEFINITION_ID, SUBJECT_ID, COHORT_START_DATE, -#' COHORT_END_DATE. -#' @param cdmVersion Define the OMOP CDM version used: currently support "4", "5" and "6". -#' @param firstExposureOnly Should only the first exposure per subject be included? Note that -#' this is typically done in the \code{createStudyPopulation} function, -#' but can already be done here for efficiency reasons. -#' @param washoutPeriod The mininum required continuous observation time prior to index -#' date for a person to be included in the at risk cohort. Note that -#' this is typically done in the \code{createStudyPopulation} function, -#' but can already be done here for efficiency reasons. -#' @param sampleSize If not NULL, only this number of people will be sampled from the target population (Default NULL) -#' -#' @param covariateSettings An object of type \code{covariateSettings} as created using the -#' \code{createCovariateSettings} function in the -#' \code{FeatureExtraction} package. -#' @param excludeDrugsFromCovariates A redundant option -#' -#' @return -#' Returns an object of type \code{plpData}, containing information on the cohorts, their -#' outcomes, and baseline covariates. Information about multiple outcomes can be captured at once for -#' efficiency reasons. This object is a list with the following components: \describe{ -#' \item{outcomes}{A data frame listing the outcomes per person, including the time to event, and -#' the outcome id. Outcomes are not yet filtered based on risk window, since this is done at -#' a later stage.} \item{cohorts}{A data frame listing the persons in each cohort, listing their -#' exposure status as well as the time to the end of the observation period and time to the end of the -#' cohort (usually the end of the exposure era).} \item{covariates}{An ffdf object listing the -#' baseline covariates per person in the two cohorts. This is done using a sparse representation: -#' covariates with a value of 0 are omitted to save space.} \item{covariateRef}{An ffdf object describing the covariates that have been extracted.} -#' \item{metaData}{A list of objects with information on how the cohortMethodData object was -#' constructed.} } The generic \code{()} and \code{summary()} functions have been implemented for this object. -#' -#' @export -getPlpData <- function(connectionDetails, - cdmDatabaseSchema, - oracleTempSchema = cdmDatabaseSchema, - cohortId, - outcomeIds, - studyStartDate = "", - studyEndDate = "", - cohortDatabaseSchema = cdmDatabaseSchema, - cohortTable = "cohort", - outcomeDatabaseSchema = cdmDatabaseSchema, - outcomeTable = "cohort", - cdmVersion = "5", - firstExposureOnly = FALSE, - washoutPeriod = 0, - sampleSize = NULL, - covariateSettings, - excludeDrugsFromCovariates = FALSE) { - if (studyStartDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyStartDate) == -1) - stop("Study start date must have format YYYYMMDD") - if (studyEndDate != "" && regexpr("^[12][0-9]{3}[01][0-9][0-3][0-9]$", studyEndDate) == -1) - stop("Study end date must have format YYYYMMDD") - if(!is.null(sampleSize)){ - if(!class(sampleSize) %in% c('numeric', 'integer')) - stop("sampleSize must be numeric") - } - - if(is.null(cohortId)) - stop('User must input cohortId') - if(length(cohortId)>1) - stop('Currently only supports one cohortId at a time') - if(is.null(outcomeIds)) - stop('User must input outcomeIds') - #ToDo: add other checks the inputs are valid - - connection <- DatabaseConnector::connect(connectionDetails) - on.exit(DatabaseConnector::disconnect(connection)) - dbms <- connectionDetails$dbms - - writeLines("\nConstructing the at risk cohort") - if(!is.null(sampleSize)) writeLines(paste("\n Sampling ",sampleSize, " people")) - renderedSql <- SqlRender::loadRenderTranslateSql("CreateCohorts.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - oracleTempSchema = oracleTempSchema, - cdm_database_schema = cdmDatabaseSchema, - cohort_database_schema = cohortDatabaseSchema, - cohort_table = cohortTable, - cdm_version = cdmVersion, - cohort_id = cohortId, - study_start_date = studyStartDate, - study_end_date = studyEndDate, - first_only = firstExposureOnly, - washout_period = washoutPeriod, - use_sample = !is.null(sampleSize), - sample_number=sampleSize - ) - DatabaseConnector::executeSql(connection, renderedSql) - - writeLines("Fetching cohorts from server") - start <- Sys.time() - cohortSql <- SqlRender::loadRenderTranslateSql("GetCohorts.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - oracleTempSchema = oracleTempSchema, - cdm_version = cdmVersion) - cohorts <- DatabaseConnector::querySql(connection, cohortSql) - colnames(cohorts) <- SqlRender::snakeCaseToCamelCase(colnames(cohorts)) - metaData.cohort <- list(cohortId = cohortId, - studyStartDate = studyStartDate, - studyEndDate = studyEndDate) - - if(nrow(cohorts)==0) - stop('Target population is empty') - - delta <- Sys.time() - start - writeLines(paste("Loading cohorts took", signif(delta, 3), attr(delta, "units"))) - - #covariateSettings$useCovariateCohortIdIs1 <- TRUE - covariateData <- FeatureExtraction::getDbCovariateData(connection = connection, - oracleTempSchema = oracleTempSchema, - cdmDatabaseSchema = cdmDatabaseSchema, - cdmVersion = cdmVersion, - cohortTable = "#cohort_person", - cohortTableIsTemp = TRUE, - rowIdField = "row_id", - covariateSettings = covariateSettings) - # add indexes for tidyCov + covariate summary - Andromeda::createIndex(covariateData$covariates, c('rowId'), - indexName = 'covariates_rowId') - Andromeda::createIndex(covariateData$covariates, c('covariateId'), - indexName = 'covariates_covariateId') - Andromeda::createIndex(covariateData$covariates, c('covariateId', 'covariateValue'), - indexName = 'covariates_covariateId_value') - - if(max(outcomeIds)!=-999){ - writeLines("Fetching outcomes from server") - start <- Sys.time() - outcomeSql <- SqlRender::loadRenderTranslateSql("GetOutcomes.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - oracleTempSchema = oracleTempSchema, - cdm_database_schema = cdmDatabaseSchema, - outcome_database_schema = outcomeDatabaseSchema, - outcome_table = outcomeTable, - outcome_ids = outcomeIds, - cdm_version = cdmVersion) - outcomes <- DatabaseConnector::querySql(connection, outcomeSql) - colnames(outcomes) <- SqlRender::snakeCaseToCamelCase(colnames(outcomes)) - metaData.outcome <- data.frame(outcomeIds =outcomeIds) - attr(outcomes, "metaData") <- metaData.outcome - if(nrow(outcomes)==0) - stop('No Outcomes') - - metaData.cohort$attrition <- getCounts2(cohorts,outcomes, "Original cohorts") - attr(cohorts, "metaData") <- metaData.cohort - - delta <- Sys.time() - start - writeLines(paste("Loading outcomes took", signif(delta, 3), attr(delta, "units"))) - } else { - outcomes <- NULL - } - - - - - # Remove temp tables: - renderedSql <- SqlRender::loadRenderTranslateSql("RemoveCohortTempTables.sql", - packageName = "PatientLevelPrediction", - dbms = dbms, - oracleTempSchema = oracleTempSchema) - DatabaseConnector::executeSql(connection, renderedSql, progressBar = FALSE, reportOverallTime = FALSE) - #DatabaseConnector::disconnect(connection) - - metaData <- covariateData$metaData - metaData$call <- match.call() - metaData$call$connectionDetails = connectionDetails - metaData$call$connection = NULL - metaData$call$cdmDatabaseSchema = cdmDatabaseSchema - metaData$call$oracleTempSchema = oracleTempSchema - metaData$call$cohortId = cohortId - metaData$call$outcomeIds = outcomeIds - metaData$call$studyStartDate = studyStartDate - metaData$call$studyEndDate = studyEndDate - metaData$call$cohortDatabaseSchema = cohortDatabaseSchema - metaData$call$cohortTable = cohortTable - metaData$call$outcomeDatabaseSchema = outcomeDatabaseSchema - metaData$call$outcomeTable = outcomeTable - metaData$call$cdmVersion = cdmVersion - metaData$call$firstExposureOnly = firstExposureOnly - metaData$call$washoutPeriod = washoutPeriod - metaData$call$covariateSettings= covariateSettings - metaData$call$sampleSize = sampleSize - - # create the temporal settings (if temporal use) - timeReference <- NULL - if(!is.null(covariateSettings$temporal)){ - if(covariateSettings$temporal){ - # make sure time days populated - if(length(covariateSettings$temporalStartDays)>0){ - timeReference = data.frame(timeId=1:length(covariateSettings$temporalStartDays), - startDay = covariateSettings$temporalStartDays, - endDay = covariateSettings$temporalEndDays) - } - }} - - - result <- list(cohorts = cohorts, - outcomes = outcomes, - covariateData = covariateData, - timeRef = timeReference, - metaData = metaData) - - class(result) <- "plpData" - return(result) -} - - -#' Save the cohort data to folder -#' -#' @description -#' \code{savePlpData} saves an object of type plpData to folder. -#' -#' @param plpData An object of type \code{plpData} as generated using -#' \code{getPlpData}. -#' @param file The name of the folder where the data will be written. The folder should -#' not yet exist. -#' @param envir The environment for to evaluate variables when saving -#' @param overwrite Whether to force overwrite an existing file -#' @details -#' The data will be written to a set of files in the folder specified by the user. -#' -#' @examples -#' # todo -#' -#' @export -savePlpData <- function(plpData, file, envir=NULL, overwrite=F) { - if (missing(plpData)) - stop("Must specify plpData") - if (missing(file)) - stop("Must specify file") - if (!class(plpData) %in% c("plpData","plpData.libsvm" )) - stop("Data not of class plpData") - if(dir.exists(file.path(file, "covariates"))){ - stop('Folder to save covariates already exists...') - } - - if(!dir.exists(file)){ - dir.create(file) - } - - # save the actual values in the metaData - # TODO - only do this if exists in parent or environ - if(is.null(plpData$metaData$call$sampleSize)){ # fixed a bug when sampleSize is NULL - plpData$metaData$call$sampleSize <- 'NULL' - } - for(i in 2:length(plpData$metaData$call)){ - if(!is.null(plpData$metaData$call[[i]])) - plpData$metaData$call[[i]] <- eval(plpData$metaData$call[[i]], envir = envir) - } - - #FeatureExtraction::saveCovariateData(covariateData = plpData$covariateData, file = file.path(file, "covariates")) - Andromeda::saveAndromeda(plpData$covariateData, file = file.path(file, "covariates"), maintainConnection = T) - saveRDS(plpData$timeRef, file = file.path(file, "timeRef.rds")) - saveRDS(plpData$cohorts, file = file.path(file, "cohorts.rds")) - saveRDS(plpData$outcomes, file = file.path(file, "outcomes.rds")) - saveRDS(plpData$metaData, file = file.path(file, "metaData.rds")) -} - -#' Load the cohort data from a folder -#' -#' @description -#' \code{loadPlpData} loads an object of type plpData from a folder in the file -#' system. -#' -#' @param file The name of the folder containing the data. -#' @param readOnly If true, the data is opened read only. -#' -#' @details -#' The data will be written to a set of files in the folder specified by the user. -#' -#' @return -#' An object of class plpData. -#' -#' @examples -#' # todo -#' -#' @export -loadPlpData <- function(file, readOnly = TRUE) { - if (!file.exists(file)) - stop(paste("Cannot find folder", file)) - if (!file.info(file)$isdir) - stop(paste("Not a folder", file)) - - result <- list(covariateData = FeatureExtraction::loadCovariateData(file = file.path(file, "covariates")), - timeRef = readRDS(file.path(file, "timeRef.rds")), - cohorts = readRDS(file.path(file, "cohorts.rds")), - outcomes = readRDS(file.path(file, "outcomes.rds")), - metaData = readRDS(file.path(file, "metaData.rds"))) - - class(result) <- "plpData" - - return(result) -} - - -#' @export -print.plpData <- function(x, ...) { - writeLines("plpData object") - writeLines("") - writeLines(paste("At risk concept ID:", attr(x$cohorts, "metaData")$cohortId)) - writeLines(paste("Outcome concept ID(s):", paste(attr(x$outcomes, "metaData")$outcomeIds, collapse = ","))) -} - -#' @method summary plpData -#' @export -summary.plpData <- function(object,...){ - people <- length(unique(object$cohorts$subjectId)) - outcomeCounts <- data.frame(outcomeId = attr(object$outcomes, "metaData")$outcomeIds, - eventCount = 0, - personCount = 0) - for (i in 1:nrow(outcomeCounts)) { - outcomeCounts$eventCount[i] <- sum(object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i]) - outcomeCounts$personCount[i] <- length(unique(object$outcomes$rowId[object$outcomes$outcomeId == attr(object$outcomes, "metaData")$outcomeIds[i]])) - } - - covDetails <- FeatureExtraction::summary(object$covariateData) - result <- list(metaData = append(append(object$metaData, attr(object$cohorts, "metaData")), attr(object$outcomes, "metaData")), - people = people, - outcomeCounts = outcomeCounts, - covariateCount = covDetails$covariateCount, - covariateValueCount = covDetails$covariateValueCount) - class(result) <- "summary.plpData" - return(result) -} - -#' @export -print.summary.plpData <- function(x, ...) { - writeLines("plpData object summary") - writeLines("") - writeLines(paste("At risk cohort concept ID:", x$metaData$cohortId)) - writeLines(paste("Outcome concept ID(s):", x$metaData$outcomeIds, collapse = ",")) - writeLines("") - writeLines(paste("People:", paste(x$people))) - writeLines("") - writeLines("Outcome counts:") - outcomeCounts <- x$outcomeCounts - rownames(outcomeCounts) <- outcomeCounts$outcomeId - outcomeCounts$outcomeId <- NULL - colnames(outcomeCounts) <- c("Event count", "Person count") - stats::printCoefmat(outcomeCounts) - writeLines("") - writeLines("Covariates:") - writeLines(paste("Number of covariates:", x$covariateCount)) - writeLines(paste("Number of non-zero covariate values:", x$covariateValueCount)) -} - - -#' Saves the plp model -#' -#' @details -#' Saves the plp model to a user specificed folder -#' -#' @param plpModel A trained classifier returned by running \code{runPlp()$model} -#' @param dirPath A location to save the model to -#' -#' @export -savePlpModel <- function(plpModel, dirPath){ - if (missing(plpModel)) - stop("Must specify plpModel") - if (missing(dirPath)) - stop("Must specify directory path") - if (class(plpModel) != "plpModel") - stop("Not a plpModel") - - if(!dir.exists(dirPath)) dir.create(dirPath) - - - # If model is saved on hard drive move it... - #============================================================ - moveFile <- moveHdModel(plpModel, dirPath ) - if(!moveFile){ - ParallelLogger::logError('Moving model files error') - } - #============================================================ - - - # if deep (keras) then save hdfs - if(attr(plpModel, 'type')%in%c('deep', 'deepMulti','deepEnsemble')){ - - if(attr(plpModel, 'type')=='deepEnsemble'){ - tryCatch( - {#saveRDS(plpModel, file = file.path(dirPath, "deepEnsemble_model.rds")) - for (i in seq(plpModel$modelSettings$modelParameters$numberOfEnsembleNetwork)){ - model<-keras::serialize_model(plpModel$model[[i]], include_optimizer = TRUE) - keras::save_model_hdf5(model, filepath = file.path(dirPath, "keras_model",i)) - }},error=function(e) NULL - ) - } - if(attr(plpModel, 'type')=='deep'){ - keras::save_model_hdf5(plpModel$model, filepath = file.path(dirPath, "keras_model")) - } - if(attr(plpModel, 'type')=='deepMulti'){ - saveRDS(attr(plpModel, 'inputs'), file = file.path(dirPath, "inputs_attr.rds")) - } - } else if(attr(plpModel, 'type') == "xgboost"){ - # fixing xgboost save/load issue - xgboost::xgb.save(model = plpModel$model, fname = file.path(dirPath, "model")) - } else { - saveRDS(plpModel$model, file = file.path(dirPath, "model.rds")) - } - #saveRDS(plpModel$predict, file = file.path(dirPath, "transform.rds")) - saveRDS(NULL, file = file.path(dirPath, "transform.rds")) - saveRDS(plpModel$index, file = file.path(dirPath, "index.rds")) - saveRDS(plpModel$trainCVAuc, file = file.path(dirPath, "trainCVAuc.rds")) - saveRDS(plpModel$hyperParamSearch, file = file.path(dirPath, "hyperParamSearch.rds")) - saveRDS(plpModel$modelSettings, file = file.path(dirPath, "modelSettings.rds")) - saveRDS(plpModel$metaData, file = file.path(dirPath, "metaData.rds")) - saveRDS(plpModel$populationSettings, file = file.path(dirPath, "populationSettings.rds")) - saveRDS(plpModel$trainingTime, file = file.path(dirPath, "trainingTime.rds")) - saveRDS(plpModel$varImp, file = file.path(dirPath, "varImp.rds")) - saveRDS(plpModel$dense, file = file.path(dirPath, "dense.rds")) - saveRDS(plpModel$cohortId, file = file.path(dirPath, "cohortId.rds")) - saveRDS(plpModel$outcomeId, file = file.path(dirPath, "outcomeId.rds")) - saveRDS(plpModel$analysisId, file = file.path(dirPath, "analysisId.rds")) - #if(!is.null(plpModel$covariateMap)) - saveRDS(plpModel$covariateMap, file = file.path(dirPath, "covariateMap.rds")) - - attributes <- list(type=attr(plpModel, 'type'), predictionType=attr(plpModel, 'predictionType') ) - saveRDS(attributes, file = file.path(dirPath, "attributes.rds")) - - -} - -moveHdModel <- function(plpModel, dirPath ){ - #================================================================== - # if python then move pickle - #================================================================== - if(attr(plpModel, 'type') %in% c('pythonOld','pythonReticulate', 'pythonAuto') ){ - if(!dir.exists(file.path(dirPath,'python_model'))) - dir.create(file.path(dirPath,'python_model')) - for(file in dir(plpModel$model)){ #DOES THIS CORRECTLY TRANSFER AUTOENCODER BITS? - file.copy(file.path(plpModel$model,file), - file.path(dirPath,'python_model'), overwrite=TRUE, recursive = FALSE, - copy.mode = TRUE, copy.date = FALSE) - } - } - - #================================================================== - # if sagemaker then move pickle - #================================================================== - if(attr(plpModel, 'type') =='sagemaker'){ - if(!dir.exists(file.path(dirPath,'sagemaker_model'))) - dir.create(file.path(dirPath,'sagemaker_model')) - for(file in dir(plpModel$model$loc)){ - file.copy(file.path(plpModel$model$loc,file), - file.path(dirPath,'sagemaker_model'), overwrite=TRUE, recursive = FALSE, - copy.mode = TRUE, copy.date = FALSE) - } - } - - #================================================================== - # if knn then move model - #================================================================== - if(attr(plpModel, 'type') =='knn'){ - if(!dir.exists(file.path(dirPath,'knn_model'))) - dir.create(file.path(dirPath,'knn_model')) - for(file in dir(plpModel$model)){ - file.copy(file.path(plpModel$model,file), - file.path(dirPath,'knn_model'), overwrite=TRUE, recursive = FALSE, - copy.mode = TRUE, copy.date = FALSE) - } - } - - return(TRUE) -} - -#' loads the plp model -#' -#' @details -#' Loads a plp model that was saved using \code{savePlpModel()} -#' -#' @param dirPath The location of the model -#' -#' @export -loadPlpModel <- function(dirPath) { - if (!file.exists(dirPath)) - stop(paste("Cannot find folder", dirPath)) - if (!file.info(dirPath)$isdir) - stop(paste("Not a folder", dirPath)) - - hyperParamSearch <- tryCatch(readRDS(file.path(dirPath, "hyperParamSearch.rds")), - error=function(e) NULL) - # add in these as they got dropped - outcomeId <- tryCatch(readRDS(file.path(dirPath, "outcomeId.rds")), - error=function(e) NULL) - cohortId <- tryCatch(readRDS(file.path(dirPath, "cohortId.rds")), - error=function(e) NULL) - dense <- tryCatch(readRDS(file.path(dirPath, "dense.rds")), - error=function(e) NULL) - covariateMap <- tryCatch(readRDS(file.path(dirPath, "covariateMap.rds")), - error=function(e) NULL) - analysisId <- tryCatch(readRDS(file.path(dirPath, "analysisId.rds")), - error=function(e) NULL) - - if(file.exists(file.path(dirPath, "keras_model"))){ - ensure_installed("keras") - model <- keras::load_model_hdf5(file.path(dirPath, "keras_model")) - } else if(readRDS(file.path(dirPath, "attributes.rds"))$type == "xgboost"){ - ensure_installed("xgboost") - # fixing xgboost save/load issue - model <- xgboost::xgb.load(file.path(dirPath, "model")) - } else { - model <- readRDS(file.path(dirPath, "model.rds")) - } - - result <- list(model = model, - modelSettings = readRDS(file.path(dirPath, "modelSettings.rds")), - hyperParamSearch = hyperParamSearch, - trainCVAuc = readRDS(file.path(dirPath, "trainCVAuc.rds")), - metaData = readRDS(file.path(dirPath, "metaData.rds")), - populationSettings= readRDS(file.path(dirPath, "populationSettings.rds")), - outcomeId = outcomeId, - cohortId = cohortId, - varImp = readRDS(file.path(dirPath, "varImp.rds")), - trainingTime = readRDS(file.path(dirPath, "trainingTime.rds")), - covariateMap =covariateMap, - predict = readRDS(file.path(dirPath, "transform.rds")), - index = readRDS(file.path(dirPath, "index.rds")), - dense = dense, - analysisId = analysisId) - - #attributes <- readRDS(file.path(dirPath, "attributes.rds")) - attributes <- readRDS(file.path(dirPath, "attributes.rds")) - attr(result, 'type') <- attributes$type - attr(result, 'predictionType') <- attributes$predictionType - class(result) <- "plpModel" - - # update the model location to the load dirPath - result <- updateModelLocation(result, dirPath) - - # make this backwrds compatible for ffdf: - result$predict <- createTransform(result) - - return(result) -} - -updateModelLocation <- function(plpModel, dirPath){ - type <- attr(plpModel, 'type') - # if python update the location - if( type %in% c('pythonOld','pythonReticulate', 'pythonAuto')){ - plpModel$model <- file.path(dirPath,'python_model') - plpModel$predict <- createTransform(plpModel) - } - if( type =='sagemaker'){ - plpModel$model$loc <- file.path(dirPath,'sagemaker_model') - plpModel$predict <- createTransform(plpModel) - } - # if knn update the locaiton - TODO !!!!!!!!!!!!!! - if( type =='knn'){ - plpModel$model <- file.path(dirPath,'knn_model') - plpModel$predict <- createTransform(plpModel) - } - if( type =='deep' ){ - plpModel$predict <- createTransform(plpModel) - } - if( type =='deepEnsemble' ){ - plpModel$predict <- createTransform(plpModel) - } - if( type =='deepMulti'){ - attr(plpModel, 'inputs') <- tryCatch(readRDS(file.path(dirPath, "inputs_attr.rds")), - error=function(e) NULL) - plpModel$predict <- createTransform(plpModel) - - } - - return(plpModel) -} - - -#' Saves the prediction dataframe to RDS -#' -#' @details -#' Saves the prediction data frame returned by predict.R to an RDS file and returns the fileLocation where the prediction is saved -#' -#' @param prediction The prediciton data.frame -#' @param dirPath The directory to save the prediction RDS -#' @param fileName The name of the RDS file that will be saved in dirPath -#' -#' @export -savePrediction <- function(prediction, dirPath, fileName='prediction.rds'){ - #TODO check inupts - saveRDS(prediction, file=file.path(dirPath,fileName)) - - return(file.path(dirPath,fileName)) -} - -#' Loads the prediciton dataframe to csv -#' -#' @details -#' Loads the prediciton RDS file -#' -#' @param fileLocation The location with the saved prediction -#' -#' @export -loadPrediction <- function(fileLocation){ - #TODO check inupts - prediction <- readRDS(file=fileLocation) - return(prediction) -} - -#' Saves the result from runPlp into the location directory -#' -#' @details -#' Saves the result from runPlp into the location directory -#' -#' @param result The result of running runPlp() -#' @param dirPath The directory to save the csv -#' -#' @export -savePlpResult <- function(result, dirPath){ - if (missing(result)) - stop("Must specify runPlp output") - if (missing(dirPath)) - stop("Must specify directory location") - #if (class(plpModel) != "plpModel") - # stop("Not a plpModel") - - if(!dir.exists(dirPath)) dir.create(dirPath, recursive = T) - - savePlpModel(result$model, dirPath=file.path(dirPath,'model') ) - saveRDS(result$analysisRef, file = file.path(dirPath, "analysisRef.rds")) - saveRDS(result$inputSetting, file = file.path(dirPath, "inputSetting.rds")) - saveRDS(result$executionSummary, file = file.path(dirPath, "executionSummary.rds")) - saveRDS(result$prediction, file = file.path(dirPath, "prediction.rds")) - saveRDS(result$performanceEvaluation, file = file.path(dirPath, "performanceEvaluation.rds")) - #saveRDS(result$performanceEvaluationTrain, file = file.path(dirPath, "performanceEvaluationTrain.rds")) - saveRDS(result$covariateSummary, file = file.path(dirPath, "covariateSummary.rds")) - - -} - -#' Loads the evalaution dataframe -#' -#' @details -#' Loads the evaluation -#' -#' @param dirPath The directory where the evaluation was saved -#' -#' @export -loadPlpResult <- function(dirPath){ - if (!file.exists(dirPath)) - stop(paste("Cannot find folder", dirPath)) - if (!file.info(dirPath)$isdir) - stop(paste("Not a folder", dirPath)) - - - result <- list(model = loadPlpModel(file.path(dirPath, "model")), - analysisRef = readRDS(file.path(dirPath, "analysisRef.rds")), - inputSetting = readRDS(file.path(dirPath, "inputSetting.rds")), - executionSummary = readRDS(file.path(dirPath, "executionSummary.rds")), - prediction = readRDS(file.path(dirPath, "prediction.rds")), - performanceEvaluation = readRDS(file.path(dirPath, "performanceEvaluation.rds")), - #performanceEvaluationTrain= readRDS(file.path(dirPath, "performanceEvaluationTrain.rds")), - covariateSummary = readRDS(file.path(dirPath, "covariateSummary.rds")) - ) - class(result) <- "runPlp" - - return(result) - -} - - -#result$inputSetting$dataExtrractionSettings$covariateSettings -formatCovariateSettings <- function(covariateSettings){ - - if(class(covariateSettings) == "covariateSettings"){ - return(list(cvs = data.frame(X = names(unlist(covariateSettings)), x= unlist(covariateSettings)), - fun = attr(covariateSettings,'fun'))) - - } else{ - return(list(cvs = do.call(rbind, lapply(1:length(covariateSettings), function(i){ - inds <- which(lapply(covariateSettings[[i]], class) == "function") - if(length(inds)>0){ - for(j in inds){ - covariateSettings[[i]][[j]] <- paste0(deparse(covariateSettings[[i]][[j]]), collapse = " ") - } - } - tempResult <- data.frame(names = names(unlist(covariateSettings[[i]])), - values = unlist(covariateSettings[[i]])) - tempResult$settingsId <- i - return(tempResult) - })), - fun = unlist(lapply(covariateSettings, function(x) attr(x,'fun'))) - ) - ) - } - -} - -reformatCovariateSettings <- function(covariateSettingsLocation){ - # adding this to stop warnings when files does not exist - if(!file.exists(covariateSettingsLocation)){ - return(NULL) - } - cs <- utils::read.csv(covariateSettingsLocation, stringsAsFactors=FALSE) - fun <- utils::read.csv(gsub('.csv','_fun.csv',covariateSettingsLocation), stringsAsFactors=FALSE) - - if(sum(colnames(cs)%in%c('X','x'))==2){ - covariateSettings <- cs$x - covariateSettings <- as.list(covariateSettings) - names(covariateSettings) <- cs$X - attr(covariateSettings,'fun') <- fun$x - class(covariateSettings) <- 'covariateSettings' - } else { - - covariateSettings <- list() - length(covariateSettings) <- max(cs$settingsId) - - for(i in 1:max(cs$settingsId)){ - covariateSettings[[i]] <- cs$values[cs$settingsId==i] - covariateSettings[[i]] <- as.list(covariateSettings[[i]]) - names(covariateSettings[[i]]) <- cs$names[cs$settingsId==i] - attr(covariateSettings[[i]],'fun') <- fun$x[i] - } - - } - -return(covariateSettings) -} - - -#' Save parts of the plp result as a csv for transparent sharing -#' -#' @details -#' Saves the main results as a csv (these files can be read by the shiny app) -#' -#' @param result An object of class runPlp with development or validation results -#' @param dirPath The directory the save the results as csv files -#' -#' @export -savePlpToCsv <- function(result, dirPath){ - - #inputSetting - if(!dir.exists(file.path(dirPath, 'inputSetting'))){dir.create(file.path(dirPath, 'inputSetting'), recursive = T)} - utils::write.csv(result$inputSetting$modelSettings$model, file = file.path(dirPath, 'inputSetting','modelSettings_model.csv'), row.names = F) - - if(!is.null(result$inputSetting$modelSettings$param)){ - utils::write.csv(as.data.frame(t(unlist(result$inputSetting$modelSettings$param))), file = file.path(dirPath, 'inputSetting','modelSettings_param.csv'), row.names = F) - }else{ - utils::write.csv(NULL, file = file.path(dirPath, 'inputSetting','modelSettings_param.csv'), row.names = F) - } - utils::write.csv(result$inputSetting$modelSettings$name, file = file.path(dirPath, 'inputSetting','modelSettings_name.csv'), row.names = F) - if(!is.null(result$inputSetting$dataExtrractionSettings$covariateSettings)){ - utils::write.csv(formatCovariateSettings(result$inputSetting$dataExtrractionSettings$covariateSettings)$cvs, file = file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings.csv'), row.names = F) - utils::write.csv(formatCovariateSettings(result$inputSetting$dataExtrractionSettings$covariateSettings)$fun, file = file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings_fun.csv'), row.names = F) - } - utils::write.csv(result$inputSetting$populationSettings$attrition, file = file.path(dirPath, 'inputSetting','populationSettings_attrition.csv'), row.names = F) - result$inputSetting$populationSettings$attrition <- NULL - utils::write.csv(result$inputSetting$populationSettings, file = file.path(dirPath, 'inputSetting','populationSettings.csv'), row.names = F) - - #executionSummary - if(!dir.exists(file.path(dirPath, 'executionSummary'))){dir.create(file.path(dirPath, 'executionSummary'), recursive = T)} - utils::write.csv(result$executionSummary$PackageVersion, file = file.path(dirPath, 'executionSummary','PackageVersion.csv'), row.names = F) - utils::write.csv(unlist(result$executionSummary$PlatformDetails), file = file.path(dirPath, 'executionSummary','PlatformDetails.csv')) - utils::write.csv(result$executionSummary$TotalExecutionElapsedTime, file = file.path(dirPath, 'executionSummary','TotalExecutionElapsedTime.csv'), row.names = F) - utils::write.csv(result$executionSummary$ExecutionDateTime, file = file.path(dirPath, 'executionSummary','ExecutionDateTime.csv'), row.names = F) - - #performanceEvaluation - if(!dir.exists(file.path(dirPath, 'performanceEvaluation'))){dir.create(file.path(dirPath, 'performanceEvaluation'), recursive = T)} - utils::write.csv(result$performanceEvaluation$evaluationStatistics, file = file.path(dirPath, 'performanceEvaluation','evaluationStatistics.csv'), row.names = F) - utils::write.csv(result$performanceEvaluation$thresholdSummary, file = file.path(dirPath, 'performanceEvaluation','thresholdSummary.csv'), row.names = F) - utils::write.csv(result$performanceEvaluation$demographicSummary, file = file.path(dirPath, 'performanceEvaluation','demographicSummary.csv'), row.names = F) - utils::write.csv(result$performanceEvaluation$calibrationSummary, file = file.path(dirPath, 'performanceEvaluation','calibrationSummary.csv'), row.names = F) - utils::write.csv(result$performanceEvaluation$predictionDistribution, file = file.path(dirPath, 'performanceEvaluation','predictionDistribution.csv'), row.names = F) - - #covariateSummary - utils::write.csv(result$covariateSummary, file = file.path(dirPath,'covariateSummary.csv'), row.names = F) -} - -#' Loads parts of the plp result saved as csv files for transparent sharing -#' -#' @details -#' Load the main results from csv files into a runPlp object -#' -#' @param dirPath The directory with the results as csv files -#' -#' @export -loadPlpFromCsv <- function(dirPath){ - - result <- list() - objects <- gsub('.csv','',dir(dirPath)) - if(sum(!c('covariateSummary','executionSummary','inputSetting','performanceEvaluation')%in%objects)>0){ - stop('Incorrect csv results file') - } - - length(result) <- length(objects) - names(result) <- objects - - #covariateSummary - result$covariateSummary <- utils::read.csv(file = file.path(dirPath,'covariateSummary.csv')) - - #executionSummary - result$executionSummary <- list() - result$executionSummary$PackageVersion <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'executionSummary','PackageVersion.csv')))}, error = function(e){return(NULL)}) - result$executionSummary$PlatformDetails <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))$x)}, error = function(e){return(NULL)}) - names(result$executionSummary$PlatformDetails) <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','PlatformDetails.csv'))$X}, error = function(e){return(NULL)}) - result$executionSummary$TotalExecutionElapsedTime <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','TotalExecutionElapsedTime.csv'))$x}, error = function(e){return(NULL)}) - result$executionSummary$ExecutionDateTime <- tryCatch({utils::read.csv(file = file.path(dirPath, 'executionSummary','ExecutionDateTime.csv'))$x}, error = function(e){return(NULL)}) - - #inputSetting - result$inputSetting <- list() - result$inputSetting$modelSettings$model <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_model.csv'))$x}, error = function(e){return(NULL)}) - result$inputSetting$modelSettings$param <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_param.csv')))}, error = function(e){return(NULL)}) - result$inputSetting$modelSettings$name <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','modelSettings_name.csv'))$x}, error = function(e){return(NULL)}) - - result$inputSetting$dataExtrractionSettings$covariateSettings <- tryCatch({reformatCovariateSettings(file.path(dirPath, 'inputSetting','dataExtrractionSettings_covariateSettings.csv'))}, error = function(e){return(NULL)}) - - result$inputSetting$populationSettings <- tryCatch({as.list(utils::read.csv(file = file.path(dirPath, 'inputSetting','populationSettings.csv')))}, error = function(e){return(NULL)}) - result$inputSetting$populationSettings$attrition <- tryCatch({utils::read.csv(file = file.path(dirPath, 'inputSetting','populationSettings_attrition.csv'))}, error = function(e){return(NULL)}) - - #performanceEvaluation - result$performanceEvaluation <- list() - result$performanceEvaluation$evaluationStatistics <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','evaluationStatistics.csv'))}, error = function(e){return(NULL)}) - result$performanceEvaluation$thresholdSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','thresholdSummary.csv'))}, error = function(e){return(NULL)}) - result$performanceEvaluation$demographicSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','demographicSummary.csv'))}, error = function(e){return(NULL)}) - result$performanceEvaluation$calibrationSummary <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','calibrationSummary.csv'))}, error = function(e){return(NULL)}) - result$performanceEvaluation$predictionDistribution <- tryCatch({utils::read.csv(file = file.path(dirPath, 'performanceEvaluation','predictionDistribution.csv'))}, error = function(e){return(NULL)}) - - result$model$modelSettings <- result$inputSetting$modelSettings - result$model$populationSettings <- result$inputSetting$populationSettings - result$model$metaData$call$covariateSettings <- result$inputSetting$dataExtrractionSettings$covariateSettings - - class(result) <- "runPlp" - return(result) -} diff --git a/R/Transformer.R b/R/Transformer.R new file mode 100644 index 0000000..0ab7227 --- /dev/null +++ b/R/Transformer.R @@ -0,0 +1,303 @@ +#' create settings for training a non-temporal transformer +#' +#' @description A transformer model +#' @details from https://arxiv.org/abs/2106.11959 +#' +#' @param numBlocks number of transformer blocks +#' @param dimToken dimension of each token (embedding size) +#' @param dimOut dimension of output, usually 1 for binary problems +#' @param numHeads number of attention heads +#' @param attDropout dropout to use on attentions +#' @param ffnDropout dropout to use in feedforward block +#' @param resDropout dropout to use in residual connections +#' @param dimHidden dimension of the feedworward block +#' @param weightDecay weightdecay to use +#' @param learningRate learning rate to use +#' @param batchSize batchSize to use +#' @param epochs How many epochs to run the model for +#' @param device Which device to use, cpu or cuda +#' @param hyperParamSearch what kind of hyperparameter search to do, default 'random' +#' @param randomSamples How many samples to use in hyperparameter search if random +#' @param seed Random seed to use +#' +#' @export +setTransformer <- function(numBlocks = 3, dimToken = 96, dimOut = 1, + numHeads = 8, attDropout = 0.25, ffnDropout = 0.25, + resDropout = 0, dimHidden = 512, weightDecay = 1e-6, + learningRate = 3e-4, batchSize = 1024, + epochs = 10, device = "cpu", hyperParamSearch = "random", + randomSamples = 100, seed = NULL) { + if (is.null(seed)) { + seed <- as.integer(sample(1e5, 1)) + } + + if (dimToken %% numHeads != 0) { + stop(paste( + "dimToken needs to divisble by numHeads. dimToken =", dimToken, + "is not divisible by numHeads =", numHeads + )) + } + + paramGrid <- list( + numBlocks = numBlocks, + dimToken = dimToken, + dimOut = dimOut, + numHeads = numHeads, + dimHidden = dimHidden, + attDropout = attDropout, + ffnDropout = ffnDropout, + resDropout = resDropout, + weightDecay = weightDecay, + learningRate = learningRate, + seed = list(as.integer(seed[[1]])) + ) + + param <- PatientLevelPrediction::listCartesian(paramGrid) + + if (hyperParamSearch == "random") { + param <- param[sample(length(param), randomSamples)] + } + + attr(param, "settings") <- list( + seed = seed[1], + device = device, + batchSize = batchSize, + epochs = epochs, + name = "Transformer", + saveType = "file", + modelParamNames = c( + "numBlocks", "dimToken", "dimOut", "numHeads", + "attDropout", "ffnDropout", "resDropout", "dimHidden" + ), + baseModel = "Transformer" + ) + + results <- list( + fitFunction = "fitEstimator", + param = param + ) + + class(results) <- "modelSettings" + return(results) +} + + +Transformer <- torch::nn_module( + name = "Transformer", + initialize = function(catFeatures, numFeatures, numBlocks, dimToken, dimOut = 1, + numHeads, attDropout, ffnDropout, resDropout, + headActivation = torch::nn_relu, + activation = NULL, + ffnNorm = torch::nn_layer_norm, + headNorm = torch::nn_layer_norm, + attNorm = torch::nn_layer_norm, + dimHidden) { + activation <- nn_reglu + self$Categoricalembedding <- Embedding(catFeatures + 1, dimToken) # + 1 for padding idx + self$numericalEmbedding <- numericalEmbedding(numFeatures, dimToken) + self$classToken <- ClassToken(dimToken) + + self$layers <- torch::nn_module_list(lapply( + 1:numBlocks, + function(x) { + layer <- torch::nn_module_list() + layer$add_module("attention", torch::nn_multihead_attention(dimToken, numHeads, + dropout = attDropout, + bias = TRUE + )) + layer$add_module("ffn", FeedForwardBlock(dimToken, dimHidden, + biasFirst = TRUE, + biasSecond = TRUE, + dropout = ffnDropout, + activation = activation + )) + layer$add_module("attentionResDropout", torch::nn_dropout(resDropout)) + layer$add_module("ffnResDropout", torch::nn_dropout(resDropout)) + layer$add_module("ffnNorm", ffnNorm(dimToken)) + + if (x != 1) { + layer$add_module("attentionNorm", attNorm(dimToken)) + } + return(layer) + } + )) + self$head <- Head(dimToken, + bias = TRUE, activation = headActivation, + headNorm, dimOut + ) + }, + forward = function(x) { + mask <- torch::torch_where(x$cat == 0, TRUE, FALSE) + input <- x + cat <- self$Categoricalembedding(x$cat) + if (!is.null(input$num)) { + num <- self$numericalEmbedding(input$num) + x <- torch::torch_cat(list(cat, num), dim = 2L) + mask <- torch::torch_cat(list(mask, torch::torch_zeros(c( + x$shape[1], + num$shape[2] + ), + device = mask$device, + dtype = mask$dtype + )), + dim = 2L + ) + } else { + x <- cat + } + x <- self$classToken(x) + mask <- torch::torch_cat(list(mask, torch::torch_zeros(c(x$shape[1], 1), + device = mask$device, + dtype = mask$dtype + )), + dim = 2L + ) + for (i in 1:length(self$layers)) { + layer <- self$layers[[i]] + xResidual <- self$startResidual(layer, "attention", x) + + if (i == length(self$layers)) { + dims <- xResidual$shape + # in final layer take only attention on CLS token + xResidual <- layer$attention( + xResidual[, -1]$view(c(dims[1], 1, dims[3]))$transpose(1, 2), + xResidual$transpose(1, 2), + xResidual$transpose(1, 2), mask + ) + attnWeights <- xResidual[[2]] + xResidual <- xResidual[[1]] + x <- x[, -1]$view(c(dims[1], 1, dims[3])) + } else { + # attention input is seq_length x batch_size x embedding_dim + xResidual <- layer$attention( + xResidual$transpose(1, 2), + xResidual$transpose(1, 2), + xResidual$transpose(1, 2), + mask, + )[[1]] + } + x <- self$endResidual(layer, "attention", x, xResidual$transpose(1, 2)) + + xResidual <- self$startResidual(layer, "ffn", x) + xResidual <- layer$ffn(xResidual) + x <- self$endResidual(layer, "ffn", x, xResidual) + } + x <- self$head(x)[, 1] # remove singleton dimension + return(x) + }, + startResidual = function(layer, stage, x) { + xResidual <- x + normKey <- paste0(stage, "Norm") + if (normKey %in% names(as.list(layer))) { + xResidual <- layer[[normKey]](xResidual) + } + return(xResidual) + }, + endResidual = function(layer, stage, x, xResidual) { + dropoutKey <- paste0(stage, "ResDropout") + xResidual <- layer[[dropoutKey]](xResidual) + x <- x + xResidual + return(x) + } +) + + +FeedForwardBlock <- torch::nn_module( + name = "FeedForwardBlock", + initialize = function(dimToken, dimHidden, biasFirst, biasSecond, + dropout, activation) { + self$linearFirst <- torch::nn_linear(dimToken, dimHidden * 2, biasFirst) + self$activation <- activation() + self$dropout <- torch::nn_dropout(dropout) + self$linearSecond <- torch::nn_linear(dimHidden, dimToken, biasSecond) + }, + forward = function(x) { + x <- self$linearFirst(x) + x <- self$activation(x) + x <- self$dropout(x) + x <- self$linearSecond(x) + return(x) + } +) + +Head <- torch::nn_module( + name = "Head", + initialize = function(dimIn, bias, activation, normalization, dimOut) { + self$normalization <- normalization(dimIn) + self$activation <- activation() + self$linear <- torch::nn_linear(dimIn, dimOut, bias) + }, + forward = function(x) { + x <- x[, -1] # ? + x <- self$normalization(x) + x <- self$activation(x) + x <- self$linear(x) + return(x) + } +) + +Embedding <- torch::nn_module( + name = "Embedding", + initialize = function(numEmbeddings, embeddingDim) { + self$embedding <- torch::nn_embedding(numEmbeddings, embeddingDim, padding_idx = 1) + }, + forward = function(x_cat) { + x <- self$embedding(x_cat + 1L) # padding idx is 1L + return(x) + } +) + +numericalEmbedding <- torch::nn_module( + name = "numericalEmbedding", + initialize = function(numEmbeddings, embeddingDim, bias = TRUE) { + self$weight <- torch::nn_parameter(torch::torch_empty(numEmbeddings, embeddingDim)) + if (bias) { + self$bias <- torch::nn_parameter(torch::torch_empty(numEmbeddings, embeddingDim)) + } else { + self$bias <- NULL + } + + for (parameter in list(self$weight, self$bias)) { + if (!is.null(parameter)) { + torch::nn_init_kaiming_uniform_(parameter, a = sqrt(5)) + } + } + }, + forward = function(x) { + x <- self$weight$unsqueeze(1) * x$unsqueeze(-1) + if (!is.null(self$bias)) { + x <- x + self$bias$unsqueeze(1) + } + return(x) + } +) + +# adds a class token embedding to embeddings +ClassToken <- torch::nn_module( + name = "ClassToken", + initialize = function(dimToken) { + self$weight <- torch::nn_parameter(torch::torch_empty(dimToken, 1)) + torch::nn_init_kaiming_uniform_(self$weight, a = sqrt(5)) + }, + expand = function(dims) { + newDims <- vector("integer", length(dims) - 1) + 1 + return(self$weight$view(c(newDims, -1))$expand(c(dims, -1))) + }, + forward = function(x) { + return(torch::torch_cat(c(x, self$expand(c(dim(x)[[1]], 1))), dim = 2)) + } +) + +nn_reglu <- torch::nn_module( + name = "ReGlu", + forward = function(x) { + return(reglu(x)) + } +) + + +reglu <- function(x) { + chunks <- x$chunk(2, dim = -1) + + return(chunks[[1]] * torch::nnf_relu(chunks[[2]])) +} diff --git a/README.md b/README.md index 21f8d67..ad0e3c6 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ DeepPatientLevelPrediction ====================== -[![Build Status](https://github.com/OHDSI/DeepPatientLevelPrediction/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/DeepPatientLevelPrediction/actions?query=workflow%3AR-CMD-check) -[![codecov.io](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction/coverage.svg?branch=master)](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction?branch=master) +[![Build Status](https://github.com/OHDSI/DeepPatientLevelPrediction/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/DeepPatientLevelPrediction/actions?query=workflow%3AR-CMD-check?branch=develop) +[![codecov.io](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction/coverage.svg?branch=master)](https://codecov.io/github/OHDSI/DeepPatientLevelPrediction?branch=main) Introduction @@ -15,25 +15,27 @@ Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. [Design and implementati Features ======== -- add - +- Adds deep learning models to use in the OHDSI PatientLevelPrediction framework. +- Allows to add custom deep learning models. +- Includes an MLP, ResNet and a Transformer +- Allows to use all the features of [PatientLevelPrediction](https://github.com/OHDSI/PatientLevelPrediction/) to validate and explore your model performance. Technology ========== -DeepPatientLevelPrediction is an R package, with some functions implemented in C++ and python. +DeepPatientLevelPrediction is an R package. It uses [torch in R](https://torch.mlverse.org/) to build deep learning models without using python. System Requirements =================== -Requires R (version 3.3.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). Libraries used in DeepPatientLevelPrediction require Java and Python. +Requires R (version 3.5.0 or higher). Installation on Windows requires [RTools](http://cran.r-project.org/bin/windows/Rtools/). For training deep learning models in most cases an nvidia GPU is required using either Windows or Linux. -The python installation is required for some of the machine learning algorithms. We advise to -install Python 3.7 using Anaconda (https://www.continuum.io/downloads). Getting Started =============== -- add +- To install the package please read the [Package installation guide]() +- Please read the main vignette for the package: +[Building Deep Learning Models](https://ohdsi.github.io/DeepPatientLevelPrediction/articles/BuildingDeepModels.html) User Documentation ================== diff --git a/_pkgdown.yml b/_pkgdown.yml new file mode 100644 index 0000000..a552c95 --- /dev/null +++ b/_pkgdown.yml @@ -0,0 +1,33 @@ +template: + params: + bootswatch: cosmo + +home: + links: + - text: Ask a question + href: http://forums.ohdsi.org + +navbar: + structure: + left: + - home + - intro + - reference + - articles + right: [hades, github] + components: + home: + icon: fa-home fa-lg + href: index.html + reference: + text: Reference + href: reference/index.html + intro: + text: Get started + href: articles/Installing.html + github: + icon: fa-github fa-lg + href: https://github.com/OHDSI/DeepPatientLevelPrediction + hades: + text: hadesLogo + href: https://ohdsi.github.io/Hades diff --git a/docs/404.html b/docs/404.html new file mode 100644 index 0000000..491064f --- /dev/null +++ b/docs/404.html @@ -0,0 +1,127 @@ + + + + + + + +Page not found (404) • DeepPatientLevelPrediction + + + + + + + + + + + +
+
+ + + + +
+
+ + +Content not found. Please use links in the navbar. + +
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.5.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/BuildingDeepModels.html b/docs/articles/BuildingDeepModels.html new file mode 100644 index 0000000..70c805a --- /dev/null +++ b/docs/articles/BuildingDeepModels.html @@ -0,0 +1,378 @@ + + + + + + + +Building Deep Learning Models • DeepPatientLevelPrediction + + + + + + + + + + + + +
+
+ + + + +
+
+ + + + + +
+

Introduction +

+
+

DeepPatientLevelPrediction +

+

Patient level prediction aims to use historic data to learn a function between an input (a patient’s features such as age/gender/comorbidities at index) and an output (whether the patient experienced an outcome during some time-at-risk). Deep learning is example of the the current state-of-the-art classifiers that can be implemented to learn the function between inputs and outputs.

+

Deep Learning models are widely used to automatically learn high-level feature representations from the data, and have achieved remarkable results in image processing, speech recognition and computational biology. Recently, interesting results have been shown using large observational healthcare data (e.g., electronic healthcare data or claims data), but more extensive research is needed to assess the power of Deep Learning in this domain.

+

This vignette describes how you can use the Observational Health Data Sciences and Informatics (OHDSI) PatientLevelPrediction package and DeepPatientLevelPrediction package to build Deep Learning models. This vignette assumes you have read and are comfortable with building patient level prediction models as described in the BuildingPredictiveModels vignette. Furthermore, this vignette assumes you are familiar with Deep Learning methods.

+
+
+

Background +

+

Deep Learning models are build by stacking an often large number of neural network layers that perform feature engineering steps, e.g embedding, and are collapsed in a final softmax layer (basically a logistic regression layer). These algorithms need a lot of data to converge to a good representation, but currently the sizes of the large observational healthcare databases are growing fast which would make Deep Learning an interesting approach to test within OHDSI’s Patient-Level Prediction Framework. The current implementation allows us to perform research at scale on the value and limitations of Deep Learning using observational healthcare data.

+

In the package we have used torch and tabnet but we invite the community to add other backends.

+

Many network architectures have recently been proposed and we have implemented a number of them, however, this list will grow in the near future. It is important to understand that some of these architectures require a 2D data matrix, i.e. |patient|x|feature|, and others use a 3D data matrix |patient|x|feature|x|time|. The FeatureExtraction Package has been extended to enable the extraction of both data formats as will be described with examples below.

+

Note that training Deep Learning models is computationally intensive, our implementation therefore supports both GPU and CPU. It will automatically check whether there is GPU or not in your computer. A GPU is highly recommended for Deep Learning!

+
+
+

Requirements +

+

Full details about the package requirements and instructions on installing the package can be found here.

+
+
+

Integration with PatientLevelPrediction +

+

The DeepPatientLevelPrediction package provides additional model settings that can be used within the PatientLevelPrediction package runPlp() function. To use both packages you first need to pick the deep learning architecture you wish to fit (see below) and then you specifiy this as the modelSettings inside runPlp().

+
+# load the data
+plpData <- PatientLevelPrediction::loadPlpData('locationOfData')
+
+# pick the set<Model> from  DeepPatientLevelPrediction
+deepLearningModel <- DeepPatientLevelPrediction::setResNet()
+
+# use PatientLevelPrediction to fit model
+deepLearningResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 1230, 
+    modelSettings = deepLearningModel,
+    analysisId = 'resNetTorch', 
+    ...
+  )
+
+
+
+

Non-Temporal Architectures +

+

We implemented the following non-temporal (2D data matrix) architectures:

+
+

Simple MLP +

+
+

Overall concept +

+

A multilayer perceptron (MLP) model is a directed graph consisting of an input layer, one or more hidden layers and an output layer. The model takes in the input feature values and feeds these forward through the graph to determine the output class. A process known as ‘backpropogation’ is used to train the model. Backpropogation requires labelled data and involves iteratively calculating the error between the MLP model’s predictions and ground truth to learn how to adjust the model.

+
+
+

Example +

+
+
Set Fuction +
+

To use the package to fit a MLP model you can use the setDeepNNTorch() function to specify the hyper-parameter settings for the MLP.

+
+
+
Inputs +
+

The units input defines the network topology via the number of nodes per layer in the networks hidden layers. A list of different topologies can be investigated list(c(10,63), 128) means two different topologies will be fit, the first has two hidden layers with 10 nodes in the first hidden layer and 63 in the second hidden layer. The second just has one hidden layer with 128 nodes.

+

The layer_dropout input specifies the probability that a layer randomly sets input units to 0 at each step during training time. A value of 0.2 means that 20% of the time the layer input units will be set to 0. This is used to reduce overfitting.

+

The lr input is the learning rate which is a hyperparameter that controls how much to change the model in response to the estimated error each time the model weights are updated. The smaller the lr the longer it will take to fit the model and the model weights may get stuck, but if the lr is too large, the weights may sub-optimally converge too fast.

+

The decay input corresponds to the weight decay in the objective function. During model fitting the aim is to minimize the objective function. The objective function is made up of the prediction error (the difference between the prediction vs the truth) plus the square of the weights multiplied by the weight decay. The larger the weight decay, the more you penalize having large weights. If you set the weight decay too large, the model will never fit well enough, if you set it too low, you need to be careful of overfitting (so try to stop model fitting earlier).

+

The outcome_weight specifies whether to add more weight to misclassifying one class (e.g., with outcome during TAR) vs the other (e.g., without outcome during TAR). This can be useful if there is imbalance between the classes (e.g., the outcome rarely occurs during TAR).

+

The batch_size corresponds to the number of data points (patients) used per iteration to estimate the network error during model fitting.

+

The epochs corresponds to how many time to run through the entire training data while fitting the model.

+

The seed lets the user reproduce the same network given the same training data and hyper-parameter settings if they use the same seed.

+
+
+
Example Code +
+

For example, the following code will try two different network topologies and pick the topology that obtains the greatest AUROC via cross validation in the training data and then fit the model with that topology using all the training data. The standard output of runPlp() will be returned - this contains the MLP model along with the performance details and settings.

+
+#singleLayerNN(inputN = 10, layer1 = 100, outputN = 2, layer_dropout = 0.1)
+deepset <- setDeepNNTorch(
+  units = list(c(10,63), 128), 
+  layer_dropout = c(0.2),
+  lr = c(1e-4), 
+  decay = c(1e-5), 
+  outcome_weight = c(1.0), 
+  batch_size = c(100), 
+  epochs = c(5),  
+  seed = 12  
+  )
+
+mlpResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 3, 
+    modelSettings = deepset,
+    analysisId = 'DeepNNTorch', 
+    analysisName = 'Testing Deep Learning', 
+    populationSettings = populationSet, 
+    splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), 
+    sampleSettings = PatientLevelPrediction::createSampleSettings(),  # none 
+    featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none 
+    preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), 
+    executeSettings = PatientLevelPrediction::createExecuteSettings(
+      runSplitData = T, 
+      runSampleData = F, 
+      runfeatureEngineering = F, 
+      runPreprocessData = T, 
+      runModelDevelopment = T, 
+      runCovariateSummary = F
+    ), 
+    saveDirectory = file.path(testLoc, 'DeepNNTorch')
+  )
+
+
+
+
+

ResNet +

+
+

Overall concept +

+

Deep learning models are often trained via a process known as gradient descent during backpropogation. During this process the network weights are updated based on the gradient of the error function for the current weights. However, as the number of layers in the network increase, there is a greater chance of experiencing an issue known as the vanishing or exploding gradient during this process. The vanishing or exploding gradient is when the gradient goes to 0 or infinity, which negatively impacts the model fitting.

+

The residual network (ResNet) was introduced to address the vanishing or exploding gradient issue. It works by adding connections between non-adjacent layers, termed a ‘skip connection’. Using some form of regularization with these ‘skip connections’ enables the network to ignore any problematic layer that resulted due to gradient issues.

+
+
+

Example +

+
+
Set Fuction +
+

To use the package to fit a ResNet model you can use the setResNet() function to specify the hyper-parameter settings for the network.

+
+
+
Inputs +
+

[add info about each input here]

+
+
+
Example Code +
+

For example, the following code will …

+
+resset <- setResNet(
+  numLayers = c(2), 
+  sizeHidden = c(32),
+  hiddenFactor = c(2),
+  residualDropout = c(0.1), 
+  hiddenDropout = c(0.1),
+  normalization = c('BatchNorm'), 
+  activation = c('RelU'),
+  sizeEmbedding = c(32), 
+  weightDecay = c(1e-6),
+  learningRate = c(3e-4), 
+  seed = 42, 
+  hyperParamSearch = 'random',
+  randomSample = 1, 
+  #device='cuda:0', 
+  batchSize = 128, 
+  epochs = 3
+)
+
+resResult <- PatientLevelPrediction::runPlp(
+    plpData = plpData, 
+    outcomeId = 3, 
+    modelSettings = resset,
+    analysisId = 'ResNet', 
+    analysisName = 'Testing ResNet', 
+    populationSettings = populationSet, 
+    splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), 
+    sampleSettings = PatientLevelPrediction::createSampleSettings(),  # none 
+    featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none 
+    preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), 
+    executeSettings = PatientLevelPrediction::createExecuteSettings(
+      runSplitData = T, 
+      runSampleData = F, 
+      runfeatureEngineering = F, 
+      runPreprocessData = T, 
+      runModelDevelopment = T, 
+      runCovariateSummary = F
+    ), 
+    saveDirectory = file.path(testLoc, 'ResNet')
+  )
+
+
+
+
+

TabNet +

+
+

Overall concept +

+
+
+

Examples +

+
+
+
+

Transformer +

+
+

Overall concept +

+
+
+

Examples +

+
+
+
+
+

Acknowledgments +

+

Considerable work has been dedicated to provide the DeepPatientLevelPrediction package.

+
+citation("DeepPatientLevelPrediction")
+
## 
+## To cite package 'DeepPatientLevelPrediction' in publications use:
+## 
+##   Reps J, Fridgeirsson E, Chan You S, Kim C, John H (2021).
+##   _DeepPatientLevelPrediction: Deep learning function for patient level
+##   prediction using data in the OMOP Common Data Model_.
+##   https://ohdsi.github.io/PatientLevelPrediction,
+##   https://github.com/OHDSI/DeepPatientLevelPrediction.
+## 
+## A BibTeX entry for LaTeX users is
+## 
+##   @Manual{,
+##     title = {DeepPatientLevelPrediction: Deep learning function for patient level prediction using data in the OMOP Common Data Model},
+##     author = {Jenna Reps and Egill Fridgeirsson and Seng {Chan You} and Chungsoo Kim and Henrik John},
+##     year = {2021},
+##     note = {https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction},
+##   }
+

Please reference this paper if you use the PLP Package in your work:

+

Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

+
+
+ + + +
+ + + +
+ +
+

+

Site built with pkgdown 2.0.5.

+
+ +
+
+ + + + + + + + diff --git a/docs/articles/index.html b/docs/articles/index.html new file mode 100644 index 0000000..06f1bc3 --- /dev/null +++ b/docs/articles/index.html @@ -0,0 +1,100 @@ + +Articles • DeepPatientLevelPrediction + + +
+
+ + + +
+
+ + + +
+
+ + +
+ +
+

Site built with pkgdown 2.0.5.

+
+ +
+ + + + + + + + diff --git a/docs/authors.html b/docs/authors.html new file mode 100644 index 0000000..b203b7f --- /dev/null +++ b/docs/authors.html @@ -0,0 +1,135 @@ + +Authors and Citation • DeepPatientLevelPrediction + + +
+
+ + + +
+
+
+ + + +
  • +

    Jenna Reps. Author. +

    +
  • +
  • +

    Egill Fridgeirsson. Author, maintainer. +

    +
  • +
  • +

    Seng Chan You. Author. +

    +
  • +
  • +

    Chungsoo Kim. Author. +

    +
  • +
  • +

    Henrik John. Author. +

    +
  • +
+
+
+

Citation

+ Source: DESCRIPTION +
+
+ + +

Reps J, Fridgeirsson E, Chan You S, Kim C, John H (2022). +DeepPatientLevelPrediction: Deep learning function for patient level prediction using data in the OMOP Common Data Model. +https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction. +

+
@Manual{,
+  title = {DeepPatientLevelPrediction: Deep learning function for patient level prediction using data in the OMOP Common Data Model},
+  author = {Jenna Reps and Egill Fridgeirsson and Seng {Chan You} and Chungsoo Kim and Henrik John},
+  year = {2022},
+  note = {https://ohdsi.github.io/PatientLevelPrediction, https://github.com/OHDSI/DeepPatientLevelPrediction},
+}
+ +
+ +
+ + + +
+ +
+

Site built with pkgdown 2.0.5.

+
+ +
+ + + + + + + + diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css new file mode 100644 index 0000000..5a85941 --- /dev/null +++ b/docs/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js new file mode 100644 index 0000000..1cdd573 --- /dev/null +++ b/docs/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/docsearch.css b/docs/docsearch.css new file mode 100644 index 0000000..e5f1fe1 --- /dev/null +++ b/docs/docsearch.css @@ -0,0 +1,148 @@ +/* Docsearch -------------------------------------------------------------- */ +/* + Source: https://github.com/algolia/docsearch/ + License: MIT +*/ + +.algolia-autocomplete { + display: block; + -webkit-box-flex: 1; + -ms-flex: 1; + flex: 1 +} + +.algolia-autocomplete .ds-dropdown-menu { + width: 100%; + min-width: none; + max-width: none; + padding: .75rem 0; + background-color: #fff; + background-clip: padding-box; + border: 1px solid rgba(0, 0, 0, .1); + box-shadow: 0 .5rem 1rem rgba(0, 0, 0, .175); +} + +@media (min-width:768px) { + .algolia-autocomplete .ds-dropdown-menu { + width: 175% + } +} + +.algolia-autocomplete .ds-dropdown-menu::before { + display: none +} + +.algolia-autocomplete .ds-dropdown-menu [class^=ds-dataset-] { + padding: 0; + background-color: rgb(255,255,255); + border: 0; + max-height: 80vh; +} + +.algolia-autocomplete .ds-dropdown-menu .ds-suggestions { + margin-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion { + padding: 0; + overflow: visible +} + +.algolia-autocomplete .algolia-docsearch-suggestion--category-header { + padding: .125rem 1rem; + margin-top: 0; + font-size: 1.3em; + font-weight: 500; + color: #00008B; + border-bottom: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--wrapper { + float: none; + padding-top: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--subcategory-column { + float: none; + width: auto; + padding: 0; + text-align: left +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content { + float: none; + width: auto; + padding: 0 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--content::before { + display: none +} + +.algolia-autocomplete .ds-suggestion:not(:first-child) .algolia-docsearch-suggestion--category-header { + padding-top: .75rem; + margin-top: .75rem; + border-top: 1px solid rgba(0, 0, 0, .1) +} + +.algolia-autocomplete .ds-suggestion .algolia-docsearch-suggestion--subcategory-column { + display: block; + padding: .1rem 1rem; + margin-bottom: 0.1; + font-size: 1.0em; + font-weight: 400 + /* display: none */ +} + +.algolia-autocomplete .algolia-docsearch-suggestion--title { + display: block; + padding: .25rem 1rem; + margin-bottom: 0; + font-size: 0.9em; + font-weight: 400 +} + +.algolia-autocomplete .algolia-docsearch-suggestion--text { + padding: 0 1rem .5rem; + margin-top: -.25rem; + font-size: 0.8em; + font-weight: 400; + line-height: 1.25 +} + +.algolia-autocomplete .algolia-docsearch-footer { + width: 110px; + height: 20px; + z-index: 3; + margin-top: 10.66667px; + float: right; + font-size: 0; + line-height: 0; +} + +.algolia-autocomplete .algolia-docsearch-footer--logo { + background-image: url("data:image/svg+xml;utf8,"); + background-repeat: no-repeat; + background-position: 50%; + background-size: 100%; + overflow: hidden; + text-indent: -9000px; + width: 100%; + height: 100%; + display: block; + transform: translate(-8px); +} + +.algolia-autocomplete .algolia-docsearch-suggestion--highlight { + color: #FF8C00; + background: rgba(232, 189, 54, 0.1) +} + + +.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { + box-shadow: inset 0 -2px 0 0 rgba(105, 105, 105, .5) +} + +.algolia-autocomplete .ds-suggestion.ds-cursor .algolia-docsearch-suggestion--content { + background-color: rgba(192, 192, 192, .15) +} diff --git a/docs/docsearch.js b/docs/docsearch.js new file mode 100644 index 0000000..b35504c --- /dev/null +++ b/docs/docsearch.js @@ -0,0 +1,85 @@ +$(function() { + + // register a handler to move the focus to the search bar + // upon pressing shift + "/" (i.e. "?") + $(document).on('keydown', function(e) { + if (e.shiftKey && e.keyCode == 191) { + e.preventDefault(); + $("#search-input").focus(); + } + }); + + $(document).ready(function() { + // do keyword highlighting + /* modified from https://jsfiddle.net/julmot/bL6bb5oo/ */ + var mark = function() { + + var referrer = document.URL ; + var paramKey = "q" ; + + if (referrer.indexOf("?") !== -1) { + var qs = referrer.substr(referrer.indexOf('?') + 1); + var qs_noanchor = qs.split('#')[0]; + var qsa = qs_noanchor.split('&'); + var keyword = ""; + + for (var i = 0; i < qsa.length; i++) { + var currentParam = qsa[i].split('='); + + if (currentParam.length !== 2) { + continue; + } + + if (currentParam[0] == paramKey) { + keyword = decodeURIComponent(currentParam[1].replace(/\+/g, "%20")); + } + } + + if (keyword !== "") { + $(".contents").unmark({ + done: function() { + $(".contents").mark(keyword); + } + }); + } + } + }; + + mark(); + }); +}); + +/* Search term highlighting ------------------------------*/ + +function matchedWords(hit) { + var words = []; + + var hierarchy = hit._highlightResult.hierarchy; + // loop to fetch from lvl0, lvl1, etc. + for (var idx in hierarchy) { + words = words.concat(hierarchy[idx].matchedWords); + } + + var content = hit._highlightResult.content; + if (content) { + words = words.concat(content.matchedWords); + } + + // return unique words + var words_uniq = [...new Set(words)]; + return words_uniq; +} + +function updateHitURL(hit) { + + var words = matchedWords(hit); + var url = ""; + + if (hit.anchor) { + url = hit.url_without_anchor + '?q=' + escape(words.join(" ")) + '#' + hit.anchor; + } else { + url = hit.url + '?q=' + escape(words.join(" ")); + } + + return url; +} diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 0000000..020d123 --- /dev/null +++ b/docs/index.html @@ -0,0 +1,234 @@ + + + + + + + +Deep learning function for patient level prediction using data in the OMOP Common Data Model • DeepPatientLevelPrediction + + + + + + + + + + + + +
    +
    + + + + +
    +
    + +
    + + +
    +
    +

    Introduction +

    +

    DeepPatientLevelPrediction is an R package for building and validating deep learning patient-level predictive models using data in the OMOP Common Data Model format and OHDSI PatientLevelPrediction framework.

    +

    Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.

    +
    +
    +

    Features +

    +
      +
    • Adds deep learning models to use in the OHDSI PatientLevelPrediction framework.
    • +
    • Allows to add custom deep learning models.
    • +
    • Includes an MLP, ResNet and a Transformer
    • +
    • Allows to use all the features of PatientLevelPrediction to validate and explore your model performance.
    • +
    +
    +
    +

    Technology +

    +

    DeepPatientLevelPrediction is an R package. It uses torch in R to build deep learning models without using python.

    +
    +
    +

    System Requirements +

    +

    Requires R (version 3.5.0 or higher). Installation on Windows requires RTools. For training deep learning models in most cases an nvidia GPU is required using either Windows or Linux.

    +
    +
    +

    Getting Started +

    + +
    +
    +

    User Documentation +

    +

    Documentation can be found on the package website.

    +

    PDF versions of the documentation are also available, as mentioned above.

    +
    +
    +

    Support +

    + +
    +
    +

    Contributing +

    +

    Read here how you can contribute to this package.

    +
    +
    +

    License +

    +

    DeepPatientLevelPrediction is licensed under Apache License 2.0

    +
    +
    +

    Development +

    +

    DeepPatientLevelPrediction is being developed in R Studio.

    +
    + +
    + + +
    + + +
    + +
    +

    +

    Site built with pkgdown 2.0.5.

    +
    + +
    +
    + + + + + + + + diff --git a/docs/link.svg b/docs/link.svg new file mode 100644 index 0000000..88ad827 --- /dev/null +++ b/docs/link.svg @@ -0,0 +1,12 @@ + + + + + + diff --git a/docs/pkgdown.css b/docs/pkgdown.css new file mode 100644 index 0000000..80ea5b8 --- /dev/null +++ b/docs/pkgdown.css @@ -0,0 +1,384 @@ +/* Sticky footer */ + +/** + * Basic idea: https://philipwalton.github.io/solved-by-flexbox/demos/sticky-footer/ + * Details: https://github.com/philipwalton/solved-by-flexbox/blob/master/assets/css/components/site.css + * + * .Site -> body > .container + * .Site-content -> body > .container .row + * .footer -> footer + * + * Key idea seems to be to ensure that .container and __all its parents__ + * have height set to 100% + * + */ + +html, body { + height: 100%; +} + +body { + position: relative; +} + +body > .container { + display: flex; + height: 100%; + flex-direction: column; +} + +body > .container .row { + flex: 1 0 auto; +} + +footer { + margin-top: 45px; + padding: 35px 0 36px; + border-top: 1px solid #e5e5e5; + color: #666; + display: flex; + flex-shrink: 0; +} +footer p { + margin-bottom: 0; +} +footer div { + flex: 1; +} +footer .pkgdown { + text-align: right; +} +footer p { + margin-bottom: 0; +} + +img.icon { + float: right; +} + +/* Ensure in-page images don't run outside their container */ +.contents img { + max-width: 100%; + height: auto; +} + +/* Fix bug in bootstrap (only seen in firefox) */ +summary { + display: list-item; +} + +/* Typographic tweaking ---------------------------------*/ + +.contents .page-header { + margin-top: calc(-60px + 1em); +} + +dd { + margin-left: 3em; +} + +/* Section anchors ---------------------------------*/ + +a.anchor { + display: none; + margin-left: 5px; + width: 20px; + height: 20px; + + background-image: url(./link.svg); + background-repeat: no-repeat; + background-size: 20px 20px; + background-position: center center; +} + +h1:hover .anchor, +h2:hover .anchor, +h3:hover .anchor, +h4:hover .anchor, +h5:hover .anchor, +h6:hover .anchor { + display: inline-block; +} + +/* Fixes for fixed navbar --------------------------*/ + +.contents h1, .contents h2, .contents h3, .contents h4 { + padding-top: 60px; + margin-top: -40px; +} + +/* Navbar submenu --------------------------*/ + +.dropdown-submenu { + position: relative; +} + +.dropdown-submenu>.dropdown-menu { + top: 0; + left: 100%; + margin-top: -6px; + margin-left: -1px; + border-radius: 0 6px 6px 6px; +} + +.dropdown-submenu:hover>.dropdown-menu { + display: block; +} + +.dropdown-submenu>a:after { + display: block; + content: " "; + float: right; + width: 0; + height: 0; + border-color: transparent; + border-style: solid; + border-width: 5px 0 5px 5px; + border-left-color: #cccccc; + margin-top: 5px; + margin-right: -10px; +} + +.dropdown-submenu:hover>a:after { + border-left-color: #ffffff; +} + +.dropdown-submenu.pull-left { + float: none; +} + +.dropdown-submenu.pull-left>.dropdown-menu { + left: -100%; + margin-left: 10px; + border-radius: 6px 0 6px 6px; +} + +/* Sidebar --------------------------*/ + +#pkgdown-sidebar { + margin-top: 30px; + position: -webkit-sticky; + position: sticky; + top: 70px; +} + +#pkgdown-sidebar h2 { + font-size: 1.5em; + margin-top: 1em; +} + +#pkgdown-sidebar h2:first-child { + margin-top: 0; +} + +#pkgdown-sidebar .list-unstyled li { + margin-bottom: 0.5em; +} + +/* bootstrap-toc tweaks ------------------------------------------------------*/ + +/* All levels of nav */ + +nav[data-toggle='toc'] .nav > li > a { + padding: 4px 20px 4px 6px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; +} + +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 5px; + color: inherit; + border-left: 1px solid #878787; +} + +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 5px; + font-size: 1.5rem; + font-weight: 400; + color: inherit; + border-left: 2px solid #878787; +} + +/* Nav: second level (shown on .active) */ + +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} + +nav[data-toggle='toc'] .nav .nav > li > a { + padding-left: 16px; + font-size: 1.35rem; +} + +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 15px; +} + +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 15px; + font-weight: 500; + font-size: 1.35rem; +} + +/* orcid ------------------------------------------------------------------- */ + +.orcid { + font-size: 16px; + color: #A6CE39; + /* margins are required by official ORCID trademark and display guidelines */ + margin-left:4px; + margin-right:4px; + vertical-align: middle; +} + +/* Reference index & topics ----------------------------------------------- */ + +.ref-index th {font-weight: normal;} + +.ref-index td {vertical-align: top; min-width: 100px} +.ref-index .icon {width: 40px;} +.ref-index .alias {width: 40%;} +.ref-index-icons .alias {width: calc(40% - 40px);} +.ref-index .title {width: 60%;} + +.ref-arguments th {text-align: right; padding-right: 10px;} +.ref-arguments th, .ref-arguments td {vertical-align: top; min-width: 100px} +.ref-arguments .name {width: 20%;} +.ref-arguments .desc {width: 80%;} + +/* Nice scrolling for wide elements --------------------------------------- */ + +table { + display: block; + overflow: auto; +} + +/* Syntax highlighting ---------------------------------------------------- */ + +pre, code, pre code { + background-color: #f8f8f8; + color: #333; +} +pre, pre code { + white-space: pre-wrap; + word-break: break-all; + overflow-wrap: break-word; +} + +pre { + border: 1px solid #eee; +} + +pre .img, pre .r-plt { + margin: 5px 0; +} + +pre .img img, pre .r-plt img { + background-color: #fff; +} + +code a, pre a { + color: #375f84; +} + +a.sourceLine:hover { + text-decoration: none; +} + +.fl {color: #1514b5;} +.fu {color: #000000;} /* function */ +.ch,.st {color: #036a07;} /* string */ +.kw {color: #264D66;} /* keyword */ +.co {color: #888888;} /* comment */ + +.error {font-weight: bolder;} +.warning {font-weight: bolder;} + +/* Clipboard --------------------------*/ + +.hasCopyButton { + position: relative; +} + +.btn-copy-ex { + position: absolute; + right: 0; + top: 0; + visibility: hidden; +} + +.hasCopyButton:hover button.btn-copy-ex { + visibility: visible; +} + +/* headroom.js ------------------------ */ + +.headroom { + will-change: transform; + transition: transform 200ms linear; +} +.headroom--pinned { + transform: translateY(0%); +} +.headroom--unpinned { + transform: translateY(-100%); +} + +/* mark.js ----------------------------*/ + +mark { + background-color: rgba(255, 255, 51, 0.5); + border-bottom: 2px solid rgba(255, 153, 51, 0.3); + padding: 1px; +} + +/* vertical spacing after htmlwidgets */ +.html-widget { + margin-bottom: 10px; +} + +/* fontawesome ------------------------ */ + +.fab { + font-family: "Font Awesome 5 Brands" !important; +} + +/* don't display links in code chunks when printing */ +/* source: https://stackoverflow.com/a/10781533 */ +@media print { + code a:link:after, code a:visited:after { + content: ""; + } +} + +/* Section anchors --------------------------------- + Added in pandoc 2.11: https://github.com/jgm/pandoc-templates/commit/9904bf71 +*/ + +div.csl-bib-body { } +div.csl-entry { + clear: both; +} +.hanging-indent div.csl-entry { + margin-left:2em; + text-indent:-2em; +} +div.csl-left-margin { + min-width:2em; + float:left; +} +div.csl-right-inline { + margin-left:2em; + padding-left:1em; +} +div.csl-indent { + margin-left: 2em; +} diff --git a/docs/pkgdown.js b/docs/pkgdown.js new file mode 100644 index 0000000..6f0eee4 --- /dev/null +++ b/docs/pkgdown.js @@ -0,0 +1,108 @@ +/* http://gregfranko.com/blog/jquery-best-practices/ */ +(function($) { + $(function() { + + $('.navbar-fixed-top').headroom(); + + $('body').css('padding-top', $('.navbar').height() + 10); + $(window).resize(function(){ + $('body').css('padding-top', $('.navbar').height() + 10); + }); + + $('[data-toggle="tooltip"]').tooltip(); + + var cur_path = paths(location.pathname); + var links = $("#navbar ul li a"); + var max_length = -1; + var pos = -1; + for (var i = 0; i < links.length; i++) { + if (links[i].getAttribute("href") === "#") + continue; + // Ignore external links + if (links[i].host !== location.host) + continue; + + var nav_path = paths(links[i].pathname); + + var length = prefix_length(nav_path, cur_path); + if (length > max_length) { + max_length = length; + pos = i; + } + } + + // Add class to parent
  • , and enclosing
  • if in dropdown + if (pos >= 0) { + var menu_anchor = $(links[pos]); + menu_anchor.parent().addClass("active"); + menu_anchor.closest("li.dropdown").addClass("active"); + } + }); + + function paths(pathname) { + var pieces = pathname.split("/"); + pieces.shift(); // always starts with / + + var end = pieces[pieces.length - 1]; + if (end === "index.html" || end === "") + pieces.pop(); + return(pieces); + } + + // Returns -1 if not found + function prefix_length(needle, haystack) { + if (needle.length > haystack.length) + return(-1); + + // Special case for length-0 haystack, since for loop won't run + if (haystack.length === 0) { + return(needle.length === 0 ? 0 : -1); + } + + for (var i = 0; i < haystack.length; i++) { + if (needle[i] != haystack[i]) + return(i); + } + + return(haystack.length); + } + + /* Clipboard --------------------------*/ + + function changeTooltipMessage(element, msg) { + var tooltipOriginalTitle=element.getAttribute('data-original-title'); + element.setAttribute('data-original-title', msg); + $(element).tooltip('show'); + element.setAttribute('data-original-title', tooltipOriginalTitle); + } + + if(ClipboardJS.isSupported()) { + $(document).ready(function() { + var copyButton = ""; + + $("div.sourceCode").addClass("hasCopyButton"); + + // Insert copy buttons: + $(copyButton).prependTo(".hasCopyButton"); + + // Initialize tooltips: + $('.btn-copy-ex').tooltip({container: 'body'}); + + // Initialize clipboard: + var clipboardBtnCopies = new ClipboardJS('[data-clipboard-copy]', { + text: function(trigger) { + return trigger.parentNode.textContent.replace(/\n#>[^\n]*/g, ""); + } + }); + + clipboardBtnCopies.on('success', function(e) { + changeTooltipMessage(e.trigger, 'Copied!'); + e.clearSelection(); + }); + + clipboardBtnCopies.on('error', function() { + changeTooltipMessage(e.trigger,'Press Ctrl+C or Command+C to copy'); + }); + }); + } +})(window.jQuery || window.$) diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml new file mode 100644 index 0000000..e9b1b73 --- /dev/null +++ b/docs/pkgdown.yml @@ -0,0 +1,8 @@ +pandoc: 2.14.0.3 +pkgdown: 2.0.5 +pkgdown_sha: ~ +articles: + BuildingDeepModels: BuildingDeepModels.html + Installing: Installing.html +last_built: 2022-08-09T16:38Z + diff --git a/docs/reference/Dataset.html b/docs/reference/Dataset.html new file mode 100644 index 0000000..cd4ccac --- /dev/null +++ b/docs/reference/Dataset.html @@ -0,0 +1,123 @@ + +A torch dataset — Dataset • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    A torch dataset

    +
    + +
    +
    Dataset(data, labels = NULL, numericalIndex = NULL, all = FALSE)
    +
    + +
    +

    Arguments

    +
    data
    +

    a dataframe like object with the covariates

    + + +
    labels
    +

    a dataframe with the labels

    + + +
    numericalIndex
    +

    in what column numeric data is in (if any)

    + + +
    all
    +

    if True then returns all features instead of splitting num/cat

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/DeepPatientLevelPrediction.html b/docs/reference/DeepPatientLevelPrediction.html new file mode 100644 index 0000000..5e82fea --- /dev/null +++ b/docs/reference/DeepPatientLevelPrediction.html @@ -0,0 +1,102 @@ + +DeepPatientLevelPrediction — DeepPatientLevelPrediction • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    A package containing deep learning extensions for developing prediction models using data in the OMOP CDM

    +
    + + + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/EarlyStopping.html b/docs/reference/EarlyStopping.html new file mode 100644 index 0000000..049d2c9 --- /dev/null +++ b/docs/reference/EarlyStopping.html @@ -0,0 +1,175 @@ + +Earlystopping class — EarlyStopping • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Stops training if a loss or metric has stopped improving

    +
    + + +
    +

    Methods

    + +


    +

    Method new()

    +

    Creates a new earlystopping object

    +

    Usage

    +

    EarlyStopping$new(patience = 3, delta = 0, verbose = TRUE)

    +
    + +
    +

    Arguments

    +

    patience
    +

    Stop after this number of epochs if loss doesn't improve

    + + +
    delta
    +

    How much does the loss need to improve to count as improvement

    + + +
    verbose
    +
    + + +

    +
    +
    +

    Returns

    +

    a new earlystopping object

    +
    + +


    +

    Method call()

    +

    call the earlystopping object and increment a counter if loss is not +improving

    +

    Usage

    +

    EarlyStopping$call(metric)

    +
    + +
    +

    Arguments

    +

    metric
    +

    the current metric value

    + + +

    +
    + +


    +

    Method clone()

    +

    The objects of this class are cloneable with this method.

    +

    Usage

    +

    EarlyStopping$clone(deep = FALSE)

    +
    + +
    +

    Arguments

    +

    deep
    +

    Whether to make a deep clone.

    + + +

    +
    + +
    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/Estimator.html b/docs/reference/Estimator.html new file mode 100644 index 0000000..dc17110 --- /dev/null +++ b/docs/reference/Estimator.html @@ -0,0 +1,431 @@ + +Estimator — Estimator • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    A generic R6 class that wraps around a torch nn module and can be used to +fit and predict the model defined in that module.

    +
    + + +
    +

    Methods

    + +


    +

    Method new()

    +

    Creates a new estimator

    +

    Usage

    +

    Estimator$new(
    +  baseModel,
    +  modelParameters,
    +  fitParameters,
    +  optimizer = torch::optim_adam,
    +  criterion = torch::nn_bce_with_logits_loss,
    +  scheduler = torch::lr_reduce_on_plateau,
    +  device = "cpu",
    +  patience = 4
    +)

    +
    + +
    +

    Arguments

    +

    baseModel
    +

    The torch nn module to use as model

    + + +
    modelParameters
    +

    Parameters to initialize the baseModel

    + + +
    fitParameters
    +

    Parameters required for the estimator fitting

    + + +
    optimizer
    +

    A torch optimizer to use, default is Adam

    + + +
    criterion
    +

    The torch loss function to use, defaults to +binary cross entropy with logits

    + + +
    scheduler
    +

    learning rate scheduler to use

    + + +
    device
    +

    Which device to use for fitting, default is cpu

    + + +
    patience
    +

    Patience to use for early stopping

    + + +

    +
    + +


    +

    Method fit()

    +

    fits the estimator

    +

    Usage

    +

    Estimator$fit(dataset, testDataset)

    +
    + +
    +

    Arguments

    +

    dataset
    +

    a torch dataset to use for model fitting

    + + +
    testDataset
    +

    a torch dataset to use for early stopping

    + + +

    +
    + +


    +

    Method fitEpoch()

    +

    fits estimator for one epoch (one round through the data)

    +

    Usage

    +

    Estimator$fitEpoch(dataset, batchIndex)

    +
    + +
    +

    Arguments

    +

    dataset
    +

    torch dataset to use for fitting

    + + +
    batchIndex
    +

    indices of batches

    + + +

    +
    + +


    +

    Method score()

    +

    calculates loss and auc after training for one epoch

    +

    Usage

    +

    Estimator$score(dataset, batchIndex)

    +
    + +
    +

    Arguments

    +

    dataset
    +

    The torch dataset to use to evaluate loss and auc

    + + +
    batchIndex
    +

    Indices of batches in the dataset

    + + +

    +
    +
    +

    Returns

    +

    list with average loss and auc in the dataset

    +
    + +


    +

    Method finishFit()

    +

    operations that run when fitting is finished

    +

    Usage

    +

    Estimator$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)

    +
    + +
    +

    Arguments

    +

    valAUCs
    +

    validation AUC values

    + + +
    modelStateDict
    +

    fitted model parameters

    + + +
    valLosses
    +

    validation losses

    + + +
    epoch
    +

    list of epochs fit

    + + +
    learnRates
    +

    learning rate sequence used so far

    + + +

    +
    + +


    +

    Method fitWholeTrainingSet()

    +

    Fits whole training set on a specific number of epochs +TODO What happens when learning rate changes per epochs? +Ideally I would copy the learning rate strategy from before +and adjust for different sizes ie more iterations/updates???

    +

    Usage

    +

    Estimator$fitWholeTrainingSet(dataset, learnRates = NULL)

    +
    + +
    +

    Arguments

    +

    dataset
    +

    torch dataset

    + + +
    learnRates
    +

    learnRateSchedule from CV

    + + +

    +
    + +


    +

    Method save()

    +

    save model and those parameters needed to reconstruct it

    +

    Usage

    +

    Estimator$save(path, name)

    +
    + +
    +

    Arguments

    +

    path
    +

    where to save the model

    + + +
    name
    +

    name of file

    + + +

    +
    +
    +

    Returns

    +

    the path to saved model

    +
    + +


    +

    Method predictProba()

    +

    predicts and outputs the probabilities

    +

    Usage

    +

    Estimator$predictProba(dataset)

    +
    + +
    +

    Arguments

    +

    dataset
    +

    Torch dataset to create predictions for

    + + +

    +
    +
    +

    Returns

    +

    predictions as probabilities

    +
    + +


    +

    Method predict()

    +

    predicts and outputs the class

    +

    Usage

    +

    Estimator$predict(dataset, threshold = NULL)

    +
    + +
    +

    Arguments

    +

    dataset
    +

    A torch dataset to create predictions for

    + + +
    threshold
    +

    Which threshold to use for predictions

    + + +

    +
    +
    +

    Returns

    +

    The predicted class for the data in the dataset

    +
    + +


    +

    Method batchToDevice()

    +

    sends a batch of data to device +assumes batch includes lists of tensors to arbitrary nested depths

    +

    Usage

    +

    Estimator$batchToDevice(batch)

    +
    + +
    +

    Arguments

    +

    batch
    +

    the batch to send, usually a list of torch tensors

    + + +

    +
    +
    +

    Returns

    +

    the batch on the required device

    +
    + +


    +

    Method itemOrDefaults()

    +

    select item from list, and if it's null sets a default

    +

    Usage

    +

    Estimator$itemOrDefaults(list, item, default = NULL)

    +
    + +
    +

    Arguments

    +

    list
    +

    A list with items

    + + +
    item
    +

    Which list item to retrieve

    + + +
    default
    +

    The value to return if list doesn't have item

    + + +

    +
    +
    +

    Returns

    +

    the list item or default

    +
    + +


    +

    Method clone()

    +

    The objects of this class are cloneable with this method.

    +

    Usage

    +

    Estimator$clone(deep = FALSE)

    +
    + +
    +

    Arguments

    +

    deep
    +

    Whether to make a deep clone.

    + + +

    +
    + +
    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/doubleLayerNN.html b/docs/reference/doubleLayerNN.html new file mode 100644 index 0000000..b3bc13f --- /dev/null +++ b/docs/reference/doubleLayerNN.html @@ -0,0 +1,127 @@ + +Double layer neural network — doubleLayerNN • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Double layer neural network

    +
    + +
    +
    doubleLayerNN(inputN, layer1, layer2, outputN, layer_dropout)
    +
    + +
    +

    Arguments

    +
    inputN
    +

    Input neurons

    + + +
    layer1
    +

    Layer 1 neurons

    + + +
    layer2
    +

    Layer 2 neurons

    + + +
    outputN
    +

    output neurons

    + + +
    layer_dropout
    +

    layer_dropout to use

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/fitDeepNNTorch.html b/docs/reference/fitDeepNNTorch.html new file mode 100644 index 0000000..fa8c2e3 --- /dev/null +++ b/docs/reference/fitDeepNNTorch.html @@ -0,0 +1,123 @@ + +Fits a deep neural network — fitDeepNNTorch • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Fits a deep neural network

    +
    + +
    +
    fitDeepNNTorch(trainData, param, search = "grid", analysisId)
    +
    + +
    +

    Arguments

    +
    trainData
    +

    Training data object

    + + +
    param
    +

    Hyperparameters to search over

    + + +
    search
    +

    Which kind of search strategy to use

    + + +
    analysisId
    +

    Analysis Id

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/fitEstimator.html b/docs/reference/fitEstimator.html new file mode 100644 index 0000000..da8803b --- /dev/null +++ b/docs/reference/fitEstimator.html @@ -0,0 +1,123 @@ + +fitEstimator — fitEstimator • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    fits a deep learning estimator to data.

    +
    + +
    +
    fitEstimator(trainData, param, analysisId, ...)
    +
    + +
    +

    Arguments

    +
    trainData
    +

    the data to use

    + + +
    param
    +

    model parameters

    + + +
    analysisId
    +

    Id of the analysis

    + + +
    ...
    +

    Extra inputs

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/gridCvDeep.html b/docs/reference/gridCvDeep.html new file mode 100644 index 0000000..9467482 --- /dev/null +++ b/docs/reference/gridCvDeep.html @@ -0,0 +1,127 @@ + +gridCvDeep — gridCvDeep • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Performs grid search for a deep learning estimator

    +
    + +
    +
    gridCvDeep(mappedData, labels, settings, modelLocation, paramSearch)
    +
    + +
    +

    Arguments

    +
    mappedData
    +

    Mapped data with covariates

    + + +
    labels
    +

    Dataframe with the outcomes

    + + +
    settings
    +

    Settings of the model

    + + +
    modelLocation
    +

    Where to save the model

    + + +
    paramSearch
    +

    model parameters to perform search over

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/index.html b/docs/reference/index.html new file mode 100644 index 0000000..98aff88 --- /dev/null +++ b/docs/reference/index.html @@ -0,0 +1,159 @@ + +Function reference • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +

    All functions

    +

    +
    +

    Dataset()

    +

    A torch dataset

    +

    DeepPatientLevelPrediction

    +

    DeepPatientLevelPrediction

    +

    EarlyStopping

    +

    Earlystopping class

    +

    Estimator

    +

    Estimator

    +

    doubleLayerNN()

    +

    Double layer neural network

    +

    fitDeepNNTorch()

    +

    Fits a deep neural network

    +

    fitEstimator()

    +

    fitEstimator

    +

    gridCvDeep()

    +

    gridCvDeep

    +

    predictDeepEstimator()

    +

    predictDeepEstimator

    +

    predictDeepNN()

    +

    Create predictions for a deep neural network

    +

    setDeepNNTorch()

    +

    settings for a Deep neural network

    +

    setResNet()

    +

    setResNet

    +

    setTransformer()

    +

    create settings for training a non-temporal transformer

    +

    singleLayerNN()

    +

    A single layer neural network

    +

    tripleLayerNN()

    +

    Triple layer neural network

    + + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/predictDeepEstimator.html b/docs/reference/predictDeepEstimator.html new file mode 100644 index 0000000..3e9eb58 --- /dev/null +++ b/docs/reference/predictDeepEstimator.html @@ -0,0 +1,119 @@ + +predictDeepEstimator — predictDeepEstimator • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    the prediction function for the estimator

    +
    + +
    +
    predictDeepEstimator(plpModel, data, cohort)
    +
    + +
    +

    Arguments

    +
    plpModel
    +

    the plpModel

    + + +
    data
    +

    plp data object or a torch dataset

    + + +
    cohort
    +

    data.frame with the rowIds of the people

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/predictDeepNN.html b/docs/reference/predictDeepNN.html new file mode 100644 index 0000000..75341ec --- /dev/null +++ b/docs/reference/predictDeepNN.html @@ -0,0 +1,119 @@ + +Create predictions for a deep neural network — predictDeepNN • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Create predictions for a deep neural network

    +
    + +
    +
    predictDeepNN(plpModel, data, cohort)
    +
    + +
    +

    Arguments

    +
    plpModel
    +

    The plpModel to predict for

    + + +
    data
    +

    The data to make predictions for

    + + +
    cohort
    +

    The cohort to use

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/setDeepNNTorch.html b/docs/reference/setDeepNNTorch.html new file mode 100644 index 0000000..41b51de --- /dev/null +++ b/docs/reference/setDeepNNTorch.html @@ -0,0 +1,153 @@ + +settings for a Deep neural network — setDeepNNTorch • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    settings for a Deep neural network

    +
    + +
    +
    setDeepNNTorch(
    +  units = list(c(128, 64), 128),
    +  layer_dropout = c(0.2),
    +  lr = c(1e-04),
    +  decay = c(1e-05),
    +  outcome_weight = c(1),
    +  batch_size = c(10000),
    +  epochs = c(100),
    +  device = "cpu",
    +  seed = NULL
    +)
    +
    + +
    +

    Arguments

    +
    units
    +

    A list of vectors for neurons per layer

    + + +
    layer_dropout
    +

    Dropout to use per layer

    + + +
    lr
    +

    Learning rate ot use

    + + +
    decay
    +

    Weight decay to use

    + + +
    outcome_weight
    +

    Weight for minority outcome in cost function

    + + +
    batch_size
    +

    Batch size to use

    + + +
    epochs
    +

    How many epochs to use

    + + +
    device
    +

    Which device to use

    + + +
    seed
    +

    A seed to make experiments more reproducible

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/setResNet.html b/docs/reference/setResNet.html new file mode 100644 index 0000000..e183455 --- /dev/null +++ b/docs/reference/setResNet.html @@ -0,0 +1,192 @@ + +setResNet — setResNet • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Creates settings for a ResNet model

    +
    + +
    +
    setResNet(
    +  numLayers = c(1:8),
    +  sizeHidden = c(2^(6:9)),
    +  hiddenFactor = c(1:4),
    +  residualDropout = c(seq(0, 0.5, 0.05)),
    +  hiddenDropout = c(seq(0, 0.5, 0.05)),
    +  normalization = c("BatchNorm"),
    +  activation = c("RelU"),
    +  sizeEmbedding = c(2^(6:9)),
    +  weightDecay = c(1e-06, 0.001),
    +  learningRate = c(0.01, 3e-04, 1e-05),
    +  seed = NULL,
    +  hyperParamSearch = "random",
    +  randomSample = 100,
    +  device = "cpu",
    +  batchSize = 1024,
    +  epochs = 30
    +)
    +
    + +
    +

    Arguments

    +
    numLayers
    +

    Number of layers in network, default: 1:16

    + + +
    sizeHidden
    +

    Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)

    + + +
    hiddenFactor
    +

    How much to grow the amount of neurons in each ResLayer, default: 1:4

    + + +
    residualDropout
    +

    How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)

    + + +
    hiddenDropout
    +

    How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)

    + + +
    normalization
    +

    Which type of normalization to use. Default: 'Batchnorm'

    + + +
    activation
    +

    What kind of activation to use. Default: 'RelU'

    + + +
    sizeEmbedding
    +

    Size of embedding layer, default: 2^(6:9) (64 to 512)

    + + +
    weightDecay
    +

    Weight decay to apply, default: c(1e-6, 1e-3)

    + + +
    learningRate
    +

    Learning rate to use. default: c(1e-2, 1e-5)

    + + +
    seed
    +

    Seed to use for sampling hyperparameter space

    + + +
    hyperParamSearch
    +

    Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'

    + + +
    randomSample
    +

    How many random samples from hyperparameter space to use

    + + +
    device
    +

    Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'

    + + +
    batchSize
    +

    Size of batch, default: 1024

    + + +
    epochs
    +

    Number of epochs to run, default: 10

    + +
    +
    +

    Details

    +

    Model architecture from by https://arxiv.org/abs/2106.11959

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/setTransformer.html b/docs/reference/setTransformer.html new file mode 100644 index 0000000..fb2503d --- /dev/null +++ b/docs/reference/setTransformer.html @@ -0,0 +1,192 @@ + +create settings for training a non-temporal transformer — setTransformer • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    A transformer model

    +
    + +
    +
    setTransformer(
    +  numBlocks = 3,
    +  dimToken = 96,
    +  dimOut = 1,
    +  numHeads = 8,
    +  attDropout = 0.25,
    +  ffnDropout = 0.25,
    +  resDropout = 0,
    +  dimHidden = 512,
    +  weightDecay = 1e-06,
    +  learningRate = 3e-04,
    +  batchSize = 1024,
    +  epochs = 10,
    +  device = "cpu",
    +  hyperParamSearch = "random",
    +  randomSamples = 100,
    +  seed = NULL
    +)
    +
    + +
    +

    Arguments

    +
    numBlocks
    +

    number of transformer blocks

    + + +
    dimToken
    +

    dimension of each token (embedding size)

    + + +
    dimOut
    +

    dimension of output, usually 1 for binary problems

    + + +
    numHeads
    +

    number of attention heads

    + + +
    attDropout
    +

    dropout to use on attentions

    + + +
    ffnDropout
    +

    dropout to use in feedforward block

    + + +
    resDropout
    +

    dropout to use in residual connections

    + + +
    dimHidden
    +

    dimension of the feedworward block

    + + +
    weightDecay
    +

    weightdecay to use

    + + +
    learningRate
    +

    learning rate to use

    + + +
    batchSize
    +

    batchSize to use

    + + +
    epochs
    +

    How many epochs to run the model for

    + + +
    device
    +

    Which device to use, cpu or cuda

    + + +
    hyperParamSearch
    +

    what kind of hyperparameter search to do, default 'random'

    + + +
    randomSamples
    +

    How many samples to use in hyperparameter search if random

    + + +
    seed
    +

    Random seed to use

    + +
    +
    +

    Details

    +

    from https://arxiv.org/abs/2106.11959

    +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/singleLayerNN.html b/docs/reference/singleLayerNN.html new file mode 100644 index 0000000..40fb76e --- /dev/null +++ b/docs/reference/singleLayerNN.html @@ -0,0 +1,123 @@ + +A single layer neural network — singleLayerNN • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    A single layer neural network

    +
    + +
    +
    singleLayerNN(inputN, layer1, outputN = 2, layer_dropout)
    +
    + +
    +

    Arguments

    +
    inputN
    +

    Input neurons

    + + +
    layer1
    +

    Layer 1 neurons

    + + +
    outputN
    +

    Output neurons

    + + +
    layer_dropout
    +

    Layer dropout to use

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/reference/tripleLayerNN.html b/docs/reference/tripleLayerNN.html new file mode 100644 index 0000000..b5e0969 --- /dev/null +++ b/docs/reference/tripleLayerNN.html @@ -0,0 +1,131 @@ + +Triple layer neural network — tripleLayerNN • DeepPatientLevelPrediction + + +
    +
    + + + +
    +
    + + +
    +

    Triple layer neural network

    +
    + +
    +
    tripleLayerNN(inputN, layer1, layer2, layer3, outputN, layer_dropout)
    +
    + +
    +

    Arguments

    +
    inputN
    +

    Input neurons

    + + +
    layer1
    +

    amount of layer 1 neurons

    + + +
    layer2
    +

    amount of layer 2 neurons

    + + +
    layer3
    +

    amount of layer 3 neurons

    + + +
    outputN
    +

    Number of output neurons

    + + +
    layer_dropout
    +

    The dropout to use in layer

    + +
    + +
    + +
    + + +
    + +
    +

    Site built with pkgdown 2.0.5.

    +
    + +
    + + + + + + + + diff --git a/docs/sitemap.xml b/docs/sitemap.xml new file mode 100644 index 0000000..1e9115f --- /dev/null +++ b/docs/sitemap.xml @@ -0,0 +1,69 @@ + + + + /404.html + + + /articles/BuildingDeepModels.html + + + /articles/Installing.html + + + /articles/index.html + + + /authors.html + + + /index.html + + + /reference/Dataset.html + + + /reference/DeepPatientLevelPrediction.html + + + /reference/EarlyStopping.html + + + /reference/Estimator.html + + + /reference/doubleLayerNN.html + + + /reference/fitDeepNNTorch.html + + + /reference/fitEstimator.html + + + /reference/gridCvDeep.html + + + /reference/index.html + + + /reference/predictDeepEstimator.html + + + /reference/predictDeepNN.html + + + /reference/setDeepNNTorch.html + + + /reference/setResNet.html + + + /reference/setTransformer.html + + + /reference/singleLayerNN.html + + + /reference/tripleLayerNN.html + + diff --git a/extras/DeepPatientLevelPrediction.pdf b/extras/DeepPatientLevelPrediction.pdf new file mode 100644 index 0000000..d7bde43 Binary files /dev/null and b/extras/DeepPatientLevelPrediction.pdf differ diff --git a/extras/PackageMaintenance.R b/extras/PackageMaintenance.R new file mode 100644 index 0000000..0b86eda --- /dev/null +++ b/extras/PackageMaintenance.R @@ -0,0 +1,44 @@ +# @file PackageMaintenance +# +# Copyright 2022 Observational Health Data Sciences and Informatics +# +# This file is part of DeepPatientLevelPrediction +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Format and check code +styler::style_pkg() +OhdsiRTools::checkUsagePackage("DeepPatientLevelPrediction") +OhdsiRTools::updateCopyrightYearFolder() +devtools::spell_check() + + +# Create manual and vignettes +unlink("extras/DeepPatientLevelPrediction.pdf") +system("R CMD Rd2pdf ./ --output=extras/DeepPatientLevelPrediction.pdf") + +rmarkdown::render("vignettes/BuildingDeepModels.Rmd", + output_file = "../inst/doc/BuildingDeepModels.pdf", + rmarkdown::pdf_document(latex_engine = "pdflatex", + toc = TRUE, + toc_depth = 3, + number_sections = TRUE)) + + +rmarkdown::render("vignettes/Installing.Rmd", + output_file = "../inst/doc/Installing.pdf", + rmarkdown::pdf_document(latex_engine = "pdflatex", + toc = TRUE, + toc_depth = 3, + number_sections = TRUE)) + diff --git a/extras/example.R b/extras/example.R new file mode 100644 index 0000000..fa989b1 --- /dev/null +++ b/extras/example.R @@ -0,0 +1,54 @@ +# testing code (requires sequential branch of FeatureExtraction): +# rm(list = ls()) +library(PatientLevelPrediction) +library(DeepPatientLevelPrediction) + +data(plpDataSimulationProfile) +sampleSize <- 1e4 +plpData <- simulatePlpData( + plpDataSimulationProfile, + n = sampleSize +) + +populationSet <- PatientLevelPrediction::createStudyPopulationSettings( + requireTimeAtRisk = F, + riskWindowStart = 1, + riskWindowEnd = 365) + + +modelSettings <- setResNet(numLayers = 2, sizeHidden = 64, hiddenFactor = 1, + residualDropout = 0, hiddenDropout = 0.2, normalization = 'BatchNorm', + activation = 'RelU', sizeEmbedding = 512, weightDecay = 1e-6, + learningRate = 3e-4, seed = 42, hyperParamSearch = 'random', + randomSample = 1, device = 'cuda:0',batchSize = 32,epochs = 10) + +# modelSettings <- setTransformer(numBlocks=1, dimToken = 33, dimOut = 1, numHeads = 3, +# attDropout = 0.2, ffnDropout = 0.2, resDropout = 0, +# dimHidden = 8, batchSize = 32, hyperParamSearch = 'random', +# weightDecay = 1e-6, learningRate = 3e-4, epochs = 10, +# device = 'cuda:0', randomSamples = 1, seed = 42) + +res2 <- PatientLevelPrediction::runPlp( +plpData = plpData, +outcomeId = 3, +modelSettings = modelSettings, +analysisId = 'Test', +analysisName = 'Testing DeepPlp', +populationSettings = populationSet, +splitSettings = createDefaultSplitSetting(), +sampleSettings = createSampleSettings(), # none +featureEngineeringSettings = createFeatureEngineeringSettings(), # none +preprocessSettings = createPreprocessSettings(), +logSettings = createLogSettings(verbosity='TRACE'), +executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = T +), +saveDirectory = '~/test/new_plp/' +) + + diff --git a/extras/example_plp5.R b/extras/example_plp5.R new file mode 100644 index 0000000..0e65ecb --- /dev/null +++ b/extras/example_plp5.R @@ -0,0 +1,142 @@ +# testing code (requires sequential branch of FeatureExtraction): +rm(list = ls()) +library(FeatureExtraction) +library(PatientLevelPrediction) +library(DeepPatientLevelPrediction) +connectionDetails <- Eunomia::getEunomiaConnectionDetails() +Eunomia::createCohorts(connectionDetails) + +temp <- F + +covSet <- createCovariateSettings(useDemographicsGender = T, + useDemographicsAge = T, + useDemographicsRace = T, + useDemographicsEthnicity = T, + useDemographicsAgeGroup = T, + useConditionGroupEraLongTerm = T, + useDrugEraStartLongTerm = T, + endDays = -1 +) + +if(temp){ +covSetT <- createTemporalSequenceCovariateSettings(useDemographicsGender = T, + useDemographicsAge = T, + useDemographicsRace = T, + useDemographicsEthnicity = T, + useDemographicsAgeGroup = T, + useConditionEraGroupStart = T, + useDrugEraStart = T, + timePart = 'month', + timeInterval = 1, + sequenceEndDay = -1, + sequenceStartDay = -365*5) +} + + +databaseDetails <- PatientLevelPrediction::createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTable = "cohort", + cohortId = 4, + outcomeIds = 3, + outcomeDatabaseSchema = "main", + outcomeTable = "cohort", + cdmDatabaseName = 'eunomia' +) + +restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings( + firstExposureOnly = T, + washoutPeriod = 365 +) + +plpData <- PatientLevelPrediction::getPlpData( + databaseDetails = databaseDetails, + restrictPlpDataSettings = restrictPlpDataSettings, + covariateSettings = covSet +) + +if(temp){ + plpDataT <- PatientLevelPrediction::getPlpData( + databaseDetails = databaseDetails, + restrictPlpDataSettings = restrictPlpDataSettings, + covariateSettings = covSetT + ) +} + + +populationSet <- PatientLevelPrediction::createStudyPopulationSettings( + requireTimeAtRisk = F, + riskWindowStart = 1, + riskWindowEnd = 365 + ) + +# code to train models +deepset <- setDeepNNTorch(units=list(c(128, 64), 128), layer_dropout=c(0.2), + lr =c(1e-4), decay=c(1e-5), outcome_weight = c(1.0), batch_size = c(100), + epochs= c(5), seed=NULL ) + + +#debug(fitDeepNNTorch) +# res <- runPlp(population = population, +# plpData = plpData, +# nfold = 3, +# modelSettings = deepset, +# savePlpData = F, +# savePlpResult = F, +# savePlpPlots = F, +# saveEvaluation = F) +# + + +resSet <- setResNet_plp5( + numLayers = list(5), + sizeHidden = list(256), + hiddenFactor = list(2), + residualDropout = list(0.1), + hiddenDropout = list(0.1), + normalization = list('BatchNorm'), + activation = list('RelU'), + sizeEmbedding = list(64), + weightDecay = list(1e-6), + learningRate = list(3e-4), + seed = 42, + hyperParamSearch = 'random', + randomSample = 1, + #device='cuda:0', + batchSize = 128, + epochs = 10 + ) + + +res2 <- runPlp( + plpData = plpData, + outcomeId = 3, + modelSettings = resSet, + analysisId = 'ResNet', + analysisName = 'Testing Deep Learning', + populationSettings = populationSet, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), + sampleSettings = PatientLevelPrediction::createSampleSettings(), # none + featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), + executeSettings = PatientLevelPrediction::createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = 'D:/testing/Deep' + ) + + +##predict.customLibrary(libraryName, predictionFunction, inputList){ +## libraryName <- 'PatientLevelPrediction' +## predictionFunction <- "createStudyPopulation" +## predictFun <- get(predictionFunction, envir = rlang::search_envs()[grep(paste0('package:', libraryName), search())][[1]]) +## +## prediction <- do.call(predictFun, inputList) +## return(prediction) +##} diff --git a/inst/python/TorchMap.py b/inst/python/TorchMap.py deleted file mode 100644 index ef8feac..0000000 --- a/inst/python/TorchMap.py +++ /dev/null @@ -1,19 +0,0 @@ -import torch - -def map_python_initiate(maxCol,maxRow, maxT=None): - if maxT != None: - matrix = torch.sparse.FloatTensor(torch.LongTensor([[0,0],[0,1],[0,0]]), torch.FloatTensor([0.,0.]), torch.Size([maxRow,maxCol,maxT])) - else: - matrix = torch.sparse.FloatTensor(torch.LongTensor([[0,0],[0,1]]), torch.FloatTensor([0.,0.]), torch.Size([maxRow,maxCol])) - return matrix - -def map_python(matrix, datas, maxCol,maxRow, maxT=None): - if maxT != None: - indexes= datas[:,0:3]-1 - matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:,3]), torch.Size([maxRow,maxCol, maxT])) - matrix = matrix.add(matrixt) - else: - indexes= datas[:,0:2]-1 - matrixt = torch.sparse.FloatTensor(torch.LongTensor(indexes.T), torch.FloatTensor(datas[:,2]), torch.Size([maxRow,maxCol])) - matrix = matrix.add(matrixt) - return matrix diff --git a/inst/python/TorchUtils.py b/inst/python/TorchUtils.py deleted file mode 100755 index ccb21b9..0000000 --- a/inst/python/TorchUtils.py +++ /dev/null @@ -1,695 +0,0 @@ -""" - deepUtils.py - - Copyright 2016 Observational Health Data Sciences and Informatics - - This file is part of PatientLevelPrediction - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import sys -import os -#import _pickle as cPickle -import pdb -import random -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable -from torch.utils.data import DataLoader, TensorDataset -from sklearn.externals import joblib -from sklearn.metrics import roc_auc_score -from collections import OrderedDict -output_dir = 'data' - - -class FocalLoss(nn.Module): - """ - Method to handle data imbalance based on paper (arXiv:1708.02002) entitled - Focal loss for dense object detection. - Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class]) - - """ - - def __init__(self, gamma=5, eps=1e-7, size_average=False): - super(FocalLoss, self).__init__() - self.gamma = gamma - self.eps = eps - self.size_average = size_average - - def forward(self, input, target): - y = self.one_hot(target, input.size(-1)) - logit = F.softmax(input) - logit = logit.clamp(self.eps, 1. - self.eps) - - loss = -1 * y * torch.log(logit) # cross entropy - loss = loss * (1 - logit) ** self.gamma # focal loss - - if self.size_average: - loss = loss.mean() - else: - loss = loss.sum() - return loss - - def one_hot(self, index, classes): - """ - - :param index: is the labels - :param classes: number if classes - :return: - """ - size = index.size() + (classes,) - view = index.size() + (1,) - - mask = torch.Tensor(*size).fill_(0) - index = index.view(*view) - ones = 1. - - if isinstance(index, Variable): - ones = Variable(torch.Tensor(index.size()).fill_(1)) - mask = Variable(mask, volatile=index.volatile) - if torch.cuda.is_available(): - ones = ones.cuda() - mask = mask.cuda() - - return mask.scatter_(1, index, ones) - -def loss_function(recon_x, x, mu, logvar): - """Loss function for varational autoencoder VAE""" - BCE = F.binary_cross_entropy(recon_x, x, size_average=False) - - # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) - KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) - - return BCE + KLD - - -def mixup_data(x, y, alpha=1.0): - - '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda - Data Augmentation method based on paper (arXiv:1710.09412) entitled - mixup: Beyond empirical risk minimization. - ''' - if alpha > 0.: - lam = np.random.beta(alpha, alpha) - else: - lam = 1. - batch_size = x.size()[0] - if torch.cuda.is_available(): - index = torch.randperm(batch_size).cuda() - else: - index = torch.randperm(batch_size) - - mixed_x = lam * x + (1 - lam) * x[index,:] - y_a, y_b = y, y[index] - return mixed_x, y_a, y_b, lam - -def mixup_criterion(y_a, y_b, lam): - return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b) - -def early_stop(metrics_hist, patience = 3): - if not np.all(np.isnan(metrics_hist)): - return np.nanargmin(metrics_hist) > len(metrics_hist) - patience - else: - #keep training if criterion results have all been nan so far - return False - - -class Estimator(object): - """ - It is used for training different deep models in the same interface. - """ - - def __init__(self, model): - self.model = model - - def compile(self, optimizer, loss): - self.optimizer = optimizer - self.loss_f = loss - - def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False): - """ - train one epoch - - :param train_loader: The data loaded using DataLoader - :param l1regularization: default False - :return: the return fitted loss and accuracy - """ - loss_list = [] - acc_list = [] - for idx, (X, y) in enumerate(train_loader): - X_v = Variable(X) - y_v = Variable(y) - if torch.cuda.is_available(): - X_v = X_v.cuda() - y_v = y_v.cuda() - - if mixup: - X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v) - X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b) - - # print 'GPU id', torch.cuda.current_device() - self.optimizer.zero_grad() - # the below comemnted lines are used for multiple GPU training - # if torch.cuda.device_count() > 1: - # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count())) - # if cuda: - # net = net.cuda() - # y_pred = net(X_v) - if autoencoder: - if vae: - y_pred, mu, logvar = self.model(X_v) - loss = loss_function(y_pred, X_v, mu, logvar) - else: - y_pred = self.model(X_v) - loss = self.loss_f(y_pred, X_v) - else: - y_pred = self.model(X_v) - - loss = self.loss_f(y_pred, y_v) - - if mixup: - loss_func = mixup_criterion(y_v_a, y_v_b, lam) - loss = loss_func(self.loss_f, y_pred) - - if l1regularization: - l1_crit = nn.L1Loss(size_average=False) - reg_loss = 0 - for param in self.model.parameters(): - target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32))) - if torch.cuda.is_available(): - target = target.cuda() - reg_loss += l1_crit(param, target) - - factor = 0.0005 - loss += factor * reg_loss - - loss.backward() - self.optimizer.step() - loss_list.append(loss.item()) - if autoencoder: - acc_list.append(0) - else: - classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten() - acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten()) - acc_list.append(acc) - del loss - del y_pred - - return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list) - - def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False): - train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)), - torch.from_numpy(y.astype(np.float32)).long().view(-1)) - train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) - self.model.train() - for t in range(nb_epoch): - loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae) - #print loss - val_log = '' - if validation_data and not autoencoder: - val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size) - - val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc) - print(val_log) - # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log)) - - def evaluate(self, X, y, batch_size=32): - y_pred = self.predict(X) - y_v = Variable(torch.from_numpy(y).long(), requires_grad=False) - if torch.cuda.is_available(): - y_v = y_v.cuda() - loss = self.loss_f(y_pred, y_v) - predict = y_pred.data.cpu().numpy()[:, 1].flatten() - auc = roc_auc_score(y, predict) - return loss.item(), auc - - def _accuracy(self, y_pred, y): - return float(sum(y_pred == y)) / y.shape[0] - - def predict(self, X): - X = Variable(torch.from_numpy(X.astype(np.float32))) - if torch.cuda.is_available(): - X = X.cuda() - y_pred = self.model(X) - return y_pred - - def predict_proba(self, X): - self.model.eval() - return self.model.predict_proba(X) - -class EarlyStopping(object): # pylint: disable=R0902 - """ - Gives a criterion to stop training when a given metric is not - improving anymore - Args: - mode (str): One of `min`, `max`. In `min` mode, training will - be stopped when the quantity monitored has stopped - decreasing; in `max` mode it will be stopped when the - quantity monitored has stopped increasing. Default: 'min'. - patience (int): Number of epochs with no improvement after - which training is stopped. For example, if - `patience = 2`, then we will ignore the first 2 epochs - with no improvement, and will only stop learning after the - 3rd epoch if the loss still hasn't improved then. - Default: 10. - threshold (float): Threshold for measuring the new optimum, - to only focus on significant changes. Default: 1e-4. - threshold_mode (str): One of `rel`, `abs`. In `rel` mode, - dynamic_threshold = best * ( 1 + threshold ) in 'max' - mode or best * ( 1 - threshold ) in `min` mode. - In `abs` mode, dynamic_threshold = best + threshold in - `max` mode or best - threshold in `min` mode. Default: 'rel'. - """ - - def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'): - self.patience = patience - self.mode = mode - self.threshold = threshold - self.threshold_mode = threshold_mode - self.best = None - self.num_bad_epochs = None - self.mode_worse = None # the worse value for the chosen mode - self.is_better = None - self.last_epoch = -1 - self._init_is_better(mode=mode, threshold=threshold, - threshold_mode=threshold_mode) - self._reset() - - def _reset(self): - """Resets num_bad_epochs counter and cooldown counter.""" - self.best = self.mode_worse - self.num_bad_epochs = 0 - - def step(self, metrics, epoch=None): - """ Updates early stopping state """ - current = metrics - if epoch is None: - epoch = self.last_epoch = self.last_epoch + 1 - self.last_epoch = epoch - - if self.is_better(current, self.best): - self.best = current - self.num_bad_epochs = 0 - else: - self.num_bad_epochs += 1 - - @property - def stop(self): - """ Should we stop learning? """ - return self.num_bad_epochs > self.patience - - def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201 - if mode == 'min' and threshold_mode == 'rel': - rel_epsilon = 1. - threshold - return a < best * rel_epsilon - - elif mode == 'min' and threshold_mode == 'abs': - return a < best - threshold - - elif mode == 'max' and threshold_mode == 'rel': - rel_epsilon = threshold + 1. - return a > best * rel_epsilon - - return a > best + threshold - - def _init_is_better(self, mode, threshold, threshold_mode): - if mode not in {'min', 'max'}: - raise ValueError('mode ' + mode + ' is unknown!') - if threshold_mode not in {'rel', 'abs'}: - raise ValueError('threshold mode ' + threshold_mode + ' is unknown!') - - if mode == 'min': - self.mode_worse = float('inf') - else: # mode == 'max': - self.mode_worse = (-float('inf')) - - self.is_better = partial(self._cmp, mode, threshold_mode, threshold) - - def state_dict(self): - """ Returns early stopping state """ - return {key: value for key, value in self.__dict__.items() if key != 'is_better'} - - def load_state_dict(self, state_dict): - """ Loads early stopping state """ - self.__dict__.update(state_dict) - self._init_is_better(mode=self.mode, threshold=self.threshold, - threshold_mode=self.threshold_mode) - -def adjust_learning_rate(learning_rate, optimizer, epoch): - """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" - - lr = learning_rate * (0.1 ** (epoch // 10)) - - for param_group in optimizer.param_groups: - param_group['lr'] = lr - return lr - -def batch(tensor, batch_size = 50): - """ It is used to create batch samples, each batch has batch_size samples""" - tensor_list = [] - length = tensor.shape[0] - i = 0 - while True: - if (i+1) * batch_size >= length: - tensor_list.append(tensor[i * batch_size: length]) - return tensor_list - tensor_list.append(tensor[i * batch_size: (i+1) * batch_size]) - i += 1 - - -class selu(nn.Module): - def __init__(self): - super(selu, self).__init__() - self.alpha = 1.6732632423543772848170429916717 - self.scale = 1.0507009873554804934193349852946 - - def forward(self, x): - temp1 = self.scale * F.relu(x) - temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x))) - return temp1 + temp2 - - -class alpha_drop(nn.Module): - def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1): - super(alpha_drop, self).__init__() - keep_prob = 1 - p - self.a = np.sqrt( - fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar))) - self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha) - self.alpha = alpha - self.keep_prob = 1 - p - self.drop_prob = p - - def forward(self, x): - if self.keep_prob == 1 or not self.training: - # print("testing mode, direct return") - return x - else: - random_tensor = self.keep_prob + torch.rand(x.size()) - - binary_tensor = Variable(torch.floor(random_tensor)) - - if torch.cuda.is_available(): - binary_tensor = binary_tensor.cuda() - - x = x.mul(binary_tensor) - ret = x + self.alpha * (1 - binary_tensor) - ret.mul_(self.a).add_(self.b) - return ret - -def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None): - """ - create matrix for temporal models. - - :param covariate_ids: the covariate ids in the whole data - :param patient_dict: the dictionary contains the data for each patient - :param y_dict: if the output labels is known, it contains the labels for patients - :param timeid_len: the total number time window gaps when extracting temporal data - :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids - """ - D = len(covariate_ids) - N = len(patient_dict) - T = timeid_len - concept_list =list(covariate_ids) - concept_list.sort() - x_raw = np.zeros((N, D, T), dtype=float) - patient_ind = 0 - p_ids = [] - patient_keys = patient_dict.keys() - #print covariate_ids - for kk in patient_keys: - #print('-------------------') - vals = patient_dict[kk] - #sorted(vals) - p_ids.append(int(kk)) - for timeid, meas in vals.iteritems(): - int_time = int(timeid) - 1 - for val in meas: - if not len(val): - continue - cov_id, cov_val = val - if cov_id not in covariate_ids: - continue - lab_ind = concept_list.index(cov_id) - if cov_mean_dict is None: - x_raw[patient_ind][lab_ind][int_time] = float(cov_val) - else: - mean_std = cov_mean_dict[cov_id] - if mean_std[1]: - x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1] - else: - x_raw[patient_ind][lab_ind][int_time] = float(cov_val) - - - patient_ind = patient_ind + 1 - - #impute the data using the value of previous timestamp - #fw = open('patient_var.txt', 'w') - for i in xrange(N): - for j in xrange(D): - temp = x_raw[i][j] - nonzero_inds = np.nonzero(temp)[0] - count_nonzeros = len(nonzero_inds) - #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n') - if count_nonzeros == 1: - ind = nonzero_inds[0] - for k in xrange(ind + 1, T): - x_raw[i][j][k] = x_raw[i][j][ind] - elif count_nonzeros > 1: - for ind in xrange(1, count_nonzeros): - for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]] - # For last nonzeros. - for k in xrange(nonzero_inds[-1] + 1, T): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]] - - #fw.close() - - return x_raw, patient_keys - -def forward_impute_missing_value(x_raw): - N = x_raw.shape[0] - D = x_raw.shape[1] - T = x_raw.shape[2] - for i in xrange(N): - for j in xrange(D): - temp = x_raw[i][j] - nonzero_inds = np.nonzero(temp)[0] - count_nonzeros = len(nonzero_inds) - #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n') - if count_nonzeros == 1: - ind = nonzero_inds[0] - for k in xrange(ind + 1, T): - x_raw[i][j][k] = x_raw[i][j][ind] - elif count_nonzeros > 1: - for ind in xrange(1, count_nonzeros): - for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]] - # For last nonzeros. - for k in xrange(nonzero_inds[-1] + 1, T): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]] - - -def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False): - """ - It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix - - :param covariates: covariates extracted by FeatureExtraction package - :param timeid_len: the total number of window gaps when extracting temporal data - :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids - """ - patient_dict = OrderedDict() - print('Loading temporal data') - cov_vals_dict = {} - for row in covariates: - p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3] - cov_id = np.int64(cov_id) - #time_id = int(time_id) - cov_vals_dict.setdefault(cov_id, []).append(float(cov_val)) - if p_id not in patient_dict: - patient_dict[p_id] = {time_id: [(cov_id, cov_val)]} - else: - if time_id not in patient_dict[p_id]: - patient_dict[p_id][time_id] = [(cov_id, cov_val)] - else: - patient_dict[p_id][time_id].append((cov_id, cov_val)) - #covariate_ids.add(cov_id) - #T = 365/time_window - covariate_ids = set() - cov_mean_dict = {} - if not predict: - fw = open('covariate_mean_std.csv', 'w') - for key, val in cov_vals_dict.iteritems(): - mean_val = np.mean(val) - std_val = np.std(val) - - # Remove those covariates with few occurrence (<5) - if len(val) >= 5: - covariate_ids.add(key) - cov_mean_dict[key] = (mean_val, std_val) - fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n') - fw.close() - else: - fp = open('covariate_mean_std.csv', 'r') - for line in fp: - values = line.rstrip().split(',') - key = np.int64(values[0]) - covariate_ids.add(key) - cov_mean_dict[key] = (float(values[1]), float(values[2])) - fp.close() - - - if normalize: - x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict) - else: - x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len) - - return x, patient_keys - - -def read_covariates(covariate_file): - patient_dict = {} - head = True - with open(covariate_file, 'r') as fp: - for line in fp: - if head: - head = False - continue - values = line.rstrip().split(',') - patient_id = values[1] - cov_id = values[2] - #time_id = int(values[-1]) - # covariates in one patient has time order - patient_dict.setdefault(patient_id, []).append((cov_id)) - new_patient = [] - for key in patient_dict.keys(): - #patient_dict[key].sort() - sort_vals = [] - for val in patient_dict[key]: - if val[1] not in sort_vals: - sort_vals.append(val) - new_patient.append(sort_vals) - - return new_patient - - -def word_embeddings(covariate_file, embedding_size=50): - import gensim.models.word2vec as w2v - modelname = "processed_%s.w2v" % ('heartfailure') - sentences = read_covariates(covariate_file) - model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1) - print("building word2vec vocab on %s..." % (covariate_file)) - - model.build_vocab(sentences) - print("training...") - model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) - out_file = modelname - print("writing embeddings to %s" % (out_file)) - model.save(out_file) - return out_file - -def read_data(filename): - covariate_ids = set() - patient_dict = {} - head = True - with open(filename) as fp: - for lines in fp: - if head: - head = False - continue - lines = lines.strip('\n').strip('\r').split(',') - try: - p_id, cov_id, time_id, cov_val = lines[1], lines[2], lines[3], lines[4] - except: - pdb.set_trace() - print(p_id, cov_id, time_id) - if p_id not in patient_dict: - patient_dict[p_id] = {} - else: - if time_id not in patient_dict[p_id]: - patient_dict[p_id][time_id] = [] - else: - patient_dict[p_id][time_id].append((cov_id, cov_val)) - covariate_ids.add(cov_id) - patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients - #(15, 2000, 60) 20000 patients, 15 lab tests, - return covariate_ids, patient_dict - -def split_training_validation(classes, validation_size=0.2, shuffle=False): - """split sampels based on balnace classes""" - num_samples = len(classes) - classes = np.array(classes) - classes_unique = np.unique(classes) - num_classes = len(classes_unique) - indices = np.arange(num_samples) - # indices_folds=np.zeros([num_samples],dtype=int) - training_indice = [] - training_label = [] - validation_indice = [] - validation_label = [] - for cl in classes_unique: - indices_cl = indices[classes == cl] - num_samples_cl = len(indices_cl) - - # split this class into k parts - if shuffle: - random.shuffle(indices_cl) # in-place shuffle - - # module and residual - num_samples_each_split = int(num_samples_cl * validation_size) - res = num_samples_cl - num_samples_each_split - - training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]] - training_label = training_label + [cl] * res - - validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]] - validation_label = validation_label + [cl] * num_samples_each_split - - training_index = np.arange(len(training_label)) - random.shuffle(training_index) - training_indice = np.array(training_indice)[training_index] - training_label = np.array(training_label)[training_index] - - validation_index = np.arange(len(validation_label)) - random.shuffle(validation_index) - validation_indice = np.array(validation_indice)[validation_index] - validation_label = np.array(validation_label)[validation_index] - - return training_indice, training_label, validation_indice, validation_label - - -if __name__ == "__main__": - x_raw = np.array([[1, 1, 0], [0,1,0]]) - x = [] - x.append(x_raw) - x.append(x_raw) - x = np.array(x) - #pdb.set_trace() - forward_impute_missing_value(x) - #filename = sys.argv[1] - #word_embeddings(filename) - ''' - population = joblib.load('/data/share/plp/SYNPUF/population.pkl') - # y = population[:, 1] - covriate_ids, patient_dict = read_data(filename) - # y_ids = np.array([int(val) for val in patient_dict.keys()]) - # Y = [] - y_dict = dict(zip(population[:, 0], population[:, 1])) - #x, patient_keys = convert_2_cnn_format(covariates, timeid_len = 31) - # for val in y_ids: - # Y.append(y_dict[y_ids]) - x_train, x_valid, x_test, Y_train, Y_valid, Y_test = convert_to_temporal_format(covriate_ids, patient_dict, y_dict) - ''' diff --git a/inst/python/deepTorch.py b/inst/python/deepTorch.py deleted file mode 100644 index 7425413..0000000 --- a/inst/python/deepTorch.py +++ /dev/null @@ -1,1186 +0,0 @@ -""" - deepTorch.py: It implements different deep learning classifiers - - Copyright 2016 Observational Health Data Sciences and Informatics - - This file is part of PatientLevelPrediction - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import sys -import os -import pdb -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable -from torch.utils.data import DataLoader, TensorDataset -from torch.nn.utils.rnn import pack_padded_sequence -from torch.nn.utils.rnn import pad_packed_sequence -from collections import OrderedDict -import timeit -import joblib -from sklearn.model_selection import train_test_split -from sklearn.metrics import roc_auc_score -import numpy as np - -import warnings -warnings.filterwarnings("ignore") - -if "python_dir" in globals(): - #print python_dir - sys.path.insert(0, python_dir) - -import TorchUtils as tu - - -class LogisticRegression(nn.Module): - """ - Train a logistic regression model using pytorch - """ - def __init__(self, input_size, num_classes = 2): - super(LogisticRegression, self).__init__() - self.linear = nn.Linear(input_size, num_classes) - - def forward(self, x): - out = self.linear(x) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class MLP(nn.Module): - """ - Train a multiple-layer perceptron with one hideen layer - """ - def __init__(self, input_dim, hidden_size, num_classes = 2): - super(MLP, self).__init__() - self.fc1 = nn.Linear(input_dim, hidden_size) - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.dropout(x, p =0.5, training=self.training) - x = self.fc2(x) - x = torch.sigmoid(x) - return x - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class SNN(nn.Module): - """ - Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515 - """ - - def __init__(self, input_dim, hidden_size, num_classes=2): - super(SNN, self).__init__() - self.fc1 = nn.Linear(input_dim, hidden_size) - self.fc2 = tu.selu() - self.ad1 = tu.alpha_drop() - self.fc4 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.dropout(x, p=0.5, training=self.training) - x = self.fc2(x) - x = self.ad1(x) - x = self.fc4(x) - x = torch.sigmoid(x) - return x - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - -class AutoEncoder(nn.Module): - """ - A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data. - """ - def __init__(self, input_size, encoding_size): - super(AutoEncoder, self).__init__() - - self.encoder = nn.Sequential( - nn.Linear(input_size, input_size/2), - nn.ReLU(True), - nn.Linear(input_size/2, input_size/4), - nn.ReLU(True), - nn.Linear(input_size/4, encoding_size), - nn.ReLU(True) - ) - self.decoder = nn.Sequential( - nn.Linear(encoding_size, input_size/4), - nn.ReLU(True), - nn.Linear(input_size/4, input_size/2), - nn.ReLU(True), - nn.Linear(input_size/2, input_size) - ) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - encoded = self.encoder(x) - decoded = self.decoder(encoded) - return decoded - - def get_encode_features(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - encoded = self.encoder(x) - encoded = encoded.data.cpu().numpy() - return encoded - -class VAE(nn.Module): - """ - A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data. - """ - def __init__(self, input_size, encoding_size): - super(VAE, self).__init__() - - self.fc1 = nn.Linear(input_size, input_size/2) - self.fc21 = nn.Linear(input_size/2, encoding_size) - self.fc22 = nn.Linear(input_size/2, encoding_size) - self.fc3 = nn.Linear(encoding_size, input_size/2) - self.fc4 = nn.Linear(input_size/2, input_size) - - self.relu = nn.ReLU() - self.sigmoid = nn.Sigmoid() - - def encode(self, x): - h1 = self.relu(self.fc1(x)) - return self.fc21(h1), self.fc22(h1) - - def reparameterize(self, mu, logvar): - if self.training: - std = logvar.mul(0.5).exp_() - eps = Variable(std.data.new(std.size()).normal_()) - return eps.mul(std).add_(mu) - else: - return mu - - def decode(self, z): - h3 = self.relu(self.fc3(z)) - return self.sigmoid(self.fc4(h3)) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - mu, logvar = self.encode(x) - z = self.reparameterize(mu, logvar) - return self.decode(z), mu, logvar - - def get_encode_features(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - mu, logvar = self.encode(x) - encoded = self.reparameterize(mu, logvar) - encoded = encoded.data.cpu().numpy() - return encoded - -class Decoder(nn.Module): - """ VAE decoder input_size = original inputsize/16*256""" - def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)): - super(Decoder, self).__init__() - self.latent_size = latent_size - self.img_channels = img_channels - - self.fc1 = nn.Linear(latent_size, input_size) - self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding) - self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding) - self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding) - self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding) - - def forward(self, x): # pylint: disable=arguments-differ - x = F.relu(self.fc1(x)) - x = x.unsqueeze(-1).unsqueeze(-1) - x = F.relu(self.deconv1(x)) - x = F.relu(self.deconv2(x)) - x = F.relu(self.deconv3(x)) - reconstruction = torch.sigmoid(self.deconv4(x)) - return reconstruction - -class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes - """ VAE encoder """ - def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)): - super(Encoder, self).__init__() - self.latent_size = latent_size - #self.img_size = img_size - self.img_channels = img_channels - - self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding) - self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding) - self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding) - self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding) - out_size = input_size / 16 - self.fc_mu = nn.Linear(out_size, latent_size) - self.fc_logsigma = nn.Linear(out_size, latent_size) - - - def forward(self, x): # pylint: disable=arguments-differ - x = F.relu(self.conv1(x)) - x = F.relu(self.conv2(x)) - x = F.relu(self.conv3(x)) - x = F.relu(self.conv4(x)) - x = x.view(x.size(0), -1) - - mu = self.fc_mu(x) - logsigma = self.fc_logsigma(x) - - return mu, logsigma - -class VAE_CNN(nn.Module): - """ Variational Autoencoder """ - def __init__(self, latent_size, input_size): - super(VAE, self).__init__() - self.encoder = Encoder(latent_size, input_size) - input_size = input_size/16 - self.decoder = Decoder(latent_size, input_size) - - def forward(self, x): # pylint: disable=arguments-differ - mu, logsigma = self.encoder(x) - sigma = logsigma.exp() - eps = torch.randn_like(sigma) - z = eps.mul(sigma).add_(mu) - - recon_x = self.decoder(z) - return recon_x, mu, logsigma - - def get_encode_features(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - mu, logvar = self.encoder(x) - encoded = mu .data.cpu().numpy() - return encoded - -class CNN(nn.Module): - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0): - super(CNN, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.layer2 = nn.Sequential( - nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size) - self.bn = nn.BatchNorm1d(hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.layer1(x) - out = self.layer2(out) - out = out.view(out.size(0), -1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.bn(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -#allow multiple kernel with differnt kernel size -class CNN_MLF(nn.Module): - """ - It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0): - super(CNN_MLF, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1 - maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.layer2 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size - maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.layer3 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1 - maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - conv_outsize = maxpool1_size + maxpool2_size +maxpool3_size - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out1 = self.layer1(x) - out2 = self.layer2(x) - out3 = self.layer3(x) - out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class CNN_LSTM(nn.Module): - """ - It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2): - super(CNN_LSTM, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - self.num_layers = num_layers - self.hidden_size = hidden_size - out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding) - input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True) - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(hidden_size, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.layer1(x) - out = self.downsample(out) - out = torch.squeeze(out, 1) - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() - c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() - else: - h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) - c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) - out, hn = self.layer2(out, (h0, c0)) - out = hn[0][-1] - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class CNN_MIX(nn.Module): - """ - It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0): - super(CNN_MIX, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - self.layer2 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size)) - out1_size = int(np.ceil(float(window_size)/pool_size[1])) - self.layer3 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - - out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.layer1(x) - out = out.view(out.size(0), out.size(2), out.size(1), out.size(3)) - out = self.layer2(out) - out = out.view(out.size(0), out.size(2), out.size(1), out.size(3)) - out = self.layer3(out) - out = out.view(out.size(0), -1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class CNN_MULTI(nn.Module): - """ - It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0): - super(CNN_MULTI, self).__init__() - # resolution 1 - self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size) - maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size) - maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - #resolution 2 - self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size) - maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - - self.layer2 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - self.layer3 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size)) - cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - self.layer4 = nn.Sequential( - nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - merge_size = cnn1_size + cnn2_size + cnn4_size - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.pool1_1(x) - out = self.pool1_2(out) - out1 = self.layer1(out) - out = self.pool2_1(x) - out2 = self.layer2(out) - out = self.layer3(x) - out3 = self.layer4(out) - out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -# 1x3 Convolution -def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)): - return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, - padding=padding, stride=stride, bias=False) - - -# Residual Block -class ResidualBlock(nn.Module): - def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None): - super(ResidualBlock, self).__init__() - self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride) - self.bn1 = nn.BatchNorm2d(nb_filter) - self.relu = nn.ReLU(inplace=True) - self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride) - self.bn2 = nn.BatchNorm2d(nb_filter) - self.downsample = downsample - - def forward(self, x): - residual = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - if self.downsample: - residual = self.downsample(x) - out += residual - out = self.relu(out) - return out - - -# ResNet Module -class ResNet(nn.Module): - def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100): - super(ResNet, self).__init__() - self.in_channels = 1 - self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size) - self.bn = nn.BatchNorm2d(nb_filter) - self.relu = nn.ReLU(inplace=True) - self.layer1 = self.make_layer(block, nb_filter, layers[0], kernel_size = kernel_size) - self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter) - self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter) - self.avg_pool = nn.AvgPool2d(pool_size) - avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - last_layer_size = nb_filter*4*labcounts*avgpool2_1_size - self.fc = nn.Linear(last_layer_size, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def make_layer(self, block, out_channels, blocks, stride=1, kernel_size = (1, 3), in_channels = 16): - downsample = None - if (stride != 1) or (self.in_channels != out_channels): - downsample = nn.Sequential( - convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride), - nn.BatchNorm2d(out_channels)) - layers = [] - layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample)) - self.in_channels = out_channels - for i in range(1, blocks): - layers.append(block(out_channels, out_channels, kernel_size = kernel_size)) - return nn.Sequential(*layers) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.conv(x) - out = self.bn(out) - out = self.relu(out) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.avg_pool(out) - out = out.view(out.size(0), -1) - out = self.fc(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class GRU(nn.Module): - """ - It is a deep network with one GRU layer, which are further fed into one fully connected layers. - """ - def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5): - super(GRU, self).__init__() - - self.hidden_size = hidden_size - self.num_layers = num_layers - self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout) - self.linear = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection - else: - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection - self.gru.flatten_parameters() - out, hn = self.gru(x, h0) - - rearranged = hn[-1] - out = self.linear(rearranged) - out = torch.sigmoid(out) - return out - - def initHidden(self, N): - return Variable(torch.randn(1, N, self.hidden_size)) - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class RNN(nn.Module): - """ - It is a deep network with one LSTM layer, which are further fed into one fully connected layer. - """ - def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5): - super(RNN, self).__init__() - self.hidden_size = hidden_size - self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout) - self.fc = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() - c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() - else: - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) - c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) - self.lstm.flatten_parameters() - out, hn = self.lstm(x, (h0, c0)) - rearranged = hn[0][-1] - # Decode hidden state of last time step - out = self.fc(rearranged) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class BiRNN(nn.Module): - """ - It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer. - """ - def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5): - super(BiRNN, self).__init__() - self.hidden_size = hidden_size - self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, - batch_first = True, dropout = dropout, bidirectional=True) - self.fc = nn.Linear(hidden_size*2, num_classes) # 2 for bidirection - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection - c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() - else: - h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection - c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) - self.lstm.flatten_parameters() - out, hn = self.lstm(x, (h0, c0)) - hn = hn[0] - - rearranged = hn[-2:].view(x.size(0), -1) - # Decode hidden state of last time step - out = self.fc(rearranged) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -# select model -if __name__ == "__main__": - if model_type in ['LogisticRegression', 'MLP', 'SNN']: - y = population[:, 1] - X = plpData[population[:, 0], :] - trainInds = population[:, population.shape[1] - 1] > 0 - if class_weight == -1: - loss = tu.FocalLoss(gamma = 5) - else: - if class_weight == 0: - weights = float(np.count_nonzero(y))/y.shape[0] - class_weight = [1 - weights, weights] - else: - class_weight = [class_weight, 1] - class_weight = 1/torch.Tensor(class_weight) - if torch.cuda.is_available(): - class_weight = class_weight.cuda() - loss=nn.CrossEntropyLoss(weight = class_weight) - - print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1])) - print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1])) - ########################################################################### - l1regularization = False - - if train: - pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0)) - print("Calculating prediction for train set of size %s" % (pred_size)) - test_pred = np.zeros(pred_size) # zeros length sum(population[:,population.size[1]] ==i) - for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1): - testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i - trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i) - train_x = X[trainInds, :][trainInd, :] - train_y = y[trainInds][trainInd] - - test_x = X[trainInds, :][testInd, :] - print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0])) - print("Train set contains %s outcomes " % (np.sum(train_y))) - train_x = train_x.toarray() - test_x = test_x.toarray() - - if autoencoder: - print('first train stakced autoencoder') - encoding_size = 256 - if vae: - auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size) - else: - auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size) - if torch.cuda.is_available(): - auto_model = auto_model.cuda() - clf = tu.Estimator(auto_model) - clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay), - loss=nn.MSELoss()) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae) - #split to batch for large dataset - train_batch = tu.batch(train_x, batch_size=32) - train_x = np.array([]).reshape(0, encoding_size) - for train in train_batch: - encode_train = auto_model.get_encode_features(train) - train_x = np.concatenate((train_x, encode_train), axis=0) - #train_x = auto_model.get_encode_features(train_x.toarray()) - #test_x = auto_model.get_encode_features(test_x.toarray()) - test_batch = tu.batch(test_x, batch_size=32) - test_x = np.array([]).reshape(0, encoding_size) - for test in test_batch: - encode_Test = auto_model.get_encode_features(test) - test_x = np.concatenate((test_x, encode_Test), axis=0) - del auto_model - del clf - # train on fold - print("Training fold %s" % (i)) - start_time = timeit.default_timer() - if model_type == 'LogisticRegression': - model = LogisticRegression(train_x.shape[1]) - l1regularization = True - elif model_type == 'SNN': - model = SNN(train_x.shape[1], size) - else: - model = MLP(train_x.shape[1], size) - - if torch.cuda.is_available(): - model = model.cuda() - clf = tu.Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay), - loss=loss) - #if not autoencoder: - # train_x = train_x.toarray() - # test_x = test_x.toarray() - - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization) - - ind = (population[:, population.shape[1] - 1] > 0) - ind = population[ind, population.shape[1] - 1] == i - - test_input_var = torch.from_numpy(test_x.astype(np.float32)) - - test_batch = tu.batch(test_x, batch_size = 32) - temp = [] - for test in test_batch: - pred_test1 = model.predict_proba(test)[:, 1] - temp = np.concatenate((temp, pred_test1), axis = 0) - #temp = model.predict_proba(test_input_var)[:, 1] - #temp = preds.data.cpu().numpy().flatten() - #print temp - test_pred[ind] = temp - print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0])) - print("Mean: %s prediction value" % (np.mean(test_pred[ind]))) - - # merge pred with indexes[testInd,:] - test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) - prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) - - # train final: - else: - print("Training final neural network model on all train data...") - print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0])) - - start_time = timeit.default_timer() - - train_x = X[trainInds, :] - train_x = train_x.toarray() - train_y = y[trainInds] - if not os.path.exists(modelOutput): - os.makedirs(modelOutput) - if autoencoder: - encoding_size = 256 - if vae: - auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size) - else: - auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size) - if torch.cuda.is_available(): - auto_model = auto_model.cuda() - clf = tu.Estimator(auto_model) - clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay), - loss=nn.MSELoss()) - - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae) - #train_x = auto_model.get_encode_features(train_x.toarray()) - train_batch = tu.batch(train_x, batch_size=32) - train_x = np.array([]).reshape(0, encoding_size) - for train in train_batch: - encode_train = auto_model.get_encode_features(train) - train_x = np.concatenate((train_x, encode_train), axis=0) - joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl')) - del auto_model - del clf - - print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay)) - if model_type == 'LogisticRegression': - model = LogisticRegression(train_x.shape[1]) - l1regularization = True - elif model_type == 'SNN': - model = SNN(train_x.shape[1], size) - else: - model = MLP(train_x.shape[1], size) - - #if not autoencoder: - # train_x = train_x.toarray() - - if torch.cuda.is_available(): - model = model.cuda() - clf = tu.Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay), - loss=loss) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization) - - end_time = timeit.default_timer() - print("Training final took: %.2f s" % (end_time - start_time)) - - # save the model: - print("Model saved to: %s" % (modelOutput)) - - joblib.dump(model, os.path.join(modelOutput,'model.pkl')) - - - elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']: - #print 'running model', model_type - y = population[:, 1] - #plpData = plpData[population[:, 0], :] - #config = tf.ConfigProto() - #config.gpu_options.allow_growth = True - #with tf.Session(config=config) as sess: - # X = tf.sparse_reorder(plpData) - # X = tf.sparse_tensor_to_dense(X) - # X = sess.run(X) - #tu.forward_impute_missing_value(X) - X = plpData.to_dense().numpy() - X = X[np.int64(population[:, 0]), :] - ''' - p_ids_in_cov = set(covariates[:, 0]) - full_covariates = np.array([]).reshape(0,4) - default_covid = covariates[0, 1] - timeid_len = len(set(covariates[:, -2])) - for p_id in population[:, 0]: - if p_id not in p_ids_in_cov: - tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1 - full_covariates = np.concatenate((full_covariates, tmp_x), axis=0) - else: - tmp_x = covariates[covariates[:, 0] == p_id, :] - full_covariates = np.concatenate((full_covariates, tmp_x), axis=0) - - trainInds = population[:, population.shape[1] - 1] > 0 - X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len= timeid_len) - full_covariates = [] - ''' - if class_weight == -1: - loss = tu.FocalLoss(gamma = 3) - else: - if class_weight == 0: - weights = float(np.count_nonzero(y))/y.shape[0] - class_weight = [1 - weights, weights] - else: - class_weight = [class_weight, 1] - class_weight = 1/torch.Tensor(class_weight) - if torch.cuda.is_available(): - class_weight = class_weight.cuda() - loss=nn.CrossEntropyLoss(weight = class_weight) - trainInds = population[:, population.shape[1] - 1] > 0 - if train: - pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0)) - print("Calculating prediction for train set of size %s" % (pred_size)) - test_pred = np.zeros(pred_size) # zeros length sum(population[:,population.size[1]] ==i) - for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1): - testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i - trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i) - train_x = X[trainInds, :][trainInd, :] - train_y = y[trainInds][trainInd] - - test_x = X[trainInds, :][testInd, :] - print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0])) - print("Train set contains %s outcomes " % (np.sum(train_y))) - - # train on fold - learning_rate = 0.001 - print("Training fold %s" % (i)) - start_time = timeit.default_timer() - if model_type == 'CNN': - model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_LSTM': - model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'CNN_MLF': # multiple kernels with different size - model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis - model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis - model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'ResNet': - print('train ResNet') - model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'RNN': - model = RNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'BiRNN': - model = BiRNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'GRU': - model = GRU(train_x.shape[2], hidden_size, 2, 2) - else: - print('temproal data not supported by this model') - - if torch.cuda.is_available(): - model = model.cuda() - clf = tu.Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001), - loss=loss) - - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs) - - ind = (population[:, population.shape[1] - 1] > 0) - ind = population[ind, population.shape[1] - 1] == i - - test_batch = tu.batch(test_x, batch_size = 32) - temp = [] - for test in test_batch: - pred_test1 = model.predict_proba(test)[:, 1] - temp = np.concatenate((temp, pred_test1), axis = 0) - - test_pred[ind] = temp - del model - print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0])) - print("Mean: %s prediction value" % (np.mean(test_pred[ind]))) - - # merge pred with indexes[testInd,:] - test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) - prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) - - # train final: - else: - print("Training final neural network model on all train data...") - print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0])) - - start_time = timeit.default_timer() - - train_x = X[trainInds, :] - train_y = y[trainInds] - learning_rate = 0.001 - - if model_type == 'CNN': - model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_LSTM': - model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'CNN_MLF': # multiple kernels with different size - model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis - model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis - model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'ResNet': - model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'RNN': - model = RNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'BiRNN': - model = BiRNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'GRU': - model = GRU(train_x.shape[2], hidden_size, 2, 2) - else: - print('temproal data not supported by this model') - - if torch.cuda.is_available(): - model = model.cuda() - clf = tu.Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001), - loss=loss) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs) - - end_time = timeit.default_timer() - print("Training final took: %.2f s" % (end_time - start_time)) - - # save the model: - if not os.path.exists(modelOutput): - os.makedirs(modelOutput) - print("Model saved to: %s" % (modelOutput)) - - joblib.dump(model, os.path.join(modelOutput,'model.pkl')) - - # prediction on train: - test_batch = tu.batch(train_x, batch_size = 32) - test_pred = [] - for test in test_batch: - pred_test1 = model.predict_proba(test)[:, 1] - test_pred = np.concatenate((test_pred, pred_test1), axis = 0) - test_pred.shape = (population.shape[0], 1) - prediction = np.append(population, test_pred, axis=1) - -''' -if __name__ == "__main__": - DATA_SIZE = 1000 - INPUT_SIZE = 36 - HIDDEN_SIZE = 100 - class_size = 2 - #X = np.random.randn(DATA_SIZE * class_size, 18, INPUT_SIZE) - X = np.random.randn(DATA_SIZE * class_size, INPUT_SIZE) - y = np.array([i for i in range(class_size) for _ in range(DATA_SIZE)]) - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2) - model = LogisticRegression(X_train.shape[1]) - l1regularization = True - #model = CNN_LSTM(nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2]) - #model = ResNet(ResidualBlock, [3, 3, 3], nb_filter = 16, labcounts = X.shape[1], window_size = X.shape[2]) - #model = RNN(INPUT_SIZE, HIDDEN_SIZE, 2, class_size) - #pdb.set_trace() - if cuda: - model = model.cuda() - clf = Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-4), - loss=nn.CrossEntropyLoss()) - clf.fit(X_train, y_train, batch_size=64, nb_epoch=10, - validation_data=(X_test, y_test), l1regularization = l1regularization) - score, auc = clf.evaluate(X_test, y_test) - - print('Test score: %s' %(auc)) -''' diff --git a/inst/python/deepTorchFunctions.py b/inst/python/deepTorchFunctions.py deleted file mode 100644 index ff2f6f0..0000000 --- a/inst/python/deepTorchFunctions.py +++ /dev/null @@ -1,1732 +0,0 @@ -""" - deepTorch.py: It implements different deep learning classifiers - - Copyright 2016 Observational Health Data Sciences and Informatics - - This file is part of PatientLevelPrediction - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -""" - -import sys -import os -import pdb -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.autograd import Variable -from torch.utils.data import DataLoader, TensorDataset -from torch.nn.utils.rnn import pack_padded_sequence -from torch.nn.utils.rnn import pad_packed_sequence -from collections import OrderedDict -import timeit -import joblib -from sklearn.model_selection import train_test_split -from sklearn.metrics import roc_auc_score -import numpy as np - -import warnings -warnings.filterwarnings("ignore") - -class FocalLoss(nn.Module): - """ - Method to handle data imbalance based on paper (arXiv:1708.02002) entitled - Focal loss for dense object detection. - Loss(x, class) = - (1-softmax(x)[class])^gamma \log(softmax(x)[class]) - - """ - - def __init__(self, gamma=5, eps=1e-7, size_average=False): - super(FocalLoss, self).__init__() - self.gamma = gamma - self.eps = eps - self.size_average = size_average - - def forward(self, input, target): - y = self.one_hot(target, input.size(-1)) - logit = F.softmax(input) - logit = logit.clamp(self.eps, 1. - self.eps) - - loss = -1 * y * torch.log(logit) # cross entropy - loss = loss * (1 - logit) ** self.gamma # focal loss - - if self.size_average: - loss = loss.mean() - else: - loss = loss.sum() - return loss - - def one_hot(self, index, classes): - """ - - :param index: is the labels - :param classes: number if classes - :return: - """ - size = index.size() + (classes,) - view = index.size() + (1,) - - mask = torch.Tensor(*size).fill_(0) - index = index.view(*view) - ones = 1. - - if isinstance(index, Variable): - ones = Variable(torch.Tensor(index.size()).fill_(1)) - mask = Variable(mask, volatile=index.volatile) - if torch.cuda.is_available(): - ones = ones.cuda() - mask = mask.cuda() - - return mask.scatter_(1, index, ones) - -def loss_function(recon_x, x, mu, logvar): - """Loss function for varational autoencoder VAE""" - BCE = F.binary_cross_entropy(recon_x, x, size_average=False) - - # 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2) - KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp()) - - return BCE + KLD - - -def mixup_data(x, y, alpha=1.0): - - '''Compute the mixup data. Return mixed inputs, pairs of targets, and lambda - Data Augmentation method based on paper (arXiv:1710.09412) entitled - mixup: Beyond empirical risk minimization. - ''' - if alpha > 0.: - lam = np.random.beta(alpha, alpha) - else: - lam = 1. - batch_size = x.size()[0] - if torch.cuda.is_available(): - index = torch.randperm(batch_size).cuda() - else: - index = torch.randperm(batch_size) - - mixed_x = lam * x + (1 - lam) * x[index,:] - y_a, y_b = y, y[index] - return mixed_x, y_a, y_b, lam - -def mixup_criterion(y_a, y_b, lam): - return lambda criterion, pred: lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b) - -def early_stop(metrics_hist, patience = 3): - if not np.all(np.isnan(metrics_hist)): - return np.nanargmin(metrics_hist) > len(metrics_hist) - patience - else: - #keep training if criterion results have all been nan so far - return False - - -class Estimator(object): - """ - It is used for training different deep models in the same interface. - """ - - def __init__(self, model): - self.model = model - - def compile(self, optimizer, loss): - self.optimizer = optimizer - self.loss_f = loss - - def _fit(self, train_loader, l1regularization=False, autoencoder=False, mixup=False, vae = False): - """ - train one epoch - - :param train_loader: The data loaded using DataLoader - :param l1regularization: default False - :return: the return fitted loss and accuracy - """ - loss_list = [] - acc_list = [] - for idx, (X, y) in enumerate(train_loader): - X_v = Variable(X) - y_v = Variable(y) - if torch.cuda.is_available(): - X_v = X_v.cuda() - y_v = y_v.cuda() - - if mixup: - X_v, y_v_a, y_v_b, lam = mixup_data(X_v, y_v) - X_v, y_v_a, y_v_b = Variable(X_v), Variable(y_v_a), Variable(y_v_b) - - # print 'GPU id', torch.cuda.current_device() - self.optimizer.zero_grad() - # the below comemnted lines are used for multiple GPU training - # if torch.cuda.device_count() > 1: - # net = torch.nn.DataParallel(self.model, device_ids = range(torch.cuda.device_count())) - # if cuda: - # net = net.cuda() - # y_pred = net(X_v) - if autoencoder: - if vae: - y_pred, mu, logvar = self.model(X_v) - loss = loss_function(y_pred, X_v, mu, logvar) - else: - y_pred = self.model(X_v) - loss = self.loss_f(y_pred, X_v) - else: - y_pred = self.model(X_v) - - loss = self.loss_f(y_pred, y_v) - - if mixup: - loss_func = mixup_criterion(y_v_a, y_v_b, lam) - loss = loss_func(self.loss_f, y_pred) - - if l1regularization: - l1_crit = nn.L1Loss(size_average=False) - reg_loss = 0 - for param in self.model.parameters(): - target = Variable(torch.from_numpy(np.zeros(param.size()).astype(np.float32))) - if torch.cuda.is_available(): - target = target.cuda() - reg_loss += l1_crit(param, target) - - factor = 0.0005 - loss += factor * reg_loss - - loss.backward() - self.optimizer.step() - loss_list.append(loss.item()) - if autoencoder: - acc_list.append(0) - else: - classes = torch.topk(y_pred, 1)[1].data.cpu().numpy().flatten() - acc = self._accuracy(classes, y_v.data.cpu().numpy().flatten()) - acc_list.append(acc) - del loss - del y_pred - - return sum(loss_list) / len(loss_list), sum(acc_list) / len(acc_list) - - def fit(self, X, y, batch_size=32, nb_epoch=10, validation_data=(), l1regularization=False, autoencoder =False, vae = False): - train_set = TensorDataset(torch.from_numpy(X.astype(np.float32)), - torch.from_numpy(y.astype(np.float32)).long().view(-1)) - train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True) - self.model.train() - for t in range(nb_epoch): - loss, acc = self._fit(train_loader, l1regularization=l1regularization, autoencoder = autoencoder, vae = vae) - #print loss - val_log = '' - if validation_data and not autoencoder: - val_loss, auc = self.evaluate(validation_data[0], validation_data[1], batch_size) - - val_log = "- val_loss: %06.4f - auc: %6.4f" % (val_loss, auc) - print(val_log) - # print("Epoch %s/%s loss: %06.4f - acc: %06.4f %s" % (t, nb_epoch, loss, acc, val_log)) - - def evaluate(self, X, y, batch_size=32): - y_pred = self.predict(X) - y_v = Variable(torch.from_numpy(y).long(), requires_grad=False) - if torch.cuda.is_available(): - y_v = y_v.cuda() - loss = self.loss_f(y_pred, y_v) - predict = y_pred.data.cpu().numpy()[:, 1].flatten() - auc = roc_auc_score(y, predict) - return loss.item(), auc - - def _accuracy(self, y_pred, y): - return float(sum(y_pred == y)) / y.shape[0] - - def predict(self, X): - X = Variable(torch.from_numpy(X.astype(np.float32))) - if torch.cuda.is_available(): - X = X.cuda() - y_pred = self.model(X) - return y_pred - - def predict_proba(self, X): - self.model.eval() - return self.model.predict_proba(X) - -class EarlyStopping(object): # pylint: disable=R0902 - """ - Gives a criterion to stop training when a given metric is not - improving anymore - Args: - mode (str): One of `min`, `max`. In `min` mode, training will - be stopped when the quantity monitored has stopped - decreasing; in `max` mode it will be stopped when the - quantity monitored has stopped increasing. Default: 'min'. - patience (int): Number of epochs with no improvement after - which training is stopped. For example, if - `patience = 2`, then we will ignore the first 2 epochs - with no improvement, and will only stop learning after the - 3rd epoch if the loss still hasn't improved then. - Default: 10. - threshold (float): Threshold for measuring the new optimum, - to only focus on significant changes. Default: 1e-4. - threshold_mode (str): One of `rel`, `abs`. In `rel` mode, - dynamic_threshold = best * ( 1 + threshold ) in 'max' - mode or best * ( 1 - threshold ) in `min` mode. - In `abs` mode, dynamic_threshold = best + threshold in - `max` mode or best - threshold in `min` mode. Default: 'rel'. - """ - - def __init__(self, mode='min', patience=3, threshold=1e-4, threshold_mode='rel'): - self.patience = patience - self.mode = mode - self.threshold = threshold - self.threshold_mode = threshold_mode - self.best = None - self.num_bad_epochs = None - self.mode_worse = None # the worse value for the chosen mode - self.is_better = None - self.last_epoch = -1 - self._init_is_better(mode=mode, threshold=threshold, - threshold_mode=threshold_mode) - self._reset() - - def _reset(self): - """Resets num_bad_epochs counter and cooldown counter.""" - self.best = self.mode_worse - self.num_bad_epochs = 0 - - def step(self, metrics, epoch=None): - """ Updates early stopping state """ - current = metrics - if epoch is None: - epoch = self.last_epoch = self.last_epoch + 1 - self.last_epoch = epoch - - if self.is_better(current, self.best): - self.best = current - self.num_bad_epochs = 0 - else: - self.num_bad_epochs += 1 - - @property - def stop(self): - """ Should we stop learning? """ - return self.num_bad_epochs > self.patience - - def _cmp(self, mode, threshold_mode, threshold, a, best): # pylint: disable=R0913, R0201 - if mode == 'min' and threshold_mode == 'rel': - rel_epsilon = 1. - threshold - return a < best * rel_epsilon - - elif mode == 'min' and threshold_mode == 'abs': - return a < best - threshold - - elif mode == 'max' and threshold_mode == 'rel': - rel_epsilon = threshold + 1. - return a > best * rel_epsilon - - return a > best + threshold - - def _init_is_better(self, mode, threshold, threshold_mode): - if mode not in {'min', 'max'}: - raise ValueError('mode ' + mode + ' is unknown!') - if threshold_mode not in {'rel', 'abs'}: - raise ValueError('threshold mode ' + threshold_mode + ' is unknown!') - - if mode == 'min': - self.mode_worse = float('inf') - else: # mode == 'max': - self.mode_worse = (-float('inf')) - - self.is_better = partial(self._cmp, mode, threshold_mode, threshold) - - def state_dict(self): - """ Returns early stopping state """ - return {key: value for key, value in self.__dict__.items() if key != 'is_better'} - - def load_state_dict(self, state_dict): - """ Loads early stopping state """ - self.__dict__.update(state_dict) - self._init_is_better(mode=self.mode, threshold=self.threshold, - threshold_mode=self.threshold_mode) - -def adjust_learning_rate(learning_rate, optimizer, epoch): - """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" - - lr = learning_rate * (0.1 ** (epoch // 10)) - - for param_group in optimizer.param_groups: - param_group['lr'] = lr - return lr - -def batch(tensor, batch_size = 50): - """ It is used to create batch samples, each batch has batch_size samples""" - tensor_list = [] - length = tensor.shape[0] - i = 0 - while True: - if (i+1) * batch_size >= length: - tensor_list.append(tensor[i * batch_size: length]) - return tensor_list - tensor_list.append(tensor[i * batch_size: (i+1) * batch_size]) - i += 1 - - -class selu(nn.Module): - def __init__(self): - super(selu, self).__init__() - self.alpha = 1.6732632423543772848170429916717 - self.scale = 1.0507009873554804934193349852946 - - def forward(self, x): - temp1 = self.scale * F.relu(x) - temp2 = self.scale * self.alpha * (F.elu(-1 * F.relu(-1 * x))) - return temp1 + temp2 - - -class alpha_drop(nn.Module): - def __init__(self, p=0.05, alpha=-1.7580993408473766, fixedPointMean=0, fixedPointVar=1): - super(alpha_drop, self).__init__() - keep_prob = 1 - p - self.a = np.sqrt( - fixedPointVar / (keep_prob * ((1 - keep_prob) * pow(alpha - fixedPointMean, 2) + fixedPointVar))) - self.b = fixedPointMean - self.a * (keep_prob * fixedPointMean + (1 - keep_prob) * alpha) - self.alpha = alpha - self.keep_prob = 1 - p - self.drop_prob = p - - def forward(self, x): - if self.keep_prob == 1 or not self.training: - # print("testing mode, direct return") - return x - else: - random_tensor = self.keep_prob + torch.rand(x.size()) - - binary_tensor = Variable(torch.floor(random_tensor)) - - if torch.cuda.is_available(): - binary_tensor = binary_tensor.cuda() - - x = x.mul(binary_tensor) - ret = x + self.alpha * (1 - binary_tensor) - ret.mul_(self.a).add_(self.b) - return ret - -def convert_to_3d_matrix(covariate_ids, patient_dict, y_dict = None, timeid_len = 31, cov_mean_dict = None): - """ - create matrix for temporal models. - - :param covariate_ids: the covariate ids in the whole data - :param patient_dict: the dictionary contains the data for each patient - :param y_dict: if the output labels is known, it contains the labels for patients - :param timeid_len: the total number time window gaps when extracting temporal data - :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids - """ - D = len(covariate_ids) - N = len(patient_dict) - T = timeid_len - concept_list =list(covariate_ids) - concept_list.sort() - x_raw = np.zeros((N, D, T), dtype=float) - patient_ind = 0 - p_ids = [] - patient_keys = patient_dict.keys() - #print covariate_ids - for kk in patient_keys: - #print('-------------------') - vals = patient_dict[kk] - #sorted(vals) - p_ids.append(int(kk)) - for timeid, meas in vals.iteritems(): - int_time = int(timeid) - 1 - for val in meas: - if not len(val): - continue - cov_id, cov_val = val - if cov_id not in covariate_ids: - continue - lab_ind = concept_list.index(cov_id) - if cov_mean_dict is None: - x_raw[patient_ind][lab_ind][int_time] = float(cov_val) - else: - mean_std = cov_mean_dict[cov_id] - if mean_std[1]: - x_raw[patient_ind][lab_ind][int_time] = (float(cov_val) - mean_std[0])/mean_std[1] - else: - x_raw[patient_ind][lab_ind][int_time] = float(cov_val) - - - patient_ind = patient_ind + 1 - - #impute the data using the value of previous timestamp - #fw = open('patient_var.txt', 'w') - for i in xrange(N): - for j in xrange(D): - temp = x_raw[i][j] - nonzero_inds = np.nonzero(temp)[0] - count_nonzeros = len(nonzero_inds) - #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n') - if count_nonzeros == 1: - ind = nonzero_inds[0] - for k in xrange(ind + 1, T): - x_raw[i][j][k] = x_raw[i][j][ind] - elif count_nonzeros > 1: - for ind in xrange(1, count_nonzeros): - for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]] - # For last nonzeros. - for k in xrange(nonzero_inds[-1] + 1, T): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]] - - #fw.close() - - return x_raw, patient_keys - -def forward_impute_missing_value(x_raw): - N = x_raw.shape[0] - D = x_raw.shape[1] - T = x_raw.shape[2] - for i in xrange(N): - for j in xrange(D): - temp = x_raw[i][j] - nonzero_inds = np.nonzero(temp)[0] - count_nonzeros = len(nonzero_inds) - #fw.write(str(i) + '\t' + str(count_nonzeros) + '\n') - if count_nonzeros == 1: - ind = nonzero_inds[0] - for k in xrange(ind + 1, T): - x_raw[i][j][k] = x_raw[i][j][ind] - elif count_nonzeros > 1: - for ind in xrange(1, count_nonzeros): - for k in xrange(nonzero_inds[ind -1] + 1, nonzero_inds[ind]): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[ind - 1]] - # For last nonzeros. - for k in xrange(nonzero_inds[-1] + 1, T): - x_raw[i][j][k] = x_raw[i][j][nonzero_inds[-1]] - - -def convert_to_temporal_format(covariates, timeid_len= 31, normalize = True, predict = False): - """ - It reads the data from covariates extracted by FeatureExtraction package and convert it to temporal data matrix - - :param covariates: covariates extracted by FeatureExtraction package - :param timeid_len: the total number of window gaps when extracting temporal data - :return: return the raw data in 3-D format, patients x covariates x number of windows, and the patients ids - """ - patient_dict = OrderedDict() - print('Loading temporal data') - cov_vals_dict = {} - for row in covariates: - p_id, cov_id, time_id, cov_val = row[0], row[1], row[2], row[3] - cov_id = np.int64(cov_id) - #time_id = int(time_id) - cov_vals_dict.setdefault(cov_id, []).append(float(cov_val)) - if p_id not in patient_dict: - patient_dict[p_id] = {time_id: [(cov_id, cov_val)]} - else: - if time_id not in patient_dict[p_id]: - patient_dict[p_id][time_id] = [(cov_id, cov_val)] - else: - patient_dict[p_id][time_id].append((cov_id, cov_val)) - #covariate_ids.add(cov_id) - #T = 365/time_window - covariate_ids = set() - cov_mean_dict = {} - if not predict: - fw = open('covariate_mean_std.csv', 'w') - for key, val in cov_vals_dict.iteritems(): - mean_val = np.mean(val) - std_val = np.std(val) - - # Remove those covariates with few occurrence (<5) - if len(val) >= 5: - covariate_ids.add(key) - cov_mean_dict[key] = (mean_val, std_val) - fw.write(str(key) + ',' + str(mean_val) + ',' + str(std_val) + '\n') - fw.close() - else: - fp = open('covariate_mean_std.csv', 'r') - for line in fp: - values = line.rstrip().split(',') - key = np.int64(values[0]) - covariate_ids.add(key) - cov_mean_dict[key] = (float(values[1]), float(values[2])) - fp.close() - - - if normalize: - x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len = timeid_len, cov_mean_dict = cov_mean_dict) - else: - x, patient_keys = convert_to_3d_matrix(covariate_ids, patient_dict, timeid_len=timeid_len) - - return x, patient_keys - - -def read_covariates(covariate_file): - patient_dict = {} - head = True - with open(covariate_file, 'r') as fp: - for line in fp: - if head: - head = False - continue - values = line.rstrip().split(',') - patient_id = values[1] - cov_id = values[2] - #time_id = int(values[-1]) - # covariates in one patient has time order - patient_dict.setdefault(patient_id, []).append((cov_id)) - new_patient = [] - for key in patient_dict.keys(): - #patient_dict[key].sort() - sort_vals = [] - for val in patient_dict[key]: - if val[1] not in sort_vals: - sort_vals.append(val) - new_patient.append(sort_vals) - - return new_patient - - -def word_embeddings(covariate_file, embedding_size=50): - import gensim.models.word2vec as w2v - modelname = "processed_%s.w2v" % ('heartfailure') - sentences = read_covariates(covariate_file) - model = w2v.Word2Vec(size=embedding_size, min_count=3, workers=4, iter=10, sg=1) - print("building word2vec vocab on %s..." % (covariate_file)) - - model.build_vocab(sentences) - print("training...") - model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) - out_file = modelname - print("writing embeddings to %s" % (out_file)) - model.save(out_file) - return out_file - -def read_data(filename): - covariate_ids = set() - patient_dict = {} - head = True - with open(filename) as fp: - for lines in fp: - if head: - head = False - continue - lines = lines.strip('\n').strip('\r').split(',') - try: - p_id, cov_id, time_id, cov_val = lines[1], lines[2], lines[3], lines[4] - except: - pdb.set_trace() - print(p_id, cov_id, time_id) - if p_id not in patient_dict: - patient_dict[p_id] = {} - else: - if time_id not in patient_dict[p_id]: - patient_dict[p_id][time_id] = [] - else: - patient_dict[p_id][time_id].append((cov_id, cov_val)) - covariate_ids.add(cov_id) - patient_dict = {k: v for k, v in patient_dict.iteritems() if v} #remove empty patients - #(15, 2000, 60) 20000 patients, 15 lab tests, - return covariate_ids, patient_dict - -def split_training_validation(classes, validation_size=0.2, shuffle=False): - """split sampels based on balnace classes""" - num_samples = len(classes) - classes = np.array(classes) - classes_unique = np.unique(classes) - num_classes = len(classes_unique) - indices = np.arange(num_samples) - # indices_folds=np.zeros([num_samples],dtype=int) - training_indice = [] - training_label = [] - validation_indice = [] - validation_label = [] - for cl in classes_unique: - indices_cl = indices[classes == cl] - num_samples_cl = len(indices_cl) - - # split this class into k parts - if shuffle: - random.shuffle(indices_cl) # in-place shuffle - - # module and residual - num_samples_each_split = int(num_samples_cl * validation_size) - res = num_samples_cl - num_samples_each_split - - training_indice = training_indice + [val for val in indices_cl[num_samples_each_split:]] - training_label = training_label + [cl] * res - - validation_indice = validation_indice + [val for val in indices_cl[:num_samples_each_split]] - validation_label = validation_label + [cl] * num_samples_each_split - - training_index = np.arange(len(training_label)) - random.shuffle(training_index) - training_indice = np.array(training_indice)[training_index] - training_label = np.array(training_label)[training_index] - - validation_index = np.arange(len(validation_label)) - random.shuffle(validation_index) - validation_indice = np.array(validation_indice)[validation_index] - validation_label = np.array(validation_label)[validation_index] - - return training_indice, training_label, validation_indice, validation_label - - -class LogisticRegression(nn.Module): - """ - Train a logistic regression model using pytorch - """ - def __init__(self, input_size, num_classes = 2): - super(LogisticRegression, self).__init__() - self.linear = nn.Linear(input_size, num_classes) - - def forward(self, x): - out = self.linear(x) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class MLP(nn.Module): - """ - Train a multiple-layer perceptron with one hideen layer - """ - def __init__(self, input_dim, hidden_size, num_classes = 2): - super(MLP, self).__init__() - self.fc1 = nn.Linear(input_dim, hidden_size) - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.dropout(x, p =0.5, training=self.training) - x = self.fc2(x) - x = torch.sigmoid(x) - return x - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class SNN(nn.Module): - """ - Train a multiple-layer self normalizing neural network, ref arXiv:1706.02515 - """ - - def __init__(self, input_dim, hidden_size, num_classes=2): - super(SNN, self).__init__() - self.fc1 = nn.Linear(input_dim, hidden_size) - self.fc2 = selu() - self.ad1 = alpha_drop() - self.fc4 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.dropout(x, p=0.5, training=self.training) - x = self.fc2(x) - x = self.ad1(x) - x = self.fc4(x) - x = torch.sigmoid(x) - return x - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - -class AutoEncoder(nn.Module): - """ - A stacked autoencoder with 2 hiddden layers and need be adapted for EHR data. - """ - def __init__(self, input_size, encoding_size): - super(AutoEncoder, self).__init__() - - self.encoder = nn.Sequential( - nn.Linear(input_size, input_size/2), - nn.ReLU(True), - nn.Linear(input_size/2, input_size/4), - nn.ReLU(True), - nn.Linear(input_size/4, encoding_size), - nn.ReLU(True) - ) - self.decoder = nn.Sequential( - nn.Linear(encoding_size, input_size/4), - nn.ReLU(True), - nn.Linear(input_size/4, input_size/2), - nn.ReLU(True), - nn.Linear(input_size/2, input_size) - ) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - encoded = self.encoder(x) - decoded = self.decoder(encoded) - return decoded - - def get_encode_features(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - encoded = self.encoder(x) - encoded = encoded.data.cpu().numpy() - return encoded - -class VAE(nn.Module): - """ - A stacked variational autoencoder with 2 hiddden layers and need be adapted for EHR data. - """ - def __init__(self, input_size, encoding_size): - super(VAE, self).__init__() - - self.fc1 = nn.Linear(input_size, input_size/2) - self.fc21 = nn.Linear(input_size/2, encoding_size) - self.fc22 = nn.Linear(input_size/2, encoding_size) - self.fc3 = nn.Linear(encoding_size, input_size/2) - self.fc4 = nn.Linear(input_size/2, input_size) - - self.relu = nn.ReLU() - self.sigmoid = nn.Sigmoid() - - def encode(self, x): - h1 = self.relu(self.fc1(x)) - return self.fc21(h1), self.fc22(h1) - - def reparameterize(self, mu, logvar): - if self.training: - std = logvar.mul(0.5).exp_() - eps = Variable(std.data.new(std.size()).normal_()) - return eps.mul(std).add_(mu) - else: - return mu - - def decode(self, z): - h3 = self.relu(self.fc3(z)) - return self.sigmoid(self.fc4(h3)) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - mu, logvar = self.encode(x) - z = self.reparameterize(mu, logvar) - return self.decode(z), mu, logvar - - def get_encode_features(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - mu, logvar = self.encode(x) - encoded = self.reparameterize(mu, logvar) - encoded = encoded.data.cpu().numpy() - return encoded - -class Decoder(nn.Module): - """ VAE decoder input_size = original inputsize/16*256""" - def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)): - super(Decoder, self).__init__() - self.latent_size = latent_size - self.img_channels = img_channels - - self.fc1 = nn.Linear(latent_size, input_size) - self.deconv1 = nn.ConvTranspose2d(input_size, 128, kernel_size, stride=stride, padding = padding) - self.deconv2 = nn.ConvTranspose2d(128, 64, kernel_size, stride=stride, padding = padding) - self.deconv3 = nn.ConvTranspose2d(64, 32, kernel_size, stride=stride, padding = padding) - self.deconv4 = nn.ConvTranspose2d(32, img_channels, kernel_size, stride=stride, padding = padding) - - def forward(self, x): # pylint: disable=arguments-differ - x = F.relu(self.fc1(x)) - x = x.unsqueeze(-1).unsqueeze(-1) - x = F.relu(self.deconv1(x)) - x = F.relu(self.deconv2(x)) - x = F.relu(self.deconv3(x)) - reconstruction = torch.sigmoid(self.deconv4(x)) - return reconstruction - -class Encoder(nn.Module): # pylint: disable=too-many-instance-attributes - """ VAE encoder """ - def __init__(self, latent_size, input_size, img_channels = 1, kernel_size=(1, 4), stride=(1, 2), padding=(0, 1)): - super(Encoder, self).__init__() - self.latent_size = latent_size - #self.img_size = img_size - self.img_channels = img_channels - - self.conv1 = nn.Conv2d(img_channels, 32, kernel_size, stride=stride, padding = padding) - self.conv2 = nn.Conv2d(32, 64, kernel_size, stride=stride, padding = padding) - self.conv3 = nn.Conv2d(64, 128, kernel_size, stride=stride, padding = padding) - self.conv4 = nn.Conv2d(128, 256, kernel_size, stride=stride, padding = padding) - out_size = input_size / 16 - self.fc_mu = nn.Linear(out_size, latent_size) - self.fc_logsigma = nn.Linear(out_size, latent_size) - - - def forward(self, x): # pylint: disable=arguments-differ - x = F.relu(self.conv1(x)) - x = F.relu(self.conv2(x)) - x = F.relu(self.conv3(x)) - x = F.relu(self.conv4(x)) - x = x.view(x.size(0), -1) - - mu = self.fc_mu(x) - logsigma = self.fc_logsigma(x) - - return mu, logsigma - -class VAE_CNN(nn.Module): - """ Variational Autoencoder """ - def __init__(self, latent_size, input_size): - super(VAE, self).__init__() - self.encoder = Encoder(latent_size, input_size) - input_size = input_size/16 - self.decoder = Decoder(latent_size, input_size) - - def forward(self, x): # pylint: disable=arguments-differ - mu, logsigma = self.encoder(x) - sigma = logsigma.exp() - eps = torch.randn_like(sigma) - z = eps.mul(sigma).add_(mu) - - recon_x = self.decoder(z) - return recon_x, mu, logsigma - - def get_encode_features(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - mu, logvar = self.encoder(x) - encoded = mu .data.cpu().numpy() - return encoded - -class CNN(nn.Module): - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0): - super(CNN, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.layer2 = nn.Sequential( - nn.Conv2d(nb_filter, nb_filter, kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out2_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(int(maxpool_size*labcounts*nb_filter), hidden_size) - self.bn = nn.BatchNorm1d(hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.layer1(x) - out = self.layer2(out) - out = out.view(out.size(0), -1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.bn(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -#allow multiple kernel with differnt kernel size -class CNN_MLF(nn.Module): - """ - It is a deep CNNs with three different kernel size, the outputs from the three CNNs are concatenated to fed into two fully connected layers. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 200, stride = (1, 1), padding = 0): - super(CNN_MLF, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (1, 3), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out1_size = (window_size + 2*padding - (3 - 1) - 1)/stride[1] + 1 - maxpool1_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.layer2 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (1, 4), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out2_size = (window_size + 2*padding - (4 - 1) - 1)/stride[1] + 1 #4 is the convolve filter size - maxpool2_size = (out2_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.layer3 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (1, 5), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - out3_size = (window_size + 2*padding - (5 - 1) - 1)/stride[1] + 1 - maxpool3_size = (out3_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - conv_outsize = maxpool1_size + maxpool2_size +maxpool3_size - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(conv_outsize*labcounts*nb_filter, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out1 = self.layer1(x) - out2 = self.layer2(x) - out3 = self.layer3(x) - out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out2.size(0), -1)), 1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class CNN_LSTM(nn.Module): - """ - It is a deep network with two layer CNN, followed by LSTM layer, which further fed into two fully connected layers. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0, num_layers = 2): - super(CNN_LSTM, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size, stride = stride)) - self.num_layers = num_layers - self.hidden_size = hidden_size - out1_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool_size = (out1_size + 2*padding - (pool_size[1] - 1) - 1)/stride[1] + 1 - self.downsample = nn.Conv2d(nb_filter, 1, kernel_size, stride = stride, padding = padding) - input_size = (maxpool_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - self.layer2 = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True) - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(hidden_size, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.layer1(x) - out = self.downsample(out) - out = torch.squeeze(out, 1) - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() - c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)).cuda() - else: - h0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) - c0 = Variable(torch.zeros(self.num_layers, out.size(0), self.hidden_size)) - out, hn = self.layer2(out, (h0, c0)) - out = hn[0][-1] - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class CNN_MIX(nn.Module): - """ - It is a deep network with 2 layers CNN, which works on input and time dimension, respectively, more details refer to deepDianosis in github. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 3), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0): - super(CNN_MIX, self).__init__() - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (labcounts, 1), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - self.layer2 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = (nb_filter, 1), stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size)) - out1_size = int(np.ceil(float(window_size)/pool_size[1])) - self.layer3 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - - out2_size = (out1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(out2_size*nb_filter*nb_filter, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.layer1(x) - out = out.view(out.size(0), out.size(2), out.size(1), out.size(3)) - out = self.layer2(out) - out = out.view(out.size(0), out.size(2), out.size(1), out.size(3)) - out = self.layer3(out) - out = out.view(out.size(0), -1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class CNN_MULTI(nn.Module): - """ - It is a deep network with multiple resolution, more details refer to multiresconvnet of deepDianosis in github. - """ - def __init__(self, nb_filter, num_classes = 2, kernel_size = (1, 5), pool_size = (1, 2), labcounts = 32, window_size = 12, hidden_size = 100, stride = (1, 1), padding = 0): - super(CNN_MULTI, self).__init__() - # resolution 1 - self.pool1_1 = nn.MaxPool2d(pool_size, stride = pool_size) - maxpool_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - self.pool1_2 = nn.MaxPool2d(pool_size, stride = pool_size) - maxpool1_2_size = (maxpool_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - - self.layer1 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - cnn1_size = (maxpool1_2_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - #resolution 2 - self.pool2_1 = nn.MaxPool2d(pool_size, stride = pool_size) - maxpool2_1_size = (window_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - - self.layer2 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - cnn2_size = (maxpool2_1_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - self.layer3 = nn.Sequential( - nn.Conv2d(1, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU(), - nn.MaxPool2d(pool_size)) - cnn3_size = (window_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - maxpool3_size = (cnn3_size + 2*padding - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - self.layer4 = nn.Sequential( - nn.Conv2d(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride, padding = padding), - nn.BatchNorm2d(nb_filter), - nn.ReLU()) - cnn4_size = (maxpool3_size + 2*padding - (kernel_size[1] - 1) - 1)/stride[1] + 1 - merge_size = cnn1_size + cnn2_size + cnn4_size - self.drop1 = nn.Dropout(p=0.5) - self.fc1 = nn.Linear(labcounts*nb_filter*merge_size, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.pool1_1(x) - out = self.pool1_2(out) - out1 = self.layer1(out) - out = self.pool2_1(x) - out2 = self.layer2(out) - out = self.layer3(x) - out3 = self.layer4(out) - out = torch.cat((out1.view(out1.size(0), -1), out2.view(out2.size(0), -1), out3.view(out3.size(0), -1)), 1) - out = self.drop1(out) - out = self.fc1(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -# 1x3 Convolution -def convR(in_channels, out_channels, kernel_size, stride=1, padding = (0, 1)): - return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, - padding=padding, stride=stride, bias=False) - - -# Residual Block -class ResidualBlock(nn.Module): - def __init__(self, in_channel, nb_filter = 16, kernel_size = (1, 3), stride=1, downsample=None): - super(ResidualBlock, self).__init__() - self.conv1 = convR(in_channel, nb_filter, kernel_size = kernel_size, stride = stride) - self.bn1 = nn.BatchNorm2d(nb_filter) - self.relu = nn.ReLU(inplace=True) - self.conv2 = convR(nb_filter, nb_filter, kernel_size = kernel_size, stride = stride) - self.bn2 = nn.BatchNorm2d(nb_filter) - self.downsample = downsample - - def forward(self, x): - residual = x - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - out = self.conv2(out) - out = self.bn2(out) - if self.downsample: - residual = self.downsample(x) - out += residual - out = self.relu(out) - return out - - -# ResNet Module -class ResNet(nn.Module): - def __init__(self, block, layers, nb_filter = 16, labcounts = 12, window_size = 36, kernel_size = (1, 3), pool_size = (1, 3), num_classes=2, hidden_size = 100): - super(ResNet, self).__init__() - self.in_channels = 1 - self.conv = convR(self.in_channels, nb_filter, kernel_size = kernel_size) - self.bn = nn.BatchNorm2d(nb_filter) - self.relu = nn.ReLU(inplace=True) - self.layer1 = self.make_layer(block, nb_filter, layers[0], kernel_size = kernel_size) - self.layer2 = self.make_layer(block, nb_filter*2, layers[1], 1, kernel_size = kernel_size, in_channels = nb_filter) - self.layer3 = self.make_layer(block, nb_filter*4, layers[2], 1, kernel_size = kernel_size, in_channels = 2*nb_filter) - self.avg_pool = nn.AvgPool2d(pool_size) - avgpool2_1_size = (window_size - (pool_size[1] - 1) - 1)/pool_size[1] + 1 - last_layer_size = nb_filter*4*labcounts*avgpool2_1_size - self.fc = nn.Linear(last_layer_size, hidden_size) - self.drop2 = nn.Dropout(p=0.5) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(hidden_size, num_classes) - - def make_layer(self, block, out_channels, blocks, stride=1, kernel_size = (1, 3), in_channels = 16): - downsample = None - if (stride != 1) or (self.in_channels != out_channels): - downsample = nn.Sequential( - convR(in_channels, out_channels, kernel_size = kernel_size, stride=stride), - nn.BatchNorm2d(out_channels)) - layers = [] - layers.append(block(in_channels, out_channels, kernel_size = kernel_size, stride = stride, downsample = downsample)) - self.in_channels = out_channels - for i in range(1, blocks): - layers.append(block(out_channels, out_channels, kernel_size = kernel_size)) - return nn.Sequential(*layers) - - def forward(self, x): - x = x.view(x.size(0), 1, x.size(1), x.size(2)) - out = self.conv(x) - out = self.bn(out) - out = self.relu(out) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.avg_pool(out) - out = out.view(out.size(0), -1) - out = self.fc(out) - out = self.drop2(out) - out = self.relu1(out) - out = self.fc2(out) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class GRU(nn.Module): - """ - It is a deep network with one GRU layer, which are further fed into one fully connected layers. - """ - def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5): - super(GRU, self).__init__() - - self.hidden_size = hidden_size - self.num_layers = num_layers - self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout) - self.linear = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() # 2 for bidirection - else: - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) # 2 for bidirection - self.gru.flatten_parameters() - out, hn = self.gru(x, h0) - - rearranged = hn[-1] - out = self.linear(rearranged) - out = torch.sigmoid(out) - return out - - def initHidden(self, N): - return Variable(torch.randn(1, N, self.hidden_size)) - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class RNN(nn.Module): - """ - It is a deep network with one LSTM layer, which are further fed into one fully connected layer. - """ - def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5): - super(RNN, self).__init__() - self.hidden_size = hidden_size - self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, dropout = dropout) - self.fc = nn.Linear(hidden_size, num_classes) - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() - c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)).cuda() - else: - h0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) - c0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) - self.lstm.flatten_parameters() - out, hn = self.lstm(x, (h0, c0)) - rearranged = hn[0][-1] - # Decode hidden state of last time step - out = self.fc(rearranged) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -class BiRNN(nn.Module): - """ - It is a deep network with one bidirectional LSTM layer, which are further fed into one fully connected layer. - """ - def __init__(self, input_size, hidden_size, num_layers, num_classes = 2, dropout = 0.5): - super(BiRNN, self).__init__() - self.hidden_size = hidden_size - self.num_layers = num_layers - self.lstm = nn.LSTM(input_size, hidden_size, num_layers, - batch_first = True, dropout = dropout, bidirectional=True) - self.fc = nn.Linear(hidden_size*2, num_classes) # 2 for bidirection - - def forward(self, x): - if torch.cuda.is_available(): - x = x.cuda() - h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() # 2 for bidirection - c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)).cuda() - else: - h0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) # 2 for bidirection - c0 = Variable(torch.zeros(self.num_layers*2, x.size(0), self.hidden_size)) - self.lstm.flatten_parameters() - out, hn = self.lstm(x, (h0, c0)) - hn = hn[0] - - rearranged = hn[-2:].view(x.size(0), -1) - # Decode hidden state of last time step - out = self.fc(rearranged) - out = torch.sigmoid(out) - return out - - def predict_proba(self, x): - if type(x) is np.ndarray: - x = torch.from_numpy(x.astype(np.float32)) - with torch.no_grad(): - x = Variable(x) - if torch.cuda.is_available(): - x = x.cuda() - y = self.forward(x) - temp = y.data.cpu().numpy() - return temp - - -# select model -def train_deeptorch(population, plpData, train = True, model_type = 'LogisticRegression', class_weight =0, autoencoder = True, w_decay =0.9, epochs = 1, vae = False, size = 100, loss = 'LogSoftmax', nbfilters = 4, learning_rate = 0.0001, hidden_size = 100, modelOutput = 'C:/deeptorch', seed = 1, quiet = False): - if model_type in ['LogisticRegression', 'MLP', 'SNN']: - y = population[:, 1] - X = plpData[population[:, 0], :] - trainInds = population[:, population.shape[1] - 1] > 0 - if class_weight == -1: - loss = FocalLoss(gamma = 5) - else: - if class_weight == 0: - weights = float(np.count_nonzero(y))/y.shape[0] - class_weight = [1 - weights, weights] - else: - class_weight = [class_weight, 1] - class_weight = 1/torch.Tensor(class_weight) - if torch.cuda.is_available(): - class_weight = class_weight.cuda() - loss=nn.CrossEntropyLoss(weight = class_weight) - - print("Dataset has %s rows and %s columns" % (X.shape[0], X.shape[1])) - print("population loaded- %s rows and %s columns" % (np.shape(population)[0], np.shape(population)[1])) - ########################################################################### - l1regularization = False - if train: - pred_size = int(np.sum(population[:, population.shape[1] - 1] > 0)) - print("Calculating prediction for train set of size %s" % (pred_size)) - test_pred = np.zeros(pred_size) # zeros length sum(population[:,population.size[1]] ==i) - for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1): - testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i - trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i) - train_x = X[trainInds, :][trainInd, :] - train_y = y[trainInds][trainInd] - test_x = X[trainInds, :][testInd, :] - print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0])) - print("Train set contains %s outcomes " % (np.sum(train_y))) - train_x = train_x.toarray() - test_x = test_x.toarray() - if autoencoder: - print('first train stakced autoencoder') - encoding_size = 256 - if vae: - auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size) - else: - auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size) - if torch.cuda.is_available(): - auto_model = auto_model.cuda() - clf = Estimator(auto_model) - clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay = w_decay), - loss=nn.MSELoss()) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder = autoencoder, vae = vae) - #split to batch for large dataset - train_batch = batch(train_x, batch_size=32) - train_x = np.array([]).reshape(0, encoding_size) - for train in train_batch: - encode_train = auto_model.get_encode_features(train) - train_x = np.concatenate((train_x, encode_train), axis=0) - test_batch = batch(test_x, batch_size=32) - test_x = np.array([]).reshape(0, encoding_size) - for test in test_batch: - encode_Test = auto_model.get_encode_features(test) - test_x = np.concatenate((test_x, encode_Test), axis=0) - del auto_model - del clf - # train on fold - print("Training fold %s" % (i)) - start_time = timeit.default_timer() - if model_type == 'LogisticRegression': - model = LogisticRegression(train_x.shape[1]) - l1regularization = True - elif model_type == 'SNN': - model = SNN(train_x.shape[1], size) - else: - model = MLP(train_x.shape[1], size) - - if torch.cuda.is_available(): - model = model.cuda() - clf = Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay), - loss=loss) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization) - - ind = (population[:, population.shape[1] - 1] > 0) - ind = population[ind, population.shape[1] - 1] == i - test_input_var = torch.from_numpy(test_x.astype(np.float32)) - test_batch = batch(test_x, batch_size = 32) - temp = [] - for test in test_batch: - pred_test1 = model.predict_proba(test)[:, 1] - temp = np.concatenate((temp, pred_test1), axis = 0) - test_pred[ind] = temp - print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0])) - print("Mean: %s prediction value" % (np.mean(test_pred[ind]))) - - # RETURN CV PREDICTION WHEN TRAIN == T - test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) - prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) - return prediction; - - # train final: - else: - print("Training final neural network model on all train data...") - print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0])) - start_time = timeit.default_timer() - train_x = X[trainInds, :] - train_x = train_x.toarray() - train_y = y[trainInds] - if not os.path.exists(modelOutput): - os.makedirs(modelOutput) - if autoencoder: - encoding_size = 256 - if vae: - auto_model = VAE(input_size=train_x.shape[1], encoding_size=encoding_size) - else: - auto_model = AutoEncoder(input_size=train_x.shape[1], encoding_size=encoding_size) - if torch.cuda.is_available(): - auto_model = auto_model.cuda() - clf = Estimator(auto_model) - clf.compile(optimizer=torch.optim.Adam(auto_model.parameters(), lr=1e-3, weight_decay=w_decay), - loss=nn.MSELoss()) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, autoencoder=autoencoder, vae = vae) - train_batch = batch(train_x, batch_size=32) - train_x = np.array([]).reshape(0, encoding_size) - for train in train_batch: - encode_train = auto_model.get_encode_features(train) - train_x = np.concatenate((train_x, encode_train), axis=0) - joblib.dump(auto_model, os.path.join(modelOutput, 'autoencoder_model.pkl')) - del auto_model - del clf - print('the final parameter epochs %.2f weight_decay %.2f' %(epochs,w_decay)) - if model_type == 'LogisticRegression': - model = LogisticRegression(train_x.shape[1]) - l1regularization = True - elif model_type == 'SNN': - model = SNN(train_x.shape[1], size) - else: - model = MLP(train_x.shape[1], size) - if torch.cuda.is_available(): - model = model.cuda() - clf = Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay = w_decay), - loss=loss) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs, l1regularization = l1regularization) - end_time = timeit.default_timer() - print("Training final took: %.2f s" % (end_time - start_time)) - print("Model saved to: %s" % (modelOutput)) - joblib.dump(model, os.path.join(modelOutput,'model.pkl')) - # DO PREDICTION ON TRAIN: - train_batch = batch(train_x, batch_size = 32) - train_pred = [] - for train in train_batch: - preds = model.predict_proba(train)[:, 1] - train_pred = np.concatenate((train_pred, preds), axis = 0) - train_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) - prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], train_pred, axis=1) - # RETURN TRAIN PREDICTION WHEN TRAIN == F - return prediction; - - elif model_type in ['CNN', 'RNN', 'CNN_LSTM', 'CNN_MLF', 'CNN_MIX', 'GRU', 'BiRNN', 'CNN_MULTI', 'ResNet']: - y = population[:, 1] - X = plpData.to_dense().numpy() - X = X[np.int64(population[:, 0]), :] - trainInds = population[:, population.shape[1] - 1] > 0 - - if class_weight == -1: - loss = FocalLoss(gamma = 3) - else: - if class_weight == 0: - weights = float(np.count_nonzero(y))/y.shape[0] - class_weight = [1 - weights, weights] - else: - class_weight = [class_weight, 1] - class_weight = 1/torch.Tensor(class_weight) - if torch.cuda.is_available(): - class_weight = class_weight.cuda() - loss=nn.CrossEntropyLoss(weight = class_weight) - - if train: - test_pred = np.zeros(population[population[:, population.shape[1] - 1] > 0, :].shape[0]) # zeros length sum(population[:,population.size[1]] ==i) - for i in range(1, int(np.max(population[:, population.shape[1] - 1]) + 1), 1): - testInd = population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] == i - trainInd = (population[population[:, population.shape[1] - 1] > 0, population.shape[1] - 1] != i) - train_x = X[trainInds, :][trainInd, :] - train_y = y[trainInds][trainInd] - test_x = X[trainInds, :][testInd, :] - print("Fold %s split %s in train set and %s in test set" % (i, train_x.shape[0], test_x.shape[0])) - print("Train set contains %s outcomes " % (np.sum(train_y))) - # train on fold - learning_rate = 0.001 - print("Training fold %s" % (i)) - start_time = timeit.default_timer() - if model_type == 'CNN': - model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_LSTM': - model = CNN_LSTM(nb_filter = nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'CNN_MLF': # multiple kernels with different size - model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MIX': # mixed model from deepDiagnosis - model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MULTI': # multiple resolution model from deepDiagnosis - model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'ResNet': - print('train ResNet') - model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'RNN': - model = RNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'BiRNN': - model = BiRNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'GRU': - model = GRU(train_x.shape[2], hidden_size, 2, 2) - else: - print('temproal data not supported by this model') - - if torch.cuda.is_available(): - model = model.cuda() - clf = Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001), - loss=loss) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs) - ind = (population[:, population.shape[1] - 1] > 0) - ind = population[ind, population.shape[1] - 1] == i - test_batch = batch(test_x, batch_size = 32) - temp = [] - for test in test_batch: - pred_test1 = model.predict_proba(test)[:, 1] - temp = np.concatenate((temp, pred_test1), axis = 0) - test_pred[ind] = temp - del model - print("Prediction complete: %s rows " % (np.shape(test_pred[ind])[0])) - print("Mean: %s prediction value" % (np.mean(test_pred[ind]))) - # RETURN CV PREDICTION - test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) - prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) - return prediction; - - # train final: - else: - print("Training final neural network model on all train data...") - print("X- %s rows and Y %s length" % (X[trainInds, :].shape[0], y[trainInds].shape[0])) - start_time = timeit.default_timer() - train_x = X[trainInds, :] - train_y = y[trainInds] - learning_rate = 0.001 - if model_type == 'CNN': - model = CNN(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_LSTM': - model = CNN_LSTM(nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'CNN_MLF': # multiple kernels with different size - model = CNN_MLF(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MIX': #mixed model from deepDiagnosis - model = CNN_MIX(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'CNN_MULTI': # multi resolution model from deepDiagnosis - model = CNN_MULTI(nb_filter = nbfilters, labcounts = train_x.shape[1], window_size = train_x.shape[2]) - elif model_type == 'ResNet': - model = ResNet(ResidualBlock, [3, 3, 3], nb_filter=nbfilters, labcounts=train_x.shape[1], window_size=train_x.shape[2]) - elif model_type == 'RNN': - model = RNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'BiRNN': - model = BiRNN(train_x.shape[2], hidden_size, 2, 2) - elif model_type == 'GRU': - model = GRU(train_x.shape[2], hidden_size, 2, 2) - else: - print('temproal data not supported by this model') - - if torch.cuda.is_available(): - model = model.cuda() - clf = Estimator(model) - clf.compile(optimizer=torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay = 0.0001), - loss=loss) - clf.fit(train_x, train_y, batch_size=32, nb_epoch=epochs) - end_time = timeit.default_timer() - print("Training final took: %.2f s" % (end_time - start_time)) - # save the model: - if not os.path.exists(modelOutput): - os.makedirs(modelOutput) - print("Model saved to: %s" % (modelOutput)) - - joblib.dump(model, os.path.join(modelOutput,'model.pkl')) - # prediction on train: - test_batch = batch(train_x, batch_size = 32) - test_pred = [] - for test in test_batch: - pred_test1 = model.predict_proba(test)[:, 1] - test_pred = np.concatenate((test_pred, pred_test1), axis = 0) - test_pred.shape = (population[population[:, population.shape[1] - 1] > 0, :].shape[0], 1) - prediction = np.append(population[population[:, population.shape[1] - 1] > 0, :], test_pred, axis=1) - return prediction; diff --git a/inst/python/predictFunctions.py b/inst/python/predictFunctions.py deleted file mode 100644 index fcbb2e5..0000000 --- a/inst/python/predictFunctions.py +++ /dev/null @@ -1,153 +0,0 @@ -# apply random forest model on new data -#=============================================================== -# INPUT: -# 1) location of new data -# 2) location of model -# -# OUTPUT: -# it returns a file with indexes merged with prediction for test index - named new_pred -#================================================================ -import numpy as np -from collections import OrderedDict -import os -import sys -import timeit -import math -from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack -from joblib import Memory -import joblib - -def batch(tensor, batch_size = 50): - """ It is used to create batch samples, each batch has batch_size samples""" - tensor_list = [] - length = tensor.shape[0] - i = 0 - while True: - if (i+1) * batch_size >= length: - tensor_list.append(tensor[i * batch_size: length]) - return tensor_list - tensor_list.append(tensor[i * batch_size: (i+1) * batch_size]) - i += 1 -#================================================================ -########################################################################### -def python_predict_temporal(population, plpData, model_loc, dense, autoencoder): - print("Applying Python Model") - print("Loading Data...") - # load data + train,test indexes + validation index - #y=population[:,1] - - ########################################################################### - # uf dense convert - if dense==1: - print("converting to dense data...") - X = plpData.to_dense().numpy() - if dense==0: - print("keeping data sparse...") - X = plpData.numpy() - ########################################################################### - - # order the data - X = X[np.int64(population[:, 0]), :] - # load index file - print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) - print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) - print("Data ready for model has %s features" %(np.shape(X)[1])) - - # load model - print("Loading model...") - if autoencoder: - autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl')) - X = autoencoder_model.get_encode_features(X) - modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) - print("Calculating predictions on population...") - test_batch = batch(X, batch_size = 32) - test_pred = [] - for test in test_batch: - pred_test1 = modelTrained.predict_proba(test)[:, 1] - test_pred = np.concatenate((test_pred , pred_test1), axis = 0) - print("Prediction complete: %s rows" %(np.shape(test_pred)[0])) - print("Mean: %s prediction value" %(np.mean(test_pred))) - # merge pred with population - test_pred.shape = (population.shape[0], 1) - prediction = np.append(population,test_pred, axis=1) - return prediction - -def python_predict(population, plpData, model_loc, dense, autoencoder): - print("Applying Python Model") - print("Loading Data...") - # load data + train,test indexes + validation index - #y=population[:,1] - X = plpData[population[:,0].astype(int),:] - # load index file - print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) - print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) - print("Data ready for model has %s features" %(np.shape(X)[1])) - ########################################################################### - # uf dense convert - if dense==1: - print("converting to dense data...") - X=X.toarray() - ########################################################################### - # load model - print("Loading model...") - if autoencoder: - autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl')) - X = autoencoder_model.get_encode_features(X) - modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) - print("Calculating predictions on population...") - test_pred = modelTrained.predict_proba(X)[:, 1] - print("Prediction complete: %s rows" %(np.shape(test_pred)[0])) - print("Mean: %s prediction value" %(np.mean(test_pred))) - # merge pred with population - test_pred.shape = (population.shape[0], 1) - prediction = np.append(population,test_pred, axis=1) - return prediction - - -def python_predict_survival(population, plpData, model_loc): - print("Applying Python Model") - print("Loading Data...") - # load data + train,test indexes + validation index - X = plpData[population[:,0].astype(int),:] - # load index file - print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) - print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) - print("Data ready for model has %s features" %(np.shape(X)[1])) - ########################################################################### - # load model - print("Loading model...") - modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) - print("Calculating predictions on population...") - test_pred = modelTrained.predict(X.toarray()) - test_pred = test_pred.flatten() - rowCount = population.shape[0] - test_pred = test_pred[0:(rowCount)] - print("Prediction complete: %s rows" %(np.shape(test_pred)[0])) - print("Mean: %s prediction value" %(np.mean(test_pred))) - # merge pred with population - test_pred.shape = (population.shape[0], 1) - prediction = np.append(population,test_pred, axis=1) - return prediction - -def python_predict_garden(population, plpData, model_loc,quantile=None): - print("Applying Python Model") - print("Loading Data...") - # load data + train,test indexes + validation index - #y=population[:,1] - X = plpData[population[:,0].astype(int),:] - # load index file - print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) - print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) - print("Data ready for model has %s features" %(np.shape(X)[1])) - ########################################################################### - # load model - print("Loading model...") - modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) - print("Calculating predictions on population...") - test_pred = modelTrained.predict(X,quantile=quantile)[:, 0] - print("Prediction complete: %s rows" %(np.shape(test_pred)[0])) - print("Mean: %s prediction value" %(np.mean(test_pred))) - # merge pred with population - test_pred.shape = (population.shape[0], 1) - prediction = np.append(population,test_pred, axis=1) - return prediction diff --git a/inst/python/python_predict.py b/inst/python/python_predict.py deleted file mode 100644 index 3a2f132..0000000 --- a/inst/python/python_predict.py +++ /dev/null @@ -1,111 +0,0 @@ -# apply random forest model on new data -#=============================================================== -# INPUT: -# 1) location of new data -# 2) location of model -# -# OUTPUT: -# it returns a file with indexes merged with prediction for test index - named new_pred -#================================================================ -import numpy as np -from collections import OrderedDict -import os -import sys -import timeit -import math -#from sklearn.ensemble import RandomForestClassifier -#from sklearn.naive_bayes import GaussianNB -from scipy.sparse import coo_matrix,csr_matrix,vstack,hstack -#from sklearn.feature_selection import SelectFromModel -#from sklearn.cross_validation import PredefinedSplit -from sklearn.externals.joblib import Memory -#from sklearn.datasets import load_svmlight_file -from sklearn.externals import joblib -if "python_dir" in globals(): - sys.path.insert(0, python_dir) - import TorchUtils as tu -#================================================================ - - -print("Applying Python Model") - -########################################################################### - -def get_temproal_data(covariates, population): - p_ids_in_cov = set(covariates[:, 0]) - timeid_len = len(set(covariates[:, -2])) - full_covariates = np.array([]).reshape(0,4) - default_covid = covariates[0, 1] - for p_id in population[:, 0]: - if p_id not in p_ids_in_cov: - tmp_x = np.array([p_id, default_covid, 1, 0]).reshape(1,4) #default cov id, timeid=1 - full_covariates = np.concatenate((full_covariates, tmp_x), axis=0) - else: - tmp_x = covariates[covariates[:, 0] == p_id, :] - #print tmp_x.shape, X.shape - full_covariates = np.concatenate((full_covariates, tmp_x), axis=0) - - X, patient_keys = tu.convert_to_temporal_format(full_covariates, timeid_len = timeid_len, predict = True) - return X - - -print("Loading Data...") -# load data + train,test indexes + validation index - -y=population[:,1] -#print covariates.shape - -if modeltype == 'temporal': - X = plpData.to_dense().numpy() - X = X[np.int64(population[:, 0]), :] - #X = get_temproal_data(covariates, population) - dense = 0 -else: - #print included - X = plpData[population[:,0],:] - X = X[:,included.flatten()] - -# load index file -print("population loaded- %s rows and %s columns" %(np.shape(population)[0], np.shape(population)[1])) -print("Dataset has %s rows and %s columns" %(X.shape[0], X.shape[1])) -print("Data ready for model has %s features" %(np.shape(X)[1])) - -########################################################################### -# uf dense convert -if dense==1: - print("converting to dense data...") - X=X.toarray() -########################################################################### - -# load model -print("Loading model...") - -modelTrained = joblib.load(os.path.join(model_loc,"model.pkl")) - -print(X.shape) -print("Calculating predictions on population...") - -if autoencoder: - autoencoder_model = joblib.load(os.path.join(model_loc, 'autoencoder_model.pkl')) - X = autoencoder_model.get_encode_features(X) - -if modeltype == 'temporal': - test_batch = tu.batch(X, batch_size = 32) - test_pred = [] - for test in test_batch: - pred_test1 = modelTrained.predict_proba(test)[:, 1] - test_pred = np.concatenate((test_pred , pred_test1), axis = 0) -else: - test_pred = modelTrained.predict_proba(X)[:, 1] - - -if test_pred.ndim != 1: - test_pred = test_pred[:,1] - - -print("Prediction complete: %s rows" %(np.shape(test_pred)[0])) -print("Mean: %s prediction value" %(np.mean(test_pred))) - -# merge pred with population -test_pred.shape = (population.shape[0], 1) -prediction = np.append(population,test_pred, axis=1) diff --git a/inst/sql/sql_server/DomainConceptTemporal.sql b/inst/sql/sql_server/DomainConceptTemporal.sql deleted file mode 100644 index a3ebc2a..0000000 --- a/inst/sql/sql_server/DomainConceptTemporal.sql +++ /dev/null @@ -1,90 +0,0 @@ --- Feature construction -SELECT -CAST(@domain_concept_id AS BIGINT) * 1000 + @analysis_id AS covariate_id, - time_id, - duration, -- does aggregation make sense for this? -{@aggregated} ? { - cohort_definition_id, - COUNT(*) AS sum_value -} : { - row_id, - 1 AS covariate_value -} -INTO @covariate_table -FROM ( - SELECT DISTINCT @domain_concept_id, - FLOOR(DATEDIFF(@time_part, @cdm_database_schema.@domain_table.@domain_start_date, cohort.cohort_start_date)*1.0/@time_interval ) as time_id, - {@aggregated} ? { - max( - } - - CASE WHEN @cdm_database_schema.@domain_table.@domain_end_date <= cohort.cohort_start_date - THEN - DATEDIFF(@time_part, @cdm_database_schema.@domain_table.@domain_start_date, @cdm_database_schema.@domain_table.@domain_end_date)*1.0/@time_interval - ELSE - DATEDIFF(@time_part, @cdm_database_schema.@domain_table.@domain_start_date, cohort.cohort_start_date)*1.0/@time_interval - END - {@aggregated} ? { - ) - } - as duration, - {@aggregated} ? { - cohort_definition_id, - cohort.subject_id, - cohort.cohort_start_date - } : { - cohort.@row_id_field AS row_id - } - FROM @cohort_table cohort - INNER JOIN @cdm_database_schema.@domain_table - ON cohort.subject_id = @domain_table.person_id - - WHERE @domain_start_date <= DATEADD(DAY, @end_day, cohort.cohort_start_date) - AND @domain_concept_id != 0 - - {@sub_type == 'inpatient'} ? { AND condition_type_concept_id IN (38000183, 38000184, 38000199, 38000200)} - {@excluded_concept_table != ''} ? { AND @domain_concept_id NOT IN (SELECT id FROM @excluded_concept_table)} - {@included_concept_table != ''} ? { AND @domain_concept_id IN (SELECT id FROM @included_concept_table)} - {@included_cov_table != ''} ? { AND CAST(@domain_concept_id AS BIGINT) * 1000 + @analysis_id IN (SELECT id FROM @included_cov_table)} - {@cohort_definition_id != -1} ? { AND cohort.cohort_definition_id IN (@cohort_definition_id)} -) by_row_id -{@aggregated} ? { - GROUP BY - cohort_definition_id, - @domain_concept_id, - time_id -} -; - --- Reference construction -INSERT INTO #cov_ref ( -covariate_id, -covariate_name, -analysis_id, -concept_id -) -SELECT covariate_id, - -CAST(CONCAT('@domain_table: ', CASE WHEN concept_name IS NULL THEN 'Unknown concept' ELSE concept_name END {@sub_type == 'inpatient'} ? {, ' (inpatient)'}) AS VARCHAR(512)) AS covariate_name, - -@analysis_id AS analysis_id, -CAST((covariate_id - @analysis_id) / 1000 AS INT) AS concept_id -FROM ( - SELECT DISTINCT covariate_id - FROM @covariate_table -) t1 -LEFT JOIN @cdm_database_schema.concept -ON concept_id = CAST((covariate_id - @analysis_id) / 1000 AS INT); - -INSERT INTO #analysis_ref ( -analysis_id, -analysis_name, -domain_id, -is_binary, -missing_means_zero -) -SELECT @analysis_id AS analysis_id, -CAST('@analysis_name' AS VARCHAR(512)) AS analysis_name, -CAST('@domain_id' AS VARCHAR(20)) AS domain_id, -CAST('Y' AS VARCHAR(1)) AS is_binary, -CAST(NULL AS VARCHAR(1)) AS missing_means_zero; \ No newline at end of file diff --git a/man/Dataset.Rd b/man/Dataset.Rd new file mode 100644 index 0000000..eb12468 --- /dev/null +++ b/man/Dataset.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Dataset.R +\name{Dataset} +\alias{Dataset} +\title{A torch dataset} +\usage{ +Dataset(data, labels = NULL, numericalIndex = NULL) +} +\arguments{ +\item{data}{a dataframe like object with the covariates} + +\item{labels}{a dataframe with the labels} + +\item{numericalIndex}{in what column numeric data is in (if any)} +} +\description{ +A torch dataset +} diff --git a/man/DeepPatientLevelPrediction.Rd b/man/DeepPatientLevelPrediction.Rd new file mode 100644 index 0000000..c97f163 --- /dev/null +++ b/man/DeepPatientLevelPrediction.Rd @@ -0,0 +1,9 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DeepPatientLevelPrediction.R +\docType{package} +\name{DeepPatientLevelPrediction} +\alias{DeepPatientLevelPrediction} +\title{DeepPatientLevelPrediction} +\description{ +A package containing deep learning extensions for developing prediction models using data in the OMOP CDM +} diff --git a/man/EarlyStopping.Rd b/man/EarlyStopping.Rd new file mode 100644 index 0000000..1087705 --- /dev/null +++ b/man/EarlyStopping.Rd @@ -0,0 +1,76 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Estimator.R +\name{EarlyStopping} +\alias{EarlyStopping} +\title{Earlystopping class} +\description{ +Stops training if a loss or metric has stopped improving +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-EarlyStopping-new}{\code{EarlyStopping$new()}} +\item \href{#method-EarlyStopping-call}{\code{EarlyStopping$call()}} +\item \href{#method-EarlyStopping-clone}{\code{EarlyStopping$clone()}} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EarlyStopping-new}{}}} +\subsection{Method \code{new()}}{ +Creates a new earlystopping object +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EarlyStopping$new(patience = 3, delta = 0, verbose = TRUE)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{patience}}{Stop after this number of epochs if loss doesn't improve} + +\item{\code{delta}}{How much does the loss need to improve to count as improvement} + +\item{\code{verbose}}{If information should be printed out} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +a new earlystopping object +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EarlyStopping-call}{}}} +\subsection{Method \code{call()}}{ +call the earlystopping object and increment a counter if loss is not +improving +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EarlyStopping$call(metric)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{metric}}{the current metric value} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-EarlyStopping-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{EarlyStopping$clone(deep = FALSE)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
    }} +} +} +} diff --git a/man/Estimator.Rd b/man/Estimator.Rd new file mode 100644 index 0000000..a6fc4bc --- /dev/null +++ b/man/Estimator.Rd @@ -0,0 +1,301 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Estimator.R +\name{Estimator} +\alias{Estimator} +\title{Estimator} +\description{ +A generic R6 class that wraps around a torch nn module and can be used to +fit and predict the model defined in that module. +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-Estimator-new}{\code{Estimator$new()}} +\item \href{#method-Estimator-fit}{\code{Estimator$fit()}} +\item \href{#method-Estimator-fitEpoch}{\code{Estimator$fitEpoch()}} +\item \href{#method-Estimator-score}{\code{Estimator$score()}} +\item \href{#method-Estimator-finishFit}{\code{Estimator$finishFit()}} +\item \href{#method-Estimator-fitWholeTrainingSet}{\code{Estimator$fitWholeTrainingSet()}} +\item \href{#method-Estimator-save}{\code{Estimator$save()}} +\item \href{#method-Estimator-predictProba}{\code{Estimator$predictProba()}} +\item \href{#method-Estimator-predict}{\code{Estimator$predict()}} +\item \href{#method-Estimator-batchToDevice}{\code{Estimator$batchToDevice()}} +\item \href{#method-Estimator-itemOrDefaults}{\code{Estimator$itemOrDefaults()}} +\item \href{#method-Estimator-clone}{\code{Estimator$clone()}} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-new}{}}} +\subsection{Method \code{new()}}{ +Creates a new estimator +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$new( + baseModel, + modelParameters, + fitParameters, + optimizer = torch::optim_adam, + criterion = torch::nn_bce_with_logits_loss, + scheduler = torch::lr_reduce_on_plateau, + device = "cpu", + patience = 4 +)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{baseModel}}{The torch nn module to use as model} + +\item{\code{modelParameters}}{Parameters to initialize the baseModel} + +\item{\code{fitParameters}}{Parameters required for the estimator fitting} + +\item{\code{optimizer}}{A torch optimizer to use, default is Adam} + +\item{\code{criterion}}{The torch loss function to use, defaults to +binary cross entropy with logits} + +\item{\code{scheduler}}{learning rate scheduler to use} + +\item{\code{device}}{Which device to use for fitting, default is cpu} + +\item{\code{patience}}{Patience to use for early stopping} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-fit}{}}} +\subsection{Method \code{fit()}}{ +fits the estimator +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$fit(dataset, testDataset)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{dataset}}{a torch dataset to use for model fitting} + +\item{\code{testDataset}}{a torch dataset to use for early stopping} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-fitEpoch}{}}} +\subsection{Method \code{fitEpoch()}}{ +fits estimator for one epoch (one round through the data) +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$fitEpoch(dataset, batchIndex)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{dataset}}{torch dataset to use for fitting} + +\item{\code{batchIndex}}{indices of batches} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-score}{}}} +\subsection{Method \code{score()}}{ +calculates loss and auc after training for one epoch +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$score(dataset, batchIndex)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{dataset}}{The torch dataset to use to evaluate loss and auc} + +\item{\code{batchIndex}}{Indices of batches in the dataset} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +list with average loss and auc in the dataset +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-finishFit}{}}} +\subsection{Method \code{finishFit()}}{ +operations that run when fitting is finished +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$finishFit(valAUCs, modelStateDict, valLosses, epoch, learnRates)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{valAUCs}}{validation AUC values} + +\item{\code{modelStateDict}}{fitted model parameters} + +\item{\code{valLosses}}{validation losses} + +\item{\code{epoch}}{list of epochs fit} + +\item{\code{learnRates}}{learning rate sequence used so far} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-fitWholeTrainingSet}{}}} +\subsection{Method \code{fitWholeTrainingSet()}}{ +Fits whole training set on a specific number of epochs +TODO What happens when learning rate changes per epochs? +Ideally I would copy the learning rate strategy from before +and adjust for different sizes ie more iterations/updates??? +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$fitWholeTrainingSet(dataset, learnRates = NULL)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{dataset}}{torch dataset} + +\item{\code{learnRates}}{learnRateSchedule from CV} +} +\if{html}{\out{
    }} +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-save}{}}} +\subsection{Method \code{save()}}{ +save model and those parameters needed to reconstruct it +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$save(path, name)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{path}}{where to save the model} + +\item{\code{name}}{name of file} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +the path to saved model +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-predictProba}{}}} +\subsection{Method \code{predictProba()}}{ +predicts and outputs the probabilities +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$predictProba(dataset)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{dataset}}{Torch dataset to create predictions for} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +predictions as probabilities +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-predict}{}}} +\subsection{Method \code{predict()}}{ +predicts and outputs the class +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$predict(dataset, threshold = NULL)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{dataset}}{A torch dataset to create predictions for} + +\item{\code{threshold}}{Which threshold to use for predictions} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +The predicted class for the data in the dataset +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-batchToDevice}{}}} +\subsection{Method \code{batchToDevice()}}{ +sends a batch of data to device +assumes batch includes lists of tensors to arbitrary nested depths +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$batchToDevice(batch)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{batch}}{the batch to send, usually a list of torch tensors} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +the batch on the required device +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-itemOrDefaults}{}}} +\subsection{Method \code{itemOrDefaults()}}{ +select item from list, and if it's null sets a default +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$itemOrDefaults(list, item, default = NULL)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{list}}{A list with items} + +\item{\code{item}}{Which list item to retrieve} + +\item{\code{default}}{The value to return if list doesn't have item} +} +\if{html}{\out{
    }} +} +\subsection{Returns}{ +the list item or default +} +} +\if{html}{\out{
    }} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-Estimator-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
    }}\preformatted{Estimator$clone(deep = FALSE)}\if{html}{\out{
    }} +} + +\subsection{Arguments}{ +\if{html}{\out{
    }} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
    }} +} +} +} diff --git a/man/fitEstimator.Rd b/man/fitEstimator.Rd new file mode 100644 index 0000000..87b6e39 --- /dev/null +++ b/man/fitEstimator.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Estimator.R +\name{fitEstimator} +\alias{fitEstimator} +\title{fitEstimator} +\usage{ +fitEstimator(trainData, modelSettings, analysisId, ...) +} +\arguments{ +\item{trainData}{the data to use} + +\item{modelSettings}{modelSettings object} + +\item{analysisId}{Id of the analysis} + +\item{...}{Extra inputs} +} +\description{ +fits a deep learning estimator to data. +} diff --git a/man/gridCvDeep.Rd b/man/gridCvDeep.Rd new file mode 100644 index 0000000..4d52d4d --- /dev/null +++ b/man/gridCvDeep.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Estimator.R +\name{gridCvDeep} +\alias{gridCvDeep} +\title{gridCvDeep} +\usage{ +gridCvDeep(mappedData, labels, settings, modelLocation, paramSearch) +} +\arguments{ +\item{mappedData}{Mapped data with covariates} + +\item{labels}{Dataframe with the outcomes} + +\item{settings}{Settings of the model} + +\item{modelLocation}{Where to save the model} + +\item{paramSearch}{model parameters to perform search over} +} +\description{ +Performs grid search for a deep learning estimator +} diff --git a/man/predictDeepEstimator.Rd b/man/predictDeepEstimator.Rd new file mode 100644 index 0000000..31d9e8d --- /dev/null +++ b/man/predictDeepEstimator.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Estimator.R +\name{predictDeepEstimator} +\alias{predictDeepEstimator} +\title{predictDeepEstimator} +\usage{ +predictDeepEstimator(plpModel, data, cohort) +} +\arguments{ +\item{plpModel}{the plpModel} + +\item{data}{plp data object or a torch dataset} + +\item{cohort}{data.frame with the rowIds of the people} +} +\description{ +the prediction function for the estimator +} diff --git a/man/setEstimator.Rd b/man/setEstimator.Rd new file mode 100644 index 0000000..d3001e3 --- /dev/null +++ b/man/setEstimator.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Estimator.R +\name{setEstimator} +\alias{setEstimator} +\title{setEstimator} +\arguments{ +\item{learningRate}{what learning rate to use} + +\item{weightDecay}{what weight_decay to use} + +\item{optimizer}{which optimizer to use} + +\item{scheduler}{which learning rate scheduler to use} + +\item{criterion}{loss function to use} + +\item{posWeight}{If more weight should be added to positive labels during training - will result in miscalibrated models} + +\item{earlyStopping}{If earlyStopping should be used which stops the training of your metric is not improving} + +\item{earlyStoppingMetric}{Which parameter to use for early stopping} + +\item{patience}{patience for earlyStopper} + +\item{hyperparameterMetric}{which metric to use for hyperparameter, loss, auc, auprc or a custom function} +} +\description{ +creates settings for the Estimator, which takes a model and trains it +} diff --git a/man/setMultiLayerPerceptron.Rd b/man/setMultiLayerPerceptron.Rd new file mode 100644 index 0000000..a79ab49 --- /dev/null +++ b/man/setMultiLayerPerceptron.Rd @@ -0,0 +1,52 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/MLP.R +\name{setMultiLayerPerceptron} +\alias{setMultiLayerPerceptron} +\title{setMultiLayerPerceptron} +\usage{ +setMultiLayerPerceptron( + numLayers = c(1:8), + sizeHidden = c(2^(6:9)), + dropout = c(seq(0, 0.5, 0.05)), + sizeEmbedding = c(2^(6:9)), + weightDecay = c(1e-06, 0.001), + learningRate = c(0.01, 3e-04, 1e-05), + seed = NULL, + hyperParamSearch = "random", + randomSample = 100, + device = "cpu", + batchSize = 1024, + epochs = 30 +) +} +\arguments{ +\item{numLayers}{Number of layers in network, default: 1:16} + +\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)} + +\item{dropout}{How much dropout to apply after first linear, default: seq(0, 0.3, 0.05)} + +\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)} + +\item{weightDecay}{Weight decay to apply, default: c(1e-6, 1e-3)} + +\item{learningRate}{Learning rate to use. default: c(1e-2, 1e-5)} + +\item{seed}{Seed to use for sampling hyperparameter space} + +\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'} + +\item{randomSample}{How many random samples from hyperparameter space to use} + +\item{device}{Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'} + +\item{batchSize}{Size of batch, default: 1024} + +\item{epochs}{Number of epochs to run, default: 10} +} +\description{ +Creates settings for a Multilayer perceptron model +} +\details{ +Model architecture +} diff --git a/man/setResNet.Rd b/man/setResNet.Rd new file mode 100644 index 0000000..d0ced0d --- /dev/null +++ b/man/setResNet.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ResNet.R +\name{setResNet} +\alias{setResNet} +\title{setResNet} +\usage{ +setResNet( + numLayers = c(1:8), + sizeHidden = c(2^(6:9)), + hiddenFactor = c(1:4), + residualDropout = c(seq(0, 0.5, 0.05)), + hiddenDropout = c(seq(0, 0.5, 0.05)), + sizeEmbedding = c(2^(6:9)), + weightDecay = c(1e-06, 0.001), + learningRate = c(0.01, 3e-04, 1e-05), + seed = NULL, + hyperParamSearch = "random", + randomSample = 100, + device = "cpu", + batchSize = 1024, + epochs = 30 +) +} +\arguments{ +\item{numLayers}{Number of layers in network, default: 1:16} + +\item{sizeHidden}{Amount of neurons in each default layer, default: 2^(6:10) (64 to 1024)} + +\item{hiddenFactor}{How much to grow the amount of neurons in each ResLayer, default: 1:4} + +\item{residualDropout}{How much dropout to apply after last linear layer in ResLayer, default: seq(0, 0.3, 0.05)} + +\item{hiddenDropout}{How much dropout to apply after first linear layer in ResLayer, default: seq(0, 0.3, 0.05)} + +\item{sizeEmbedding}{Size of embedding layer, default: 2^(6:9) (64 to 512)} + +\item{weightDecay}{Weight decay to apply, default: c(1e-6, 1e-3)} + +\item{learningRate}{Learning rate to use. default: c(1e-2, 1e-5)} + +\item{seed}{Seed to use for sampling hyperparameter space} + +\item{hyperParamSearch}{Which kind of hyperparameter search to use random sampling or exhaustive grid search. default: 'random'} + +\item{randomSample}{How many random samples from hyperparameter space to use} + +\item{device}{Which device to run analysis on, either 'cpu' or 'cuda', default: 'cpu'} + +\item{batchSize}{Size of batch, default: 1024} + +\item{epochs}{Number of epochs to run, default: 10} +} +\description{ +Creates settings for a ResNet model +} +\details{ +Model architecture from by https://arxiv.org/abs/2106.11959 +} diff --git a/man/setTransformer.Rd b/man/setTransformer.Rd new file mode 100644 index 0000000..54927b0 --- /dev/null +++ b/man/setTransformer.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Transformer.R +\name{setTransformer} +\alias{setTransformer} +\title{create settings for training a non-temporal transformer} +\usage{ +setTransformer( + numBlocks = 3, + dimToken = 96, + dimOut = 1, + numHeads = 8, + attDropout = 0.25, + ffnDropout = 0.25, + resDropout = 0, + dimHidden = 512, + weightDecay = 1e-06, + learningRate = 3e-04, + batchSize = 1024, + epochs = 10, + device = "cpu", + hyperParamSearch = "random", + randomSamples = 100, + seed = NULL +) +} +\arguments{ +\item{numBlocks}{number of transformer blocks} + +\item{dimToken}{dimension of each token (embedding size)} + +\item{dimOut}{dimension of output, usually 1 for binary problems} + +\item{numHeads}{number of attention heads} + +\item{attDropout}{dropout to use on attentions} + +\item{ffnDropout}{dropout to use in feedforward block} + +\item{resDropout}{dropout to use in residual connections} + +\item{dimHidden}{dimension of the feedworward block} + +\item{weightDecay}{weightdecay to use} + +\item{learningRate}{learning rate to use} + +\item{batchSize}{batchSize to use} + +\item{epochs}{How many epochs to run the model for} + +\item{device}{Which device to use, cpu or cuda} + +\item{hyperParamSearch}{what kind of hyperparameter search to do, default 'random'} + +\item{randomSamples}{How many samples to use in hyperparameter search if random} + +\item{seed}{Random seed to use} +} +\description{ +A transformer model +} +\details{ +from https://arxiv.org/abs/2106.11959 +} diff --git a/tests/testthat.R b/tests/testthat.R index 59feab0..fc53f82 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,6 +1,4 @@ -#Sys.setenv("R_TESTS" = "") -#options(fftempdir = file.path(getwd(),'fftemp')) library(testthat) library(DeepPatientLevelPrediction) + test_check("DeepPatientLevelPrediction") -unlink('T:/Temp', recursive = T) diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R new file mode 100644 index 0000000..93c9232 --- /dev/null +++ b/tests/testthat/setup.R @@ -0,0 +1,94 @@ +library(PatientLevelPrediction) + +testLoc <- tempdir() + +# get connection and data from Eunomia +connectionDetails <- Eunomia::getEunomiaConnectionDetails() +Eunomia::createCohorts(connectionDetails) + +covSet <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = T, + useDemographicsAge = T, + useDemographicsRace = T, + useDemographicsEthnicity = T, + useDemographicsAgeGroup = T, + useConditionGroupEraLongTerm = T, + useDrugEraStartLongTerm = T, + endDays = -1 +) + + +covSetT <- FeatureExtraction::createTemporalSequenceCovariateSettings( + useDemographicsGender = T, + useDemographicsAge = T, + useDemographicsRace = T, + useDemographicsEthnicity = T, + useDemographicsAgeGroup = T, + useConditionEraGroupStart = T, + useDrugEraStart = T, + timePart = "month", + timeInterval = 1, + sequenceEndDay = -1, + sequenceStartDay = -365 * 5 +) + + +databaseDetails <- PatientLevelPrediction::createDatabaseDetails( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + cohortDatabaseSchema = "main", + cohortTable = "cohort", + targetId = 4, + outcomeIds = 3, + outcomeDatabaseSchema = "main", + outcomeTable = "cohort", + cdmDatabaseName = "eunomia" +) + +restrictPlpDataSettings <- PatientLevelPrediction::createRestrictPlpDataSettings( + firstExposureOnly = T, + washoutPeriod = 365 +) + +plpData <- PatientLevelPrediction::getPlpData( + databaseDetails = databaseDetails, + restrictPlpDataSettings = restrictPlpDataSettings, + covariateSettings = covSet +) + + +plpDataT <- PatientLevelPrediction::getPlpData( + databaseDetails = databaseDetails, + restrictPlpDataSettings = restrictPlpDataSettings, + covariateSettings = covSetT +) + + +populationSet <- PatientLevelPrediction::createStudyPopulationSettings( + requireTimeAtRisk = F, + riskWindowStart = 1, + riskWindowEnd = 365 +) + +population <- PatientLevelPrediction::createStudyPopulation( + plpData = plpData, + outcomeId = 3, + populationSettings = populationSet +) + +trainData <- PatientLevelPrediction::splitData( + plpData, + population = population, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting() +) + +mappedData <- PatientLevelPrediction::MapIds( + covariateData = trainData$Train$covariateData, + cohort = trainData$Train$labels +) + +dataset <- Dataset( + data = mappedData$covariates, + labels = trainData$Train$labels$outcomeCount, + numericalIndex = NULL +) diff --git a/tests/testthat/test-Dataset.R b/tests/testthat/test-Dataset.R new file mode 100644 index 0000000..989f886 --- /dev/null +++ b/tests/testthat/test-Dataset.R @@ -0,0 +1,60 @@ +test_that("dataset correct class", { + testthat::expect_true("myDataset" %in% class(dataset)) +}) + +test_that("length of index correct", { + testthat::expect_equal( + length(dataset$getNumericalIndex()), + dplyr::n_distinct(mappedData$covariates %>% dplyr::pull(covariateId)) + ) +}) + +test_that("number of num and cat features sum correctly", { + testthat::expect_equal( + dataset$numNumFeatures() + dataset$numCatFeatures(), + dplyr::n_distinct(mappedData$covariates %>% dplyr::pull(covariateId)) + ) +}) + + +test_that("length of dataset correct", { + expect_equal(length(dataset), dataset$cat$shape[1]) + expect_equal(length(dataset), dataset$num$shape[1]) + expect_equal( + dataset$.length(), + dplyr::n_distinct(mappedData$covariates %>% dplyr::pull(rowId)) + ) +}) + +test_that(".getbatch works", { + batch_size <- 16 + # get one sample + out <- dataset[10] + + # output should be a list of two items, the batch and targets, + # the batch is what goes to the model + expect_equal(length(out), 2) + + # targets should be binary + expect_true(out$target$item() %in% c(0, 1)) + + # shape of batch is correct + expect_equal(length(out$batch), 2) + expect_equal(out$batch$cat$shape[1], 1) + expect_equal(out$batch$num$shape[1], 1) + + # shape of target + expect_equal(out$target$shape[1], 1) + + # get a whole batch + out <- dataset[10:(10 + batch_size - 1)] + + expect_equal(length(out), 2) + expect_true(all(torch::as_array(out$target) %in% c(0, 1))) + + expect_equal(length(out$batch), 2) + expect_equal(out$batch$cat$shape[1], 16) + expect_equal(out$batch$num$shape[1], 16) + + expect_equal(out$target$shape[1], 16) +}) diff --git a/tests/testthat/test-Estimator.R b/tests/testthat/test-Estimator.R new file mode 100644 index 0000000..2f110d3 --- /dev/null +++ b/tests/testthat/test-Estimator.R @@ -0,0 +1,155 @@ +catFeatures <- dataset$numCatFeatures() +numFeatures <- dataset$numNumFeatures() + +fitParams <- list() +baseModel <- ResNet + +modelParameters <- list( + catFeatures = catFeatures, + numFeatures = numFeatures, + sizeEmbedding = 16, + sizeHidden = 16, + numLayers = 2, + hiddenFactor = 2 +) + +estimator <- Estimator$new( + baseModel = baseModel, + modelParameters = modelParameters, + fitParameters = fitParams, + device = "cpu" +) + +test_that("Estimator initialization works", { + + # count parameters in both instances + testthat::expect_equal( + sum(sapply(estimator$model$parameters, function(x) prod(x$shape))), + sum(sapply(do.call(baseModel, modelParameters)$parameters, function(x) prod(x$shape))) + ) + + testthat::expect_equal( + estimator$modelParameters, + modelParameters + ) + + # check the function that results the value from a list + val <- estimator$itemOrDefaults(list(param = 1, test = 3), "param", default = NULL) + expect_equal(val, 1) + val <- estimator$itemOrDefaults(list(param = 1, test = 3), "paramater", default = NULL) + expect_true(is.null(val)) +}) +sink(nullfile()) +estimator$fit(dataset, dataset) +sink() + +test_that("estimator fitting works", { + + + # check the fitting + # estimator$fitEpoch(dataset, batchIndex) + # estimator$finishFit(valAUCs, modelStateDict, valLosses, epoch) + # estimator$score(dataset, batchIndex) + expect_true(!is.null(estimator$bestEpoch)) + expect_true(!is.null(estimator$bestScore$loss)) + expect_true(!is.null(estimator$bestScore$auc)) + + old_weights <- estimator$model$head$weight$mean()$item() + + sink(nullfile()) + estimator$fitWholeTrainingSet(dataset, estimator$learnRateSchedule) + sink() + + expect_equal(estimator$optimizer$param_groups[[1]]$lr, tail(estimator$learnRateSchedule, 1)[[1]]) + + new_weights <- estimator$model$head$weight$mean()$item() + + # model should be updated when refitting + expect_true(old_weights != new_weights) + + estimator$save(testLoc, "estimator.pt") + + expect_true(file.exists(file.path(testLoc, "estimator.pt"))) + + preds <- estimator$predictProba(dataset) + + expect_lt(max(preds), 1) + expect_gt(min(preds), 0) + + classes <- estimator$predict(dataset, threshold = 0.5) + expect_equal(all(unique(classes) %in% c(0, 1)), TRUE) + + # not sure how to test: batchToDevice(batch) +}) + +test_that("early stopping works", { + earlyStop <- EarlyStopping$new(patience = 3, delta = 0, verbose = FALSE) + testthat::expect_true(is.null(earlyStop$bestScore)) + testthat::expect_false(earlyStop$earlyStop) + earlyStop$call(0.5) + testthat::expect_equal(earlyStop$bestScore, 0.5) + earlyStop$call(0.4) + earlyStop$call(0.4) + earlyStop$call(0.4) + testthat::expect_true(earlyStop$earlyStop) +}) + +modelSettings <- setResNet( + numLayers = 1, sizeHidden = 16, hiddenFactor = 1, + residualDropout = 0, hiddenDropout = 0, + sizeEmbedding = 16, hyperParamSearch = "random", + randomSample = 1, epochs = 1 +) + +sink(nullfile()) +results <- fitEstimator(trainData$Train, modelSettings = modelSettings, analysisId = 1) +sink() + +test_that("Estimator fit function works", { + expect_true(!is.null(results$trainDetails$trainingTime)) + + expect_equal(class(results), "plpModel") + expect_equal(attr(results, "modelType"), "binary") + expect_equal(attr(results, "saveType"), "file") +}) + +test_that("predictDeepEstimator works", { + + # input is an estimator and a dataset + sink(nullfile()) + predictions <- predictDeepEstimator(estimator, dataset, cohort = trainData$Train$labels) + sink() + + expect_lt(max(predictions$value), 1) + expect_gt(min(predictions$value), 0) + expect_equal(nrow(predictions), nrow(trainData$Train$labels)) + + # input is a plpModel and data + sink(nullfile()) + predictions <- predictDeepEstimator( + plpModel = results, data = trainData$Test, + trainData$Test$labels + ) + sink() + expect_lt(max(predictions$value), 1) + expect_gt(min(predictions$value), 0) + expect_equal(nrow(predictions), nrow(trainData$Test$labels)) +}) + +test_that("batchToDevice works", { + # since we can't test moving to gpu there is a fake device called meta + # which we can use to test of the device is updated + estimator$device <- "meta" + b <- 1:10 + batch <- estimator$batchToDevice(dataset[b]) + + devices <- lapply( + lapply(unlist(batch, recursive = TRUE), function(x) x$device), + function(x) x == torch::torch_device(type = "meta") + ) + # test that all are meta + expect_true(all(devices == TRUE)) +}) + + +# cases to add, estimator with early stopping that stops, and estimator without earlystopping diff --git a/tests/testthat/test-MLP.R b/tests/testthat/test-MLP.R new file mode 100644 index 0000000..3bc0d8f --- /dev/null +++ b/tests/testthat/test-MLP.R @@ -0,0 +1,110 @@ + +modelSettings <- setMultiLayerPerceptron( + numLayers = c(2), + sizeHidden = c(32), + dropout = c(0.1), + sizeEmbedding = c(32), + weightDecay = c(1e-6), + learningRate = c(3e-4), + seed = 42, + hyperParamSearch = "random", + randomSample = 1, + batchSize = 128, + epochs = 3 +) + +test_that("setMultiLayerPerceptron works", { + testthat::expect_s3_class(object = modelSettings, class = "modelSettings") + + testthat::expect_equal(modelSettings$fitFunction, "fitEstimator") + + testthat::expect_true(length(modelSettings$param) > 0) +}) + +sink(nullfile()) +results <- tryCatch( + { + PatientLevelPrediction::runPlp( + plpData = plpData, + outcomeId = 3, + modelSettings = modelSettings, + analysisId = "MLP", + analysisName = "Testing Deep Learning", + populationSettings = populationSet, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), + sampleSettings = PatientLevelPrediction::createSampleSettings(), # none + featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), + executeSettings = PatientLevelPrediction::createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = file.path(testLoc, "MLP") + ) + }, + error = function(e) { + print(e) + return(NULL) + } +) +sink() + +test_that("MLP with runPlp working checks", { + testthat::expect_false(is.null(results)) + + # check structure + testthat::expect_true("prediction" %in% names(results)) + testthat::expect_true("model" %in% names(results)) + testthat::expect_true("covariateSummary" %in% names(results)) + testthat::expect_true("performanceEvaluation" %in% names(results)) + + # check prediction same size as pop + testthat::expect_equal(nrow(results$prediction %>% + dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population)) + + # check prediction between 0 and 1 + testthat::expect_gte(min(results$prediction$value), 0) + testthat::expect_lte(max(results$prediction$value), 1) +}) + + +test_that("MLP nn-module works ", { + model <- MLP( + catFeatures = 5, numFeatures = 1, sizeEmbedding = 5, + sizeHidden = 16, numLayers = 1, + activation = torch::nn_relu, + normalization = torch::nn_batch_norm1d, dropout = 0.3, + d_out = 1 + ) + + pars <- sum(sapply(model$parameters, function(x) prod(x$shape))) + + # expected number of parameters + expect_equal(pars, 489) + + input <- list() + input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long()) + input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32()) + + + output <- model(input) + + # output is correct shape + expect_equal(output$shape, 10) + + input$num <- NULL + model <- MLP( + catFeatures = 5, numFeatures = 0, sizeEmbedding = 5, + sizeHidden = 16, numLayers = 1, + activation = torch::nn_relu, + normalization = torch::nn_batch_norm1d, dropout = 0.3, + d_out = 1 + ) + output <- model(input) + # model works without numeric variables + expect_equal(output$shape, 10) +}) diff --git a/tests/testthat/test-ResNet.R b/tests/testthat/test-ResNet.R new file mode 100644 index 0000000..cb9a397 --- /dev/null +++ b/tests/testthat/test-ResNet.R @@ -0,0 +1,113 @@ + +resSet <- setResNet( + numLayers = c(2), + sizeHidden = c(32), + hiddenFactor = c(2), + residualDropout = c(0.1), + hiddenDropout = c(0.1), + sizeEmbedding = c(32), + weightDecay = c(1e-6), + learningRate = c(3e-4), + seed = 42, + hyperParamSearch = "random", + randomSample = 1, + # device='cuda:0', + batchSize = 128, + epochs = 1 +) + +test_that("setResNet works", { + testthat::expect_s3_class(object = resSet, class = "modelSettings") + + testthat::expect_equal(resSet$fitFunction, "fitEstimator") + + testthat::expect_true(length(resSet$param) > 0) +}) + +sink(nullfile()) +res2 <- tryCatch( + { + PatientLevelPrediction::runPlp( + plpData = plpData, + outcomeId = 3, + modelSettings = resSet, + analysisId = "ResNet", + analysisName = "Testing Deep Learning", + populationSettings = populationSet, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), + sampleSettings = PatientLevelPrediction::createSampleSettings(), # none + featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), + executeSettings = PatientLevelPrediction::createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = file.path(testLoc, "Deep") + ) + }, + error = function(e) { + print(e) + return(NULL) + } +) +sink() + +test_that("ResNet with runPlp working checks", { + testthat::expect_false(is.null(res2)) + + # check structure + testthat::expect_true("prediction" %in% names(res2)) + testthat::expect_true("model" %in% names(res2)) + testthat::expect_true("covariateSummary" %in% names(res2)) + testthat::expect_true("performanceEvaluation" %in% names(res2)) + + # check prediction same size as pop + testthat::expect_equal(nrow(res2$prediction %>% + dplyr::filter(evaluationType %in% c("Train", "Test"))), nrow(population)) + + # check prediction between 0 and 1 + testthat::expect_gte(min(res2$prediction$value), 0) + testthat::expect_lte(max(res2$prediction$value), 1) +}) + + +test_that("ResNet nn-module works ", { + model <- ResNet( + catFeatures = 5, numFeatures = 1, sizeEmbedding = 5, + sizeHidden = 16, numLayers = 1, hiddenFactor = 2, + activation = torch::nn_relu, + normalization = torch::nn_batch_norm1d, hiddenDropout = 0.3, + residualDropout = 0.3, d_out = 1 + ) + + pars <- sum(sapply(model$parameters, function(x) prod(x$shape))) + + # expected number of parameters + expect_equal(pars, 1289) + + input <- list() + input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long()) + input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32()) + + + output <- model(input) + + # output is correct shape + expect_equal(output$shape, 10) + + input$num <- NULL + model <- ResNet( + catFeatures = 5, numFeatures = 0, sizeEmbedding = 5, + sizeHidden = 16, numLayers = 1, hiddenFactor = 2, + activation = torch::nn_relu, + normalization = torch::nn_batch_norm1d, hiddenDropout = 0.3, + residualDropout = 0.3, d_out = 1 + ) + output <- model(input) + # model works without numeric variables + expect_equal(output$shape, 10) +}) diff --git a/tests/testthat/test-Transformer.R b/tests/testthat/test-Transformer.R new file mode 100644 index 0000000..913e687 --- /dev/null +++ b/tests/testthat/test-Transformer.R @@ -0,0 +1,61 @@ +settings <- setTransformer( + numBlocks = 1, dimToken = 8, dimOut = 1, + numHeads = 2, attDropout = 0.0, ffnDropout = 0.2, + resDropout = 0.0, dimHidden = 32, batchSize = 64, + epochs = 1, randomSamples = 1 +) + +test_that("Transformer settings work", { + testthat::expect_s3_class(object = settings, class = "modelSettings") + testthat::expect_equal(settings$fitFunction, "fitEstimator") + testthat::expect_true(length(settings$param) > 0) + testthat::expect_error(setTransformer( + numBlocks = 1, dimToken = 50, + numHeads = 7 + )) +}) + +test_that("fitEstimator with Transformer works", { + results <- fitEstimator(trainData$Train, settings, analysisId = 1) + + expect_equal(class(results), "plpModel") + expect_equal(attr(results, "modelType"), "binary") + expect_equal(attr(results, "saveType"), "file") + + # check prediction between 0 and 1 + expect_gt(min(results$prediction$value), 0) + expect_lt(max(results$prediction$value), 1) +}) + +test_that("transformer nn-module works", { + model <- Transformer( + catFeatures = 5, numFeatures = 1, numBlocks = 2, + dimToken = 16, numHeads = 2, attDropout = 0, ffnDropout = 0, + resDropout = 0, dimHidden = 32 + ) + + pars <- sum(sapply(model$parameters, function(x) prod(x$shape))) + + # expected number of parameters + expect_equal(pars, 5697) + + input <- list() + input$cat <- torch::torch_randint(0, 5, c(10, 5), dtype = torch::torch_long()) + input$num <- torch::torch_randn(10, 1, dtype = torch::torch_float32()) + + + output <- model(input) + + # output is correct shape, size of batch + expect_equal(output$shape, 10) + + input$num <- NULL + + model <- Transformer( + catFeatures = 5, numFeatures = 0, numBlocks = 2, + dimToken = 16, numHeads = 2, attDropout = 0, ffnDropout = 0, + resDropout = 0, dimHidden = 32 + ) + output <- model(input) + expect_equal(output$shape, 10) +}) diff --git a/tests/testthat/test-keras.R b/tests/testthat/test-keras.R deleted file mode 100644 index 944f5bb..0000000 --- a/tests/testthat/test-keras.R +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -library("testthat") -context("Keras") - -deepSet <- setDeepNN(epochs = 1) - -plpResultDeep <- runPlp(population = population, - plpData = plpData, - modelSettings = deepSet, - savePlpData = F, - savePlpResult = F, - saveEvaluation = F, - savePlpPlots = F, - analysisId = 'deepTest', - saveDirectory = saveLoc) - - - -#TODO: add input checks and test these... -#options(fftempdir = getwd()) - - -test_that("deepNN working checks", { - - # check same structure - testthat::expect_equal(names(plpResultDeep), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultDeep$prediction), nrow(population)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultDeep$prediction$value), 0) - testthat::expect_lte(max(plpResultDeep$prediction$value), 1) - -}) - - -# add temporal data: -CNN1Set <- setCovNN(epochs = 1, kernelSize = 3, batchSize =30) -plpResultCNN1 <- runPlp(population = population2, - plpData = plpData3, - modelSettings = CNN1Set, - savePlpData = F, - savePlpResult = F, - saveEvaluation = F, - savePlpPlots = F, - analysisId = 'cnn1Test', - saveDirectory = saveLoc) - -test_that("covNN working checks", { - - # check same structure - testthat::expect_equal(names(plpResultCNN1), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultCNN1$prediction), nrow(population2)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultCNN1$prediction$value), 0) - testthat::expect_lte(max(plpResultCNN1$prediction$value), 1) - -}) - -CNN2Set <- setCovNN2(epochs = 1, kernelSize = 4, filters=4) -plpResultCNN2 <- runPlp(population = population2, - plpData = plpData3, - modelSettings = CNN2Set, - savePlpData = F, - savePlpResult = F, - saveEvaluation = F, - savePlpPlots = F, - analysisId = 'cnn1Test', - saveDirectory = saveLoc) - -test_that("covNN2 working checks", { - - # check same structure - testthat::expect_equal(names(plpResultCNN2), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultCNN2$prediction), nrow(population2)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultCNN2$prediction$value), 0) - testthat::expect_lte(max(plpResultCNN2$prediction$value), 1) - -}) - - -if(!travis){ -CIReNNSet <- setCIReNN(epochs = 1, useVae = F, units=c(10) ) -plpResultCIReNN <- runPlp(population = population2, plpData = plpData3, - minCovariateFraction = 0.001, normalizeData = F, - modelSettings = CIReNNSet, testSplit = 'person', - testFraction = 0.25, splitSeed = 1, - nfold = 3, savePlpData = F, savePlpResult = F, - savePlpPlots = F, - saveEvaluation = F, - analysisId = 'cireNNTest', - saveDirectory = saveLoc) -} - -test_that("CIReNN working checks", { - if(travis){ - skip("Too slow for travis") - } - # check same structure - testthat::expect_equal(names(plpResultCIReNN), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultCIReNN$prediction), nrow(population2)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultCIReNN$prediction$value), 0) - testthat::expect_lte(max(plpResultCIReNN$prediction$value), 1) - -}) - diff --git a/tests/testthat/test-torch.R b/tests/testthat/test-torch.R deleted file mode 100644 index ab732d2..0000000 --- a/tests/testthat/test-torch.R +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of PatientLevelPrediction -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -library("testthat") -context("Torch") - -lrtSet <- setLRTorch(w_decay=0.0005, - epochs=1, - class_weight = 0, - autoencoder = FALSE, - vae =FALSE) - - -plpResultLrt <- runPlp(population = population, - plpData = plpData, - modelSettings = lrtSet, - savePlpData = F, - savePlpResult = F, - saveEvaluation = F, - savePlpPlots = F, - analysisId = 'lrtTest', - saveDirectory = saveLoc) - - - -#TODO: add input checks and test these... -#options(fftempdir = getwd()) - - -test_that("torch LR working checks", { - - # check same structure - testthat::expect_equal(names(plpResultLrt), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultLrt$prediction), nrow(population)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultLrt$prediction$value), 0) - testthat::expect_lte(max(plpResultLrt$prediction$value), 1) - -}) - -mlptSet <- setMLPTorch(size = 10, - w_decay = 0.001, - epochs = 1, - autoencode = F) - - -plpResultMlpt <- runPlp(population = population, - plpData = plpData, - modelSettings = mlptSet, - savePlpData = F, - savePlpResult = F, - saveEvaluation = F, - savePlpPlots = F, - analysisId = 'mlptTest', - saveDirectory = saveLoc) - -test_that("MLP LR working checks", { - - # check same structure - testthat::expect_equal(names(plpResultMlpt), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultMlpt$prediction), nrow(population)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultMlpt$prediction$value), 0) - testthat::expect_lte(max(plpResultMlpt$prediction$value), 1) - -}) - - -# add temporal data: -if(!travis){ -RNNTSet <- setRNNTorch(hidden_size = 1, - epochs =1) -plpResultRNNT <- runPlp(population = population2, - plpData = plpData3, - modelSettings = RNNTSet, - savePlpData = F, - savePlpResult = F, - saveEvaluation = F, - savePlpPlots = F, - analysisId = 'rnntTest', - saveDirectory = saveLoc) -} - -test_that("RNN Torch working checks", { - if(travis){ - skip("Too slow for travis") - } - # check same structure - testthat::expect_equal(names(plpResultRNNT), - names(plpResult)) - - # check prediction same size as pop - testthat::expect_equal(nrow(plpResultRNNT$prediction), nrow(population2)) - - # check prediction between 0 and 1 - testthat::expect_gte(min(plpResultRNNT$prediction$value), 0) - testthat::expect_lte(max(plpResultRNNT$prediction$value), 1) - -}) - -# add CNN when it is fixed - diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..397b4a7 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1 @@ +*.log diff --git a/vignettes/BuildingDeepModels.Rmd b/vignettes/BuildingDeepModels.Rmd new file mode 100644 index 0000000..8ac9925 --- /dev/null +++ b/vignettes/BuildingDeepModels.Rmd @@ -0,0 +1,498 @@ +--- +title: "Building Deep Learning Models" +author: "Jenna Reps, Egill Fridgeirsson, Chungsoo Kim, Henrik John, Seng Chan You, Xiaoyong Pan" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[CO,CE]{DeepPatientLevelPrediction Package Version `r utils::packageVersion("DeepPatientLevelPrediction")`} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +editor_options: + markdown: + wrap: 72 +--- + +```{=html} + +``` +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Introduction + +## DeepPatientLevelPrediction + +Patient level prediction aims to use historic data to learn a function +between an input (a patient's features such as age/gender/comorbidities +at index) and an output (whether the patient experienced an outcome +during some time-at-risk). Deep learning is example of the the current +state-of-the-art classifiers that can be implemented to learn the +function between inputs and outputs. + +Deep Learning models are widely used to automatically learn high-level +feature representations from the data, and have achieved remarkable +results in image processing, speech recognition and computational +biology. Recently, interesting results have been shown using large +observational healthcare data (e.g., electronic healthcare data or +claims data), but more extensive research is needed to assess the power +of Deep Learning in this domain. + +This vignette describes how you can use the Observational Health Data +Sciences and Informatics (OHDSI) +[`PatientLevelPrediction`](http://github.com/OHDSI/PatientLevelPrediction) +package and +[`DeepPatientLevelPrediction`](http://github.com/OHDSI/DeepPatientLevelPrediction) +package to build Deep Learning models. This vignette assumes you have +read and are comfortable with building patient level prediction models +as described in the [`BuildingPredictiveModels` +vignette](https://github.com/OHDSI/PatientLevelPrediction/blob/main/inst/doc/BuildingPredictiveModels.pdf). +Furthermore, this vignette assumes you are familiar with Deep Learning +methods. + +## Background + +Deep Learning models are build by stacking an often large number of +neural network layers that perform feature engineering steps, e.g +embedding, and are collapsed in a final linear layer (equivalent to +logistic regression). These algorithms need a lot of data to converge to +a good representation, but currently the sizes of the large +observational healthcare databases are growing fast which would make +Deep Learning an interesting approach to test within OHDSI's +[Patient-Level Prediction +Framework](https://academic.oup.com/jamia/article/25/8/969/4989437). The +current implementation allows us to perform research at scale on the +value and limitations of Deep Learning using observational healthcare +data. + +In the package we have used +[torch](https://cran.r-project.org/web/packages/torch/index.html) but we +invite the community to add other backends. + +Many network architectures have recently been proposed and we have +implemented a number of them, however, this list will grow in the near +future. It is important to understand that some of these architectures +require a 2D data matrix, i.e. \|patient\|x\|feature\|, and others use a +3D data matrix \|patient\|x\|feature\|x\|time\|. The [FeatureExtraction +Package](www.github.com\ohdsi\FeatureExtraction) has been extended to +enable the extraction of both data formats as will be described with +examples below. + +Note that training Deep Learning models is computationally intensive, +our implementation therefore supports both GPU and CPU. It will +automatically check whether there is GPU or not in your computer. A GPU +is highly recommended for Deep Learning! + +## Requirements + +Full details about the package requirements and instructions on +installing the package can be found +[here](https://ohdsi.github.io/DeepPatientLevelPrediction/articles/Installing.html). + +## Integration with PatientLevelPrediction + +The `DeepPatientLevelPrediction` package provides additional model +settings that can be used within the `PatientLevelPrediction` package +`runPlp()` function. To use both packages you first need to pick the +deep learning architecture you wish to fit (see below) and then you +specify this as the modelSettings inside `runPlp()`. + +```{r, eval=FALSE} + +# load the data +plpData <- PatientLevelPrediction::loadPlpData('locationOfData') + +# pick the set from DeepPatientLevelPrediction +deepLearningModel <- DeepPatientLevelPrediction::setResNet() + +# use PatientLevelPrediction to fit model +deepLearningResult <- PatientLevelPrediction::runPlp( + plpData = plpData, + outcomeId = 1230, + modelSettings = deepLearningModel, + analysisId = 'resNetTorch', + ... + ) + +``` + +# Non-Temporal Architectures + +We implemented the following non-temporal (2D data matrix) +architectures: + +## Simple MLP + +### Overall concept + +A multilayer perceptron (MLP) model is a directed graph consisting of an +input layer, one or more hidden layers and an output layer. The model +takes in the input feature values and feeds these forward through the +graph to determine the output class. A process known as +'backpropagation' is used to train the model. Backpropagation requires +labelled data and involves automatically calculating the derivative of +the model parameters with respect to the the error between the model's +predictions and ground truth. Then the model learns how to adjust the +model's parameters to reduce the error. + +### Example + +#### Set Function + +To use the package to fit a MLP model you can use the `setMultiLayerPerceptron()` +function to specify the hyper-parameter settings for the MLP. + +#### Inputs + +The `numLayers` and `sizeHidden` inputs define the network topology via the number +of layers and neurons in the network's hidden layers. + +The `dropout` input specifies the probability that a layer +randomly sets some inputs to 0 at each step during training time. A +value of `0.2` means that 20% of the layers inputs will be set to +0. This is used to reduce overfitting. + +The `sizeEmbedding` input specifices the size of the embedding used. The first +layer is an embedding layer which converts each sparse feature to a dense vector +which it learns. An embedding is a lower dimensional projection of the features +where distance between points is a measure of similarity. + +The `weightDecay` input corresponds to the weight decay in the objective +function. During model fitting the aim is to minimize the objective +function. The objective function is made up of the prediction error (the +difference between the prediction vs the truth) plus the square of the +weights multiplied by the weight decay. The larger the weight decay, the +more you penalize having large weights. If you set the weight decay too +large, the model will never fit well enough, if you set it too low, you +need to be careful of overfitting (so try to stop model fitting +earlier). + +The `learningRate` input is the learning rate which is a hyperparameter that +controls how much to change the model in response to the estimated error +each time the model weights are updated. The smaller the `learningRate` the longer +it will take to fit the model and the model weights may get stuck, but +if the `learningRate` is too large, the weights may sub-optimally converge too +fast. + +The `seed` lets the user use the same random initialization of the network's +weights as a previous run. + +The `hyperParamSearch` chooses the strategy to find the best hyperparameters. +Currently a random search and grid search are supported. Grid search searches +every possible combination of hyperparameters while random search samples +randomly from the combinations. Since neural networks can be very flexible and +have many hyperparameter combinations it's almost never feasible to do a full +grid search unless the network is really small. + +The `randomSample` chooses how many random samples to use. + +The `device` specifies what device to use. Either `cpu` or `cuda`. Or if you +have many GPU's `cuda:x` where x is the gpu number as seen in `nvidia-smi`. + +The `batchSize` corresponds to the number of data points (patients) +used per iteration to estimate the network error during model fitting. + +The `epochs` corresponds to how many time to run through the entire +training data while fitting the model. + + + +#### Example Code + +For example, the following code will try 10 different network +configurations sampled from the possible combinations given and pick the one +that obtains the greatest AUROC via cross validation in the training data and +then fit the model with that configuration using all the training data. The +standard output of `runPlp()` will be returned - this contains the MLP model +along with the performance details and settings. Note that all possible +combinations are 2*2*2*2 or 16 but specify ```randomSample=10``` to only try +10 of those. + +```{r, eval=FALSE} + +modelSettings <- setMultiLayerPerceptron( + numLayers = c(3, 5), + numHidden = c(64, 128), + dropout = c(0.2), + sizeEmbedding = c(32, 64), + learningRate = c(1e-3, 1e-4), + weightDecay = c(1e-5), + randomSample=10, + batchSize = c(100), + epochs = c(5), + seed = 12 + ) + +mlpResult <- PatientLevelPrediction::runPlp( + plpData = plpData, + outcomeId = 3, + modelSettings = modelSettings, + analysisId = 'MLP', + analysisName = 'Testing Deep Learning', + populationSettings = populationSet, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), + sampleSettings = PatientLevelPrediction::createSampleSettings(), # none + featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), + executeSettings = PatientLevelPrediction::createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = file.path(testLoc, 'DeepNNTorch') + ) + +``` + +## ResNet + +### Overall concept + +Deep learning models are often trained via a process known as gradient +descent during backpropogation. During this process the network weights +are updated based on the gradient of the error function for the current +weights. However, as the number of layers in the network increase, there +is a greater chance of experiencing an issue known as the vanishing or +exploding gradient during this process. The vanishing or exploding +gradient is when the gradient goes to 0 or infinity, which negatively +impacts the model fitting. + +The residual network (ResNet) was introduced to address the vanishing or +exploding gradient issue. It works by adding connections between +non-adjacent layers, termed a 'skip connection'. Using some form of +regularization with these 'skip connections' enables the network to +ignore any problematic layer that resulted due to gradient issues. + +The ResNet calculates embeddings for every feature and then averages +them to compute an embedding per patient. + +This implementation of a ResNet for tabular data is based on [this +paper](https://arxiv.org/abs/2106.11959). + +### Example + +#### Set Function + +To use the package to fit a ResNet model you can use the `setResNet()` +function to specify the hyperparameter settings for the network. + +#### Inputs + +##### Model inputs: + +`numLayers`: How many layers to use in the model. + +`sizeHidden`: How many neurons in each hidden layer + +`hiddenFactor`: How much to increase number of neurons in each layer + +`residualDropout` and`hiddenDropout` : How much dropout to apply in +hidden layer or residual connection + +`sizeEmbedding` : The size of the initial embedding layer + +##### Training process inputs: + +`weightDecay` : How much weight decay to apply, which penalizes bigger +weights + +`learningRate` : Which learning rate to use + +`seed` : Use a seed for reproducibility + +`device` : Which device to use, such as a cpu or a gpu + +`batchSize` : Size of batch of data used per iteration during training + +`epochs` : How many runs through the data + +##### Hyperparameter tuning inputs: + +`hyperParamSearch` : Which type of hyperparameter search to use, either +random sampling or exhaustive (grid) search + +`randomSample`: If doing a random search for hyperparameters, how many +random samples to use + +#### Example Code + +For example, the following code will fit a two layer ResNet where each +layer has 32 neurons which increases by a factor of two before +decreasing againg (hiddenFactor). 10% of inputs to each layer and +residual connection within the layer are randomly zeroed. The embedding +layer has 32 neurons. Learning rate of 3e-4 with weight decay of 1e-6 is +used for the optimizer. No hyperparameter search is done since each +input only includes one option. + +```{r, eval=FALSE} + +resset <- setResNet( + numLayers = c(2), + sizeHidden = c(32), + hiddenFactor = c(2), + residualDropout = c(0.1), + hiddenDropout = c(0.1), + sizeEmbedding = c(32), + weightDecay = c(1e-6), + learningRate = c(3e-4), + seed = 42, + hyperParamSearch = 'random', + randomSample = 1, + #device='cuda:0', # uncomment to use GPU + batchSize = 128, + epochs = 3 +) + +resResult <- PatientLevelPrediction::runPlp( + plpData = plpData, + outcomeId = 3, + modelSettings = resset, + analysisId = 'ResNet', + analysisName = 'Testing ResNet', + populationSettings = populationSet, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), + sampleSettings = PatientLevelPrediction::createSampleSettings(), + featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), + preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), + executeSettings = PatientLevelPrediction::createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = file.path(getwd(), 'ResNet') # change to save elsewhere + ) + +``` + +## Transformer + +### Overall concept + +Recently there has been a surge of models in natural language processing +and computer vision that utilize attention. This is a technique where +the model learns where to look and what to focus on in the input data. +This was first described in the attention is all you need +[paper](https://arxiv.org/abs/1706.03762). Here we have used an +implementation that has shown good performance on non-temporal tabular +data from this [paper](https://arxiv.org/abs/2106.11959). + +This architecture is computationally expensive and scales badly with +longer sequence length. In this case the sequence is the amount of +features each patient has. Users need to be aware of how many features +they are feeding to the model since this will effect the computation +time heavily. This is something you control in `FeatureExtraction` when +you create your covariate settings. + +### Examples + +#### Set Function + +To use the package to fit a Transformer model you can use the +`setTransformer()` function to specify the hyperparameter settings for +the network. + +#### Inputs + +The training and hyperparameter tuning inputs are the same as for the +ResNet. + +##### Model inputs: + +`numBlocks` : How many Transformer blocks to use, each block includes a +self-attention layer and a feedforward block with two linear layers. + +`dimToken` : Dimension of the embedding for each feature's embedding + +`dimOut` : Dimension of output, for binary problems this is 1. + +`numHeads` : Number of attention heads for the self-attention + +`attDropout` , `ffnDropout` and `resDropout` : How much dropout to apply +on attentions, in feedforward block or in residual connections + +`dimHidden` : How many neurons in linear layers inside the feedforward +block + +#### Example Code + +```{r, eval=FALSE} + +modelSettings <- setTransformer(numBlocks = 3, + dimToken = 32, + dimOut = 1, + numHeads = 4, + attDropout = 0.25, + ffnDropout = 0.25, + resDropout = 0, + dimHidden = 128, + weightDecay = 1e-06, + learningRate = 3e-04 + batchSize = 128, + epochs = 10, + device = 'cpu', # or 'cuda' for GPU + randomSamples = 1) + + +TransformerResult <- PatientLevelPrediction::runPlp( + plpData = plpData, + outcomeId = 3, + modelSettings = modelSettings, + analysisId = 'Transformer', + analysisName = 'Testing transformer', + populationSettings = populationSet, + splitSettings = PatientLevelPrediction::createDefaultSplitSetting(), + sampleSettings = PatientLevelPrediction::createSampleSettings(), # none + featureEngineeringSettings = PatientLevelPrediction::createFeatureEngineeringSettings(), # none + preprocessSettings = PatientLevelPrediction::createPreprocessSettings(), + executeSettings = PatientLevelPrediction::createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + ), + saveDirectory = file.path(getwd(), 'Transformer') # change to save elsewhere + ) +``` + +# Acknowledgments + +Considerable work has been dedicated to provide the +`DeepPatientLevelPrediction` package. + +```{r tidy=TRUE,eval=TRUE} +citation("DeepPatientLevelPrediction") +``` + +**Please reference this paper if you use the PLP Package in your work:** + +[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and +implementation of a standardized framework to generate and evaluate +patient-level prediction models using observational healthcare data. J +Am Med Inform Assoc. +2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) diff --git a/vignettes/Installing.Rmd b/vignettes/Installing.Rmd new file mode 100644 index 0000000..7946f76 --- /dev/null +++ b/vignettes/Installing.Rmd @@ -0,0 +1,149 @@ +--- +title: "DeepPatientLevelPrediction Installation Guide" +author: "Egill Fridgeirsson" +date: '`r Sys.Date()`' +header-includes: + - \usepackage{fancyhdr} + - \pagestyle{fancy} + - \fancyhead{} + - \fancyfoot[CO,CE]{PatientLevelPrediction Package Version `r utils::packageVersion("PatientLevelPrediction")`} + - \fancyfoot[CO,CE]{DeepPatientLevelPrediction Package Version `r utils::packageVersion("DeepPatientLevelPrediction")`} + - \fancyfoot[LE,RO]{\thepage} + - \renewcommand{\headrulewidth}{0.4pt} + - \renewcommand{\footrulewidth}{0.4pt} +output: + pdf_document: + includes: + in_header: preamble.tex + number_sections: yes + toc: yes + word_document: + toc: yes + html_document: + number_sections: yes + toc: yes +--- + +```{=html} + +``` +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +# Introduction + +This vignette describes how you need to install the Observational Health Data Science and Informatics (OHDSI) DeepPatientLevelPrediction under Windows, Mac and Linux. + +# Software Prerequisites + +## Windows Users + +Under Windows the OHDSI Deep Patient Level Prediction (DeepPLP) package requires installing: + +- R ( ) - (R \>= 3.5.0, but latest is recommended) +- Rstudio ( ) +- Java ( ) +- RTools () + +## Mac/Linux Users + +Under Mac and Linux the OHDSI deepPLP package requires installing: + +- R ( ) - (R \>= 3.3.0, but latest is recommended) +- Rstudio ( ) +- Java ( ) +- Xcode command line tools(run in terminal: xcode-select --install) [MAC USERS ONLY] + +# Installing the Package + +The preferred way to install the package is by using `remotes`, which will automatically install the latest release and all the latest dependencies. + +If you do not want the official release you could install the bleeding edge version of the package (latest develop branch). + +Note that the latest develop branch could contain bugs, please report them to us if you experience problems. + +## Installing DeepPatientLevelPrediction using remotes + +To install using `remotes` run: + +```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} +install.packages("remotes") +remotes::install_github("OHDSI/FeatureExtraction") +remotes::install_github("OHDSI/PatientLevelPrediction") +remotes::install_github("OHDSI/DeepPatientLevelPrediction") +``` + +DeepPLP relies on [torch for R](https://torch.mlverse.org/). When torch is installed the user +will be prompted if libtorch and lantern binaries should be downloaded. These binaries are neccesary +for the package to run. + +If you are using DeepPLP in an offline environment the function `torch::install_torch_from_file()` can be used. This will first require to download and move the correct binaries to the offline environment. See [torch installation guide](https://torch.mlverse.org/docs/articles/installation.html) for more detailed instructions. + +When installing make sure to close any other Rstudio sessions that are using `DeepPatientLevelPrediction` or any dependency. Keeping Rstudio sessions open can cause locks that prevent the package installing. + +# Testing Installation + +```{r, echo = TRUE, message = FALSE, warning = FALSE,tidy=FALSE,eval=FALSE} +library(PatientLevelPrediction) +library(DeepPatientLevelPrediction) + +data(plpDataSimulationProfile) +sampleSize <- 1e4 +plpData <- simulatePlpData( + plpDataSimulationProfile, + n = sampleSize +) + +populationSettings <- PatientLevelPrediction::createStudyPopulationSettings( + requireTimeAtRisk = F, + riskWindowStart = 1, + riskWindowEnd = 365) +# a very simple resnet +modelSettings <- setResNet(numLayers = 2, + sizeHidden = 64, + hiddenFactor = 1, + residualDropout = 0, + hiddenDropout = 0.2, + sizeEmbedding = 64, + weightDecay = 1e-6, + learningRate = 3e-4, + seed = 42, + hyperParamSearch = 'random', + randomSample = 1, device = 'cpu',batchSize = 128, + epochs = 3) + +plpResults <- PatientLevelPrediction::runPlp(plpData = plpData, + outcomeId = 3, + modelSettings = modelSettings, + analysisId = 'Test', + analysisName = 'Testing DeepPlp', + populationSettings = populationSettings, + splitSettings = createDefaultSplitSetting(), + sampleSettings = createSampleSettings(), + featureEngineeringSettings = createFeatureEngineeringSettings(), + preprocessSettings = createPreprocessSettings(), + logSettings = createLogSettings(), + executeSettings = createExecuteSettings(runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = T + )) +``` + +# Acknowledgments + +Considerable work has been dedicated to provide the `DeepPatientLevelPrediction` package. + +```{r tidy=TRUE,eval=TRUE} +citation("DeepPatientLevelPrediction") +``` + +**Please reference this paper if you use the PLP Package in your work:** + +[Reps JM, Schuemie MJ, Suchard MA, Ryan PB, Rijnbeek PR. Design and implementation of a standardized framework to generate and evaluate patient-level prediction models using observational healthcare data. J Am Med Inform Assoc. 2018;25(8):969-975.](http://dx.doi.org/10.1093/jamia/ocy032) \ No newline at end of file diff --git a/vignettes/preamble.tex b/vignettes/preamble.tex new file mode 100644 index 0000000..2040267 --- /dev/null +++ b/vignettes/preamble.tex @@ -0,0 +1,8 @@ +\usepackage{float} +\let\origfigure\figure +\let\endorigfigure\endfigure +\renewenvironment{figure}[1][2] { + \expandafter\origfigure\expandafter[H] +} { + \endorigfigure +} \ No newline at end of file