v5.6

nanxstats · Nov 26, 2016 · 7888238 · 7888238
1 parent 106dd5f
commit 7888238
Show file tree

Hide file tree

Showing 32 changed files with 367 additions and 197 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,14 @@
 Package: enpls
 Type: Package
 Title: Ensemble Partial Least Squares Regression
-Version: 5.0
+Version: 5.6
 Author: Nan Xiao <[email protected]>, Dong-Sheng Cao,
     Miao-Zhu Li <[email protected]>, Qing-Song Xu <[email protected]>
 Maintainer: Nan Xiao <[email protected]>
-Description: An algorithmic framework for measuring feature importance, outlier detection, model applicability domain evaluation, and ensemble predictive modeling with (sparse) partial least squares regressions.
+Description: An algorithmic framework for measuring feature importance,
+    outlier detection, model applicability domain evaluation,
+    and ensemble predictive modeling with (sparse)
+    partial least squares regressions.
 License: GPL (>= 2)
 URL: http://enpls.org
 BugReports: https://github.com/road2stat/enpls/issues

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,13 @@
+# CHANGES IN enpls VERSION 5.6 (2016-11-25)
+
+## NEW FEATURES
+
+* New argument `cvfolds` now available in all applicable functions for finer control of cross-validation folds in automatic parameter selection of each PLS/SPLS model.
+
+## IMPROVEMENTS
+
+* Critical implementation improvements for processing the case where argument `maxcomp = NULL` (maximum number of components not specified explicitly) in `enpls.` functions. Now it should correctly determine the maximum number of components to use, considering both cross-validation and special cases such as n < p. Thanks to Dr. You-Wu Lin for the feedback.
+
 # CHANGES IN enpls VERSION 5.0 (2016-10-20)
 
 ## NEW FEATURES

diff --git a/R/cv.enpls.R b/R/cv.enpls.R
@@ -4,7 +4,10 @@
 #'
 #' @param x Predictor matrix.
 #' @param y Response vector.
-#' @param nfolds Number of folds, default is \code{5}.
+#' @param nfolds Number of cross-validation folds, default is \code{5}.
+#' Note that this is the CV folds for the ensemble PLS model,
+#' not the individual PLS models. To control the CV folds for
+#' single PLS models, please use the argument \code{cvfolds}.
 #' @param verbose Shall we print out the progress of cross-validation?
 #' @param ... Arguments to be passed to \code{\link{enpls.fit}}.
 #'
@@ -62,18 +65,15 @@ cv.enpls = function(x, y, nfolds = 5L, verbose = TRUE, ...) {
   colnames(ypred) = c('y.real', 'y.pred')
 
   residual = ypred[, 1L] - ypred[, 2L]
-  # RMSE
-  RMSE = sqrt(mean((residual)^2, na.rm = TRUE))
-  # MAE (Mean Absolute Error)
-  MAE = mean(abs(residual), na.rm = TRUE)
-  # R-square
-  Rsquare = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))
+  RMSE     = sqrt(mean((residual)^2, na.rm = TRUE))
+  MAE      = mean(abs(residual), na.rm = TRUE)
+  Rsquare  = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))
 
-  object = list('ypred' = ypred,
+  object = list('ypred'    = ypred,
                 'residual' = residual,
-                'RMSE' = RMSE,
-                'MAE' = MAE,
-                'Rsquare' = Rsquare)
+                'RMSE'     = RMSE,
+                'MAE'      = MAE,
+                'Rsquare'  = Rsquare)
   class(object) = 'cv.enpls'
   return(object)
 

diff --git a/R/cv.enspls.R b/R/cv.enspls.R
@@ -4,7 +4,10 @@
 #'
 #' @param x Predictor matrix.
 #' @param y Response vector.
-#' @param nfolds Number of folds, default is \code{5}.
+#' @param nfolds Number of cross-validation folds, default is \code{5}.
+#' Note that this is the CV folds for the ensemble sparse PLS model,
+#' not the individual sparse PLS models. To control the CV folds for
+#' single sparse PLS models, please use the argument \code{cvfolds}.
 #' @param verbose Shall we print out the progress of cross-validation?
 #' @param ... Arguments to be passed to \code{\link{enspls.fit}}.
 #'
@@ -64,18 +67,15 @@ cv.enspls = function(x, y, nfolds = 5L, verbose = TRUE, ...) {
   colnames(ypred) = c('y.real', 'y.pred')
 
   residual = ypred[, 1L] - ypred[, 2L]
-  # RMSE
-  RMSE = sqrt(mean((residual)^2, na.rm = TRUE))
-  # MAE (Mean Absolute Error)
-  MAE = mean(abs(residual), na.rm = TRUE)
-  # R-square
-  Rsquare = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))
+  RMSE     = sqrt(mean((residual)^2, na.rm = TRUE))
+  MAE      = mean(abs(residual), na.rm = TRUE)
+  Rsquare  = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))
 
-  object = list('ypred' = ypred,
+  object = list('ypred'    = ypred,
                 'residual' = residual,
-                'RMSE' = RMSE,
-                'MAE' = MAE,
-                'Rsquare' = Rsquare)
+                'RMSE'     = RMSE,
+                'MAE'      = MAE,
+                'Rsquare'  = Rsquare)
   class(object) = 'cv.enspls'
   return(object)
 

diff --git a/R/enpls-package.R b/R/enpls-package.R
@@ -1,9 +1,11 @@
 #' Ensemble Partial Least Squares Regression
 #'
-#' The enpls package provides functions for running the
-#' ensemble partial least squares regression.
+#' enpls offers an algorithmic framework for measuring feature importance,
+#' outlier detection, model applicability evaluation, and
+#' ensemble predictive modeling with (sparse)
+#' partial least squares regressions.
 #'
-#' The package vignette can be opened with \code{vignette("enpls")}.
+#' See the vignette via \code{vignette("enpls")}.
 #'
 #' \tabular{ll}{ Package: \tab enpls\cr
 #' Type: \tab Package\cr

diff --git a/R/enpls.ad.R b/R/enpls.ad.R
@@ -9,7 +9,10 @@
 #' @param ytest List, with the i-th component being the i-th test set's
 #' response vector (see example code below).
 #' @param maxcomp Maximum number of components included within each model.
-#' If not specified, will use the variable (column) numbers in \code{x}.
+#' If not specified, will use the maximum number possible (considering
+#' cross-validation and special cases where n is smaller than p).
+#' @param cvfolds Number of cross-validation folds used in each model
+#' for automatic parameter selection, default is \code{5}.
 #' @param space Space in which to apply the resampling method.
 #' Can be the sample space (\code{"sample"}) or
 #' the variable space (\code{"variable"}).
@@ -79,11 +82,12 @@
 
 enpls.ad = function(x, y,
                     xtest, ytest,
-                    maxcomp = NULL,
-                    space = c('sample', 'variable'),
-                    method = c('mc', 'boot'),
+                    maxcomp  = NULL,
+                    cvfolds  = 5L,
+                    space    = c('sample', 'variable'),
+                    method   = c('mc', 'boot'),
                     reptimes = 500L,
-                    ratio = 0.8,
+                    ratio    = 0.8,
                     parallel = 1L) {
 
   if (missing(x) | missing(y) | missing(xtest) | missing(ytest))
@@ -105,8 +109,6 @@ enpls.ad = function(x, y,
 
   if (space == 'sample') {
 
-    if (is.null(maxcomp)) maxcomp = ncol(x)
-
     idx.row = vector('list', reptimes)
 
     if (method == 'boot') {
@@ -129,7 +131,7 @@ enpls.ad = function(x, y,
       for (i in 1L:reptimes) {
         plsdf.tr = as.data.frame(cbind(x[idx.row[[i]], ],
                                        'y' = y[idx.row[[i]]]))
-        fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp))
+        fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds))
         errorlist.tr[[i]] = suppressWarnings(enpls.ad.core.pred(
           fit, as.data.frame(cbind(x, 'y' = y))))
         for (j in 1L:n.testset) {
@@ -145,7 +147,7 @@ enpls.ad = function(x, y,
       fit.list = foreach(i = 1L:reptimes) %dopar% {
         plsdf.tr = as.data.frame(cbind(x[idx.row[[i]], ],
                                        'y' = y[idx.row[[i]]]))
-        enpls.ad.core.fit(plsdf.tr, maxcomp)
+        enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds)
       }
 
       for (i in 1L:reptimes) {
@@ -175,15 +177,14 @@ enpls.ad = function(x, y,
       idx.col = vector('list', reptimes)
 
       n.var = round(x.col * ratio)
-      if (is.null(maxcomp)) maxcomp = n.var
       for (i in 1L:reptimes) idx.col[[i]] =
         sample(1L:x.col, n.var, replace = FALSE)
 
       if (parallel < 1.5) {
 
         for (i in 1L:reptimes) {
           plsdf.tr = as.data.frame(cbind(x[, idx.col[[i]]], 'y' = y))
-          fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp))
+          fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds))
           errorlist.tr[[i]] = suppressWarnings(enpls.ad.core.pred(fit, plsdf.tr))
           for (j in 1L:n.testset) {
             errorlist.te[[j]][[i]] =
@@ -198,7 +199,7 @@ enpls.ad = function(x, y,
         registerDoParallel(parallel)
         fit.list = foreach(i = 1L:reptimes) %dopar% {
           plsdf.tr = as.data.frame(cbind(x[, idx.col[[i]]], 'y' = y))
-          enpls.ad.core.fit(plsdf.tr, maxcomp)
+          enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds)
         }
 
         for (i in 1L:reptimes) {
@@ -265,21 +266,37 @@ enpls.ad = function(x, y,
 #'
 #' @keywords internal
 
-enpls.ad.core.fit = function(trainingset, maxcomp) {
+enpls.ad.core.fit = function(trainingset, maxcomp, cvfolds) {
+
+  if (is.null(maxcomp)) {
 
-  plsr.cvfit = plsr(y ~ ., data = trainingset,
-                    ncomp  = maxcomp,
-                    scale  = TRUE,
-                    method = 'simpls',
-                    validation = 'CV', segments = 5L)
+    plsr.cvfit = plsr(y ~ .,
+                      data       = trainingset,
+                      scale      = TRUE,
+                      method     = 'simpls',
+                      validation = 'CV',
+                      segments   = cvfolds)
+
+  } else {
+
+    plsr.cvfit = plsr(y ~ .,
+                      data       = trainingset,
+                      ncomp      = maxcomp,
+                      scale      = TRUE,
+                      method     = 'simpls',
+                      validation = 'CV',
+                      segments   = cvfolds)
+
+  }
 
   # select best component number using adjusted CV
   cv.bestcomp = which.min(RMSEP(plsr.cvfit)[['val']][2L, 1L, -1L])
 
-  plsr.fit = plsr(y ~ ., data = trainingset,
-                  ncomp  = cv.bestcomp,
-                  scale  = TRUE,
-                  method = 'simpls',
+  plsr.fit = plsr(y ~ .,
+                  data       = trainingset,
+                  ncomp      = cv.bestcomp,
+                  scale      = TRUE,
+                  method     = 'simpls',
                   validation = 'none')
 
   return(list('plsr.fit' = plsr.fit, 'cv.bestcomp' = cv.bestcomp))

diff --git a/R/enpls.fit.R b/R/enpls.fit.R
@@ -5,7 +5,10 @@
 #' @param x Predictor matrix.
 #' @param y Response vector.
 #' @param maxcomp Maximum number of components included within each model.
-#' If not specified, will use the variable (column) numbers in \code{x}.
+#' If not specified, will use the maximum number possible (considering
+#' cross-validation and special cases where n is smaller than p).
+#' @param cvfolds Number of cross-validation folds used in each model
+#' for automatic parameter selection, default is \code{5}.
 #' @param reptimes Number of models to build with Monte-Carlo resampling
 #' or bootstrapping.
 #' @param method Resampling method. \code{"mc"} (Monte-Carlo resampling)
@@ -39,15 +42,15 @@
 #' predict(fit, newx = x)
 
 enpls.fit = function(x, y,
-                     maxcomp = NULL,
+                     maxcomp  = NULL,
+                     cvfolds  = 5L,
                      reptimes = 500L,
-                     method = c('mc', 'boot'), ratio = 0.8,
+                     method   = c('mc', 'boot'),
+                     ratio    = 0.8,
                      parallel = 1L) {
 
   if (missing(x) | missing(y)) stop('Please specify both x and y')
 
-  if (is.null(maxcomp)) maxcomp = ncol(x)
-
   method = match.arg(method)
 
   x.row = nrow(x)
@@ -68,7 +71,7 @@ enpls.fit = function(x, y,
       xtmp = x[samp.idx[[i]], ]
       ytmp = y[samp.idx[[i]]]
       plsdf = as.data.frame(cbind(xtmp, 'y' = ytmp))
-      modellist[[i]] = suppressWarnings(enpls.fit.core(plsdf, maxcomp))
+      modellist[[i]] = suppressWarnings(enpls.fit.core(plsdf, maxcomp, cvfolds))
     }
 
   } else {
@@ -78,7 +81,7 @@ enpls.fit = function(x, y,
       xtmp = x[samp.idx[[i]], ]
       ytmp = y[samp.idx[[i]]]
       plsdf = as.data.frame(cbind(xtmp, 'y' = ytmp))
-      enpls.fit.core(plsdf, maxcomp)
+      enpls.fit.core(plsdf, maxcomp, cvfolds)
     }
 
   }
@@ -100,24 +103,40 @@ enpls.fit = function(x, y,
 #'
 #' @keywords internal
 
-enpls.fit.core = function(plsdf, maxcomp) {
+enpls.fit.core = function(plsdf, maxcomp, cvfolds) {
+
+  if (is.null(maxcomp)) {
 
-  plsr.cvfit = plsr(y ~ ., data = plsdf,
-                    ncomp  = maxcomp,
-                    scale  = TRUE,
-                    method = 'simpls',
-                    validation = 'CV', segments = 5L)
+    plsr.cvfit = plsr(y ~ .,
+                      data       = plsdf,
+                      scale      = TRUE,
+                      method     = 'simpls',
+                      validation = 'CV',
+                      segments   = cvfolds)
+
+  } else {
+
+    plsr.cvfit = plsr(y ~ .,
+                      data       = plsdf,
+                      ncomp      = maxcomp,
+                      scale      = TRUE,
+                      method     = 'simpls',
+                      validation = 'CV',
+                      segments   = cvfolds)
+
+  }
 
   # select best component number using adjusted CV
   cv.bestcomp = which.min(RMSEP(plsr.cvfit)[['val']][2L, 1L, -1L])
 
   # remove plsr.cvfit object
   rm(plsr.cvfit)
 
-  plsr.fit = plsr(y ~ ., data = plsdf,
-                  ncomp  = cv.bestcomp,
-                  scale  = TRUE,
-                  method = 'simpls',
+  plsr.fit = plsr(y ~ .,
+                  data       = plsdf,
+                  ncomp      = cv.bestcomp,
+                  scale      = TRUE,
+                  method     = 'simpls',
                   validation = 'none')
 
   # minify plsr.fit object to reduce memory footprint