Skip to content

Commit

Permalink
v5.6
Browse files Browse the repository at this point in the history
  • Loading branch information
nanxstats committed Nov 26, 2016
1 parent 106dd5f commit 7888238
Show file tree
Hide file tree
Showing 32 changed files with 367 additions and 197 deletions.
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
Package: enpls
Type: Package
Title: Ensemble Partial Least Squares Regression
Version: 5.0
Version: 5.6
Author: Nan Xiao <[email protected]>, Dong-Sheng Cao,
Miao-Zhu Li <[email protected]>, Qing-Song Xu <[email protected]>
Maintainer: Nan Xiao <[email protected]>
Description: An algorithmic framework for measuring feature importance, outlier detection, model applicability domain evaluation, and ensemble predictive modeling with (sparse) partial least squares regressions.
Description: An algorithmic framework for measuring feature importance,
outlier detection, model applicability domain evaluation,
and ensemble predictive modeling with (sparse)
partial least squares regressions.
License: GPL (>= 2)
URL: http://enpls.org
BugReports: https://github.com/road2stat/enpls/issues
Expand Down
10 changes: 10 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# CHANGES IN enpls VERSION 5.6 (2016-11-25)

## NEW FEATURES

* New argument `cvfolds` now available in all applicable functions for finer control of cross-validation folds in automatic parameter selection of each PLS/SPLS model.

## IMPROVEMENTS

* Critical implementation improvements for processing the case where argument `maxcomp = NULL` (maximum number of components not specified explicitly) in `enpls.` functions. Now it should correctly determine the maximum number of components to use, considering both cross-validation and special cases such as n < p. Thanks to Dr. You-Wu Lin for the feedback.

# CHANGES IN enpls VERSION 5.0 (2016-10-20)

## NEW FEATURES
Expand Down
22 changes: 11 additions & 11 deletions R/cv.enpls.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param nfolds Number of folds, default is \code{5}.
#' @param nfolds Number of cross-validation folds, default is \code{5}.
#' Note that this is the CV folds for the ensemble PLS model,
#' not the individual PLS models. To control the CV folds for
#' single PLS models, please use the argument \code{cvfolds}.
#' @param verbose Shall we print out the progress of cross-validation?
#' @param ... Arguments to be passed to \code{\link{enpls.fit}}.
#'
Expand Down Expand Up @@ -62,18 +65,15 @@ cv.enpls = function(x, y, nfolds = 5L, verbose = TRUE, ...) {
colnames(ypred) = c('y.real', 'y.pred')

residual = ypred[, 1L] - ypred[, 2L]
# RMSE
RMSE = sqrt(mean((residual)^2, na.rm = TRUE))
# MAE (Mean Absolute Error)
MAE = mean(abs(residual), na.rm = TRUE)
# R-square
Rsquare = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))
RMSE = sqrt(mean((residual)^2, na.rm = TRUE))
MAE = mean(abs(residual), na.rm = TRUE)
Rsquare = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))

object = list('ypred' = ypred,
object = list('ypred' = ypred,
'residual' = residual,
'RMSE' = RMSE,
'MAE' = MAE,
'Rsquare' = Rsquare)
'RMSE' = RMSE,
'MAE' = MAE,
'Rsquare' = Rsquare)
class(object) = 'cv.enpls'
return(object)

Expand Down
22 changes: 11 additions & 11 deletions R/cv.enspls.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
#'
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param nfolds Number of folds, default is \code{5}.
#' @param nfolds Number of cross-validation folds, default is \code{5}.
#' Note that this is the CV folds for the ensemble sparse PLS model,
#' not the individual sparse PLS models. To control the CV folds for
#' single sparse PLS models, please use the argument \code{cvfolds}.
#' @param verbose Shall we print out the progress of cross-validation?
#' @param ... Arguments to be passed to \code{\link{enspls.fit}}.
#'
Expand Down Expand Up @@ -64,18 +67,15 @@ cv.enspls = function(x, y, nfolds = 5L, verbose = TRUE, ...) {
colnames(ypred) = c('y.real', 'y.pred')

residual = ypred[, 1L] - ypred[, 2L]
# RMSE
RMSE = sqrt(mean((residual)^2, na.rm = TRUE))
# MAE (Mean Absolute Error)
MAE = mean(abs(residual), na.rm = TRUE)
# R-square
Rsquare = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))
RMSE = sqrt(mean((residual)^2, na.rm = TRUE))
MAE = mean(abs(residual), na.rm = TRUE)
Rsquare = 1L - (sum((residual)^2, na.rm = TRUE)/sum((y - mean(y))^2))

object = list('ypred' = ypred,
object = list('ypred' = ypred,
'residual' = residual,
'RMSE' = RMSE,
'MAE' = MAE,
'Rsquare' = Rsquare)
'RMSE' = RMSE,
'MAE' = MAE,
'Rsquare' = Rsquare)
class(object) = 'cv.enspls'
return(object)

Expand Down
8 changes: 5 additions & 3 deletions R/enpls-package.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
#' Ensemble Partial Least Squares Regression
#'
#' The enpls package provides functions for running the
#' ensemble partial least squares regression.
#' enpls offers an algorithmic framework for measuring feature importance,
#' outlier detection, model applicability evaluation, and
#' ensemble predictive modeling with (sparse)
#' partial least squares regressions.
#'
#' The package vignette can be opened with \code{vignette("enpls")}.
#' See the vignette via \code{vignette("enpls")}.
#'
#' \tabular{ll}{ Package: \tab enpls\cr
#' Type: \tab Package\cr
Expand Down
61 changes: 39 additions & 22 deletions R/enpls.ad.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
#' @param ytest List, with the i-th component being the i-th test set's
#' response vector (see example code below).
#' @param maxcomp Maximum number of components included within each model.
#' If not specified, will use the variable (column) numbers in \code{x}.
#' If not specified, will use the maximum number possible (considering
#' cross-validation and special cases where n is smaller than p).
#' @param cvfolds Number of cross-validation folds used in each model
#' for automatic parameter selection, default is \code{5}.
#' @param space Space in which to apply the resampling method.
#' Can be the sample space (\code{"sample"}) or
#' the variable space (\code{"variable"}).
Expand Down Expand Up @@ -79,11 +82,12 @@

enpls.ad = function(x, y,
xtest, ytest,
maxcomp = NULL,
space = c('sample', 'variable'),
method = c('mc', 'boot'),
maxcomp = NULL,
cvfolds = 5L,
space = c('sample', 'variable'),
method = c('mc', 'boot'),
reptimes = 500L,
ratio = 0.8,
ratio = 0.8,
parallel = 1L) {

if (missing(x) | missing(y) | missing(xtest) | missing(ytest))
Expand All @@ -105,8 +109,6 @@ enpls.ad = function(x, y,

if (space == 'sample') {

if (is.null(maxcomp)) maxcomp = ncol(x)

idx.row = vector('list', reptimes)

if (method == 'boot') {
Expand All @@ -129,7 +131,7 @@ enpls.ad = function(x, y,
for (i in 1L:reptimes) {
plsdf.tr = as.data.frame(cbind(x[idx.row[[i]], ],
'y' = y[idx.row[[i]]]))
fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp))
fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds))
errorlist.tr[[i]] = suppressWarnings(enpls.ad.core.pred(
fit, as.data.frame(cbind(x, 'y' = y))))
for (j in 1L:n.testset) {
Expand All @@ -145,7 +147,7 @@ enpls.ad = function(x, y,
fit.list = foreach(i = 1L:reptimes) %dopar% {
plsdf.tr = as.data.frame(cbind(x[idx.row[[i]], ],
'y' = y[idx.row[[i]]]))
enpls.ad.core.fit(plsdf.tr, maxcomp)
enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds)
}

for (i in 1L:reptimes) {
Expand Down Expand Up @@ -175,15 +177,14 @@ enpls.ad = function(x, y,
idx.col = vector('list', reptimes)

n.var = round(x.col * ratio)
if (is.null(maxcomp)) maxcomp = n.var
for (i in 1L:reptimes) idx.col[[i]] =
sample(1L:x.col, n.var, replace = FALSE)

if (parallel < 1.5) {

for (i in 1L:reptimes) {
plsdf.tr = as.data.frame(cbind(x[, idx.col[[i]]], 'y' = y))
fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp))
fit = suppressWarnings(enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds))
errorlist.tr[[i]] = suppressWarnings(enpls.ad.core.pred(fit, plsdf.tr))
for (j in 1L:n.testset) {
errorlist.te[[j]][[i]] =
Expand All @@ -198,7 +199,7 @@ enpls.ad = function(x, y,
registerDoParallel(parallel)
fit.list = foreach(i = 1L:reptimes) %dopar% {
plsdf.tr = as.data.frame(cbind(x[, idx.col[[i]]], 'y' = y))
enpls.ad.core.fit(plsdf.tr, maxcomp)
enpls.ad.core.fit(plsdf.tr, maxcomp, cvfolds)
}

for (i in 1L:reptimes) {
Expand Down Expand Up @@ -265,21 +266,37 @@ enpls.ad = function(x, y,
#'
#' @keywords internal

enpls.ad.core.fit = function(trainingset, maxcomp) {
enpls.ad.core.fit = function(trainingset, maxcomp, cvfolds) {

if (is.null(maxcomp)) {

plsr.cvfit = plsr(y ~ ., data = trainingset,
ncomp = maxcomp,
scale = TRUE,
method = 'simpls',
validation = 'CV', segments = 5L)
plsr.cvfit = plsr(y ~ .,
data = trainingset,
scale = TRUE,
method = 'simpls',
validation = 'CV',
segments = cvfolds)

} else {

plsr.cvfit = plsr(y ~ .,
data = trainingset,
ncomp = maxcomp,
scale = TRUE,
method = 'simpls',
validation = 'CV',
segments = cvfolds)

}

# select best component number using adjusted CV
cv.bestcomp = which.min(RMSEP(plsr.cvfit)[['val']][2L, 1L, -1L])

plsr.fit = plsr(y ~ ., data = trainingset,
ncomp = cv.bestcomp,
scale = TRUE,
method = 'simpls',
plsr.fit = plsr(y ~ .,
data = trainingset,
ncomp = cv.bestcomp,
scale = TRUE,
method = 'simpls',
validation = 'none')

return(list('plsr.fit' = plsr.fit, 'cv.bestcomp' = cv.bestcomp))
Expand Down
53 changes: 36 additions & 17 deletions R/enpls.fit.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
#' @param x Predictor matrix.
#' @param y Response vector.
#' @param maxcomp Maximum number of components included within each model.
#' If not specified, will use the variable (column) numbers in \code{x}.
#' If not specified, will use the maximum number possible (considering
#' cross-validation and special cases where n is smaller than p).
#' @param cvfolds Number of cross-validation folds used in each model
#' for automatic parameter selection, default is \code{5}.
#' @param reptimes Number of models to build with Monte-Carlo resampling
#' or bootstrapping.
#' @param method Resampling method. \code{"mc"} (Monte-Carlo resampling)
Expand Down Expand Up @@ -39,15 +42,15 @@
#' predict(fit, newx = x)

enpls.fit = function(x, y,
maxcomp = NULL,
maxcomp = NULL,
cvfolds = 5L,
reptimes = 500L,
method = c('mc', 'boot'), ratio = 0.8,
method = c('mc', 'boot'),
ratio = 0.8,
parallel = 1L) {

if (missing(x) | missing(y)) stop('Please specify both x and y')

if (is.null(maxcomp)) maxcomp = ncol(x)

method = match.arg(method)

x.row = nrow(x)
Expand All @@ -68,7 +71,7 @@ enpls.fit = function(x, y,
xtmp = x[samp.idx[[i]], ]
ytmp = y[samp.idx[[i]]]
plsdf = as.data.frame(cbind(xtmp, 'y' = ytmp))
modellist[[i]] = suppressWarnings(enpls.fit.core(plsdf, maxcomp))
modellist[[i]] = suppressWarnings(enpls.fit.core(plsdf, maxcomp, cvfolds))
}

} else {
Expand All @@ -78,7 +81,7 @@ enpls.fit = function(x, y,
xtmp = x[samp.idx[[i]], ]
ytmp = y[samp.idx[[i]]]
plsdf = as.data.frame(cbind(xtmp, 'y' = ytmp))
enpls.fit.core(plsdf, maxcomp)
enpls.fit.core(plsdf, maxcomp, cvfolds)
}

}
Expand All @@ -100,24 +103,40 @@ enpls.fit = function(x, y,
#'
#' @keywords internal

enpls.fit.core = function(plsdf, maxcomp) {
enpls.fit.core = function(plsdf, maxcomp, cvfolds) {

if (is.null(maxcomp)) {

plsr.cvfit = plsr(y ~ ., data = plsdf,
ncomp = maxcomp,
scale = TRUE,
method = 'simpls',
validation = 'CV', segments = 5L)
plsr.cvfit = plsr(y ~ .,
data = plsdf,
scale = TRUE,
method = 'simpls',
validation = 'CV',
segments = cvfolds)

} else {

plsr.cvfit = plsr(y ~ .,
data = plsdf,
ncomp = maxcomp,
scale = TRUE,
method = 'simpls',
validation = 'CV',
segments = cvfolds)

}

# select best component number using adjusted CV
cv.bestcomp = which.min(RMSEP(plsr.cvfit)[['val']][2L, 1L, -1L])

# remove plsr.cvfit object
rm(plsr.cvfit)

plsr.fit = plsr(y ~ ., data = plsdf,
ncomp = cv.bestcomp,
scale = TRUE,
method = 'simpls',
plsr.fit = plsr(y ~ .,
data = plsdf,
ncomp = cv.bestcomp,
scale = TRUE,
method = 'simpls',
validation = 'none')

# minify plsr.fit object to reduce memory footprint
Expand Down
Loading

0 comments on commit 7888238

Please sign in to comment.