version 1.1-6

cran · Jul 24, 2014 · 42e9dfe · 42e9dfe
1 parent 47efe83
commit 42e9dfe
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 75 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: AppliedPredictiveModeling
 Type: Package
 Title: Functions and Data Sets for 'Applied Predictive Modeling'
-Version: 1.1-5
-Date: 2014-02-03
+Version: 1.1-6
+Date: 2014-07-24
 Author: Max Kuhn, Kjell Johnson
 Maintainer: Max Kuhn <[email protected]>
 Description: A few functions and several data set for the Springer book 'Applied Predictive Modeling'
@@ -11,7 +11,7 @@ Depends: R (>= 2.10)
 Imports: CORElearn, MASS, plyr, reshape2
 Suggests: caret (>= 6.0-22), lattice, ellipse
 License: GPL
-Packaged: 2014-02-03 21:08:33 UTC; kuhna03
+Packaged: 2014-07-25 13:37:54 UTC; kuhna03
 NeedsCompilation: no
 Repository: CRAN
-Date/Publication: 2014-02-03 23:51:39
+Date/Publication: 2014-07-25 18:42:22
diff --git a/MD5 b/MD5
@@ -1,4 +1,4 @@
-4f1a0f11a6a243d5e47446702f618cc8 *DESCRIPTION
+b956c4b3e5aad17e35ac120a5d5f59d3 *DESCRIPTION
 03e5df17ec09e62322761dbf70cd3a8a *NAMESPACE
 e4e564d2188913c297d854a86868bd37 *R/bookTheme.R
 538821ec8c21e26d4b936611aa157bc4 *R/easyBoundaryFunc.R
@@ -21,7 +21,7 @@ e1590269851cf810fdffa832b6cf6d65 *data/schedulingData.RData
 669172e9b524f9194a23fbc84a2816f8 *data/segmentationOriginal.RData
 06780bd86a4db76cb2a8eb12ef107df7 *data/solubility.RData
 5e5422a8c05125f3ab1822f6c525296a *data/twoClassData.RData
-0c14b02b846f7c50801049f7e30d9a86 *inst/NEWS.Rd
+47ca37b7084f35df9345c9c5b6a3d461 *inst/NEWS.Rd
 55afb317aa767a6e82c6c52ee985563f *inst/chapters/02_A_Short_Tour.R
 5a487c219abde639b85d7275c6a4bf31 *inst/chapters/02_A_Short_Tour.Rout
 ec4768cf8bf24124e998a1ce680dceb6 *inst/chapters/03_Data_Pre_Processing.R
@@ -52,8 +52,8 @@ cdd39d98758aa17566201c45150265b8 *inst/chapters/18_Importance.R
 4288e31b3484d3719f98cf377e756e7f *inst/chapters/18_Importance.Rout
 001bc824c0505d4c462039b112364d9d *inst/chapters/19_Feature_Select.R
 487d84200b36ed46159a608ec81fedc0 *inst/chapters/19_Feature_Select.Rout
-ee8d141c6ff92f1878bb1954d21cab67 *inst/chapters/CreateGrantData.R
-6b7d3facf17c4ad5704ca9c54c17acc1 *inst/chapters/CreateGrantData.Rout
+3afe6f6859238c711c0ce0ba33678051 *inst/chapters/CreateGrantData.R
+8a85f9749667d240d73b57de77df5b31 *inst/chapters/CreateGrantData.Rout
 6a51123bb7533bc6ac7cc60e20c30f7c *man/AlzheimerDisease.Rd
 79b66304686ea5f41624e941a839f783 *man/AppliedPredictiveModeling-package.Rd
 b5c2029d7b9d21d128b3084b108404a8 *man/ChemicalManufacturingProcess.Rd

diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
@@ -3,6 +3,14 @@
 \newcommand{\cpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}}
 
 
+\section{Changes in version 1.1-6}{
+
+\itemize{
+\item The file \code{CreateGrantData.R} was updated to include code to create the objects \code{factorPredictors} and \code{factorForm}. 
+
+}
+}
+
 \section{Changes in version 1.1-5}{
 
 The package dependencies were updated. Some were moved to 'Imports'

diff --git a/inst/chapters/CreateGrantData.R b/inst/chapters/CreateGrantData.R
@@ -49,8 +49,7 @@ library(lubridate)
 ## machine) but will consume more memory.
 cores <- 3
 
-if(cores > 1)
-  {
+if(cores > 1) {
     library(doMC)
     registerDoMC(cores)
   }
@@ -94,8 +93,7 @@ dpmt <- sort(dpmt[!is.na(dpmt)])
 ## Split up the data by role number (1-15) and add any missing columns
 ## (roles 1-5 have more columns than the others)
 tmp <- vector(mode = "list", length = 15)
-for(i in 1:15)
-  {
+for(i in 1:15) {
     tmpData <- raw[, c("Grant.Application.ID", grep(paste("\\.", i, "$", sep = ""), names(raw), value = TRUE))]
     names(tmpData) <- gsub(paste("\\.", i, "$", sep = ""), "", names(tmpData))
     if(i == 1) nms <- names(tmpData)
@@ -164,8 +162,7 @@ vertical$No..of.Years.in.Uni.at.Time.of.Grant <- factor(vertical$No..of.Years.in
 ######################################################################
 ## A function to shorten the role titles
 
-shortNames <- function(x, pre = "")
-  {
+shortNames <- function(x, pre = ""){
     x <- gsub("EXT_CHIEF_INVESTIGATOR",  "ECI", x)
     x <- gsub("STUD_CHIEF_INVESTIGATOR", "SCI", x)
     x <- gsub("CHIEF_INVESTIGATOR",      "CI", x)
@@ -180,8 +177,7 @@ shortNames <- function(x, pre = "")
   }
 
 ## A function to find and remove zero-variance ("ZV") predictors
-noZV <- function(x)
-  {
+noZV <- function(x) {
     keepers <- unlist(lapply(x, function(x) length(unique(x)) > 1))
     x[,keepers,drop = FALSE]
   }
@@ -206,8 +202,7 @@ names(investCount) <- shortNames(names(investCount), "Num")
 ## For each role, calculate the frequency of people in each age group
 
 investDOB <- ddply(vertical, .(Grant.Application.ID),
-                   function(x)
-                   {
+                   function(x) {
                      tabDF <- as.data.frame(table(x$Role, x$Year.of.Birth))
                      out <- data.frame(t(tabDF$Freq))
                      names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
@@ -221,8 +216,7 @@ investDOB <- noZV(investDOB)
 ## For each role, calculate the frequency of people from each country
 
 investCountry <- ddply(vertical, .(Grant.Application.ID),
-                       function(x)
-                       {
+                       function(x) {
                          tabDF <- as.data.frame(table(x$Role, x$Country.of.Birth))
                          out <- data.frame(t(tabDF$Freq))
                          names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
@@ -236,8 +230,7 @@ investCountry <- noZV(investCountry)
 ## For each role, calculate the frequency of people for each language
 
 investLang <- ddply(vertical, .(Grant.Application.ID),
-                    function(x)
-                    {
+                    function(x) {
                       tabDF <- as.data.frame(table(x$Role, x$Home.Language))
                       out <- data.frame(t(tabDF$Freq))
                       names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
@@ -251,8 +244,7 @@ investLang <- noZV(investLang)
 ## For each role, determine who as a Ph.D.
 
 investPhD <- ddply(vertical, .(Grant.Application.ID),
-                   function(x)
-                   {
+                   function(x) {
                      tabDF <- as.data.frame(table(x$Role, x$With.PHD))
                      out <- data.frame(t(tabDF$Freq))
                      names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
@@ -269,8 +261,7 @@ investPhD <- noZV(investPhD)
 ## grants
 
 investGrants <- ddply(vertical, .(Grant.Application.ID, Role),
-                      function(x)
-                      {
+                      function(x) {
                         data.frame(Success = sum(x$Number.of.Successful.Grant, na.rm = TRUE),
                                    Unsuccess = sum(x$Number.of.Unsuccessful.Grant, na.rm = TRUE))
 
@@ -286,8 +277,7 @@ investGrants <- noZV(investGrants)
 ## Create variables for each role/department combination
 
 investDept <- ddply(vertical, .(Grant.Application.ID),
-                    function(x)
-                    {
+                    function(x) {
                       tabDF <- as.data.frame(table(x$Role, x$Dept.No.))
                       out <- data.frame(t(tabDF$Freq))
                       names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
@@ -302,8 +292,7 @@ investDept <- noZV(investDept)
 
 
 investFaculty <- ddply(vertical, .(Grant.Application.ID),
-                       function(x)
-                       {
+                       function(x) {
                          tabDF <- as.data.frame(table(x$Role, x$Faculty.No.))
                          out <- data.frame(t(tabDF$Freq))
                          names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
@@ -328,8 +317,7 @@ investDuration[is.na(investDuration)] <- 0
 ## removed for models that cannot deal with such a linear dependency
 
 totalPub <- ddply(vertical, .(Grant.Application.ID),
-                   function(x)
-                   {
+                   function(x) {
                      data.frame(AstarTotal = sum(x$A., na.rm = TRUE),
                                 ATotal = sum(x$A, na.rm = TRUE),
                                 BTotal = sum(x$B, na.rm = TRUE),
@@ -344,8 +332,7 @@ totalPub <- ddply(vertical, .(Grant.Application.ID),
 ## type per role.
 
 investPub <- ddply(vertical, .(Grant.Application.ID, Role),
-                   function(x)
-                   {
+                   function(x) {
                      data.frame(Astar = sum(x$A., na.rm = TRUE),
                                 A = sum(x$A, na.rm = TRUE),
                                 B = sum(x$B, na.rm = TRUE),
@@ -458,6 +445,22 @@ testing <- testing[, names(training)]
 
 fullSet <- names(training)[names(training) != "Class"]
 
+###################################################################
+### In the classification tree chapter, there is a different set
+### of predictors that use factor encodings of some of the 
+### predictors
+
+factorPredictors <- names(training)[names(training) != "Class"]
+factorPredictors <- factorPredictors[!grepl("Sponsor[0-9]", factorPredictors)]
+factorPredictors <- factorPredictors[!grepl("SponsorUnk", factorPredictors)]
+factorPredictors <- factorPredictors[!grepl("ContractValueBand[A-Z]", factorPredictors)]
+factorPredictors <- factorPredictors[!grepl("GrantCat", factorPredictors)]
+factorPredictors <- factorPredictors[!(factorPredictors %in% levels(training$Month))]
+factorPredictors <- factorPredictors[!(factorPredictors %in% levels(training$Weekday))]
+
+factorForm <- paste("Class ~ ", paste(factorPredictors, collapse = "+"))
+factorForm <- as.formula(factorForm)
+
 ### Some are extremely correlated, so remove
 predCorr <- cor(training[,fullSet])
 highCorr <- findCorrelation(predCorr, .99)