Skip to content

Commit

Permalink
version 1.1-6
Browse files Browse the repository at this point in the history
  • Loading branch information
topepo authored and gaborcsardi committed Jul 24, 2014
1 parent 47efe83 commit 42e9dfe
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 75 deletions.
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: AppliedPredictiveModeling
Type: Package
Title: Functions and Data Sets for 'Applied Predictive Modeling'
Version: 1.1-5
Date: 2014-02-03
Version: 1.1-6
Date: 2014-07-24
Author: Max Kuhn, Kjell Johnson
Maintainer: Max Kuhn <[email protected]>
Description: A few functions and several data set for the Springer book 'Applied Predictive Modeling'
Expand All @@ -11,7 +11,7 @@ Depends: R (>= 2.10)
Imports: CORElearn, MASS, plyr, reshape2
Suggests: caret (>= 6.0-22), lattice, ellipse
License: GPL
Packaged: 2014-02-03 21:08:33 UTC; kuhna03
Packaged: 2014-07-25 13:37:54 UTC; kuhna03
NeedsCompilation: no
Repository: CRAN
Date/Publication: 2014-02-03 23:51:39
Date/Publication: 2014-07-25 18:42:22
8 changes: 4 additions & 4 deletions MD5
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
4f1a0f11a6a243d5e47446702f618cc8 *DESCRIPTION
b956c4b3e5aad17e35ac120a5d5f59d3 *DESCRIPTION
03e5df17ec09e62322761dbf70cd3a8a *NAMESPACE
e4e564d2188913c297d854a86868bd37 *R/bookTheme.R
538821ec8c21e26d4b936611aa157bc4 *R/easyBoundaryFunc.R
Expand All @@ -21,7 +21,7 @@ e1590269851cf810fdffa832b6cf6d65 *data/schedulingData.RData
669172e9b524f9194a23fbc84a2816f8 *data/segmentationOriginal.RData
06780bd86a4db76cb2a8eb12ef107df7 *data/solubility.RData
5e5422a8c05125f3ab1822f6c525296a *data/twoClassData.RData
0c14b02b846f7c50801049f7e30d9a86 *inst/NEWS.Rd
47ca37b7084f35df9345c9c5b6a3d461 *inst/NEWS.Rd
55afb317aa767a6e82c6c52ee985563f *inst/chapters/02_A_Short_Tour.R
5a487c219abde639b85d7275c6a4bf31 *inst/chapters/02_A_Short_Tour.Rout
ec4768cf8bf24124e998a1ce680dceb6 *inst/chapters/03_Data_Pre_Processing.R
Expand Down Expand Up @@ -52,8 +52,8 @@ cdd39d98758aa17566201c45150265b8 *inst/chapters/18_Importance.R
4288e31b3484d3719f98cf377e756e7f *inst/chapters/18_Importance.Rout
001bc824c0505d4c462039b112364d9d *inst/chapters/19_Feature_Select.R
487d84200b36ed46159a608ec81fedc0 *inst/chapters/19_Feature_Select.Rout
ee8d141c6ff92f1878bb1954d21cab67 *inst/chapters/CreateGrantData.R
6b7d3facf17c4ad5704ca9c54c17acc1 *inst/chapters/CreateGrantData.Rout
3afe6f6859238c711c0ce0ba33678051 *inst/chapters/CreateGrantData.R
8a85f9749667d240d73b57de77df5b31 *inst/chapters/CreateGrantData.Rout
6a51123bb7533bc6ac7cc60e20c30f7c *man/AlzheimerDisease.Rd
79b66304686ea5f41624e941a839f783 *man/AppliedPredictiveModeling-package.Rd
b5c2029d7b9d21d128b3084b108404a8 *man/ChemicalManufacturingProcess.Rd
Expand Down
8 changes: 8 additions & 0 deletions inst/NEWS.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
\newcommand{\cpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}}


\section{Changes in version 1.1-6}{

\itemize{
\item The file \code{CreateGrantData.R} was updated to include code to create the objects \code{factorPredictors} and \code{factorForm}.

}
}

\section{Changes in version 1.1-5}{

The package dependencies were updated. Some were moved to 'Imports'
Expand Down
55 changes: 29 additions & 26 deletions inst/chapters/CreateGrantData.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,7 @@ library(lubridate)
## machine) but will consume more memory.
cores <- 3

if(cores > 1)
{
if(cores > 1) {
library(doMC)
registerDoMC(cores)
}
Expand Down Expand Up @@ -94,8 +93,7 @@ dpmt <- sort(dpmt[!is.na(dpmt)])
## Split up the data by role number (1-15) and add any missing columns
## (roles 1-5 have more columns than the others)
tmp <- vector(mode = "list", length = 15)
for(i in 1:15)
{
for(i in 1:15) {
tmpData <- raw[, c("Grant.Application.ID", grep(paste("\\.", i, "$", sep = ""), names(raw), value = TRUE))]
names(tmpData) <- gsub(paste("\\.", i, "$", sep = ""), "", names(tmpData))
if(i == 1) nms <- names(tmpData)
Expand Down Expand Up @@ -164,8 +162,7 @@ vertical$No..of.Years.in.Uni.at.Time.of.Grant <- factor(vertical$No..of.Years.in
######################################################################
## A function to shorten the role titles

shortNames <- function(x, pre = "")
{
shortNames <- function(x, pre = ""){
x <- gsub("EXT_CHIEF_INVESTIGATOR", "ECI", x)
x <- gsub("STUD_CHIEF_INVESTIGATOR", "SCI", x)
x <- gsub("CHIEF_INVESTIGATOR", "CI", x)
Expand All @@ -180,8 +177,7 @@ shortNames <- function(x, pre = "")
}

## A function to find and remove zero-variance ("ZV") predictors
noZV <- function(x)
{
noZV <- function(x) {
keepers <- unlist(lapply(x, function(x) length(unique(x)) > 1))
x[,keepers,drop = FALSE]
}
Expand All @@ -206,8 +202,7 @@ names(investCount) <- shortNames(names(investCount), "Num")
## For each role, calculate the frequency of people in each age group

investDOB <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
tabDF <- as.data.frame(table(x$Role, x$Year.of.Birth))
out <- data.frame(t(tabDF$Freq))
names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
Expand All @@ -221,8 +216,7 @@ investDOB <- noZV(investDOB)
## For each role, calculate the frequency of people from each country

investCountry <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
tabDF <- as.data.frame(table(x$Role, x$Country.of.Birth))
out <- data.frame(t(tabDF$Freq))
names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
Expand All @@ -236,8 +230,7 @@ investCountry <- noZV(investCountry)
## For each role, calculate the frequency of people for each language

investLang <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
tabDF <- as.data.frame(table(x$Role, x$Home.Language))
out <- data.frame(t(tabDF$Freq))
names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
Expand All @@ -251,8 +244,7 @@ investLang <- noZV(investLang)
## For each role, determine who as a Ph.D.

investPhD <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
tabDF <- as.data.frame(table(x$Role, x$With.PHD))
out <- data.frame(t(tabDF$Freq))
names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
Expand All @@ -269,8 +261,7 @@ investPhD <- noZV(investPhD)
## grants

investGrants <- ddply(vertical, .(Grant.Application.ID, Role),
function(x)
{
function(x) {
data.frame(Success = sum(x$Number.of.Successful.Grant, na.rm = TRUE),
Unsuccess = sum(x$Number.of.Unsuccessful.Grant, na.rm = TRUE))

Expand All @@ -286,8 +277,7 @@ investGrants <- noZV(investGrants)
## Create variables for each role/department combination

investDept <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
tabDF <- as.data.frame(table(x$Role, x$Dept.No.))
out <- data.frame(t(tabDF$Freq))
names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
Expand All @@ -302,8 +292,7 @@ investDept <- noZV(investDept)


investFaculty <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
tabDF <- as.data.frame(table(x$Role, x$Faculty.No.))
out <- data.frame(t(tabDF$Freq))
names(out) <- paste(tabDF$Var1, tabDF$Var2, sep = ".")
Expand All @@ -328,8 +317,7 @@ investDuration[is.na(investDuration)] <- 0
## removed for models that cannot deal with such a linear dependency

totalPub <- ddply(vertical, .(Grant.Application.ID),
function(x)
{
function(x) {
data.frame(AstarTotal = sum(x$A., na.rm = TRUE),
ATotal = sum(x$A, na.rm = TRUE),
BTotal = sum(x$B, na.rm = TRUE),
Expand All @@ -344,8 +332,7 @@ totalPub <- ddply(vertical, .(Grant.Application.ID),
## type per role.

investPub <- ddply(vertical, .(Grant.Application.ID, Role),
function(x)
{
function(x) {
data.frame(Astar = sum(x$A., na.rm = TRUE),
A = sum(x$A, na.rm = TRUE),
B = sum(x$B, na.rm = TRUE),
Expand Down Expand Up @@ -458,6 +445,22 @@ testing <- testing[, names(training)]

fullSet <- names(training)[names(training) != "Class"]

###################################################################
### In the classification tree chapter, there is a different set
### of predictors that use factor encodings of some of the
### predictors

factorPredictors <- names(training)[names(training) != "Class"]
factorPredictors <- factorPredictors[!grepl("Sponsor[0-9]", factorPredictors)]
factorPredictors <- factorPredictors[!grepl("SponsorUnk", factorPredictors)]
factorPredictors <- factorPredictors[!grepl("ContractValueBand[A-Z]", factorPredictors)]
factorPredictors <- factorPredictors[!grepl("GrantCat", factorPredictors)]
factorPredictors <- factorPredictors[!(factorPredictors %in% levels(training$Month))]
factorPredictors <- factorPredictors[!(factorPredictors %in% levels(training$Weekday))]

factorForm <- paste("Class ~ ", paste(factorPredictors, collapse = "+"))
factorForm <- as.formula(factorForm)

### Some are extremely correlated, so remove
predCorr <- cor(training[,fullSet])
highCorr <- findCorrelation(predCorr, .99)
Expand Down
Loading

0 comments on commit 42e9dfe

Please sign in to comment.