10 Structuring data transformation and model assessments
We can train the model using Caret package which is an interface that supports many models.
See the XGBoost file.
library(recipes)
library(caret)
library(ISLR)
library(doParallel)
#Initiating parellels
<- makePSOCKcluster(detectCores()-1)
CoreCount registerDoParallel(CoreCount)
#Loading Data
set.seed(1337)
<- ISLR::OJ
df <- sample(x = nrow(df),size = 800)
index <- df[index, ]
df.train <- df[-index, ]
df.test
#Blueprinting - transforming data
#Run here
{# blueprint <- {recipe(formula = Purchase ~ . #Defining Y and X's
# ,data = df)} #Selecting the data
# #Notice, one could apply steps in the braces
#
# #blueprint #Could be run to see the operations
#
#
#
# #Prepping
# prep.df <- prep(x = blueprint,training = df)
# #prep.df #Could be run to see the operations
#
# #Baking
# baked.df <- bake(prep.df, new_data = df)
# baked.df.train <- bake(prep.df, new_data = df.train) #Used if Blueprint is run on train data
# baked.df.test <- bake(prep.df, new_data = df.test) #Used if Blueprint is run on train data
}
#Selecting the parameters
<- trainControl(method = "cv" #The resampling method
train.param number = 5 #Relevant for when the models are trained
,classProbs = TRUE
,summaryFunction = twoClassSummary #Returns sens, spec, ROC
,allowParallel = TRUE
,
)
#Training different models
#(1) Conditional Inference Tree
<- train(Purchase ~ .
model.tree data = df.train
,method = "ctree2" #See list http://topepo.github.io/caret/train-models-by-tag.html
,trControl = train.param
,metric = "ROC")
,
#(2) Random Forest
.1 <- train(Purchase ~ .
model.rfdata = df.train
,method = "rf"
,ntree = 10 #Number of trees to grow
,trControl = train.param
,metric = "ROC")
,
#(3) Random Forest 2
.2 <- train(Purchase ~ .
model.rfdata = df.train
,method = "rf"
,ntree = 150 #Number of trees to grow
,trControl = train.param
,metric = "ROC")
,
#(4) XGBoosting
<- train(Purchase ~ .
model.xgboost data = df.train
,method = "xgbTree"
,trControl = train.param
,metric = "ROC"
,tuneGrid = data.frame(.nrounds = 300 #Defining a table of tuning params
,.max_depth = 3
,.eta = 0.03
,.gamma = 0
,.subsample = 0.5
,.colsample_bytree = 0.1
,.min_child_weight = 1))
,
#Terminating the parallels
stopCluster(CoreCount)
registerDoSEQ()
#Summarizing the results
<- resamples(list(tree = model.tree,
results rf.1 = model.rf.1,
rf.2 = model.rf.2,
xgboost = model.xgboost)
) results
summary(results)
bwplot(results)
dotplot(results)