How can I perform cross validation using rpart package on titanic dataset?

#=======================================================================
# Code sample illustrating the use of the mighty caret package for
# performing cross valdation of rpart trees, making predictions, and
# saving out a .CSV file suitable for submission to Kaggle.
#
#=======================================================================
#
install.packages("caret")
library(caret)
library(rpart.plot)

# Load up data.
# NOTE - Set your working directory to the correct location for the
#        Kaggle data files
train <- read.csv("train.csv", stringsAsFactors = FALSE)
test <- read.csv("test.csv", stringsAsFactors = FALSE)


# Combine the data to make data cleaning easier
survived <- train$Survived
data.combined <- rbind(train[, -2], test)


# Transform some variables to factors
data.combined$Pclass <- as.factor(data.combined$Pclass)
data.combined$Sex <- as.factor(data.combined$Sex)


# Split data back out
train <- data.combined[1:891,]
train$Survived <- as.factor(survived)

test <- data.combined[892:1309,]


# Subset the features we want to use
features <- c("Survived", "Sex")


# Set seed to ensure reproducibility between runs
set.seed(12345)


# Set up caret to perform 10-fold cross validation repeated 3 times
caret.control <- trainControl(method = "repeatedcv",
                              number = 10,
                              repeats = 3)


# Use caret to train the rpart decision tree using 10-fold cross 
# validation repeated 3 times and use 15 values for tuning the
# cp parameter for rpart. This code returns the best model trained
# on all the data! Mighty!
rpart.cv <- train(Survived ~ ., 
                  data = train[, features],
                  method = "rpart",
                  trControl = caret.control,
                  tuneLength = 15)


# Display the results of the cross validation run - 78.679% mean accuracy! 
rpart.cv


# What is the standard deviation?
cat(paste("\nCross validation standard deviation:",  
          sd(rpart.cv$resample$Accuracy), "\n", sep = " "))


# Pull out the the trained model using the best parameters on
# all the data! Mighty!
rpart.best <- rpart.cv$finalModel


# Look at the model - this model is trained on 100% of the data!
rpart.best


# Make the model look pretty
#install.packages("rpart.plot")
library(rpart.plot)
prp(rpart.best, type = 0, extra = 1, under = TRUE)


# Create predictions
preds <- predict(rpart.cv, test, type = "raw")

# Create dataframe shaped for Kaggle
submission <- data.frame(PassengerId = test$PassengerId,
                         Survived = preds)


# Write out a .CSV suitable for Kaggle submission
write.csv(submission, file = "MySubmission.csv", row.names = FALSE)
2 Likes