filename = "titanic.csv"
# Building decision tree model
# using rpart and the titanic dataset
setwd("C:/repos/bootcamp/Datasets")
# read in titanic data
titanic <- read.csv(file = filename,
stringsAsFactors = FALSE)
# clean missing values
# cleaning, embarked
titanic[titanic$Embarked=="","Embarked"] <- "S"
# cleaning age
# Finding rows that have masters
masters <- grep(pattern = "Master\\.",
x = titanic$Name,
ignore.case = TRUE)
# calculate median age for masters
median.masters <- median(
titanic[masters, "Age"],
na.rm=TRUE)
# engineer a masters column
titanic$IsMaster <- FALSE
titanic[masters, "IsMaster"] <- TRUE
is.master <- titanic$IsMaster==TRUE
age.missing <- is.na(titanic$Age)
# fill in missing values of age
titanic[is.master & age.missing, "Age"] <- median.masters
# clean remaining age values
median.age <- median(
titanic[!is.master, "Age"],
na.rm = TRUE)
age.missing <- is.na(titanic$Age)
titanic[age.missing, "Age"] <- median.age
titanic$Survived <- as.factor(titanic$Survived)
titanic$Pclass <- as.factor(titanic$Pclass)
titanic$Sex <- as.factor(titanic$Sex)
titanic$Embarked <- as.factor(titanic$Embarked)
titanic$IsMaster <- as.factor(titanic$IsMaster)