Accessing and reading the titanic dataset.
1.Set your working directory to folder where your titanic data is located.
2.Read the titanic data and set stringAsFactors to false.
setwd("C:/Data") #(1)
titanic <- read.csv(
"titanic.csv",
stringsAsFactors = FALSE) #(2)
Exploring the titanic data
3.Check the last three rows of titanic dataset.
tail(titanic, 3) #(3)
Exploring the Survived variable
4.Using table command to see the total number of survived and dead.
5. We can use table command to see the total number of survived and dead.
6. Next calculate the proportion of dead and survived in titanic dataseet.
table(titanic$Survived) #(4)
pie(table(titanic$Survived)) #(5)
prop.table(table(titanic$Survived)) #(6)
Factorization
7.We are going to convert Survived variable into categorical variable.
8. Next , convert Sex variable into categorical variable.
titanic$Survived <- as.factor(titanic$Survived) #7
titanic$Sex <- as.factor(titanic$Sex) #8
Segmented table of survived and pclass
9. Next, check the survivality in titanic dataset by Pclass.
table(titanic$Survived, titanic$Pclass) #staff prop.table(table(titanic$Survived, titanic$Pclass))
0.41750842 / (0.41750842 + 0.13355780)
0.10886644 / (0.10886644 + 0.09764310)
0.08978676 / (0.08978676 + 0.15263749)
1-0.3703704
age boxplots segmented by survived
ggplot(titanic, aes(x=Survived, y=Age)) + geom_boxplot()
density plot of age, segmented by survived
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived))
segmenting various features against one another
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived)) + facet_wrap(~Pclass)
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived)) + facet_wrap(~Sex)
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived)) + facet_grid(Pclass~Sex)
tail(titanic, 2)
ggplot(titanic, aes(x=SibSp)) + geom_density(aes(color=Survived))
ggplot(titanic, aes(x=Parch)) + geom_density(aes(color=Survived))
ggplot(titanic, aes(x=Fare)) + geom_density(aes(color=Survived))
ggplot(titanic, aes(x=Fare)) + geom_density(aes(color=Survived)) + facet_wrap(~Pclass)
fill in missing values of embarked
table(titanic$Embarked)
titanic$Embarked <- as.character(titanic$Embarked)
titanic[titanic$Embarked=="","Embarked"]
titanic[titanic$Embarked=="","Embarked"] <- "S"
titanic[titanic$Embarked=="","Embarked"] <- "S"
titanic$Embarked <- as.factor(titanic$Embarked)
table(titanic$Embarked)
prop.table(table(titanic$Embarked))
seeing if fare is a combination of pclass, and embarkation port
ggplot(titanic, aes(x=Fare)) + geom_density(aes(color=Survived)) + facet_grid(Embarked~Pclass)
ggplot(titanic, aes(x=Fare)) + geom_boxplot(aes(color=Survived)) + facet_grid(Embarked~Pclass)
ggplot(titanic, aes(x=Survived, y=Fare)) + geom_boxplot() + facet_grid(Embarked~Pclass)
filtering outlier fares out, visualizing fare segmented
titanic.lowfare <- titanic[titanic$Fare < 100,]
ggplot(titanic.lowfare, aes(x=Survived, y=Fare)) + geom_boxplot() + facet_grid(Embarked~Pclass)
exploring name
titanic$Name
looking up reverends
titanic[grep("Rev\\.", titanic$Name, ignore.case = TRUE),"Name"]
looking up if reverends survived
titanic[grep("Rev\\.", titanic$Name, ignore.case = TRUE),"Survived"]
if masters survived
table(titanic[grep("master\\.", titanic$Name, ignore.case = TRUE),"Survived"])\
how old were masters?
table(titanic[grep("master\\.", titanic$Name, ignore.case = TRUE),"Age"])
what were the genders of masters?
table(titanic[grep("master\\.", titanic$Name, ignore.case = TRUE),"Sex"])
did doctors survive?
table(titanic[grep("dr\\.", titanic$Name, ignore.case = TRUE),"Survived"])
did the captain live?
table(titanic[grep("capt\\.", titanic$Name, ignore.case = TRUE),"Survived"])
did rose live?
table(titanic[grep("rose", titanic$Name, ignore.case = TRUE),"Survived"])
explore the density plot of age, to explore cleaning function ramifications
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived))
bringing in an external dataset to predict missing values of gender from first names
library(gender)
gender("mercia")
gender("pat")
build a new column where we mark age is missing
titanic$AgeIsMissing <- FALSE
age.missing <- is.na(titanic$Age)
titanic[age.missing,"AgeIsMissing"] <- TRUE
clean missing values of age with the median
age.median <- median(titanic$Age, na.rm=TRUE)
titanic[age.missing, "Age"] <- age.median