Intoducton to R with titanic dataset

Accessing and reading the titanic dataset.

1.Set your working directory to folder where your titanic data is located.
2.Read the titanic data and set stringAsFactors to false.

setwd("C:/Data")   #(1)
titanic <- read.csv(
                "titanic.csv",
                stringsAsFactors = FALSE)   #(2)

Exploring the titanic data

3.Check the last three rows of titanic dataset.

tail(titanic, 3)     #(3)

Exploring the Survived variable

4.Using table command to see the total number of survived and dead.
5. We can use table command to see the total number of survived and dead.
6. Next calculate the proportion of dead and survived in titanic dataseet.

table(titanic$Survived)   #(4)
pie(table(titanic$Survived))  #(5)
prop.table(table(titanic$Survived))  #(6)

Factorization

7.We are going to convert Survived variable into categorical variable.
8. Next , convert Sex variable into categorical variable.

titanic$Survived <- as.factor(titanic$Survived)   #7
titanic$Sex <- as.factor(titanic$Sex)     #8

Segmented table of survived and pclass


9. Next, check the survivality in titanic dataset by Pclass.
table(titanic$Survived, titanic$Pclass)  #staff prop.table(table(titanic$Survived, titanic$Pclass))

0.41750842 / (0.41750842 + 0.13355780)
0.10886644 / (0.10886644 + 0.09764310)
0.08978676 / (0.08978676 + 0.15263749)
1-0.3703704

age boxplots segmented by survived

ggplot(titanic, aes(x=Survived, y=Age)) + geom_boxplot()

density plot of age, segmented by survived

ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived))

segmenting various features against one another

ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived)) + facet_wrap(~Pclass)
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived)) + facet_wrap(~Sex)
ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived)) + facet_grid(Pclass~Sex)
tail(titanic, 2)
ggplot(titanic, aes(x=SibSp)) + geom_density(aes(color=Survived))
ggplot(titanic, aes(x=Parch)) + geom_density(aes(color=Survived))
ggplot(titanic, aes(x=Fare)) + geom_density(aes(color=Survived))
ggplot(titanic, aes(x=Fare)) + geom_density(aes(color=Survived)) + facet_wrap(~Pclass)

fill in missing values of embarked

table(titanic$Embarked)
titanic$Embarked <- as.character(titanic$Embarked)
titanic[titanic$Embarked=="","Embarked"]
titanic[titanic$Embarked=="","Embarked"] <- "S"
titanic[titanic$Embarked=="","Embarked"] <- "S"
titanic$Embarked <- as.factor(titanic$Embarked)
table(titanic$Embarked)
prop.table(table(titanic$Embarked))

seeing if fare is a combination of pclass, and embarkation port

ggplot(titanic, aes(x=Fare)) + geom_density(aes(color=Survived)) + facet_grid(Embarked~Pclass)
ggplot(titanic, aes(x=Fare)) + geom_boxplot(aes(color=Survived)) + facet_grid(Embarked~Pclass)
ggplot(titanic, aes(x=Survived, y=Fare)) + geom_boxplot() + facet_grid(Embarked~Pclass)

filtering outlier fares out, visualizing fare segmented

titanic.lowfare <- titanic[titanic$Fare < 100,]
ggplot(titanic.lowfare, aes(x=Survived, y=Fare)) + geom_boxplot() + facet_grid(Embarked~Pclass)

exploring name

titanic$Name

looking up reverends

titanic[grep("Rev\\.", titanic$Name, ignore.case = TRUE),"Name"]

looking up if reverends survived

titanic[grep("Rev\\.", titanic$Name, ignore.case = TRUE),"Survived"]

if masters survived

table(titanic[grep("master\\.", titanic$Name, ignore.case = TRUE),"Survived"])\

how old were masters?

table(titanic[grep("master\\.", titanic$Name, ignore.case = TRUE),"Age"])

what were the genders of masters?

table(titanic[grep("master\\.", titanic$Name, ignore.case = TRUE),"Sex"])

did doctors survive?

table(titanic[grep("dr\\.", titanic$Name, ignore.case = TRUE),"Survived"])

did the captain live?

table(titanic[grep("capt\\.", titanic$Name, ignore.case = TRUE),"Survived"])

did rose live?

table(titanic[grep("rose", titanic$Name, ignore.case = TRUE),"Survived"])

explore the density plot of age, to explore cleaning function ramifications

ggplot(titanic, aes(x=Age)) + geom_density(aes(color=Survived))

bringing in an external dataset to predict missing values of gender from first names

library(gender)
gender("mercia")
gender("pat")

build a new column where we mark age is missing

titanic$AgeIsMissing <- FALSE
age.missing <- is.na(titanic$Age)
titanic[age.missing,"AgeIsMissing"] <- TRUE

clean missing values of age with the median

age.median <- median(titanic$Age, na.rm=TRUE)
titanic[age.missing, "Age"] <- age.median