Kaggle Titanic Dataset Machine Learning Model

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_val_predict 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import RandomizedSearchCV

Import Data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
both = [train,test]
train.columns
Out[3]:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Varibles Description

PassengerId : Unique id number to each passenger
Survived : Passanger survive(1) or die(0)
Pclass : Passanger Class
Name : Name
Sex : Gender of passanger
Age : Age of passanger
SibSp : Number of siblings/spouses
Parch : Number of parents/children
Ticket : Number of ticket
Fare : Amount of money spend on ticket
Cabin : Cabin type
Embarked : Port where passenger embarked (C = Cherbourg, Q = Queenstown, S = Southampton)



Print out all numerical columns

In [4]:
train.describe()
Out[4]:
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

Print out all object columns

In [5]:
train.describe(include=['O'])
Out[5]:
Name Sex Ticket Cabin Embarked
count 891 891 891 204 889
unique 891 2 681 147 3
top Braund, Mr. Owen Harris male 347082 B96 B98 S
freq 1 577 7 4 644

Check where there are missing values

In [6]:
train.isnull().sum()
Out[6]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Cabin is missing a tremendous amount of data. I wonder if grouping it as 'Deck' may provide insight

In [54]:
for df in both:    
    df['Cabin'].replace(np.nan,'M',inplace=True)
    df['Deck']=df['Cabin'].str[0]
    print(df['Deck'].unique())
['M' 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']
['M' 'B' 'E' 'A' 'C' 'D' 'F' 'G']

Data Exploration

In [8]:
def bars(data, y, vars,title):
    fig, axes = plt.subplots(1,len(vars),figsize=(15,5))
    for var, ax in zip(vars,axes.reshape(-1)):
        sns.barplot(x=var, y=y,data=data,ax=ax)
        ax.set(title=var+' Distribution')
    fig.suptitle(title)
    fig.tight_layout()
    fig.show()

Plot bar charts of the survival rate by different categories

In [9]:
vars = ['Sex','Pclass','Embarked','Deck']
bars(data=train,y='Survived',vars=vars,title='Survivability by Different Groups')
C:\Users\yy275e\AppData\Local\Temp/ipykernel_3620/4048716104.py:8: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

Plot the changes in survivability from group to group

In [55]:
fig, axes = plt.subplots(1,3,figsize=(15,5))
sns.pointplot(x='Sex', y='Survived', hue='Pclass', data=train,kind='point',ax=axes[0], ci = None)
axes[0].set_title('Survivability by Sex and Class')
sns.pointplot(x='Sex', y='Survived', hue='Embarked', data=train,kind='point',ax=axes[1], ci = None)
axes[1].set_title('Survivability by Sex and Embarked Location')
sns.pointplot(x='Pclass', y='Survived', hue='Embarked', data=train,kind='point',ax=axes[2], ci = None)
axes[2].set_title('Survivability by Class and Embarked Location')
Out[55]:
Text(0.5, 1.0, 'Survivability by Class and Embarked Location')
In [11]:
def hists(data, comp, vars,title):
    fig, axes = plt.subplots(2,int(round(len(vars)/2)),figsize=(20,10))
    groups = data[comp].unique()
    for var, ax in zip(vars,axes.reshape(-1)):
        for group in groups:
            sns.histplot(x=var,hue=comp,data=data,ax=ax,stat='density',kde=True)
        ax.set(title=var+' Distribution')
    fig.suptitle(title)
    fig.tight_layout()
    fig.show()

Plot the different distributions for the numerical variables between the survived passengers and those who did not. It is notable that younger people were more likely to survive and people who paid more for their ticket we likely to survive as well.

In [12]:
hist_vars = ['Age', 'SibSp', 'Parch', 'Fare']
hists(data=train, comp = 'Survived',vars=hist_vars,title='')
C:\Users\yy275e\AppData\Local\Temp/ipykernel_3620/2655237577.py:10: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

Correlation Matrix shows that Age, SibSp, and Parch are not very correlated

In [36]:
plt.figure(figsize=(15,6))
sns.heatmap(train.drop('PassengerId',axis=1).corr(), square=True, annot=True)
#Looking at the variables that are less thatn .1 correlated with 'Survived' we can create Age bands to help age be a better variable
print(pd.cut(train['Age'], 5).unique())
[(16.336, 32.252], (32.252, 48.168], NaN, (48.168, 64.084], (0.34, 16.336], (64.084, 80.0]]
Categories (5, interval[float64, right]): [(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]]

Feature creation

Creating features that will be more highly correlated with the survivability

In [67]:
trn = pd.read_csv('train.csv')
tst = pd.read_csv('test.csv')
both = [trn,tst]
cont_vars = ['Fare']

for df in both:
    #Fill in blank Embarked with the most common 'Embarked' code
    df['Embarked'] = df['Embarked'].fillna(trn['Embarked'].mode())
    #Edit create Title field and sort into categories: Master, Miss, Mrs, Mr, and Rare
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    #Fill in missing ages with a random age from the mean and std of the other ages
    age_avg = df['Age'].mean()
    age_std = df['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    df.loc[df['Age'].isnull(),'Age']=age_null_random_list
    df['Age'] = df['Age'].astype(int)

    #Creating age brackets
    df['Age_Group']=""
    df.loc[df['Age']<=16,'Age_Group'] = 'Child'
    df.loc[df['Age']>16,'Age_Group'] = 'YA'
    df.loc[df['Age']>32,'Age_Group'] = 'Adult'
    df.loc[df['Age']>48,'Age_Group'] = 'Mid_Age'
    df.loc[df['Age']>64,'Age_Group'] = 'Old'

    #Creating Deck variable
    df['Cabin'].replace(np.nan,'M',inplace=True)
    df['Deck']=df['Cabin'].str[0]
    df.loc[df['Deck']=='A','Deck'] = 'Top'
    df.loc[df['Deck']=='B','Deck'] = 'Top'
    df.loc[df['Deck']=='C','Deck'] = 'Top'
    df.loc[df['Deck']=='D','Deck'] = 'Top'
    df.loc[df['Deck']=='E','Deck'] = 'Top'
    df.loc[df['Deck']=='F','Deck'] = 'Bot'
    df.loc[df['Deck']=='G','Deck'] = 'Bot'
    df.loc[df['Deck']=='T','Deck'] = 'Bot'
    df.loc[df['Deck']=='M','Deck'] = 'Mis'
    

    #Fill in missing Fare data with median
    df['Fare'] = df['Fare'].fillna(trn['Fare'].median())
    #Add sibling and parents together for total family number
    df['Family']=df['Parch']+df['SibSp']+1
    #Add Alone column if the person was alone
    df['Alone']=0
    df.loc[df['Family']==1,'Alone']=1

#Scale the one continuous variable that we still have   
scaler = MinMaxScaler(feature_range=(0,1))
trn[cont_vars] = scaler.fit_transform(trn[cont_vars])
tst[cont_vars] = scaler.fit_transform(tst[cont_vars])


#Create dummy variables for our categorival variables
trn = pd.get_dummies(trn, columns = ["Sex","Embarked","Title","Age_Group",'Deck'],prefix=["Sex","Em_type",'Title','Age','Deck'])   
tst = pd.get_dummies(tst, columns = ["Sex","Embarked",'Title',"Age_Group",'Deck'],prefix=["Sex","Em_type",'Title','Age','Deck'])                           
#Drop unused variables
trn.drop(['Parch', 'SibSp', 'Family','Age'], axis=1,inplace=True)
tst.drop(['Parch', 'SibSp', 'Family','Age'], axis=1,inplace=True)


plt.figure(figsize=(22,22))
sns.heatmap(trn.drop('PassengerId',axis=1).corr(), square=True, annot=True)
Out[67]:
<AxesSubplot:>

Model Testing

In [68]:
def test_models(models,X_train,y_train,X_test):
    for clf in models:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = round(clf.score(X_train, y_train) * 100, 2)
        print(str(clf),' score: ',acc)

Test multiple different classification models and return the score from the training set

In [69]:
models= [LogisticRegression(),SVC(),LinearSVC(),KNeighborsClassifier(n_neighbors=3),DecisionTreeClassifier(),RandomForestClassifier(n_estimators=100),GaussianNB(),Perceptron(max_iter=5,tol=None),SGDClassifier(max_iter=5,tol=None),GradientBoostingClassifier(),AdaBoostClassifier()]
X_train = trn.drop(['PassengerId', 'Survived', 'Name', 'Ticket','Cabin'], axis=1)
X_test =  tst.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis =1)
y_train = trn['Survived']

test_models(models = models, X_train=X_train, y_train=y_train, X_test=X_test)
LogisticRegression()  score:  81.93
SVC()  score:  82.94
LinearSVC()  score:  81.37
KNeighborsClassifier(n_neighbors=3)  score:  87.43
DecisionTreeClassifier()  score:  94.61
RandomForestClassifier()  score:  94.5
GaussianNB()  score:  77.22
Perceptron(max_iter=5, tol=None)  score:  79.69
SGDClassifier(max_iter=5, tol=None)  score:  78.56
GradientBoostingClassifier()  score:  88.78
AdaBoostClassifier()  score:  84.62

Hyper parameter tuning

With RandomForestClassifier() being one of the best scoring tests and having good parameters to tune on, lets start with first using the Random Search CV.

In [70]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
In [71]:
clf = RandomForestClassifier()
clf_cv = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf_cv.fit(X_train, y_train)
y_pred = clf_cv.predict(X_test)
acc = round(clf_cv.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
print(clf_cv.best_params_)
Fitting 3 folds for each of 100 candidates, totalling 300 fits
RandomForestClassifier()  score:  90.46
{'n_estimators': 1200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}

With the parameters narrowed in on, lets now create a grid search cv to finer tune those parameters

In [74]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15],
    'max_features': ['sqrt'],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1000, 1200, 1400]
}
# Create a based model

grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
clf = RandomForestClassifier()
clf_cv = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 3, verbose=2, n_jobs = -1)
clf_cv.fit(X_train, y_train)
y_pred = clf_cv.predict(X_test)
acc = round(clf_cv.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
print(clf_cv.best_params_)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
RandomForestClassifier()  score:  88.78
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}

Printing the results to a submission file.

In [75]:
test_results = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_pred
    })
test_results.to_csv('test_results.csv', index=False)

This process resulted in an accuracy of 77.272% with the test set which is in the top 20% of the competition and the top 15% if you remove all of the people that have perfect scores (most likely cheaters per Kaggle)