Kaggle Titanic Dataset Machine Learning Model¶

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold 
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_val_predict 
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import RandomizedSearchCV

Import Data¶

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
both = [train,test]
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

Varibles Description

PassengerId : Unique id number to each passenger
Survived : Passanger survive(1) or die(0)
Pclass : Passanger Class
Name : Name
Sex : Gender of passanger
Age : Age of passanger
SibSp : Number of siblings/spouses
Parch : Number of parents/children
Ticket : Number of ticket
Fare : Amount of money spend on ticket
Cabin : Cabin type
Embarked : Port where passenger embarked (C = Cherbourg, Q = Queenstown, S = Southampton)

Print out all numerical columns

train.describe()

Print out all object columns

train.describe(include=['O'])

Check where there are missing values

train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Cabin is missing a tremendous amount of data. I wonder if grouping it as 'Deck' may provide insight

for df in both:    
    df['Cabin'].replace(np.nan,'M',inplace=True)
    df['Deck']=df['Cabin'].str[0]
    print(df['Deck'].unique())

['M' 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']
['M' 'B' 'E' 'A' 'C' 'D' 'F' 'G']

Data Exploration¶

def bars(data, y, vars,title):
    fig, axes = plt.subplots(1,len(vars),figsize=(15,5))
    for var, ax in zip(vars,axes.reshape(-1)):
        sns.barplot(x=var, y=y,data=data,ax=ax)
        ax.set(title=var+' Distribution')
    fig.suptitle(title)
    fig.tight_layout()
    fig.show()

Plot bar charts of the survival rate by different categories

vars = ['Sex','Pclass','Embarked','Deck']
bars(data=train,y='Survived',vars=vars,title='Survivability by Different Groups')

C:\Users\yy275e\AppData\Local\Temp/ipykernel_3620/4048716104.py:8: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

Plot the changes in survivability from group to group

fig, axes = plt.subplots(1,3,figsize=(15,5))
sns.pointplot(x='Sex', y='Survived', hue='Pclass', data=train,kind='point',ax=axes[0], ci = None)
axes[0].set_title('Survivability by Sex and Class')
sns.pointplot(x='Sex', y='Survived', hue='Embarked', data=train,kind='point',ax=axes[1], ci = None)
axes[1].set_title('Survivability by Sex and Embarked Location')
sns.pointplot(x='Pclass', y='Survived', hue='Embarked', data=train,kind='point',ax=axes[2], ci = None)
axes[2].set_title('Survivability by Class and Embarked Location')

Text(0.5, 1.0, 'Survivability by Class and Embarked Location')

def hists(data, comp, vars,title):
    fig, axes = plt.subplots(2,int(round(len(vars)/2)),figsize=(20,10))
    groups = data[comp].unique()
    for var, ax in zip(vars,axes.reshape(-1)):
        for group in groups:
            sns.histplot(x=var,hue=comp,data=data,ax=ax,stat='density',kde=True)
        ax.set(title=var+' Distribution')
    fig.suptitle(title)
    fig.tight_layout()
    fig.show()

Plot the different distributions for the numerical variables between the survived passengers and those who did not. It is notable that younger people were more likely to survive and people who paid more for their ticket we likely to survive as well.

hist_vars = ['Age', 'SibSp', 'Parch', 'Fare']
hists(data=train, comp = 'Survived',vars=hist_vars,title='')

C:\Users\yy275e\AppData\Local\Temp/ipykernel_3620/2655237577.py:10: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
  fig.show()

Correlation Matrix shows that Age, SibSp, and Parch are not very correlated

plt.figure(figsize=(15,6))
sns.heatmap(train.drop('PassengerId',axis=1).corr(), square=True, annot=True)
#Looking at the variables that are less thatn .1 correlated with 'Survived' we can create Age bands to help age be a better variable
print(pd.cut(train['Age'], 5).unique())

[(16.336, 32.252], (32.252, 48.168], NaN, (48.168, 64.084], (0.34, 16.336], (64.084, 80.0]]
Categories (5, interval[float64, right]): [(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]]

Feature creation¶

Creating features that will be more highly correlated with the survivability

trn = pd.read_csv('train.csv')
tst = pd.read_csv('test.csv')
both = [trn,tst]
cont_vars = ['Fare']

for df in both:
    #Fill in blank Embarked with the most common 'Embarked' code
    df['Embarked'] = df['Embarked'].fillna(trn['Embarked'].mode())
    #Edit create Title field and sort into categories: Master, Miss, Mrs, Mr, and Rare
    df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    #Fill in missing ages with a random age from the mean and std of the other ages
    age_avg = df['Age'].mean()
    age_std = df['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    df.loc[df['Age'].isnull(),'Age']=age_null_random_list
    df['Age'] = df['Age'].astype(int)

    #Creating age brackets
    df['Age_Group']=""
    df.loc[df['Age']<=16,'Age_Group'] = 'Child'
    df.loc[df['Age']>16,'Age_Group'] = 'YA'
    df.loc[df['Age']>32,'Age_Group'] = 'Adult'
    df.loc[df['Age']>48,'Age_Group'] = 'Mid_Age'
    df.loc[df['Age']>64,'Age_Group'] = 'Old'

    #Creating Deck variable
    df['Cabin'].replace(np.nan,'M',inplace=True)
    df['Deck']=df['Cabin'].str[0]
    df.loc[df['Deck']=='A','Deck'] = 'Top'
    df.loc[df['Deck']=='B','Deck'] = 'Top'
    df.loc[df['Deck']=='C','Deck'] = 'Top'
    df.loc[df['Deck']=='D','Deck'] = 'Top'
    df.loc[df['Deck']=='E','Deck'] = 'Top'
    df.loc[df['Deck']=='F','Deck'] = 'Bot'
    df.loc[df['Deck']=='G','Deck'] = 'Bot'
    df.loc[df['Deck']=='T','Deck'] = 'Bot'
    df.loc[df['Deck']=='M','Deck'] = 'Mis'
    

    #Fill in missing Fare data with median
    df['Fare'] = df['Fare'].fillna(trn['Fare'].median())
    #Add sibling and parents together for total family number
    df['Family']=df['Parch']+df['SibSp']+1
    #Add Alone column if the person was alone
    df['Alone']=0
    df.loc[df['Family']==1,'Alone']=1

#Scale the one continuous variable that we still have   
scaler = MinMaxScaler(feature_range=(0,1))
trn[cont_vars] = scaler.fit_transform(trn[cont_vars])
tst[cont_vars] = scaler.fit_transform(tst[cont_vars])


#Create dummy variables for our categorival variables
trn = pd.get_dummies(trn, columns = ["Sex","Embarked","Title","Age_Group",'Deck'],prefix=["Sex","Em_type",'Title','Age','Deck'])   
tst = pd.get_dummies(tst, columns = ["Sex","Embarked",'Title',"Age_Group",'Deck'],prefix=["Sex","Em_type",'Title','Age','Deck'])                           
#Drop unused variables
trn.drop(['Parch', 'SibSp', 'Family','Age'], axis=1,inplace=True)
tst.drop(['Parch', 'SibSp', 'Family','Age'], axis=1,inplace=True)


plt.figure(figsize=(22,22))
sns.heatmap(trn.drop('PassengerId',axis=1).corr(), square=True, annot=True)

<AxesSubplot:>

Model Testing¶

def test_models(models,X_train,y_train,X_test):
    for clf in models:
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        acc = round(clf.score(X_train, y_train) * 100, 2)
        print(str(clf),' score: ',acc)

Test multiple different classification models and return the score from the training set

models= [LogisticRegression(),SVC(),LinearSVC(),KNeighborsClassifier(n_neighbors=3),DecisionTreeClassifier(),RandomForestClassifier(n_estimators=100),GaussianNB(),Perceptron(max_iter=5,tol=None),SGDClassifier(max_iter=5,tol=None),GradientBoostingClassifier(),AdaBoostClassifier()]
X_train = trn.drop(['PassengerId', 'Survived', 'Name', 'Ticket','Cabin'], axis=1)
X_test =  tst.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis =1)
y_train = trn['Survived']

test_models(models = models, X_train=X_train, y_train=y_train, X_test=X_test)

LogisticRegression()  score:  81.93
SVC()  score:  82.94
LinearSVC()  score:  81.37
KNeighborsClassifier(n_neighbors=3)  score:  87.43
DecisionTreeClassifier()  score:  94.61
RandomForestClassifier()  score:  94.5
GaussianNB()  score:  77.22
Perceptron(max_iter=5, tol=None)  score:  79.69
SGDClassifier(max_iter=5, tol=None)  score:  78.56
GradientBoostingClassifier()  score:  88.78
AdaBoostClassifier()  score:  84.62

Hyper parameter tuning¶

With RandomForestClassifier() being one of the best scoring tests and having good parameters to tune on, lets start with first using the Random Search CV.

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf = RandomForestClassifier()
clf_cv = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf_cv.fit(X_train, y_train)
y_pred = clf_cv.predict(X_test)
acc = round(clf_cv.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
print(clf_cv.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
RandomForestClassifier()  score:  90.46
{'n_estimators': 1200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': False}

With the parameters narrowed in on, lets now create a grid search cv to finer tune those parameters

from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10, 15],
    'max_features': ['sqrt'],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1000, 1200, 1400]
}
# Create a based model

grid_search = GridSearchCV(estimator = clf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
clf = RandomForestClassifier()
clf_cv = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 3, verbose=2, n_jobs = -1)
clf_cv.fit(X_train, y_train)
y_pred = clf_cv.predict(X_test)
acc = round(clf_cv.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
print(clf_cv.best_params_)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
RandomForestClassifier()  score:  88.78
{'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}

Printing the results to a submission file.

test_results = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_pred
    })
test_results.to_csv('test_results.csv', index=False)

This process resulted in an accuracy of 77.272% with the test set which is in the top 20% of the competition and the top 15% if you remove all of the people that have perfect scores (most likely cheaters per Kaggle)

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	Name	Sex	Ticket	Cabin	Embarked
count	891	891	891	204	889
unique	891	2	681	147	3
top	Braund, Mr. Owen Harris	male	347082	B96 B98	S
freq	1	577	7	4	644