import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
both = [train,test]
train.columns
Varibles Description
PassengerId : Unique id number to each passenger
Survived : Passanger survive(1) or die(0)
Pclass : Passanger Class
Name : Name
Sex : Gender of passanger
Age : Age of passanger
SibSp : Number of siblings/spouses
Parch : Number of parents/children
Ticket : Number of ticket
Fare : Amount of money spend on ticket
Cabin : Cabin type
Embarked : Port where passenger embarked (C = Cherbourg, Q = Queenstown, S = Southampton)
Print out all numerical columns
train.describe()
Print out all object columns
train.describe(include=['O'])
Check where there are missing values
train.isnull().sum()
Cabin is missing a tremendous amount of data. I wonder if grouping it as 'Deck' may provide insight
for df in both:
df['Cabin'].replace(np.nan,'M',inplace=True)
df['Deck']=df['Cabin'].str[0]
print(df['Deck'].unique())
def bars(data, y, vars,title):
fig, axes = plt.subplots(1,len(vars),figsize=(15,5))
for var, ax in zip(vars,axes.reshape(-1)):
sns.barplot(x=var, y=y,data=data,ax=ax)
ax.set(title=var+' Distribution')
fig.suptitle(title)
fig.tight_layout()
fig.show()
Plot bar charts of the survival rate by different categories
vars = ['Sex','Pclass','Embarked','Deck']
bars(data=train,y='Survived',vars=vars,title='Survivability by Different Groups')
Plot the changes in survivability from group to group
fig, axes = plt.subplots(1,3,figsize=(15,5))
sns.pointplot(x='Sex', y='Survived', hue='Pclass', data=train,kind='point',ax=axes[0], ci = None)
axes[0].set_title('Survivability by Sex and Class')
sns.pointplot(x='Sex', y='Survived', hue='Embarked', data=train,kind='point',ax=axes[1], ci = None)
axes[1].set_title('Survivability by Sex and Embarked Location')
sns.pointplot(x='Pclass', y='Survived', hue='Embarked', data=train,kind='point',ax=axes[2], ci = None)
axes[2].set_title('Survivability by Class and Embarked Location')
def hists(data, comp, vars,title):
fig, axes = plt.subplots(2,int(round(len(vars)/2)),figsize=(20,10))
groups = data[comp].unique()
for var, ax in zip(vars,axes.reshape(-1)):
for group in groups:
sns.histplot(x=var,hue=comp,data=data,ax=ax,stat='density',kde=True)
ax.set(title=var+' Distribution')
fig.suptitle(title)
fig.tight_layout()
fig.show()
Plot the different distributions for the numerical variables between the survived passengers and those who did not. It is notable that younger people were more likely to survive and people who paid more for their ticket we likely to survive as well.
hist_vars = ['Age', 'SibSp', 'Parch', 'Fare']
hists(data=train, comp = 'Survived',vars=hist_vars,title='')
Correlation Matrix shows that Age, SibSp, and Parch are not very correlated
plt.figure(figsize=(15,6))
sns.heatmap(train.drop('PassengerId',axis=1).corr(), square=True, annot=True)
#Looking at the variables that are less thatn .1 correlated with 'Survived' we can create Age bands to help age be a better variable
print(pd.cut(train['Age'], 5).unique())
Creating features that will be more highly correlated with the survivability
trn = pd.read_csv('train.csv')
tst = pd.read_csv('test.csv')
both = [trn,tst]
cont_vars = ['Fare']
for df in both:
#Fill in blank Embarked with the most common 'Embarked' code
df['Embarked'] = df['Embarked'].fillna(trn['Embarked'].mode())
#Edit create Title field and sort into categories: Master, Miss, Mrs, Mr, and Rare
df['Title'] = df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
#Fill in missing ages with a random age from the mean and std of the other ages
age_avg = df['Age'].mean()
age_std = df['Age'].std()
age_null_count = df['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
df.loc[df['Age'].isnull(),'Age']=age_null_random_list
df['Age'] = df['Age'].astype(int)
#Creating age brackets
df['Age_Group']=""
df.loc[df['Age']<=16,'Age_Group'] = 'Child'
df.loc[df['Age']>16,'Age_Group'] = 'YA'
df.loc[df['Age']>32,'Age_Group'] = 'Adult'
df.loc[df['Age']>48,'Age_Group'] = 'Mid_Age'
df.loc[df['Age']>64,'Age_Group'] = 'Old'
#Creating Deck variable
df['Cabin'].replace(np.nan,'M',inplace=True)
df['Deck']=df['Cabin'].str[0]
df.loc[df['Deck']=='A','Deck'] = 'Top'
df.loc[df['Deck']=='B','Deck'] = 'Top'
df.loc[df['Deck']=='C','Deck'] = 'Top'
df.loc[df['Deck']=='D','Deck'] = 'Top'
df.loc[df['Deck']=='E','Deck'] = 'Top'
df.loc[df['Deck']=='F','Deck'] = 'Bot'
df.loc[df['Deck']=='G','Deck'] = 'Bot'
df.loc[df['Deck']=='T','Deck'] = 'Bot'
df.loc[df['Deck']=='M','Deck'] = 'Mis'
#Fill in missing Fare data with median
df['Fare'] = df['Fare'].fillna(trn['Fare'].median())
#Add sibling and parents together for total family number
df['Family']=df['Parch']+df['SibSp']+1
#Add Alone column if the person was alone
df['Alone']=0
df.loc[df['Family']==1,'Alone']=1
#Scale the one continuous variable that we still have
scaler = MinMaxScaler(feature_range=(0,1))
trn[cont_vars] = scaler.fit_transform(trn[cont_vars])
tst[cont_vars] = scaler.fit_transform(tst[cont_vars])
#Create dummy variables for our categorival variables
trn = pd.get_dummies(trn, columns = ["Sex","Embarked","Title","Age_Group",'Deck'],prefix=["Sex","Em_type",'Title','Age','Deck'])
tst = pd.get_dummies(tst, columns = ["Sex","Embarked",'Title',"Age_Group",'Deck'],prefix=["Sex","Em_type",'Title','Age','Deck'])
#Drop unused variables
trn.drop(['Parch', 'SibSp', 'Family','Age'], axis=1,inplace=True)
tst.drop(['Parch', 'SibSp', 'Family','Age'], axis=1,inplace=True)
plt.figure(figsize=(22,22))
sns.heatmap(trn.drop('PassengerId',axis=1).corr(), square=True, annot=True)
def test_models(models,X_train,y_train,X_test):
for clf in models:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
acc = round(clf.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
Test multiple different classification models and return the score from the training set
models= [LogisticRegression(),SVC(),LinearSVC(),KNeighborsClassifier(n_neighbors=3),DecisionTreeClassifier(),RandomForestClassifier(n_estimators=100),GaussianNB(),Perceptron(max_iter=5,tol=None),SGDClassifier(max_iter=5,tol=None),GradientBoostingClassifier(),AdaBoostClassifier()]
X_train = trn.drop(['PassengerId', 'Survived', 'Name', 'Ticket','Cabin'], axis=1)
X_test = tst.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis =1)
y_train = trn['Survived']
test_models(models = models, X_train=X_train, y_train=y_train, X_test=X_test)
With RandomForestClassifier() being one of the best scoring tests and having good parameters to tune on, lets start with first using the Random Search CV.
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
clf = RandomForestClassifier()
clf_cv = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
clf_cv.fit(X_train, y_train)
y_pred = clf_cv.predict(X_test)
acc = round(clf_cv.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
print(clf_cv.best_params_)
With the parameters narrowed in on, lets now create a grid search cv to finer tune those parameters
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search
param_grid = {
'bootstrap': [True],
'max_depth': [5, 10, 15],
'max_features': ['sqrt'],
'min_samples_leaf': [2, 3, 4],
'min_samples_split': [4, 5, 6],
'n_estimators': [1000, 1200, 1400]
}
# Create a based model
grid_search = GridSearchCV(estimator = clf, param_grid = param_grid,
cv = 3, n_jobs = -1, verbose = 2)
clf = RandomForestClassifier()
clf_cv = GridSearchCV(estimator = clf, param_grid = param_grid, cv = 3, verbose=2, n_jobs = -1)
clf_cv.fit(X_train, y_train)
y_pred = clf_cv.predict(X_test)
acc = round(clf_cv.score(X_train, y_train) * 100, 2)
print(str(clf),' score: ',acc)
print(clf_cv.best_params_)
Printing the results to a submission file.
test_results = pd.DataFrame({
"PassengerId": test["PassengerId"],
"Survived": y_pred
})
test_results.to_csv('test_results.csv', index=False)
This process resulted in an accuracy of 77.272% with the test set which is in the top 20% of the competition and the top 15% if you remove all of the people that have perfect scores (most likely cheaters per Kaggle)