Resources
Let's Build a Web App to Design Titanic Passengers - Walk Through of Chapter 3 from My Book Monetizing Machine Learning
Let's build an interactive web application to design fictional titanic passengers and see how they would've fared. The approach I describe in this book is all about extending python data science models into fully interactive web applications. No native mobile programming, no permissions, and no complicated jupyter knowledge or tableau for the world to enjoy your work.
Surviving the Titanic Shipwreck - Experiment with Passenger Profiles¶
Note: you can install the requirements file for this notebook if you want to install all the libraries at once. Open a new terminal window, navigate to the folder with the "requirements_jupyter.txt" and run the following command:
pip3 install -r requirements_jupyter.txt
#!/usr/bin/env python
import matplotlib.pyplot as plt; plt.rcdefaults()
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from numpy import genfromtxt
from sklearn.feature_selection import RFE
# download the data from the Vanderbilt University Department of Biostatistics servers
# if you cannot download the files try doing it manually
# if have an ssl error Mac try fix: "/Applications/Python\ 3.6/Install\ Certificates.command"
titanic_df = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')
# take a peek at the data
titanic_df.head()
titanic_df['embarked'].value_counts()
Data Exploration
# feature data types
titanic_df.info()
# summary of quantitative data
titanic_df.describe()
# count missing data
titanic_missing_count = titanic_df.isnull().sum().sort_values(ascending=False)
pd.DataFrame({'Percent Missing':titanic_missing_count/len(titanic_df)})
titanic_df['cabin'].head()
titanic_feature_count = titanic_df.groupby('cabin')['cabin'].count().reset_index(name = "Group_Count")
titanic_feature_count.sort_values('Group_Count', ascending=False).head(10)
titanic_feature_count = titanic_df.groupby('name')['name'].count().reset_index(name = "Group_Count")
titanic_feature_count.sort_values('Group_Count', ascending=False).head(10)
Data Preparation
# strip first letter from cabin number if there
titanic_df['cabin'] = titanic_df['cabin'].replace(np.NaN, 'U')
titanic_df['cabin'] = [ln[0] for ln in titanic_df['cabin'].values]
titanic_df['cabin'] = titanic_df['cabin'].replace('U', 'Unknown')
titanic_df['cabin'].head()
titanic_feature_count = titanic_df.groupby('cabin')['cabin'].count().reset_index(name = "Group_Count")
titanic_feature_count.sort_values('Group_Count', ascending=False).head(10)
# create title field and get extract title name
titanic_df['title'] = [ln.split()[1] for ln in titanic_df['name'].values]
titanic_df['title'].value_counts()
titanic_df['title'] = [title if title in ['Mr.', 'Miss.', 'Mrs.', 'Master.', 'Dr.', 'Rev.'] else 'Unknown'
for title in titanic_df['title'].values ]
titanic_df['title'].head()
# create isfemale field and use numerical values
titanic_df['isfemale'] = np.where(titanic_df['sex'] == 'female', 1, 0)
# drop features not needed for model
titanic_df = titanic_df[[f for f in list(titanic_df) if f not in ['sex', 'name', 'boat','body', 'ticket', 'home.dest']]]
# make pclass actual categorical column
titanic_df['pclass'] = np.where(titanic_df['pclass'] == 1, 'First',
np.where(titanic_df['pclass'] == 2, 'Second', 'Third'))
titanic_df['embarked'] = titanic_df['embarked'].replace(np.NaN, 'Unknown')
titanic_df.head()
Making Dummy Fields¶
pd.get_dummies(titanic_df['cabin'], columns=['cabin'], drop_first=False).head(10)
Modeling with Logistic Regression
# get average survival rate to compare to prediction values
average_survival_rate = np.mean(titanic_df['survived']) * 100
print('Average probability of surviving Titanic trip: %.2f percent' % average_survival_rate)
def prepare_data_for_model(raw_dataframe, target_columns, drop_first = True, make_na_col = True):
# dummy all categorical fields
dataframe_dummy = pd.get_dummies(raw_dataframe, columns=target_columns,
drop_first=drop_first,
dummy_na=make_na_col)
return (dataframe_dummy)
# create dummy features
titanic_ready_df = prepare_data_for_model(titanic_df, target_columns=['pclass', 'cabin', 'embarked', 'title'])
titanic_ready_df = titanic_ready_df.dropna()
list(titanic_ready_df)
# split data into train and test portions and model
from sklearn.model_selection import train_test_split
features = [feat for feat in list(titanic_ready_df) if feat != 'survived']
X_train, X_test, y_train, y_test = train_test_split(titanic_ready_df[features],
titanic_ready_df[['survived']],
test_size=0.5,
random_state=42)
X_train.head(3)
print(y_train.head(3))
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train.values.ravel())
# predict on test portion and get accuracy score
y_pred = lr_model.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}%'
.format(lr_model.score(X_test, y_test)*100))
Interpretation of Model’s Coefficients
coefs = pd.DataFrame({'Feature':features, 'Coef':lr_model.coef_[0]})
print('Positive Features')
coefs.sort_values('Coef', ascending=False).head(7)
print('Negative Features')
coefs.sort_values('Coef', ascending=False).tail(7)
Recursive Feature Elimination (RFE) method
from sklearn.feature_selection import RFE
rfe = RFE(lr_model, 1)
rfe = rfe.fit(X_train, np.ravel(y_train))
# get feature rank in order from elimination process
feature_rank = pd.DataFrame({'Feature':list(X_train),'Rank':rfe.ranking_ })
feature_rank = feature_rank.sort_values('Rank')
feature_rank.head()
Predict using Fictional Passenger
# Create your own passenger - ranges and options available
x_predict_pclass = 'Third' # First, Second, Third
x_predict_is_female=1 # 0-1
x_predict_age=10 # 0-100
x_predict_sibsp=3 # (0-8)
x_predict_parch = 0 # (0-9)
x_predict_fare = 200 # (0-500)
x_predict_cabin = 'A' # 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'T', 'Z'
x_predict_embarked = 'Q' # 'Q', 'S', 'C' # Cherbourg Queenstown Southampton
x_predict_title = 'Mrs.' # ['Mr.', 'Miss.', 'Mrs.', 'Master.', 'Dr.', 'Rev.', Unknown]
# Create your own passenger
x_predict_pclass = 'Third'
x_predict_is_female=0
x_predict_age=50
x_predict_sibsp=3
x_predict_parch = 0
x_predict_fare = 200
x_predict_cabin = 'A'
x_predict_embarked = 'Q'
x_predict_title = 'Mr.'
# make a copy of the original data set in order to create dummy categories that are the same as seen on
# original data
titanic_df_tmp = titanic_df.copy()
# add new row to titanic df
titanic_df_tmp = titanic_df_tmp[['pclass', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked', 'title', 'isfemale', 'survived']]
titanic_fictional_df = pd.DataFrame([[x_predict_pclass,
x_predict_age,
x_predict_sibsp,
x_predict_parch,
x_predict_fare,
x_predict_cabin,
x_predict_embarked,
x_predict_title,
x_predict_is_female,
0]], columns = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'cabin', 'embarked', 'title', 'isfemale', 'survived'])
# titanic_df_tmp.append(titanic_fictional_df)
titanic_df_tmp = pd.concat([titanic_fictional_df, titanic_df_tmp], ignore_index=True)
# titanic_df_tmp = pd.get_dummies(titanic_df_tmp)
titanic_df_tmp = prepare_data_for_model(titanic_df_tmp, target_columns=['pclass', 'cabin', 'embarked', 'title'])
Y_pred = lr_model.predict_proba(titanic_df_tmp[features].head(1))
probability_of_surviving_fictional_character = Y_pred[0][1] * 100
print('Probability of surviving Titanic voyage: %.2f percent' % probability_of_surviving_fictional_character)
fig = plt.figure()
objects = ('Average Survival Rate', 'Fictional Traveler')
y_pos = np.arange(len(objects))
performance = [average_survival_rate, probability_of_surviving_fictional_character]
ax = fig.add_subplot(111)
colors = ['gray', 'blue']
plt.bar(y_pos, performance, align='center', color = colors, alpha=0.5)
plt.xticks(y_pos, objects)
plt.axhline(average_survival_rate, color="r")
plt.ylim([0,100])
plt.ylabel('Survival Probability')
plt.title('How Did Your Fictional Traveler Do? \n ' + str(round(probability_of_surviving_fictional_character,2)) + '% Chance of Surviving!')
plt.show()
Preparing data set for the cloud and making it pandas free!¶
titanic_df = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')
# get title
titanic_df['title'] = [ln.split()[1] for ln in titanic_df['name'].values]
titanic_df['title'] = [title if title in ['Mr.', 'Miss.', 'Mrs.', 'Master.', 'Dr.', 'Rev.'] else 'Unknown'
for title in titanic_df['title'].values ]
# strip first letter from cabin number if there
titanic_df['cabin'] = titanic_df['cabin'].replace(np.NaN, 'U')
titanic_df['cabin'] = [ln[0] for ln in titanic_df['cabin'].values]
titanic_df['cabin'] = titanic_df['cabin'].replace('U', 'Unknown')
titanic_df['isfemale'] = np.where(titanic_df['sex'] == 'female', 1, 0)
# drop features not needed for model
titanic_df = titanic_df[[f for f in list(titanic_df) if f not in ['sex', 'name', 'boat','body', 'ticket', 'home.dest']]]
# make pclass actual categorical column
titanic_df['pclass'] = np.where(titanic_df['pclass'] == 1, 'First',
np.where(titanic_df['pclass'] == 2, 'Second', 'Third'))
# get average survival rate
average_survival_rate = np.mean(titanic_df['survived']) * 100
titanic_df['embarked'] = titanic_df['embarked'].replace(np.NaN, 'Unknown')
# prepare training data
titanic_ready_df = prepare_data_for_model(titanic_df, target_columns=['pclass', 'cabin', 'embarked', 'title'])
titanic_ready_df = titanic_ready_df.dropna()
# save dataframe without header or index
titanic_ready_df.to_csv('titanic3.csv', header=False, index=False)
# load the data as an array instead of a data frame
from numpy import genfromtxt
titanic_array = genfromtxt('titanic3.csv', delimiter=',')
# split data into train and test portions and model
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split([item[1:] for item in titanic_array],
[item[0] for item in titanic_array],
test_size=0.5,
random_state=42)
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
# let's check that our model works by feeding an array instead of a dataframe
average_survival_rate = np.mean([item[0] for item in titanic_array])
X_train, X_test, y_train, y_test = train_test_split([item[1:] for item in titanic_array],
[item[0] for item in titanic_array],
test_size=0.5,
random_state=42)
# fit model only once
lr_model.fit(X_train, y_train)
# check that we can get the average survival rate without pandas
np.mean([item[0] for item in titanic_array])
Let's build out web application!¶
Building a Flask application is super easy. Make sure you have Flask insalled on your machine along with scikit-learn. You will need to recreate the following file structre (see the video for more details).
'''
web-application/
├── main.py
├── titanic3.csv
├── templates/
└── index.html
├── static/
└── images/
├── small_titanic.png
└── small_cabin_location.png
'''
from IPython.display import Image
Image(filename='small_cabin_location.png')
from IPython.display import Image
Image(filename='small_titanic.png')
main.py
#!/usr/bin/env python
from flask import Flask, render_template, flash, request, jsonify, Markup
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import io, base64, os
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# default traveler constants
DEFAULT_EMBARKED = 'Southampton'
DEFAULT_FARE = 33
DEFAULT_AGE = 30
DEFAULT_GENDER = 'Female'
DEFAULT_TITLE = 'Mrs.'
DEFAULT_CLASS = 'Second'
DEFAULT_CABIN = 'C'
DEFAULT_SIBSP = 0
DEFAULT_PARCH = 0
# initializing constant vars
average_survival_rate = 0
# logistic regression modeling
lr_model = LogisticRegression()
app = Flask(__name__)
@app.before_first_request
def startup():
global average_survival_rate, lr_model
from numpy import genfromtxt
titanic_array = genfromtxt('titanic3.csv', delimiter=',')
average_survival_rate = (np.mean([item[0] for item in titanic_array]) * 100)
X_train, X_test, y_train, y_test = train_test_split([item[1:] for item in titanic_array],
[item[0] for item in titanic_array], test_size=0.5, random_state=42)
# fit model only once
lr_model.fit(X_train, y_train)
@app.route("/", methods=['POST', 'GET'])
def submit_new_profile():
model_results = ''
if request.method == 'POST':
selected_embarked = request.form['selected_embarked']
selected_fare = request.form['selected_fare']
selected_age = request.form['selected_age']
selected_gender = request.form['selected_gender']
selected_title = request.form['selected_title']
selected_class = request.form['selected_class']
selected_cabin = request.form['selected_cabin']
selected_sibsp = request.form['selected_sibsp']
selected_parch = request.form['selected_parch']
# assign new variables to live data for prediction
age = int(selected_age)
isfemale = 1 if selected_gender == 'Female' else 0
sibsp = int(selected_sibsp)
parch = int(selected_parch)
fare = int(selected_fare)
# point of embarcation
embarked_Q = 1
embarked_S = 0
embarked_Unknown = 0
embarked_nan = 0
if (selected_embarked[0]=='Q'):
embarked_Q = 1
if (selected_embarked[0]=='S'):
embarked_S = 1
# class
pclass_Second = 0
pclass_Third = 0
pclass_nan = 0
if (selected_class=='Second'):
pclass_Second = 0
if (selected_class=='Third'):
pclass_Third = 0
# title
title_Master = 0
title_Miss = 0
title_Mr = 0
title_Mrs = 0
title_Rev = 0
title_Unknown = 0
title_nan = 0
if (selected_title=='Master.'):
title_Master = 1
if (selected_title=='Miss.'):
title_Miss = 1
if (selected_title=='Mr.'):
title_Mr = 1
if (selected_title=='Mrs.'):
title_Mrs = 1
if (selected_title=='Rev.'):
title_Master = 1
if (selected_title=='Unknown'):
title_Unknown = 1
# cabin
cabin_B = 0
cabin_C = 0
cabin_D = 0
cabin_E = 0
cabin_F = 0
cabin_G = 0
cabin_T = 0
cabin_Unknown = 0
cabin_nan = 0
if (selected_cabin=='B'):
cabin_B = 1
if (selected_cabin=='C'):
cabin_C = 1
if (selected_cabin=='D'):
cabin_D = 1
if (selected_cabin=='E'):
cabin_E = 1
if (selected_cabin=='F'):
cabin_F = 1
if (selected_cabin=='G'):
cabin_G = 1
if (selected_cabin=='T'):
cabin_T = 1
if (selected_cabin=='Unknown'):
cabin_Unknown = 1
# build new array to be in same format as modeled data so we can feed it right into the predictor
user_designed_passenger = [[age, sibsp, parch, fare, isfemale, pclass_Second, pclass_Third, pclass_nan, cabin_B, cabin_C, cabin_D, cabin_E, cabin_F, cabin_G, cabin_T, cabin_Unknown, cabin_nan, embarked_Q, embarked_S, embarked_Unknown, embarked_nan, title_Master, title_Miss, title_Mr, title_Mrs, title_Rev, title_Unknown, title_nan]]
# add user desinged passenger to predict function
Y_pred = lr_model.predict_proba(user_designed_passenger)
probability_of_surviving_fictional_character = Y_pred[0][1] * 100
fig = plt.figure()
objects = ('Average Survival Rate', 'Fictional Traveler')
y_pos = np.arange(len(objects))
performance = [average_survival_rate, probability_of_surviving_fictional_character]
ax = fig.add_subplot(111)
colors = ['gray', 'blue']
plt.bar(y_pos, performance, align='center', color = colors, alpha=0.5)
plt.xticks(y_pos, objects)
plt.axhline(average_survival_rate, color="r")
plt.ylim([0,100])
plt.ylabel('Survival Probability')
plt.title('How Did Your Fictional Traveler Do? \n ' + str(round(probability_of_surviving_fictional_character,2)) + '% of Surviving!')
img = io.BytesIO()
plt.savefig(img, format='png')
img.seek(0)
plot_url = base64.b64encode(img.getvalue()).decode()
return render_template('index.html',
model_results = model_results,
model_plot = Markup('<img src="data:image/png;base64,{}">'.format(plot_url)),
selected_embarked = selected_embarked,
selected_fare = selected_fare,
selected_age = selected_age,
selected_gender = selected_gender,
selected_title = selected_title,
selected_class = selected_class,
selected_cabin = selected_cabin,
selected_sibsp = selected_sibsp,
selected_parch = selected_parch)
else:
# set default passenger settings
return render_template('index.html',
model_results = '',
model_plot = '',
selected_embarked = DEFAULT_EMBARKED,
selected_fare = DEFAULT_FARE,
selected_age = DEFAULT_AGE,
selected_gender = DEFAULT_GENDER,
selected_title = DEFAULT_TITLE,
selected_class = DEFAULT_CLASS,
selected_cabin = DEFAULT_CABIN,
selected_sibsp = DEFAULT_SIBSP,
selected_parch = DEFAULT_PARCH)
if __name__=='__main__':
app.run(debug=False)
index.html
<!DOCTYPE html>
<html>
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta charset="UTF=8">
<title>Titanic Journey Simulation</title>
</head>
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>
<link rel="stylesheet" href="//netdna.bootstrapcdn.com/bootstrap/3.0.3/css/bootstrap-theme.min.css">
<link rel="stylesheet" href="//netdna.bootstrapcdn.com/bootstrap/3.0.3/css/bootstrap.min.css">
<script src="//netdna.bootstrapcdn.com/bootstrap/3.0.3/js/bootstrap.min.js"></script>
<body>
<div class="container">
<form id='submit_params' method="POST" action="">
<div class="form-group">
<table class="table">
<tr>
<td width="300px"><p style="text-align:center"><h1>Chapter 3<BR>Design Your Titanic Traveler</h1></p></td>
<td><p style="text-align:center">
<img src="static/images/small_titanic.png" alt="titanic" style='padding:1px; border:1px solid #021a40; width: 80%; height: 80%'>
</p>
</td>
<td width="200px"><p style="text-align:center"><h3>Cabin and Port of Embarcation Details</h3></p></td>
</tr>
<tr>
<td>
<table class="table" border=0 >
<tr>
<td><label for="male">Port of Embarcation</label></td>
<td>
<SELECT class="selectpicker" name="selected_embarked">
<option value="" selected></option>
<option>Cherbourg</option>
<option>Queenstown</option>
<option>Southampton</option>
</SELECT>
</td>
</tr>
<tr>
<td><label for="male">Fare</label></td>
<td><input type="number" class="form-control" id="fare"
placeholder="Enter fare" name="selected_fare" value="">
</td>
</tr>
<tr>
<td><label for="male">Age</label></td>
<td>
<SELECT class="selectpicker" name="selected_age">
<option value="" selected></option>
</SELECT>
</td>
</tr>
<tr>
<td><label for="male">Gender</label></td>
<td>
<SELECT class="selectpicker" name="selected_gender">
<option value="" selected></option>
<option value='Male'>Male</option>
<option value='Female'>Female</option>
</SELECT>
</td>
</tr>
<tr>
<td><label for="male">Title</label></td>
<td>
<SELECT class="selectpicker" name="selected_title">
<option value="" selected></option>
<option value="Dr.">Dr.</option>
<option value="Master.">Master.</option>
<option value="Miss.">Miss.</option>
<option value="Mr.">Mr.</option>
<option value="Mrs.">Mrs.</option>
<option value="Master.">Master.</option>
<option value="Unknown.">Unknown</option>
<option value="Rev.">Rev.</option>
</SELECT>
</td>
</tr>
<tr>
<td><label for="male">Class</label></td>
<td>
<SELECT class="selectpicker" name="selected_class">
<option value="" selected></option>
<option value='First'>First</option>
<option value='Second'>Second</option>
<option value='Third'>Third</option>
</td>
</tr>
<tr>
<td><label for="male">Cabin</label></td>
<td>
<SELECT class="selectpicker" name="selected_cabin">
<option value="" selected></option>
<option>A</option>
<option>B</option>
<option>C</option>
<option>D</option>
<option>E</option>
<option>F</option>
<option>G</option>
<option>T</option>
<option>Unknown</option>
</td>
</tr>
<tr>
<td><label for="male">Number of Siblings/Spouses</label></td>
<td>
<SELECT class="selectpicker" name="selected_sibsp">
<option value="" selected></option>
</SELECT>
</td>
</tr>
<tr>
<td><label for="male">Number of Parents/Children</label></td>
<td>
<SELECT class="selectpicker" name="selected_parch">
<option value="" selected></option>
</SELECT>
</td>
</tr>
</table>
</td>
<td><BR>
</td>
<td>
<p style="text-align:center">Cabin Key</p>
<p style="text-align:center">
<img src="static/images//small_cabin_location.png" alt="cabin locations" style='padding:1px; border:1px solid #021a40; width: 100%; height: 100%'>
</p></td>
</tr>
<td><button class="btn btn-default" type="submit" style="width: 100px; height: 30px;">All Aboard!</button></td><td></td><td></td>
<tr>
</table>
</div>
</form>
</div>
</body>
</html>
Manuel Amunategui - Follow me on Twitter: @amunategui