#!/usr/bin/env python
# coding: utf-8

# # Predicting Survival of Passengers on the RMS Titanic
# 
# In this notebook various data from the passengers aboard the RMS is loaded. This data is then analyzed for significant correlation with surviving the disaster. Ultimately, models are produced to predict the survivorship of passengers based on details of their passage.
# 
# Let's import the libraries we'll need.

# In[2]:


import pandas as pd
import numpy as np
import os
from scipy import stats
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn import neighbors
from sklearn import preprocessing

import matplotlib.pyplot as plt
import matplotlib
get_ipython().run_line_magic('matplotlib', 'inline')

matplotlib.rcParams.update({'font.size': 16})


# The data is imported and a several rows are printed.

# In[3]:


df = pd.read_csv('train.csv')
print(df.shape)
no_passengers = df.shape[0]
print(df.dtypes)
df[0:10]


# Three columns have NaN. Let's address the port of Embarkation as well as age.

# In[4]:


len(df[df['Cabin'].isnull()])/len(df)


# # MISSING DATA

# ## Port of Embarkation

# We inspect the Embarked column for missing values.

# In[5]:


df[df['Embarked'].isnull()]


# Two values are found. Let's build a model based on everyone else on board and see if we can guess where Miss Icard and Mrs. Stone got onboard. We assume that the passenger's fare, class, and where they embarked are related. Let's define the variables.

# In[6]:


df2 = df.loc[:,['Pclass','Fare','Embarked']]
df2.dropna(inplace=True) #Dropping the target of our model

# Prepare X
X = np.array(df2.loc[:,['Pclass','Fare']]) # Casting to an np.arr
scaler = preprocessing.Normalizer().fit(X) 
X = scaler.transform(X)  # Scaling values

# Prepare y
ports = ['C','Q','S']
emb_dum = df2['Embarked'].replace(ports,[0,1,2]) # Transforming categorical data into numeric
y = np.array(emb_dum) # Casting to an np.arr

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15)


# A classifier clf is defined and the data fit to it.

# In[7]:


fig = plt.figure(figsize=(25,20))
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train) 


# In[8]:


f1_score(y_test,clf.predict(X_test),average='weighted')


# In[9]:


plt.figure(figsize=(8,4))
c_mat = confusion_matrix(y_test,clf.predict(X_test))
disp = ConfusionMatrixDisplay(confusion_matrix = c_mat, display_labels=ports)
disp.plot(values_format='d')
plt.savefig('./embarked_dt.png',bbox_inches='tight',dpi=300)


# In[10]:


dot_data = tree.export_graphviz(clf, out_file='embarked_dt.dot', 
                     feature_names=['Pclass','Fare'],
                    filled=True, rounded=True,  
                     special_characters=True)
graph = pgv.AGraph('./embarked_dt.dot')  
#graph.draw()


# Predicting the port of our passengers:

# In[11]:


x = scaler.transform(np.array([1,80]).reshape(1,-1)) # Both passengers paid 80 pounds for a first class ticket
print('[0,1,2]')
print(ports)
print(clf.predict(x))


# The model calculates the port of embarkation for both passengers to be Southampton. Southampton is imputed.

# In[12]:


df['Embarked'].fillna('S', inplace=True)


# Let's check to see if we missed anyone

# In[13]:


df[df['Embarked'].isnull()]


# We did not miss anyone. Let's now look towards age.

# ## Age

# In[14]:


df[df['Age'].isnull()]


# Oh dear. There is a significant number of passengers whose age is unknown. Again we will build a model with the objective of calculating the age of passengers we don't know. This time a k-Neighbors regressor is used and fit to the passengers we have data for. First let's note the indicies of the passengers we will be targeting.

# In[15]:


nullage = df[df['Age'].isnull()].index.tolist()
nullage[0:5]


# In[17]:


matplotlib.rcParams.update({'font.size': 20})

df2 = df.loc[:,['Pclass','Sex','SibSp','Parch','Fare','Embarked','Age']] # These are our variables of interest

# Quick and dirty: Categorical variables are replaced with an integer
df2['Sex'].replace(['female','male'],[0,1],inplace=True) 
df2['Embarked'].replace(['C','Q','S'],[0,1,2],inplace=True)

# A copy of out model inputs is made for graphing later
df3 = df2.copy()

# Remove NaNs and cast to np.arr
df2.dropna(inplace=True)
data = np.array(df2)

# Defining model inputs
X = data[:,:6]
y = data[:,6]

list_knn = []

its=1000
neighbor_range = range(1,20)
index = 0
for i in neighbor_range:
    RMSE_LIST= []
    for j in range(0,its):
    
        X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)
        # Defining a classifier. 
        n_neighbors = i
        knn = neighbors.KNeighborsRegressor(n_neighbors,weights='distance',algorithm='brute',n_jobs=-1)
        knn.fit(X_train,y_train)

        E = np.subtract(knn.predict(X_test),y_test)
        SE = np.power(E,2)
        MSE = np.sum(SE/len(y_test))
        RMSE = np.sqrt(MSE)
        RMSE_LIST.append(RMSE)
        os.sys.stdout.write(f'{index}\r')
        index = index + 1
    
    list_knn.append(np.array(RMSE_LIST).mean())
print('\nDONE')
plt.figure(figsize=(10,7))
plt.plot(neighbor_range,list_knn)
plt.title('n Selection for Age Model')
plt.ylabel('RMSE (Years)')
plt.xlabel('n Neighbors')
plt.xticks(neighbor_range)
#plt.savefig('./age_rmse.png',bbox_inches='tight',dpi=300)
plt.show()


# In[19]:


df2 = df.loc[:,['Pclass','Sex','SibSp','Parch','Fare','Embarked','Age']] # These are our variables of interest

# Quick and dirty: Categorical variables are replaced with an integer
df2['Sex'].replace(['female','male'],[0,1],inplace=True) 
df2['Embarked'].replace(['C','Q','S'],[0,1,2],inplace=True)

# A copy of out model inputs is made for graphing later
df3 = df2.copy()

# Remove NaNs and cast to np.arr
df2.dropna(inplace=True)
data = np.array(df2)

# Defining model inputs
X = data[:,:6]
y = data[:,6]

# The inputs have many different scales.
# They are normalized.
scaler = preprocessing.Normalizer().fit(X)
T = np.array(df3[df3['Age'].isnull()])
T = scaler.transform(T[:,:6])
X = scaler.transform(X)
model_RMSE = []
for i in range (0,1000):
    X_train, X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

    # Defining a classifier. 
    n_neighbors = 4
    knn = neighbors.KNeighborsRegressor(n_neighbors,weights='distance',algorithm='brute',n_jobs=-1)
    knn.fit(X_train,y_train)

    E = np.subtract(knn.predict(X_test),y_test)
    SE = np.power(E,2)
    MSE = np.sum(SE)/len(y_test)
    RMSE = np.sqrt(MSE)
    model_RMSE.append(RMSE)
print(f'Model RMSE: {RMSE:.1f}')

# Plot the known ages with the modeled ages for comparison of shape.
plt.figure(figsize=(8,6))
plt.hist(df['Age'],color='C0',bins=np.arange(0,80,10))
plt.hist(knn.predict(X_test),color='C1',bins=np.arange(0,80,10))
plt.legend(['Known Ages','Estimated Ages'])
plt.title('Shape Comparison')
plt.ylabel('Frequency')
plt.xlabel('Age')
#plt.savefig('./agemodeldist.png', dpi=300)
# Replace unknown ages with modeled ones.
df.iloc[nullage,5] = knn.predict(T)

plt.show()


# In[20]:


bins = np.linspace(-3,3,13)
plt.figure(figsize=(8,6))
#plt.hist(stats.zscore(model_RMSE),bins=bins)
plt.hist(model_RMSE)
plt.title('Model Accuracy')
plt.ylabel('Frequency\n(n=1000)')
plt.xlabel('RMSE (years)')
stats.describe(model_RMSE)
plt.savefig('agermsefreq.png',bbox_inches='tight',dpi=300)
print(np.mean(model_RMSE))
print(np.std(model_RMSE))


# The shapes of these distributions are visually consistent. The RMSE of the fit is 13 years or about half a human generation. There is no immediate reason to reject these results.

# In[21]:


df[df['Age'].isnull()]


# Every passenger now has an age and a port of Embarkation. Moving on, we begin to build our model of who survived.
# 
# An empty list is defined. As I move through each variable, those that have significant correlation with the Survived variable will be appended to the list.

# In[22]:


model = [] 


# # BY SEX
# 
# An exploration is made into the correlation and significance of sex on surviving the shipwreck.

# In[23]:


sexes = ['male','female']
bysex = np.zeros((2,2))
for i in range(0,len(sexes)):
    bysex[i] = df[df['Sex'] == sexes[i]]['Survived'].value_counts().sort_index()
bysex


# The first row of this array are all male. The second row is all female. The first column represents the number of passengers who did not survive. The second column represents the number of passengers who did survive. Looks like males are less likely to survive than females. Let's check rates

# In[24]:


males_rate = bysex[0][1]/np.sum(bysex[0])
females_rate = bysex[1][1]/np.sum(bysex[1])
print('Survival rate of:')
print('\tmales:\t\t',f'{males_rate:.3f}')
print('\tfemales:\t',f'{females_rate:.3f}')


# Indeed, there is quite a contrast between the two rates. Let's visualize survivorship.

# In[25]:


labels = ['No Survive', 'Survive']
male = bysex[0]
female = bysex[1]

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))
rects1 = ax.bar(x - width/2, male, width, label='Men')
rects2 = ax.bar(x + width/2, female, width, label='Women')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Passengers')
ax.set_title('Survival by Sex')
ax.set_xticks(x)
ax.set_ylim(0,500)
ax.set_xticklabels(labels)
ax.legend()

def autolabel(rects):
    """Attach a text label above each bar in *rects*, displaying its height."""
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

plt.show()


# There are more males on board than there are females. Let's do an apples to apples comparison and visualize rate.

# In[26]:


labels = ['No Survive', 'Survive']
male = np.round(bysex[0]/np.sum(bysex[0]),2)
female = np.round(bysex[1]/np.sum(bysex[1]),2)

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))
rects1 = ax.bar(x - width/2, male, width, label='Male')
rects2 = ax.bar(x + width/2, female, width, label='Female')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Rate')
ax.set_title('Survival by Sex')
ax.set_xticks(x)
ax.set_ylim(0,1)
ax.set_xticklabels(labels)
ax.legend()

autolabel(rects1)
autolabel(rects2)

plt.show()


# Now we calculate Pearson's r for being a male and surviving the disaster for inclusion in the final model.

# In[27]:


df['male'] = df['Sex'] == 'male' # Create a column to indicate whether or not the passenger is male
df['male'].replace([True,False],[1,0],inplace=True) # Converting the results to a numeric value

# Define the input
X = np.array(df['male'])
y = np.array(df['Survived'])

# Calculate correlation
pear = stats.pearsonr(X,y)
print('is_male vs Survived')
print(f'\tCorrelation: {pear[0]:.2f}')
print(f'\tp-val: {pear[1]:.2e}')


# #### Discussion
# There is a statistically significant indirect correlation between being a male and survival. Our model should include whether or not the passenger is male.

# In[28]:


model.append('male')
model


# # BY PORT of EMBARKATION
# 

# An exploration is made into the correlation and significance of port of embarkation on surviving the shipwreck. We define a list of all ports

# In[29]:


ports = df['Embarked'].unique()
ports


# In[30]:


# Now we calculate and print the survival rate of passengers vs where the boarded.
byport = np.zeros((len(ports),2))
byport
for i in range(0,len(ports)):
    byport[i] = df[df['Embarked'] == ports[i]]['Survived'].value_counts().sort_index()
print('Survival rate of:')
for i in range(0,len(ports)):
    print(f'\t{ports[i]}: {byport[i][1]/np.sum(byport[i]):.2f}')


# In[31]:


# Visualizing these rates
labels = ['No Survive', 'Survive']
S = byport[0]
C = byport[1]
Q = byport[2]

x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))
rects1 = ax.bar(x - width, S, width, label='Southampton')
rects2 = ax.bar(x , C, width, label='Cherbourg')
rects3 = ax.bar(x + width, Q, width, label='Queenstown')


# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Passengers')
ax.set_title('Survival by Port')
ax.set_xticks(x)
ax.set_ylim(0,500)
ax.set_xticklabels(labels)
ax.legend()

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

plt.show()


# In[32]:


# Now we do the same for the rate of surviving at each port.
labels = ['No Survive', 'Survive']
S = np.round(byport[0]/np.sum(byport[0]),2)
C = np.round(byport[1]/np.sum(byport[1]),2)
Q = np.round(byport[2]/np.sum(byport[2]),2)

x = np.arange(len(labels))  # the label locations
width = 0.2  # the width of the bars

fig, ax = plt.subplots(figsize=(10,8))
rects1 = ax.bar(x - width, S, width, label='Southampton')
rects2 = ax.bar(x , C, width, label='Cherbourg')
rects3 = ax.bar(x + width, Q, width, label='Queenstown')


# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('rate')
ax.set_title('Survival by Port')
ax.set_xticks(x)
ax.set_ylim(0,1)
ax.set_xticklabels(labels)
ax.legend()

autolabel(rects1)
autolabel(rects2)
autolabel(rects3)

plt.show()


# These are great pictures, but do they mean anything significant? Now I calculate the Pearson R correlation for the relationship between port and surviving.

# In[33]:


portdum = pd.get_dummies(df['Embarked'])
ports = np.array(portdum.columns)
X = np.array(portdum)
for i in range(0,3):
    if stats.pearsonr(X[:,i],y)[0] < 0.05:
        print(f'Embarked {ports[i]}:')
        print(f'\tcorr {stats.pearsonr(X[:,i],y)[0]:.3e}')
        print(f'\tp-value {stats.pearsonr(X[:,i],y)[1]:.3e}')
        model.append(f'port_{ports[i]}')
        print()


# #### Discussion
# These values suggest that embarking at Cherbourg has a statistically significant direct correlation with surviving. Southampton has a similar indirect correlation. There is no statisically significant correlation between embarking at Queenstown and survival. Our model should include whether the passenger embarked at Cherbourg or Southampton.

# In[34]:


model


# # BY DECK

# Each cabin number is prefixed with a letter which indicates on which deck the passenger's cabin was. The cabin of all passengers is not known so for those without a known cabin(and deck) number we will create a group.

# In[35]:


df.loc[:,['Cabin','Survived']][0:5]


# In[36]:


# Replace NaN with '??' to indicate that the cabin number and deck for this passenger is unknown.
df['Cabin'].replace(np.nan,'??',inplace=True)

# A function to return which deck the passenger was on.
def get_deck(cabin):
    return str(cabin)[0]

df['deck'] = df['Cabin'].apply(get_deck)
decks = df['deck'].unique()
decks.sort()
print(decks)
df.loc[:,['deck','Survived']][0:5]


# Now we have a list of all decks. Let's look into how this variable effects survival.

# In[37]:


# Initialize an array
bydeck = np.zeros((len(decks),2))

# Loop through all decks and calculate number of survivors on each.
for i in range(0,len(decks)):
    if len(df[df['deck'] == decks[i]]['Survived']) > 1: # Did anyone survive on the deck?
        bydeck[i] = df[df['deck'] == decks[i]]['Survived'].value_counts().sort_index()
    elif df[df['deck'] == decks[8]]['Survived'].value_counts().index[0] == 0: # if no one survived, return 0
        bydeck[i] = [df[df['deck'] == decks[8]]['Survived'].value_counts(),0]

print('Survival rate of:')
for i in range(0,len(decks)):
    print(f'{decks[i]} {np.sum(bydeck[i]):.2f}')


# The printout is informative, but not as revealing as a graph. Let's draw some axes.

# In[38]:


labels = ['No Survive', 'Survive']

x = np.arange(0,2)  # the label locations
width = 0.5  # the width of the bars

fig, ax = plt.subplots(figsize=(16,9))

for i in range(1,len(decks)):
    b = ax.bar(x + (i-5)*width/5, bydeck[i], width/5, label=decks[i])
    autolabel(b)

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Passengers')
ax.set_title('Survival by Deck')
ax.set_xticks(x)
#ax.set_ylim(0,505)
ax.set_xticklabels(labels)
ax.legend()

plt.show()


# In[39]:


# A snippet to generate distinct colors for our bars.
import colorsys
N = 2*len(np.unique(decks))
print('colors: {}'.format(N))
rgb_list = []
HSV_tuples = []
for x in range(1,N+1):
    if (x%2 == 0):
        HSV_tuples.append(((x-1)*1.0/N, 1.0, 0.325))
    else:
        HSV_tuples.append((x*1.0/N, 1.0, 0.75))
RGB_tuples = map(lambda x: colorsys.hsv_to_rgb(*x), HSV_tuples)
for rgb in RGB_tuples:
        color = [int(255*rgb[0]),int(255*rgb[1]),int(255*rgb[2])]
        hxstr = '#'
        for c in color:
            hxstr = hxstr + '{0:0{1}X}'.format(c,2)
        rgb_list.append(hxstr)    


# Let's also visualize survival rate per deck.

# In[40]:


x = np.arange(-2.0,2.0)  # the label locations
width = 5.0  # the width of the bars

fig, ax = plt.subplots(figsize=(16,9))
rects = []
for i in range(0,len(decks)):
    b = plt.barh(i, np.round(bydeck[i,1]/np.sum(bydeck[i]),2), width/5, label=decks[i],color=rgb_list[2*i])
    nb = plt.barh(i, -np.round(bydeck[i,0]/np.sum(bydeck[i]),2), width/5, label=decks[i],color=rgb_list[2*i+1])
    plt.annotate(f'N = {str(int(np.sum(bydeck[i])))}',xy=[-0.20,i-0.1],c='white')
    rects.append(b)
ax.set_xlabel('\nDeath                                                        Survival\nRate')
ax.set_ylabel('Deck')
ax.set_title('Survival by Deck')
ax.set_yticks(range(0,9))
ax.set_xlim(-1.1,1.1)
ax.set_yticklabels(decks)
ax.grid(axis='x')
plt.show()


# It looks like if the deck is known, seems to indicate that the passenger survived. In fact, I would say that if the deck is unknown there is a significant chance the passenger did not survive. To validate my claim, the Pearson r is calculated.

# In[41]:


# Translate the categorical deck values into numeric types
deckdum = pd.get_dummies(df['deck'])
decks = np.array(deckdum.columns)

#Define input (y is still survivors)
X = np.array(deckdum)

# Loop through decks and check for correlation
print('Significance  p < 0.05')
print('==========================')
for i in range(0,len(decks)):
    pr = stats.pearsonr(X[:,i],y)
    significant = ( pr[1] < 0.05)
    if significant: # if a significant correlation is found, do stuff
        print(f'Deck {decks[i]} ')
        print(f'\tcorr {pr[0]:.3f}')
        print(f'\tp-value {pr[1]:.3e}')
        print(f'\tSignificant: {significant}')
        model.append(f'deck_{decks[i]}')
        print('+------------------------+')


# #### Discussion
# There is a statistically significant correlation between staying on Deck B, C, D, E and survival. Conversely, if the deck of the passenger is unknown there is a stronger correlation with not surviving. There is no significant correlation for the other decks A, F, G, & T. The significant decks should be included in our model.

# In[42]:


model


# # BY AGE

# Since we filled in the missing age values, everyone on board now has an age value. Let's visualize the age distribution and investigate different age groups for correlation with surviving the disaster.
# 
# Let's make a histogram.

# In[43]:


# Defining some parameters for our histogram
nbins = 20 # The number of bins in our histogram. More on this later.
ylim = df['Age'].max() # Maximum value for the histogram's x-axis
binwidth= ylim/nbins # The width of each bin

bins = np.arange(0,ylim,binwidth) # An array of evenly spaced bins "binwidth" apart from 0 to 80 years of age.


thedigital = np.digitize(df['Age'],bins,right=True) # assign each age to a bin
thecount = np.bincount(thedigital) # Count the members in each bin

# Create some axes
plt.figure(figsize=(8,6))
plt.bar(bins+binwidth/2,thecount[1:],width=binwidth*0.95)
plt.xticks(np.arange(0,88,8))
plt.title('Passenger Age Distibution')
plt.ylabel('Frequency')
plt.xlabel('Age (years)')
plt.savefig('./agedist.png',bbox_inches='tight',dpi=300)
plt.show()


# Let's calculate some descriptive statistics for discussion.

# In[44]:


desc = stats.describe(df['Age'])

stats_ = ['nobs\t','minmax\t','mean\t','variance','skewness','kurtosis']
for i in range(0,len(stats_)):
    print(f'{stats_[i]}:\t{desc[i]}')

mode = stats.mode(df['Age'])
print(f'\nmode\t:\t{mode[0][0]} ({mode[1][0]})')

median = np.median(np.array(df['Age']))
print(f'median\t:\t{median:.1f}')

stdev= np.std(df['Age'])
print(f'Std.Dev\t:\t{stdev:.1f}')

IQR = stats.iqr(df['Age'])
_25pct = desc[2] - IQR/2
_50pct = desc[2]
_75pct = desc[2] + IQR/2
print(f'\nIQR\t:\t{IQR:.1f}')
print(f'25%\t:\t{_25pct:.1f}')
print(f'50%\t:\t{_50pct:.1f}')
print(f'75%\t:\t{_75pct:.1f}')


# This age distribution is bi-modal and skewed right with an IQR of 16.3. The center of the distribution lies between the mode (19) and the mean (30). 
# 
# Let's now visualize survival by age-group.

# In[45]:


dig_sur = np.digitize(df[df['Survived'] == 1]['Age'],bins,right=True)
freq_sur = np.bincount(dig_sur,minlength=len(bins)+1)

dig_die = np.digitize(df[df['Survived'] == 0]['Age'],bins,right=True)
freq_die = np.bincount(dig_die,minlength=len(bins)+1)

xmax = freq_die.max()+5
xlim = [-xmax,xmax]

plt.figure(figsize=(16,9))
plt.barh(bins+binwidth/2,freq_sur[1:],height=binwidth*0.9)
plt.barh(bins+binwidth/2,-freq_die[1:],height=binwidth*0.9)
plt.xlim(xlim)
plt.yticks(np.arange(0,88,8))
plt.title('Age groups on the RMS Titanic')
plt.ylabel('Age')
plt.xlabel('Lost                                     Saved\nPassengers')
plt.savefig('agegroups.png',bbox_inches='tight',dpi=300)
plt.show()


# There are three groups that seem to have a correlation with surviving or not surviving. First, those between the ages of 0 and 7 seem to survive more often than not. This could be the result that young people take up less room on lifeboats and they're pretty cute. 
# 
# Second, is the group of people age 20-27. Their fate seems to be tragic. This age group appears to correlate with not surviving. It could be that these people are full-grown adults but may not have a family of their own yet and therefore a liability. I suspect that males in this age group have sour odds.
# 
# Finally, the age group 36-44 also seems to correlate with not surviving. Perhaps this group of people has already raised their children who are now grown and independent. Obsolescence.
# 
# Let's check the pearson scores against my interpretation of this graphic.

# In[46]:


# Initialize an array to hold which age group each passenger belongs to.
age_arr = np.zeros((len(df),len(bins)+1))

# Fill the array with the age group indicator for each passenger
for i in range(0,len(df)):
    for j in range(0,len(bins)):
        if (df['Age'][i] > bins[j]) & (df['Age'][i] <= bins[j]+binwidth):
            age_arr[i,j] = 1
            break
            
# Let's have a look 
print(bins.astype(int))
print(age_arr[0:10].astype(int))


# In[47]:


bins = np.arange(0,ylim,binwidth)
X = age_arr
total = 0
for i in range(0,age_arr.shape[1]):
    significant = False
    if stats.pearsonr(X[:,i],y)[1] < 0.05:
        significant = True
        print(f'i = {i}')
        print(f'Age {bins[i]:.1f} : {bins[i]+binwidth:.1f} ')
        print(f'\tcorr {stats.pearsonr(X[:,i],y)[0]:.3f}')
        print(f'\tp-value {stats.pearsonr(X[:,i],y)[1]:.3e}')
        print(f'\tSignificant: {significant}')
        print(f'\tPassengers: {np.sum(X[:,i])}')
        model.append(f'{bins[i]:.0f}_{bins[i]+binwidth:.0f}')
        total = total + np.sum(X[:,i])
        print('+------------------------+')       

print(total)


# #### Discussion
# 
# There are two age groups that have a statistically significant correlation with survival on the RMS Titanic. Those age 0 - 9 are correlated with surviving. Those age 18-27 are correlated with not surviving. There are no other age groups that have a correlation at this confidence level.

# In[48]:


model


# # BY TICKET PREFIX

# Some tickets have a prefix. Maybe they mean something that could have an effect on whether or not the passenger survived. Let's peel off the prefixes and take a look.

# In[49]:


def get_prefix(PD_SERIES):
    splt = PD_SERIES.split(' ')
    tmp = splt
    if len(splt) > 1:      
        tmp = splt[0]
        tmp = tmp.replace('STON/O2','SOTON/O2')
        tmp = tmp.replace('STON/O','SOTON/O2')
        tmp = tmp.replace('O2.','O2')

        tmp = tmp.upper()
    else:
        tmp = ''
    return tmp


# In[50]:


df['Ticket'].apply(get_prefix).value_counts()


# #### Discussion
# 
# This is a waste of time. There aren't enough passengers with any specific prefix to make any meaningful inference about the group.

# # BY TITLE

# Each passenger has a title preceding his or her name. Perhaps the passenger's title is correlated with survival. We did find correlation for sex and age. This has an air of linear dependence, but let's look anyway.

# In[51]:


# Peel off and count titles
df['title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].replace(' ',''))
df['title'].value_counts()


# In[52]:


# Group up some of the less common titles
miss = ['Ms','Mlle']
mrs = ['Mme', 'Lady']
prestigious = ['Dr','Rev','Major','Col','Sir','theCountess','Capt','Jonkheer','Don']

df['title'] = df['title'].replace(miss,'Miss')
df['title'] = df['title'].replace(mrs,'Mrs')
df['title'] = df['title'].replace(prestigious,'Prestigious')
df['title'].value_counts()


# In[53]:


# Creating an array to hold the titles and whether or not they surived.
titles = df['title'].value_counts().index.tolist()
bytitle = np.zeros((len(titles),2))
# Populate the array
for i in range(0,len(titles)):
    bytitle[i] = df[df['title'] == titles[i]]['Survived'].value_counts().sort_index()

### Visualizing survival vs title

#Define some parameters for the graph
x = np.arange(-2.0,2.0)  # the label locations
width = 5.0  # the width of the bars

# Create some axes
fig, ax = plt.subplots(figsize=(16,9))
rects = []
# PLOT!
for i in range(0,len(titles)):
    b = plt.barh(i, np.round(bytitle[i,1]/np.sum(bytitle[i]),2), width/5, label=titles[i],color=rgb_list[2*i])
    nb = plt.barh(i, -np.round(bytitle[i,0]/np.sum(bytitle[i]),2), width/5, label=titles[i],color=rgb_list[2*i+1])
    rects.append(b)
    plt.annotate(f'N = {str(int(np.sum(bytitle[i])))}',xy=[-0.18,i-0.1],c='white')
ax.set_xlabel('\nNo Survival                                                        Survival\nRate')
ax.set_ylabel('Title')
ax.set_title('Survival by Title')
ax.set_yticks(range(0,len(titles)))
ax.set_xlim(-1.1,1.1)
ax.set_yticklabels(titles)
ax.grid(axis='x')
plt.show()


# It appears that there is no correlation with having a prestigious title and surviving or not. Also it appears that having any title except Mr. is correlated with surviving. Mr is strongly correlated with drowning/hypothermia. Let's calculate the pearson R for confirmation.

# In[54]:


# Prepare an array to hold our data
tit_arr = np.zeros((len(df),len(titles)))

# Populate array with title indicators
for i in range(0,len(df)):
    for j in range(0,len(titles)):
        if df['title'][i] == titles[j]:
            tit_arr[i,j] = 1
            break
tit_arr[0:5]


# In[55]:


# Here I loop through the title array.
# Each title is checked for correlation with the Survived variable.

X = tit_arr
total = 0
for i in range(0,tit_arr.shape[1]):
    significant = False
    if stats.pearsonr(X[:,i],y)[1] < 0.05:
        significant = True
        print(f'i = {i}')
        print(f'Title {titles[i]} ')
        print(f'\tcorr {stats.pearsonr(X[:,i],y)[0]:.2f}')
        print(f'\tp-value {stats.pearsonr(X[:,i],y)[1]:.1e}')
        print(f'\tSignificant: {significant}')
        print(f'\tPassengers: {np.sum(X[:,i])}')
        model.append(f'{titles[i]}')
        total = total + np.sum(X[:,i])
        print('+------------------------+')       

print(total)


# In[56]:


model


# # BY SIBSP

# In[57]:


df['SibSp'].value_counts().sort_index().tolist()


# In[58]:


sbsps = np.array(df['SibSp'].value_counts().sort_index().index)
bysbsp = np.zeros([len(sbsps),2])
for i in range(0,len(sbsps)):
    bysbsp[i] = df[df['SibSp'] == sbsps[i]]['Survived'].value_counts().sort_index()


# In[59]:


print('Survival rate of SibSp')
for i in range(0,len(sbsps)):
    print(f'\t = {sbsps[i]}: {bysbsp[i][1]/np.sum(bysbsp[i]):.2f}')


# In[60]:


ssp_arr = np.zeros((len(df),len(sbsps)))
for i in range(0,len(df)):
    for j in range(0,len(sbsps)):
        if df['SibSp'][i] == sbsps[j]:
            ssp_arr[i,j] = 1
            break
ssp_arr[0:5]


# In[61]:


X = ssp_arr
total = 0
for i in range(0,ssp_arr.shape[1]):
    significant = False
    if stats.pearsonr(X[:,i],y)[1] < 0.05:
        significant = True
        print(f'i = {i}')
        print(f'SibSp: {sbsps[i]} ')
        print(f'\tcorr {stats.pearsonr(X[:,i],y)[0]:.2f}')
        print(f'\tp-value {stats.pearsonr(X[:,i],y)[1]:.1e}')
        print(f'\tSignificant: {significant}')
        print(f'\tPassengers: {np.sum(X[:,i])}')
        model.append(f'sbsp_{sbsps[i]}')
        total = total + np.sum(X[:,i])
        print('+------------------------+')       

print(total)


# In[62]:


model


# # BY PARCH

# In[63]:


parchs = np.array(df['Parch'].value_counts().sort_index().index)
byparch = np.zeros([len(parchs),2])
for i in range(0,len(parchs)):
    byparch[i] = df[df['Parch'] == parchs[i]]['Survived'].value_counts().sort_index()


# In[64]:


print('Survival rate of: Parch')
for i in range(0,len(parchs)):
    print(f'\t = {parchs[i]}: {byparch[i][1]/np.sum(byparch[i]):.2f}')


# In[65]:


pch_arr = np.zeros((len(df),len(parchs)))
for i in range(0,len(df)):
    for j in range(0,len(parchs)):
        if df['Parch'][i] == parchs[j]:
            pch_arr[i,j] = 1
            break
pch_arr[0:5]


# In[66]:


X = pch_arr
total = 0
for i in range(0,pch_arr.shape[1]):
    significant = False
    if stats.pearsonr(X[:,i],y)[1] < 0.05:
        significant = True
        print(f'i = {i}')
        print(f'Parch: {parchs[i]} ')
        print(f'\tcorr {stats.pearsonr(X[:,i],y)[0]:.2f}')
        print(f'\tp-value {stats.pearsonr(X[:,i],y)[1]:.1e}')
        print(f'\tSignificant: {significant}')
        print(f'\tPassengers: {np.sum(X[:,i])}')
        model.append(f'pa_{parchs[i]}')
        total = total + np.sum(X[:,i])
        print('+------------------------+')       

print(total)


# # BY PCLASS

# In[67]:


pcls = np.array(df['Pclass'].value_counts().sort_index().index)
pcls


# In[68]:


pcls_arr = np.zeros((len(df),len(pcls)))
for i in range(0,len(df)):
    for j in range(0,len(pcls)):
        if df['Pclass'][i] == pcls[j]:
            pcls_arr[i,j] = 1
            break
pcls_arr[0:5]


# In[69]:


X = pcls_arr
total = 0
for i in range(0,pcls_arr.shape[1]):
    significant = False
    if stats.pearsonr(X[:,i],y)[1] < 0.05:
        significant = True
        print(f'i = {i}')
        print(f'Pclass: {pcls[i]} ')
        print(f'\tcorr {stats.pearsonr(X[:,i],y)[0]:.2f}')
        print(f'\tp-value {stats.pearsonr(X[:,i],y)[1]:.1e}')
        print(f'\tSignificant: {significant}')
        print(f'\tPassengers: {np.sum(X[:,i])}')
        model.append(f'class_{pcls[i]}')
        total = total + np.sum(X[:,i])
        print('+------------------------+')       

print(total)


# # Modeling
# 
# Below I use the information gleaned from exploration to build a model of who survives the disaster based upon variables considered to be most signicant. A function is defined to create an array of 0s and 1s from a dataframe which indicate whether or not the passenger has this particular feature. The array is then used to fit a logistic regression model. 

# In[70]:


model


# In[71]:


# View the format of the model array
pd.DataFrame(columns=model)


# In[72]:


def initialize_model(dataframe):
    model_arr = np.zeros((len(dataframe),len(model)),dtype=int)
    for i in range(0,len(dataframe)):
        
        ### SEX
        model_arr[i,0] = (dataframe['Sex'][i] == 'male')        
        
        ### PORT
        #if dataframe['Embarked'][i] == 'Q':
        #    model_arr[i,1] = 1
        if dataframe['Embarked'][i] == 'S':
            model_arr[i,2] = 1    
        
        ### DECK
        dataframe['Cabin'].replace(np.nan,'??',inplace=True)
        dataframe['deck'] = dataframe['Cabin'].apply(get_deck)
        if dataframe['deck'][i] == '?':
            model_arr[i,3] = 1
        if dataframe['deck'][i] == 'B':
            model_arr[i,4] = 1
        if dataframe['deck'][i] == 'C':
            model_arr[i,5] = 1
        if dataframe['deck'][i] == 'D':
            model_arr[i,6] = 1
        if dataframe['deck'][i] == 'E':
            model_arr[i,7] = 1
        

        ### TITLE
        miss = ['Ms','Mlle']
        mrs = ['Mme', 'Lady']
        dataframe['title'] = dataframe['Name'].apply(lambda x: x.split(',')[1].split('.')[0].replace(' ',''))
        dataframe['title'] = dataframe['title'].replace(miss,'Miss')
        dataframe['title'] = dataframe['title'].replace(mrs,'Mrs')
        if dataframe['title'][i] == 'Mr':
            model_arr[i,8] = 1
        if dataframe['title'][i] == 'Miss':
            model_arr[i,9] = 1
        if dataframe['title'][i] == 'Mrs':
            model_arr[i,10] = 1
        #if dataframe['title'][i] == 'Master':
        #    model_arr[i,11] = 1
        
        ### SIBSP
        if dataframe['SibSp'][i] == 0:
            model_arr[i,12] = 1        
        if dataframe['SibSp'][i] == 1:
            model_arr[i,13] = 1   
        #if dataframe['SibSp'][i] == 8:
        #   model_arr[i,14] = 1
        
        ### PARCH
        if dataframe['Parch'][i] == 0:
            model_arr[i,15] = 1                       
        if dataframe['Parch'][i] == 1:
            model_arr[i,16] = 1         
        #if dataframe['Parch'][i] == 2:
        #    model_arr[i,17] = 1
        
        ### PCLASS
        if dataframe['Pclass'][i] == 1:
            model_arr[i,18] = 1
        #if dataframe['Pclass'][i] == 2:
        #    model_arr[i,19] = 1             
        if dataframe['Pclass'][i] == 3:
            model_arr[i,20] = 1 
            
        ### AGE
        if dataframe['Age'].isnull().any():
            nullage = dataframe[dataframe['Age'].isnull()].index.tolist() # Bookmark indices of NaNs
            df2 = dataframe.loc[:,['Pclass','Sex','SibSp','Parch','Fare','Embarked','Age']]

            df2['Sex'].replace(['female','male'],[0,1],inplace=True)
            df2['Embarked'].replace(['C','Q','S'],[0,1,2],inplace=True)

            X = np.array(df2[df2['Age'].isnull()])
            X = scaler.transform(X[:,:6])
            dataframe.iloc[nullage,5] = knn.predict(X) # Use the same model we built earlier to estimate age
      
        if (dataframe['Age'][i] > 0) & (df['Age'][i] <= 4):
            model_arr[i,21] = 1
            
        #if (dataframe['Age'][i] > 32) & (df['Age'][i] <= 36):
        #    model_arr[i,22] = 1
        
    return model_arr            


# In[73]:


# Load the dataframe and initialize the model array
df_train = pd.read_csv('train.csv')
train_arr = initialize_model(df_train)
print(train_arr[0:5])


# In[78]:


# Define the model input
X = train_arr
y = np.array(df_train['Survived'])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)


# define the model
clf_log = LogisticRegression(multi_class='multinomial',class_weight=None,solver='newton-cg',max_iter=100)
clf_log.fit(X_train, y_train)

# Score the model
print('Training Set')
print(f'\tf1-score: {f1_score(y_test,clf_log.predict(X_test)):.2f}')
print(f'\tacc. score: {accuracy_score(y_test,clf_log.predict(X_test)):.2f}\n')

# Load the competition set
df_test = pd.read_csv('test.csv')
df_true = pd.read_csv('gender_submission.csv')

# Initialize the competition data
test_arr = initialize_model(df_test)

# Predict survival of competition data and score
print('Competition Set')
print(f'\tAccuracy: {accuracy_score(df_true.iloc[:,1],clf_log.predict(test_arr)):.3f}')
print(f'\tf1-score: {f1_score(df_true.iloc[:,1],clf_log.predict(test_arr)):.3f}')


# Make a pretty picture
c_mat = confusion_matrix(df_true.iloc[:,1],clf_log.predict(test_arr))
disp = ConfusionMatrixDisplay(confusion_matrix = c_mat, display_labels=['No Survive','Survive'])
disp.plot(values_format='d')
plt.savefig('confusion.png',bbox_inches='tight',dpi=300)


# In[74]:


# Prepare a dataframe to submit to Kaggle.com
df_submit = pd.DataFrame({'PassengerId':df_test['PassengerId'],'Survived':clf_log.predict(test_arr).tolist()})
df_submit.set_index('PassengerId',inplace=True)
df_submit.to_csv('kagglesubmission.csv')
df_submit[0:5]


# ## Rankings
# 
# Let's see how I did in the rankings.

# In[80]:


# Load the leaderboard and print my rank.
df_leaderboard = pd.read_csv('./leaderboard/titanic-publicleaderboard.csv')
df_leaderboard[df_leaderboard['TeamName'] == 'Joey #5']


# Now visualize with a histogram.

# In[82]:


# Define some parameters
nbins = 65 # Number of bins
ylim = 0.855 # Maximum value for the histogram's y-axis
ymin = 0.7 # Minimum value for the histogram's y-axis
binwidth= (ylim-ymin)/nbins # The width of each bin
bins = np.arange(0.7,ylim,binwidth) # An array of evenly spaced bins "binwidth" apart

# Prepare data
thedigital = np.digitize(df_leaderboard['Score'],bins,right=True) # assign each age to a bin
thecount = np.bincount(thedigital) # Count the members in each bin

# Create some axes
plt.figure(figsize=(8,6))

# Draw the bars
plt.bar(bins[0:35]+binwidth/2,thecount[1:36],width=binwidth,color = 'C0')
plt.bar(bins[35:36]+binwidth/2,thecount[36:37],width=binwidth,color = 'C1')
plt.bar(bins[36:]+binwidth/2,thecount[37:],width=binwidth,color = 'C0')

# Label
plt.xticklabels=bins[33:66]
plt.xlim=(0.7,0.85)
plt.title('Competition Rankings')
plt.ylabel('Frequency')
plt.xlabel('Kaggle Score')

# Save and view
plt.savefig('./kaggle.png',bbox_inches='tight',dpi=300)
plt.show()


# Pretty good for first attempt at this competition.