In [5]:

from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [6]:

def plot_confusion_matrix(cm):
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_title('Confusion Matrix')
    fig.colorbar(im)

    target_names = ['not survived', 'survived']

    tick_marks = np.arange(len(target_names))
    ax.set_xticks(tick_marks)
    ax.set_xticklabels(target_names, rotation=45)
    ax.set_yticks(tick_marks)
    ax.set_yticklabels(target_names)
    ax.set_ylabel('True label')
    ax.set_xlabel('Predicted label')
    fig.tight_layout()

In [7]:

df_train = pd.read_csv('./train.csv')
df_test = pd.read_csv('./test.csv')

In [8]:

df_train.drop('PassengerId', axis=1, inplace=True)
df_train.head(2)

Out[8]:

	Survived	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	0	3	Braund, Mr. Owen Harris	male	22	1	0	A/5 21171	7.2500	NaN	S
1	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	0	PC 17599	71.2833	C85	C

In [9]:

#df_train.groupby([df_train.Pclass, df_train.Sex]).Age.mean()

In [10]:

def _extract_title(name):
    if name.find('Mr.') > 0:
        return 'Mr'
    elif name.find('Mrs.') > 0:
        return 'Mrs'
    elif name.find('Master.') > 0:
        return 'Master.'
    elif name.find('Miss.') > 0:
        return 'Miss'
    else:
        return None
    
def extract_title(df):
    df['Title'] = df.Name.apply(lambda n: _extract_title(n))
    title_bin = pd.get_dummies(df.Title)
    title_bin.rename(columns=lambda x: 'title' + "_" + str(x), inplace=True)
    df = df.join(title_bin)
    #return title_bin
    return df

In [11]:

def fill_fare(df):
    df['Fare'].fillna(0, inplace=True)
    df['FareFill'] = df.Fare
    df.FareFill[(df.Fare == 0) & (df.Pclass == 1)] = 86
    df.FareFill[(df.Fare == 0) & (df.Pclass == 2)] = 21
    df.FareFill[(df.Fare == 0) & (df.Pclass == 3)] = 13
    df.FareFill = df.FareFill.apply(lambda f:np.log(f))
    return df

In [12]:

def fill_age(df):
    df['AgeFill'] = df.Age
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 1)] = 41
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 2)] = 30
    df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 3)] = 26
    df.AgeFill[df.Age.isnull() & (df.Sex == 'female') & (df.Pclass == 1)] = 34
    df.AgeFill[df.Age.isnull() & (df.Sex == 'female') & (df.Pclass == 2)] = 28
    df.AgeFill[df.Age.isnull() & (df.Sex == 'female') & (df.Pclass == 3)] = 21
    df.AgeFill[df.Age.isnull() & (df.Title == 'Master')] = 7
    df.AgeFill[df.Age.isnull() & (df.Title == 'Miss')] = 20
    return df

In [13]:

def extract_pclass(df):
    pclass_new = pd.get_dummies(df.Pclass)
    pclass_new.rename(columns=lambda x: 'pclass' + "_" + str(x), inplace=True)
    df = df.join(pclass_new)
    return df

In [14]:

def convert_sex(df):
    df['Gender'] = df.Sex.apply(lambda s: 0 if s == 'male' else 1)
    return df

In [20]:

def extract_feature(df):
    df = extract_title(df)
    df = fill_age(df)
    df = extract_pclass(df)
    df = convert_sex(df)
    df = fill_fare(df)
    cols = df.columns
    drop_cols = set(cols).intersection(set(['PassengerId', 'Title', 'Name', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Survived', 'Parch', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked', 'CCabin']))
    return df.drop(drop_cols, axis=1)

In [21]:

def get_classifier():
    clf = LogisticRegression(C=100, penalty='l2', tol=0.01)
    #clf = RandomForestClassifier()
    #clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2)
    return clf

In [22]:

def calc_classifier(df):
    X_train = extract_feature(df)
    y_train = df['Survived']
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=0)
    print('Num of Training Samples: {}'.format(len(X_train)))
    print('Num of Validation Samples: {}'.format(len(X_val)))
    
    clf = get_classifier()
    clf.fit(X_train, y_train)
    y_train_pred = clf.predict(X_train)
    y_val_pred = clf.predict(X_val)
    print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))
    print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))
    cm = confusion_matrix(y_val, y_val_pred)
    return clf

In [23]:

def cross_val(X, y, K, random_state=0, clf=None, ):
    if clf is None:
        clf = get_classifier()
    cv = KFold(len(y), K, shuffle=True, random_state=random_state)
    scores = cross_val_score(clf, X, y, cv=cv)
    print('Scores:', scores)
    print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))
    return scores

In [34]:

X_train = extract_feature(df_train)
y_train = df_train.Survived
#cross_val(X_train, y_train, 5, clf=LogisticRegression(C=100, penalty='l2', tol=0.01))
#cross_val(X_train, y_train, 5, clf=LogisticRegression(C=10, penalty='l2', tol=0.01))
#cross_val(X_train, y_train, 5, clf=LogisticRegression(C=1, penalty='l2', tol=0.01))
#cross_val(X_train, y_train, 5, clf=LogisticRegression(C=100, penalty='l1', tol=0.01))
#cross_val(X_train, y_train, 5, clf=LogisticRegression(C=10, penalty='l1', tol=0.01))
cross_val(X_train, y_train, 10, clf=LogisticRegression(C=0.1, penalty='l2', tol=0.01))

('Scores:', array([ 0.75555556,  0.7752809 ,  0.78651685,  0.7752809 ,  0.79775281,
        0.79775281,  0.83146067,  0.79775281,  0.82022472,  0.85393258]))
Mean Score: 0.799 (+/-0.055)

Out[34]:

array([ 0.75555556,  0.7752809 ,  0.78651685,  0.7752809 ,  0.79775281,
        0.79775281,  0.83146067,  0.79775281,  0.82022472,  0.85393258])

In [26]:

clf = calc_classifier(df_train)

Num of Training Samples: 712
Num of Validation Samples: 179
Accuracy on Training Set: 0.831
Accuracy on Validation Set: 0.799

In [29]:

X_train.head(10)

Out[29]:

	title_Master.	title_Miss	title_Mr	title_Mrs	AgeFill	pclass_1	pclass_2	pclass_3	Gender	FareFill
0	0	0	1	0	22	0	0	1	0	1.981001
1	0	0	0	1	38	1	0	0	1	4.266662
2	0	1	0	0	26	0	0	1	1	2.070022
3	0	0	0	1	35	1	0	0	1	3.972177
4	0	0	1	0	35	0	0	1	0	2.085672
5	0	0	1	0	26	0	0	1	0	2.135148
6	0	0	1	0	54	1	0	0	0	3.948596
7	1	0	0	0	2	0	0	1	0	3.048088
8	0	0	0	1	27	0	0	1	1	2.409941
9	0	0	0	1	14	0	1	0	1	3.403555

In [31]:

Y = extract_feature(df_test)
df_test['Survived'] = clf.predict(Y)
submit_data = df_test[['PassengerId', 'Survived']]

In [32]:

Y.head()

Out[32]:

	title_Mr	title_Mrs	AgeFill	pclass_2	pclass_3	Gender	FareFill
0	1	0	34.5	0	1	0	2.057860
1	0	1	47.0	0	1	1	1.945910
2	1	0	62.0	1	0	0	2.270836
3	1	0	27.0	0	1	0	2.159003
4	0	1	22.0	0	1	1	2.508582

In [33]:

submit_data.to_csv('./submit_simple_add_title.csv', index=False)

In [ ]: