import pandas as pd
df = pd.read_csv('http://bit.ly/kaggletrain')
cols = ['Pclass', 'Parch', 'SibSp', 'Fare']
X = df[cols]
y = df['Survived']
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score
lr = LogisticRegression(solver='liblinear', random_state=1)
cross_val_score(lr, X, y).mean()
0.6835791852363318
rf = RandomForestClassifier(max_features=None, random_state=1)
cross_val_score(rf, X, y).mean()
0.6947774778733288
# create an ensemble for improved accuracy
vc = VotingClassifier([('clf1', lr), ('clf2', rf)], voting='soft')
cross_val_score(vc, X, y).mean()
0.7251020023852865
© 2020 Data School. All rights reserved.