# warningsを無視する
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
# カーネルの場合
#df_train = pd.read_csv('../input/train.csv')
#df_test = pd.read_csv('../input/test.csv')
# 本レポジトリの場合
df_train = pd.read_csv("./titanic_csv/train.csv")
df_test = pd.read_csv("./titanic_csv/test.csv")
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
# 日本語表示用 ##カーネルでは日本語表示できない
plt.rcParams["font.size"] = 18
plt.rcParams['font.family'] = 'IPAPGothic'
# サイズの設定
plt.rcParams['figure.figsize'] = (8.0, 6.0)
# Embarkedの補完
df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C'
# Fareの補完
df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550
#Age変換のための関数
def impute_age(cols):
Age = cols[0]
Pclass = cols[1]
if pd.isnull(Age):
if Pclass == 1:
return 39
elif Pclass == 2:
return 30
else:
return 25
else:
return Age
data = [df_train, df_test]
for df in data:
# Ageの補完
df['Age'] = df[['Age','Pclass']].apply(impute_age, axis = 1)
# 性別の変換
df['Sex'] = df['Sex'].map({"male": 0, "female": 1})
# Embarked
df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
df_train = pd.get_dummies(df_train, columns = ['Embarked'])
df_test = pd.get_dummies(df_test, columns = ['Embarked'])
data = [df_train, df_test]
for df in data:
# Fareのカテゴリ変数化
df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2
df.loc[ df['Fare'] > 31, 'Fare'] = 3
df['Fare'] = df['Fare'].astype(int)
# Ageのカテゴリ変数化
df.loc[ df['Age'] <= 16, 'Age'] = 0
df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
df.loc[ df['Age'] > 48, 'Age'] = 3
df['Age'] = df['Age'].astype(int)
df_train.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
df_test.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
X_train = df_train.drop(["PassengerId", "Survived"], axis=1) # 不要な列を削除
Y_train = df_train['Survived'] # Y_trainは、df_trainのSurvived列
X_test = df_test.drop('PassengerId', axis=1).copy()
from sklearn.ensemble import RandomForestClassifier
# 学習と予測を行う
forest = RandomForestClassifier(random_state=1)
forest.fit(X_train, Y_train)
Y_prediction = forest.predict(X_test)
submission = pd.DataFrame({
'PassengerId': df_test['PassengerId'],
'Survived': Y_prediction
})
submission.to_csv('submission.csv', index=False)
forest.feature_importances_
array([ 0.13924783, 0.41004894, 0.11409453, 0.09676177, 0.07999719, 0.10232023, 0.02503838, 0.02067357, 0.01181757])
for i,k in zip(X_train.columns,forest.feature_importances_):
print(i,round(k,4))
Pclass 0.1392 Sex 0.41 Age 0.1141 SibSp 0.0968 Parch 0.08 Fare 0.1023 Embarked_0 0.025 Embarked_1 0.0207 Embarked_2 0.0118