# warningsを無視する
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
df_train = pd.read_csv("./titanic_csv/train.csv")
df_test = pd.readb_csv("./titanic_csv/test.csv")
df_gender_submission = pd.read_csv("./titanic_csv/gender_submission.csv")
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
# 本文にはない、レイアウト設定用
sns.set_palette("Blues_r", 3) # 青3色のスタイル
mpl.style.use('seaborn-white') # 背景が白いスタイル
# jupyter notebook用
% matplotlib inline
# 日本語表示用 ##カーネルでは日本語表示できない
plt.rcParams["font.size"] = 18
plt.rcParams['font.family'] = 'IPAPGothic'
# サイズの設定
plt.rcParams['figure.figsize'] = (8.0, 6.0)
df_train.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
print(df_train.shape) # 学習用データ
print(df_test.shape) # 本番予測用データ
print(df_gender_submission.shape) # 提出データのサンプル
(891, 12) (418, 11) (418, 2)
print(df_train.columns) # トレーニングデータの列名
print('-'*10) # 区切りを挿入
print(df_test.columns) # テストデータの列名
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object') ---------- Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 891 entries, 0 to 890 Data columns (total 12 columns): PassengerId 891 non-null int64 Survived 891 non-null int64 Pclass 891 non-null int64 Name 891 non-null object Sex 891 non-null object Age 714 non-null float64 SibSp 891 non-null int64 Parch 891 non-null int64 Ticket 891 non-null object Fare 891 non-null float64 Cabin 204 non-null object Embarked 889 non-null object dtypes: float64(2), int64(5), object(5) memory usage: 83.6+ KB
df_test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 418 entries, 0 to 417 Data columns (total 11 columns): PassengerId 418 non-null int64 Pclass 418 non-null int64 Name 418 non-null object Sex 418 non-null object Age 332 non-null float64 SibSp 418 non-null int64 Parch 418 non-null int64 Ticket 418 non-null object Fare 417 non-null float64 Cabin 91 non-null object Embarked 418 non-null object dtypes: float64(2), int64(4), object(5) memory usage: 36.0+ KB
df_train.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df_train.isnull().sum()
# isnull()は、欠損値に対しTrueを返し、欠損値以外にはFalseを返す
# sum()は、Trueを1、Falseを0として合計する
# よってdf.isnull().sum()で欠損値を算出することができる
PassengerId 0 Survived 0 Pclass 0 Name 0 Sex 0 Age 177 SibSp 0 Parch 0 Ticket 0 Fare 0 Cabin 687 Embarked 2 dtype: int64
df_test.isnull().sum()
PassengerId 0 Pclass 0 Name 0 Sex 0 Age 86 SibSp 0 Parch 0 Ticket 0 Fare 1 Cabin 327 Embarked 0 dtype: int64
# df_trainとdf_Testを縦に連結
df_full = pd.concat([df_train, df_test], axis = 0, ignore_index=True)
print(df_full.shape) # df_fullの行数と列数を確認
df_full.describe() # df_fullの要約統計量
(1309, 12)
Age | Fare | Parch | PassengerId | Pclass | SibSp | Survived | |
---|---|---|---|---|---|---|---|
count | 1046.000000 | 1308.000000 | 1309.000000 | 1309.000000 | 1309.000000 | 1309.000000 | 891.000000 |
mean | 29.881138 | 33.295479 | 0.385027 | 655.000000 | 2.294882 | 0.498854 | 0.383838 |
std | 14.413493 | 51.758668 | 0.865560 | 378.020061 | 0.837836 | 1.041658 | 0.486592 |
min | 0.170000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
25% | NaN | NaN | 0.000000 | 328.000000 | 2.000000 | 0.000000 | NaN |
50% | NaN | NaN | 0.000000 | 655.000000 | 3.000000 | 0.000000 | NaN |
75% | NaN | NaN | 0.000000 | 982.000000 | 3.000000 | 1.000000 | NaN |
max | 80.000000 | 512.329200 | 9.000000 | 1309.000000 | 3.000000 | 8.000000 | 1.000000 |
# 男女別の生存者数を可視化
sns.countplot(x='Survived', data=df_train)
plt.xlabel("Survived", fontsize = 20)
plt.ylabel("count", fontsize = 20)
plt.title("死亡者と生存者の数", fontsize = 20)
plt.yticks(fontsize = 20)
plt.xticks([0.0,1.0], ['死亡','生存'])
plt.xticks([0,1],["死亡者", "生存者"], fontsize = 20)
plt.show()
# 男女別の生存割合を表示する
df_train[['Sex','Survived']].groupby(['Sex']).mean()
Survived | |
---|---|
Sex | |
female | 0.742038 |
male | 0.188908 |
sns.countplot(x='Survived', hue='Sex', data=df_train)
plt.xticks([0.0,1.0], ["死亡","生存"])
plt.tick_params(labelsize=20)
plt.xlabel('Survived', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.title("男女別の死亡者と生存者の数", fontsize = 20)
plt.legend(fontsize = 20)
plt.tight_layout()
# チケットクラス別の生存者数を可視化
sns.countplot(x='Survived', hue='Pclass', data=df_train)
plt.xticks([0.0,1.0], ['死亡','生存'])
plt.title("チケットクラス別の死亡者と生存者の数", fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('Survived', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.legend(fontsize = 20)
plt.tight_layout()
plt.show()
# チケットクラス別の生存割合を表示する
df_train[['Pclass','Survived']].groupby(['Pclass']).mean()
Survived | |
---|---|
Pclass | |
1 | 0.629630 |
2 | 0.472826 |
3 | 0.242363 |
# 全体のヒストグラム
sns.distplot(df_train['Age'].dropna(), kde=False, bins = 30 ,color = "yellow",label="全体")
plt.title("乗船者の年齢の分布", fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('Age', fontsize = 20)
plt.ylabel('count',fontsize=20)
# 死亡者のヒストグラム
sns.distplot(df_train[df_train["Survived"] == 0].Age.dropna(), kde = False,bins=30,color = "green", label="死亡")
# 生存者のヒストグラム
sns.distplot(df_train[df_train["Survived"] == 1].Age.dropna(), kde = False,bins=30,color = "red", label="生存")
plt.legend(fontsize = 20); # 凡例を表示
# 年齢を8等分し、CategoricalAgeという変数を作成
df_train['CategoricalAge'] = pd.cut(df_train['Age'], 8)
# CategoricalAgeでグルーピングして、Survivedを平均
df_train[['CategoricalAge', 'Survived']].groupby(['CategoricalAge'], as_index=False).mean()
CategoricalAge | Survived | |
---|---|---|
0 | (0.34, 10.367] | 0.593750 |
1 | (10.367, 20.315] | 0.382609 |
2 | (20.315, 30.263] | 0.365217 |
3 | (30.263, 40.21] | 0.445161 |
4 | (40.21, 50.157] | 0.383721 |
5 | (50.157, 60.105] | 0.404762 |
6 | (60.105, 70.0525] | 0.235294 |
7 | (70.0525, 80] | 0.200000 |
sns.countplot(x='SibSp', data = df_train, color='cornflowerblue')
plt.title("同乗している兄弟・配偶者の数", fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('SibSp', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.tight_layout()
# SibSpが0か1であればそのまま、2以上であれば2である特徴量SibSp_0_1_2overを作成
df_train["SibSp_0_1_2over"] = [i if i <=1 else 2 for i in df_train["SibSp"]]
# SibSp_0_1_2overごとに集計し、可視化
sns.countplot(x='Survived', hue='SibSp_0_1_2over', data=df_train)
plt.legend(["0人","1人","2人以上"],fontsize =20)
plt.xticks([0,1],["死亡","生存"],fontsize = 20)
plt.title("同乗している兄弟・配偶者の数別の死亡者と生存者の数", fontsize = 20)
plt.xlabel("Survived",fontsize = 20)
plt.ylabel("count",fontsize = 20)
plt.show()
df_train[["SibSp_0_1_2over","Survived"]].groupby(["SibSp_0_1_2over"]).mean()
Survived | |
---|---|
SibSp_0_1_2over | |
0 | 0.345395 |
1 | 0.535885 |
2 | 0.270270 |
sns.countplot(x='Parch', data = df_train ,color='cornflowerblue')
plt.title("同乗している両親・子供の数", fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('Parch', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.tight_layout()
sns.set_palette("Blues_r", 4)
# 2以下であればそのままの数、3以上は3という変換を行う
df_train["Parch_0_1_2_3over"] = [i if i <=2 else 3 for i in df_train["Parch"]]
# Parch_0_1_2_3overごとに集計し可視化
sns.countplot(x="Survived",hue="Parch_0_1_2_3over", data = df_train)
plt.title("同乗している両親・子供の数別の死亡者と生存者の数", fontsize = 20)
plt.legend(["0人","1人","2人","3人以上"],fontsize=20)
plt.xticks([0,1],["死亡","生存"],fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('Parch', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.tight_layout()
df_train[["Parch_0_1_2_3over","Survived"]].groupby(["Parch_0_1_2_3over"]).mean()
Survived | |
---|---|
Parch_0_1_2_3over | |
0 | 0.343658 |
1 | 0.550847 |
2 | 0.500000 |
3 | 0.266667 |
#SibSpとParchが同乗している家族の数。1を足すと家族の人数となる
df_train['FamilySize']=df_train['SibSp']+ df_train['Parch']+ 1
# IsAloneを0とし、2行目でFamilySizeが1であれば1にしている
df_train['IsAlone'] = 0
df_train.loc[df_train['FamilySize'] == 1, 'IsAlone'] = 1
# IsAloneごとに可視化
sns.countplot(x="Survived",hue="IsAlone",data=df_train)
plt.legend(["2人以上","1人で乗船"],fontsize = 20)
plt.xticks([0,1],["死亡","生存"],fontsize=20)
plt.xlabel("Survived",fontsize=20)
plt.ylabel("count",fontsize=20)
plt.yticks(fontsize=20)
plt.title("1人or2人以上で乗船別の死亡者と生存者の数",fontsize=20)
<matplotlib.text.Text at 0x1174b52b0>
sns.distplot(df_train['Fare'].dropna(),kde = False,hist=True);
plt.title("運賃の分布", fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('Fare', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.tight_layout()
sns.distplot(df_train['Fare'].dropna(), rug=False, kde = False,hist=True);
plt.title("運賃の分布", fontsize = 20)
plt.tick_params(labelsize=20)
plt.xlabel('Fare', fontsize = 20)
plt.ylabel('count',fontsize=20)
plt.tight_layout()
df_train['CategoricalFare'] = pd.qcut(df_train['Fare'], 4)
df_train[['CategoricalFare', 'Survived']].groupby(['CategoricalFare'], as_index=False).mean()
CategoricalFare | Survived | |
---|---|---|
0 | [0, 7.91] | 0.197309 |
1 | (7.91, 14.454] | 0.303571 |
2 | (14.454, 31] | 0.454955 |
3 | (31, 512.329] | 0.581081 |
df_test['Name'][0:5]
0 Kelly, Mr. James 1 Wilkes, Mrs. James (Ellen Needs) 2 Myles, Mr. Thomas Francis 3 Wirz, Mr. Albert 4 Hirvonen, Mrs. Alexander (Helga E Lindqvist) Name: Name, dtype: object
# 敬称を抽出し、重複を省く
set(df_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False))
{'Capt', 'Col', 'Countess', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir'}
# collections.Counterを使用して、数え上げる
import collections
collections.Counter(df_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False))
Counter({'Capt': 1, 'Col': 2, 'Countess': 1, 'Don': 1, 'Dr': 7, 'Jonkheer': 1, 'Lady': 1, 'Major': 2, 'Master': 40, 'Miss': 182, 'Mlle': 2, 'Mme': 1, 'Mr': 517, 'Mrs': 125, 'Ms': 1, 'Rev': 6, 'Sir': 1})
# df_trainにTitle列を作成、Title列の値は敬称
df_train['Title'] = df_train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# df_testにTitle列を作成、Title列の値は敬称
df_test['Title'] = df_test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# df_trainのTitle列の値ごとに平均値を算出
df_train.groupby('Title').mean()['Age']
Title Capt 70.000000 Col 58.000000 Countess 33.000000 Don 40.000000 Dr 42.000000 Jonkheer 38.000000 Lady 48.000000 Major 48.500000 Master 4.574167 Miss 21.773973 Mlle 24.000000 Mme 24.000000 Mr 32.368090 Mrs 35.898148 Ms 28.000000 Rev 43.166667 Sir 49.000000 Name: Age, dtype: float64
# 変換するための関数を作成
def title_to_num(title):
if title == 'Master':
return 1
elif title == 'Miss':
return 2
elif title == 'Mr':
return 3
elif title == 'Mrs':
return 4
else:
return 5
# リスト内包表記を用いて変換
df_train['Title_num'] = [title_to_num(i) for i in df_train['Title']]
df_test['Title_num'] = [title_to_num(i) for i in df_test['Title']]