#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd from sklearn.ensemble import RandomForestClassifier # カーネルの場合 #df_train = pd.read_csv('../input/train.csv') #df_test = pd.read_csv('../input/test.csv') # 本レポジトリの場合 df_train = pd.read_csv("./titanic_csv/train.csv") df_test = pd.read_csv("./titanic_csv/test.csv") # Embarkedの補完 df_train.loc[df_train['PassengerId'].isin([62, 830]), 'Embarked'] = 'C' # Fareの補完 df_test.loc[df_test['PassengerId'] == 1044, 'Fare'] = 13.675550 #Age変換のための関数 def impute_age(cols): Age = cols[0] Pclass = cols[1] if pd.isnull(Age): if Pclass == 1: return 39 elif Pclass == 2: return 30 else: return 25 else: return Age data = [df_train, df_test] for df in data: # Ageの補完 df['Age'] = df[['Age','Pclass']].apply(impute_age, axis = 1) # 性別の変換 df['Sex'] = df['Sex'].map({"male": 0, "female": 1}) # Embarked df['Embarked'] = df['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) # Fareのカテゴリ変数化 df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0 df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1 df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare'] = 2 df.loc[ df['Fare'] > 31, 'Fare'] = 3 df['Fare'] = df['Fare'].astype(int) # Ageのカテゴリ変数化 df.loc[ df['Age'] <= 16, 'Age'] = 0 df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1 df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2 df.loc[ df['Age'] > 48, 'Age'] = 3 df['Age'] = df['Age'].astype(int) # FamilySizeとIsAloneの作成 df['FamilySize'] = df['SibSp'] + df['Parch'] + 1 df['IsAlone'] = 0 df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1 # 不要な列の削除 df_train.drop(['Name', 'Cabin', 'Ticket','SibSp','Parch'], axis=1, inplace=True) df_test.drop(['Name', 'Cabin', 'Ticket','SibSp','Parch'], axis=1, inplace=True) # X_train、Y_train、X_testを作成 X_train = df_train.drop(["PassengerId","Survived"], axis=1) Y_train = df_train["Survived"] X_test = df_test.drop("PassengerId", axis=1).copy() # 学習 forest = RandomForestClassifier(random_state=1) forest.fit(X_train, Y_train) Y_pred = forest.predict(X_test) # 提出データの作成 submission = pd.DataFrame({ "PassengerId": df_test["PassengerId"], "Survived": Y_pred}) submission.to_csv('submit.csv', index=False)