#!/usr/bin/env python # coding: utf-8 # In[2]: import pandas as pd test = pd.read_csv("https://raw.githubusercontent.com/uzay00/KaVe/master/2018/Lecture3/data/test.csv") train = pd.read_csv("https://raw.githubusercontent.com/uzay00/KaVe/master/2018/Lecture3/data/train.csv") print("Dimensions of train: {}".format(train.shape)) print("Dimensions of test: {}".format(test.shape)) # In[3]: train.head() # In[4]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') sex_pivot = train.pivot_table(index="Sex",values="Survived") sex_pivot.plot.bar() plt.show() # In[6]: class_pivot = train.pivot_table(index="Pclass",values="Survived") class_pivot.plot.bar() plt.show() # In[7]: train["Age"].describe() # In[8]: survived = train[train["Survived"] == 1] died = train[train["Survived"] == 0] survived["Age"].plot.hist(alpha=0.5,color='red',bins=50) died["Age"].plot.hist(alpha=0.5,color='blue',bins=50) plt.legend(['Survived','Died']) plt.show() # In[9]: def process_age(df,cut_points,label_names): df["Age"] = df["Age"].fillna(-0.5) df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names) return df cut_points = [-1,0,5,12,18,35,60,100] label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"] train = process_age(train,cut_points,label_names) test = process_age(test,cut_points,label_names) pivot = train.pivot_table(index="Age_categories",values='Survived') pivot.plot.bar() plt.show() # In[10]: def create_dummies(df,column_name): dummies = pd.get_dummies(df[column_name],prefix=column_name) df = pd.concat([df,dummies],axis=1) return df for column in ["Pclass","Sex","Age_categories"]: train = create_dummies(train,column) test = create_dummies(test,column) # In[11]: from sklearn.linear_model import LogisticRegression columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'Age_categories_Missing','Age_categories_Infant', 'Age_categories_Child', 'Age_categories_Teenager', 'Age_categories_Young Adult', 'Age_categories_Adult', 'Age_categories_Senior'] lr = LogisticRegression() lr.fit(train[columns], train["Survived"]) # In[12]: holdout = test # from now on we will refer to this # dataframe as the holdout data from sklearn.model_selection import train_test_split all_X = train[columns] all_y = train['Survived'] train_X, test_X, train_y, test_y = train_test_split( all_X, all_y, test_size=0.20,random_state=0) # In[13]: lr = LogisticRegression() lr.fit(train_X, train_y) predictions = lr.predict(test_X) # In[14]: from sklearn.metrics import accuracy_score accuracy = accuracy_score(test_y, predictions) # In[15]: accuracy # In[16]: from sklearn.model_selection import cross_val_score lr = LogisticRegression() scores = cross_val_score(lr, all_X, all_y, cv=10) scores.sort() accuracy = scores.mean() print(scores) print(accuracy) # In[ ]: