import pandas as pd
from bs4 import BeautifulSoup
import requests
url = "https://www.encyclopedia-titanica.org/titanic-passengers-and-crew/"
strona = requests.get(url).text
soup = BeautifulSoup(strona,"html.parser")
table = soup.find('table')
data = pd.read_html(str(table), flavor = 'bs4')[0]
print(data.shape)
(2456, 8)
data.head()
Name | Age | Class/Dept | Ticket | Joined | Job | Boat [Body] | Unnamed: 7 | |
---|---|---|---|---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27 | 3rd Class Passenger | 2699£18 15s 9d | Cherbourg | NaN | 15 | NaN |
1 | ABBING, Mr Anthony | 42 | 3rd Class Passenger | 5547£7 11s | Southampton | Blacksmith | NaN | NaN |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39 | 3rd Class Passenger | CA2673£20 5s | Southampton | NaN | A | NaN |
3 | ABBOTT, Mr Rossmore Edward | 16 | 3rd Class Passenger | CA2673£20 5s | Southampton | Jeweller | [190] | NaN |
4 | ABBOTT, Mr Eugene Joseph | 13 | 3rd Class Passenger | CA2673£20 5s | Southampton | Scholar | NaN | NaN |
data = data[["Name","Age","Class/Dept","Boat [Body]"]]
data.head()
Name | Age | Class/Dept | Boat [Body] | |
---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27 | 3rd Class Passenger | 15 |
1 | ABBING, Mr Anthony | 42 | 3rd Class Passenger | NaN |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39 | 3rd Class Passenger | A |
3 | ABBOTT, Mr Rossmore Edward | 16 | 3rd Class Passenger | [190] |
4 | ABBOTT, Mr Eugene Joseph | 13 | 3rd Class Passenger | NaN |
data["Boat [Body]"]= data["Boat [Body]"].fillna("")
data.head()
Name | Age | Class/Dept | Boat [Body] | |
---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27 | 3rd Class Passenger | 15 |
1 | ABBING, Mr Anthony | 42 | 3rd Class Passenger | |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39 | 3rd Class Passenger | A |
3 | ABBOTT, Mr Rossmore Edward | 16 | 3rd Class Passenger | [190] |
4 | ABBOTT, Mr Eugene Joseph | 13 | 3rd Class Passenger |
def przetrwanie(val):
if val=="" or "[" in val:
return 0
else:
return 1
data["Przetrwanie"] = data["Boat [Body]"].apply(przetrwanie)
data["Age"] = data["Age"].apply(pd.to_numeric, errors = "coerce")
data.head()
Name | Age | Class/Dept | Boat [Body] | Przetrwanie | |
---|---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27.0 | 3rd Class Passenger | 15 | 1 |
1 | ABBING, Mr Anthony | 42.0 | 3rd Class Passenger | 0 | |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39.0 | 3rd Class Passenger | A | 1 |
3 | ABBOTT, Mr Rossmore Edward | 16.0 | 3rd Class Passenger | [190] | 0 |
4 | ABBOTT, Mr Eugene Joseph | 13.0 | 3rd Class Passenger | 0 |
def jaka_klasa(klasa):
if "Passenger" in klasa:
return klasa.split()[0][0]
else:
return "Załoga"
data["Klasa"] = data["Class/Dept"].apply(jaka_klasa)
data.head()
Name | Age | Class/Dept | Boat [Body] | Przetrwanie | Klasa | |
---|---|---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27.0 | 3rd Class Passenger | 15 | 1 | 3 |
1 | ABBING, Mr Anthony | 42.0 | 3rd Class Passenger | 0 | 3 | |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39.0 | 3rd Class Passenger | A | 1 | 3 |
3 | ABBOTT, Mr Rossmore Edward | 16.0 | 3rd Class Passenger | [190] | 0 | 3 |
4 | ABBOTT, Mr Eugene Joseph | 13.0 | 3rd Class Passenger | 0 | 3 |
def jaka_grupa(wiek):
if wiek<18:
return "Dziecko"
else:
return "Dorosły"
data["Dziecko/Dorosły"] = data["Age"].apply(jaka_grupa)
data.head()
Name | Age | Class/Dept | Boat [Body] | Przetrwanie | Klasa | Dziecko/Dorosły | |
---|---|---|---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27.0 | 3rd Class Passenger | 15 | 1 | 3 | Dorosły |
1 | ABBING, Mr Anthony | 42.0 | 3rd Class Passenger | 0 | 3 | Dorosły | |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39.0 | 3rd Class Passenger | A | 1 | 3 | Dorosły |
3 | ABBOTT, Mr Rossmore Edward | 16.0 | 3rd Class Passenger | [190] | 0 | 3 | Dziecko |
4 | ABBOTT, Mr Eugene Joseph | 13.0 | 3rd Class Passenger | 0 | 3 | Dziecko |
def jaka_plec(imie):
po_przecinku = imie[imie.index(",")+2:].split(" ")
forma = po_przecinku[0]
if forma in ["Mr","Master", "Sig.", "Sr."]:
return "Mężczyzna"
else:
return "Kobieta"
data["Płeć"] = data["Name"].apply(jaka_plec)
data.head()
Name | Age | Class/Dept | Boat [Body] | Przetrwanie | Klasa | Dziecko/Dorosły | Płeć | |
---|---|---|---|---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27.0 | 3rd Class Passenger | 15 | 1 | 3 | Dorosły | Mężczyzna |
1 | ABBING, Mr Anthony | 42.0 | 3rd Class Passenger | 0 | 3 | Dorosły | Mężczyzna | |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39.0 | 3rd Class Passenger | A | 1 | 3 | Dorosły | Kobieta |
3 | ABBOTT, Mr Rossmore Edward | 16.0 | 3rd Class Passenger | [190] | 0 | 3 | Dziecko | Mężczyzna |
4 | ABBOTT, Mr Eugene Joseph | 13.0 | 3rd Class Passenger | 0 | 3 | Dziecko | Mężczyzna |
data.groupby(["Płeć"])["Name"].count()
Płeć Kobieta 533 Mężczyzna 1923 Name: Name, dtype: int64
data.groupby(["Płeć"])["Przetrwanie"].sum()
Płeć Kobieta 341 Mężczyzna 294 Name: Przetrwanie, dtype: int64
def porownanie_przezywalnosci(grupa):
return data.groupby([grupa])["Przetrwanie"].sum()/data.groupby([grupa])["Przetrwanie"].count()
porownanie_przezywalnosci("Płeć")
Płeć Kobieta 0.639775 Mężczyzna 0.152886 Name: Przetrwanie, dtype: float64
porownanie_przezywalnosci("Klasa")
Klasa 1 0.574286 2 0.378840 3 0.242595 Załoga 0.136775 Name: Przetrwanie, dtype: float64
porownanie_przezywalnosci("Dziecko/Dorosły")
Dziecko/Dorosły Dorosły 0.248343 Dziecko 0.378238 Name: Przetrwanie, dtype: float64
data.head()
Name | Age | Class/Dept | Boat [Body] | Przetrwanie | Klasa | Dziecko/Dorosły | Płeć | |
---|---|---|---|---|---|---|---|---|
0 | ABī-AL-MUNà, Mr Nāsīf Qāsim | 27.0 | 3rd Class Passenger | 15 | 1 | 3 | Dorosły | Mężczyzna |
1 | ABBING, Mr Anthony | 42.0 | 3rd Class Passenger | 0 | 3 | Dorosły | Mężczyzna | |
2 | ABBOTT, Mrs Rhoda Mary 'Rosa' | 39.0 | 3rd Class Passenger | A | 1 | 3 | Dorosły | Kobieta |
3 | ABBOTT, Mr Rossmore Edward | 16.0 | 3rd Class Passenger | [190] | 0 | 3 | Dziecko | Mężczyzna |
4 | ABBOTT, Mr Eugene Joseph | 13.0 | 3rd Class Passenger | 0 | 3 | Dziecko | Mężczyzna |
train = data[["Płeć","Klasa","Age","Dziecko/Dorosły","Przetrwanie"]]
train.head()
Płeć | Klasa | Age | Dziecko/Dorosły | Przetrwanie | |
---|---|---|---|---|---|
0 | Mężczyzna | 3 | 27.0 | Dorosły | 1 |
1 | Mężczyzna | 3 | 42.0 | Dorosły | 0 |
2 | Kobieta | 3 | 39.0 | Dorosły | 1 |
3 | Mężczyzna | 3 | 16.0 | Dziecko | 0 |
4 | Mężczyzna | 3 | 13.0 | Dziecko | 0 |
def toNum(val):
val = val.astype('category')
return val.cat.codes
train2 = train[["Klasa","Dziecko/Dorosły","Płeć"]].apply(toNum)
train[["Klasa","Dziecko/Dorosły","Płeć"]] = train2
train.head()
/home/sorsik/.local/lib/python3.5/site-packages/pandas/core/frame.py:2440: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy self[k1] = value[k2]
Płeć | Klasa | Age | Dziecko/Dorosły | Przetrwanie | |
---|---|---|---|---|---|
0 | 1 | 2 | 27.0 | 0 | 1 |
1 | 1 | 2 | 42.0 | 0 | 0 |
2 | 0 | 2 | 39.0 | 0 | 1 |
3 | 1 | 2 | 16.0 | 1 | 0 |
4 | 1 | 2 | 13.0 | 1 | 0 |
train.isnull().sum()
Płeć 0 Klasa 0 Age 40 Dziecko/Dorosły 0 Przetrwanie 0 dtype: int64
len(train)
2456
train = train.dropna()
len(train)
2416
train.isnull().sum()
Płeć 0 Klasa 0 Age 0 Dziecko/Dorosły 0 Przetrwanie 0 dtype: int64
train.head()
Płeć | Klasa | Age | Dziecko/Dorosły | Przetrwanie | |
---|---|---|---|---|---|
0 | 1 | 2 | 27.0 | 0 | 1 |
1 | 1 | 2 | 42.0 | 0 | 0 |
2 | 0 | 2 | 39.0 | 0 | 1 |
3 | 1 | 2 | 16.0 | 1 | 0 |
4 | 1 | 2 | 13.0 | 1 | 0 |
def corrplot(df):
return df.corr()
corrplot(train)
Płeć | Klasa | Age | Dziecko/Dorosły | Przetrwanie | |
---|---|---|---|---|---|
Płeć | 1.000000 | 0.427418 | 0.033126 | -0.146984 | -0.468146 |
Klasa | 0.427418 | 1.000000 | -0.124860 | -0.080689 | -0.352164 |
Age | 0.033126 | -0.124860 | 1.000000 | -0.506012 | -0.055350 |
Dziecko/Dorosły | -0.146984 | -0.080689 | -0.506012 | 1.000000 | 0.080117 |
Przetrwanie | -0.468146 | -0.352164 | -0.055350 | 0.080117 | 1.000000 |
from sklearn.model_selection import train_test_split
train,test = train_test_split(train, test_size = 0.2)
train.head()
Płeć | Klasa | Age | Dziecko/Dorosły | Przetrwanie | |
---|---|---|---|---|---|
2393 | 0 | 1 | 30.0 | 0 | 1 |
984 | 1 | 3 | 28.0 | 0 | 1 |
115 | 1 | 1 | 18.0 | 0 | 0 |
2047 | 1 | 0 | 51.0 | 0 | 0 |
2150 | 1 | 3 | 20.0 | 0 | 0 |
len(train)+len(test)
2416
from sklearn.tree import DecisionTreeClassifier
drzewko = DecisionTreeClassifier(max_leaf_nodes=10)
drzewko
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=10, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
drzewko = drzewko.fit(train[["Klasa","Age","Dziecko/Dorosły","Płeć"]], train["Przetrwanie"])
dict(zip(["Klasa","Age","Dziecko/Dorosły","Płeć"],list(drzewko.feature_importances_)))
{'Age': 0.088324038125327783, 'Dziecko/Dorosły': 0.0, 'Klasa': 0.2003748631509486, 'Płeć': 0.71130109872372371}
from sklearn import tree
with open("titanic.dot", "w") as f:
f = tree.export_graphviz(drzewko, feature_names = ["Klasa","Age","Dziecko/Dorosły","Płeć"], out_file = f)
predictions = drzewko.predict(test[["Klasa","Age","Dziecko/Dorosły","Płeć"]])
from sklearn.metrics import accuracy_score
accuracy_score(test["Przetrwanie"], predictions)
0.8223140495867769
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf
def jaka_skutecznosc(rf):
rf = rf.fit(train[["Klasa","Age","Dziecko/Dorosły","Płeć"]], train["Przetrwanie"])
predictions = rf.predict(test[["Klasa","Age","Dziecko/Dorosły","Płeć"]])
return accuracy_score(test["Przetrwanie"],predictions)
jaka_skutecznosc(rf)