https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
type(X)
pandas.core.frame.DataFrame
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1309 entries, 0 to 1308 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pclass 1309 non-null float64 1 name 1309 non-null object 2 sex 1309 non-null category 3 age 1046 non-null float64 4 sibsp 1309 non-null float64 5 parch 1309 non-null float64 6 ticket 1309 non-null object 7 fare 1308 non-null float64 8 cabin 295 non-null object 9 embarked 1307 non-null category 10 boat 486 non-null object 11 body 121 non-null float64 12 home.dest 745 non-null object dtypes: category(2), float64(6), object(5) memory usage: 115.4+ KB
y.dtype
CategoricalDtype(categories=['0', '1'], ordered=False)
y.attrs
{}
y.values
['1', '1', '0', '0', '0', ..., '0', '0', '0', '0', '0'] Length: 1309 Categories (2, object): ['0', '1']
y
0 1 1 1 2 0 3 0 4 0 .. 1304 0 1305 0 1306 0 1307 0 1308 0 Name: survived, Length: 1309, dtype: category Categories (2, object): ['0', '1']
y.index
RangeIndex(start=0, stop=1309, step=1)
y.empty
False
X.columns
Index(['pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'], dtype='object')
y.name
'survived'
y.flags
<Flags(allows_duplicate_labels=True)>
y.astype
<bound method NDFrame.astype of 0 1 1 1 2 0 3 0 4 0 .. 1304 0 1305 0 1306 0 1307 0 1308 0 Name: survived, Length: 1309, dtype: category Categories (2, object): ['0', '1']>
y.rank
<bound method NDFrame.rank of 0 1 1 1 2 0 3 0 4 0 .. 1304 0 1305 0 1306 0 1307 0 1308 0 Name: survived, Length: 1309, dtype: category Categories (2, object): ['0', '1']>
y.type
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-53-2727d4ac428e> in <module>() ----> 1 y.type /usr/local/lib/python3.7/dist-packages/pandas/core/generic.py in __getattr__(self, name) 5485 ): 5486 return self[name] -> 5487 return object.__getattribute__(self, name) 5488 5489 def __setattr__(self, name: str, value) -> None: AttributeError: 'Series' object has no attribute 'type'
y.astype
<bound method NDFrame.astype of 0 1 1 1 2 0 3 0 4 0 .. 1304 0 1305 0 1306 0 1307 0 1308 0 Name: survived, Length: 1309, dtype: category Categories (2, object): ['0', '1']>
type(y)
pandas.core.series.Series
df = pd.merge(X, y, right_index = True, left_index = True)
print(df)
pclass name sex \ 0 1.0 Allen, Miss. Elisabeth Walton female 1 1.0 Allison, Master. Hudson Trevor male 2 1.0 Allison, Miss. Helen Loraine female 3 1.0 Allison, Mr. Hudson Joshua Creighton male 4 1.0 Allison, Mrs. Hudson J C (Bessie Waldo Daniels) female ... ... ... ... 1304 3.0 Zabour, Miss. Hileni female 1305 3.0 Zabour, Miss. Thamine female 1306 3.0 Zakarian, Mr. Mapriededer male 1307 3.0 Zakarian, Mr. Ortin male 1308 3.0 Zimmerman, Mr. Leo male age sibsp parch ticket fare cabin embarked boat body \ 0 29.0000 0.0 0.0 24160 211.3375 B5 S 2 NaN 1 0.9167 1.0 2.0 113781 151.5500 C22 C26 S 11 NaN 2 2.0000 1.0 2.0 113781 151.5500 C22 C26 S None NaN 3 30.0000 1.0 2.0 113781 151.5500 C22 C26 S None 135.0 4 25.0000 1.0 2.0 113781 151.5500 C22 C26 S None NaN ... ... ... ... ... ... ... ... ... ... 1304 14.5000 1.0 0.0 2665 14.4542 None C None 328.0 1305 NaN 1.0 0.0 2665 14.4542 None C None NaN 1306 26.5000 0.0 0.0 2656 7.2250 None C None 304.0 1307 27.0000 0.0 0.0 2670 7.2250 None C None NaN 1308 29.0000 0.0 0.0 315082 7.8750 None S None NaN home.dest survived 0 St Louis, MO 1 1 Montreal, PQ / Chesterville, ON 1 2 Montreal, PQ / Chesterville, ON 0 3 Montreal, PQ / Chesterville, ON 0 4 Montreal, PQ / Chesterville, ON 0 ... ... ... 1304 None 0 1305 None 0 1306 None 0 1307 None 0 1308 None 0 [1309 rows x 14 columns]
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1309 entries, 0 to 1308 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 pclass 1309 non-null float64 1 name 1309 non-null object 2 sex 1309 non-null category 3 age 1046 non-null float64 4 sibsp 1309 non-null float64 5 parch 1309 non-null float64 6 ticket 1309 non-null object 7 fare 1308 non-null float64 8 cabin 295 non-null object 9 embarked 1307 non-null category 10 boat 486 non-null object 11 body 121 non-null float64 12 home.dest 745 non-null object 13 survived 1309 non-null category dtypes: category(3), float64(6), object(5) memory usage: 116.8+ KB
df.index
RangeIndex(start=0, stop=1309, step=1)
Here we are going to call a formula and use the GLM functionality
formula = 'survived ~ age + pclass'
model = smf.glm(formula = formula, data=df, family=sm.families.Binomial())
result=model.fit()
print(result.summary())
Generalized Linear Model Regression Results ========================================================================================== Dep. Variable: ['survived[0]', 'survived[1]'] No. Observations: 1046 Model: GLM Df Residuals: 1043 Model Family: Binomial Df Model: 2 Link Function: logit Scale: 1.0000 Method: IRLS Log-Likelihood: -628.09 Date: Sun, 29 May 2022 Deviance: 1256.2 Time: 02:20:23 Pearson chi2: 1.05e+03 No. Iterations: 4 Covariance Type: nonrobust ============================================================================== coef std err z P>|z| [0.025 0.975] ------------------------------------------------------------------------------ Intercept -3.1143 0.327 -9.520 0.000 -3.755 -2.473 age 0.0370 0.005 6.770 0.000 0.026 0.048 pclass 1.1015 0.096 11.530 0.000 0.914 1.289 ==============================================================================