import pandas as pd adult_census = pd.read_csv("../datasets/adult-census.csv") # drop the duplicated column `"education-num"` as stated in the first notebook adult_census = adult_census.drop(columns="education-num") target_name = "class" target = adult_census[target_name] data = adult_census.drop(columns=[target_name]) data["native-country"].value_counts().sort_index() data.dtypes from sklearn.compose import make_column_selector as selector categorical_columns_selector = selector(dtype_include=object) categorical_columns = categorical_columns_selector(data) categorical_columns data_categorical = data[categorical_columns] data_categorical.head() print(f"The dataset is composed of {data_categorical.shape[1]} features") from sklearn.preprocessing import OrdinalEncoder education_column = data_categorical[["education"]] encoder = OrdinalEncoder().set_output(transform="pandas") education_encoded = encoder.fit_transform(education_column) education_encoded encoder.categories_ data_encoded = encoder.fit_transform(data_categorical) data_encoded[:5] print(f"The dataset encoded contains {data_encoded.shape[1]} features") from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas") education_encoded = encoder.fit_transform(education_column) education_encoded print(f"The dataset is composed of {data_categorical.shape[1]} features") data_categorical.head() data_encoded = encoder.fit_transform(data_categorical) data_encoded[:5] print(f"The encoded dataset contains {data_encoded.shape[1]} features") data["native-country"].value_counts() from sklearn.pipeline import make_pipeline from sklearn.linear_model import LogisticRegression model = make_pipeline( OneHotEncoder(handle_unknown="ignore"), LogisticRegression(max_iter=500) ) from sklearn.model_selection import cross_validate cv_results = cross_validate(model, data_categorical, target) cv_results scores = cv_results["test_score"] print(f"The accuracy is: {scores.mean():.3f} ± {scores.std():.3f}")