import pandas as pd adult_census = pd.read_csv("../datasets/adult-census.csv") target_name = "class" target = adult_census[target_name] data = adult_census.drop(columns=[target_name, "education-num"]) from sklearn.compose import make_column_selector as selector numerical_columns_selector = selector(dtype_exclude=object) categorical_columns_selector = selector(dtype_include=object) numerical_columns = numerical_columns_selector(data) categorical_columns = categorical_columns_selector(data) import time from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder from sklearn.ensemble import HistGradientBoostingClassifier categorical_preprocessor = OrdinalEncoder( handle_unknown="use_encoded_value", unknown_value=-1 ) preprocessor = ColumnTransformer( [("categorical", categorical_preprocessor, categorical_columns)], remainder="passthrough", ) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) start = time.time() cv_results = cross_validate(model, data, target) elapsed_time = time.time() - start scores = cv_results["test_score"] print( "The mean cross-validation accuracy is: " f"{scores.mean():.3f} ± {scores.std():.3f} " f"with a fitting time of {elapsed_time:.3f}" ) # solution import time from sklearn.preprocessing import StandardScaler preprocessor = ColumnTransformer( [ ("numerical", StandardScaler(), numerical_columns), ( "categorical", OrdinalEncoder( handle_unknown="use_encoded_value", unknown_value=-1 ), categorical_columns, ), ] ) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) start = time.time() cv_results = cross_validate(model, data, target) elapsed_time = time.time() - start scores = cv_results["test_score"] print( "The mean cross-validation accuracy is: " f"{scores.mean():.3f} ± {scores.std():.3f} " f"with a fitting time of {elapsed_time:.3f}" ) # solution import time from sklearn.preprocessing import OneHotEncoder categorical_preprocessor = OneHotEncoder( handle_unknown="ignore", sparse_output=False ) preprocessor = ColumnTransformer( [("one-hot-encoder", categorical_preprocessor, categorical_columns)], remainder="passthrough", ) model = make_pipeline(preprocessor, HistGradientBoostingClassifier()) start = time.time() cv_results = cross_validate(model, data, target) elapsed_time = time.time() - start scores = cv_results["test_score"] print( "The mean cross-validation accuracy is: " f"{scores.mean():.3f} ± {scores.std():.3f} " f"with a fitting time of {elapsed_time:.3f}" )