import pandas as pd adult_census = pd.read_csv("../datasets/adult-census.csv") target = adult_census["class"] data = adult_census.select_dtypes(["integer", "floating"]) data = data.drop(columns=["education-num"]) data # solution from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_validate model = make_pipeline(StandardScaler(), LogisticRegression()) cv_results_lr = cross_validate( model, data, target, cv=10, return_estimator=True ) test_score_lr = cv_results_lr["test_score"] test_score_lr # solution import matplotlib.pyplot as plt coefs = [pipeline[-1].coef_[0] for pipeline in cv_results_lr["estimator"]] coefs = pd.DataFrame(coefs, columns=data.columns) color = {"whiskers": "black", "medians": "black", "caps": "black"} _, ax = plt.subplots() _ = coefs.abs().plot.box(color=color, vert=False, ax=ax) adult_census = pd.read_csv("../datasets/adult-census.csv") target = adult_census["class"] data = adult_census.drop(columns=["class", "education-num"]) # solution from sklearn.compose import make_column_selector as selector from sklearn.compose import make_column_transformer from sklearn.preprocessing import OneHotEncoder categorical_columns = selector(dtype_include=object)(data) numerical_columns = selector(dtype_exclude=object)(data) preprocessor = make_column_transformer( ( OneHotEncoder(handle_unknown="ignore", min_frequency=0.01), categorical_columns, ), (StandardScaler(), numerical_columns), ) model = make_pipeline(preprocessor, LogisticRegression(max_iter=5_000)) cv_results_complex_lr = cross_validate( model, data, target, cv=10, return_estimator=True, n_jobs=2 ) test_score_complex_lr = cv_results_complex_lr["test_score"] test_score_complex_lr # solution import numpy as np import matplotlib.pyplot as plt indices = np.arange(len(test_score_lr)) plt.scatter( indices, test_score_lr, color="tab:blue", label="numerical features only" ) plt.scatter( indices, test_score_complex_lr, color="tab:red", label="all features", ) plt.ylim((0, 1)) plt.xlabel("Cross-validation iteration") plt.ylabel("Accuracy") _ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") print( "A model using both all features is better than a" " model using only numerical features for" f" {sum(test_score_complex_lr > test_score_lr)} CV iterations out of 10." ) # solution preprocessor.fit(data) feature_names = ( preprocessor.named_transformers_["onehotencoder"].get_feature_names_out( categorical_columns ) ).tolist() feature_names += numerical_columns feature_names # solution coefs = [ pipeline[-1].coef_[0] for pipeline in cv_results_complex_lr["estimator"] ] coefs = pd.DataFrame(coefs, columns=feature_names) _, ax = plt.subplots(figsize=(10, 35)) _ = coefs.abs().plot.box(color=color, vert=False, ax=ax) # solution from sklearn.preprocessing import PolynomialFeatures model_with_interaction = make_pipeline( preprocessor, PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), LogisticRegression(C=0.01, max_iter=5_000), ) model_with_interaction # solution cv_results_interactions = cross_validate( model_with_interaction, data, target, cv=10, return_estimator=True, n_jobs=2, ) test_score_interactions = cv_results_interactions["test_score"] test_score_interactions # solution plt.scatter( indices, test_score_lr, color="tab:blue", label="numerical features only" ) plt.scatter( indices, test_score_complex_lr, color="tab:red", label="all features", ) plt.scatter( indices, test_score_interactions, color="black", label="all features and interactions", ) plt.xlabel("Cross-validation iteration") plt.ylabel("Accuracy") _ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left") print( "A model using all features and interactions is better than a model" " without interactions for" f" {sum(test_score_interactions > test_score_complex_lr)} CV iterations" " out of 10." )