import pandas as pd

adult_census = pd.read_csv("../datasets/adult-census.csv")
target = adult_census["class"]
data = adult_census.select_dtypes(["integer", "floating"])
data = data.drop(columns=["education-num"])
data

# solution
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate

model = make_pipeline(StandardScaler(), LogisticRegression())
cv_results_lr = cross_validate(
    model, data, target, cv=10, return_estimator=True
)
test_score_lr = cv_results_lr["test_score"]
test_score_lr

# solution
import matplotlib.pyplot as plt

coefs = [pipeline[-1].coef_[0] for pipeline in cv_results_lr["estimator"]]
coefs = pd.DataFrame(coefs, columns=data.columns)

color = {"whiskers": "black", "medians": "black", "caps": "black"}
_, ax = plt.subplots()
_ = coefs.abs().plot.box(color=color, vert=False, ax=ax)

adult_census = pd.read_csv("../datasets/adult-census.csv")
target = adult_census["class"]
data = adult_census.drop(columns=["class", "education-num"])

# solution
from sklearn.compose import make_column_selector as selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

categorical_columns = selector(dtype_include=object)(data)
numerical_columns = selector(dtype_exclude=object)(data)

preprocessor = make_column_transformer(
    (
        OneHotEncoder(handle_unknown="ignore", min_frequency=0.01),
        categorical_columns,
    ),
    (StandardScaler(), numerical_columns),
)
model = make_pipeline(preprocessor, LogisticRegression(max_iter=5_000))
cv_results_complex_lr = cross_validate(
    model, data, target, cv=10, return_estimator=True, n_jobs=2
)
test_score_complex_lr = cv_results_complex_lr["test_score"]
test_score_complex_lr

# solution
import numpy as np
import matplotlib.pyplot as plt

indices = np.arange(len(test_score_lr))
plt.scatter(
    indices, test_score_lr, color="tab:blue", label="numerical features only"
)
plt.scatter(
    indices,
    test_score_complex_lr,
    color="tab:red",
    label="all features",
)
plt.ylim((0, 1))
plt.xlabel("Cross-validation iteration")
plt.ylabel("Accuracy")
_ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")

print(
    "A model using both all features is better than a"
    " model using only numerical features for"
    f" {sum(test_score_complex_lr > test_score_lr)} CV iterations out of 10."
)

# solution
preprocessor.fit(data)
feature_names = (
    preprocessor.named_transformers_["onehotencoder"].get_feature_names_out(
        categorical_columns
    )
).tolist()
feature_names += numerical_columns
feature_names

# solution
coefs = [
    pipeline[-1].coef_[0] for pipeline in cv_results_complex_lr["estimator"]
]
coefs = pd.DataFrame(coefs, columns=feature_names)

_, ax = plt.subplots(figsize=(10, 35))
_ = coefs.abs().plot.box(color=color, vert=False, ax=ax)

# solution
from sklearn.preprocessing import PolynomialFeatures

model_with_interaction = make_pipeline(
    preprocessor,
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
    LogisticRegression(C=0.01, max_iter=5_000),
)
model_with_interaction

# solution
cv_results_interactions = cross_validate(
    model_with_interaction,
    data,
    target,
    cv=10,
    return_estimator=True,
    n_jobs=2,
)
test_score_interactions = cv_results_interactions["test_score"]
test_score_interactions

# solution
plt.scatter(
    indices, test_score_lr, color="tab:blue", label="numerical features only"
)
plt.scatter(
    indices,
    test_score_complex_lr,
    color="tab:red",
    label="all features",
)
plt.scatter(
    indices,
    test_score_interactions,
    color="black",
    label="all features and interactions",
)
plt.xlabel("Cross-validation iteration")
plt.ylabel("Accuracy")
_ = plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")

print(
    "A model using all features and interactions is better than a model"
    " without interactions for"
    f" {sum(test_score_interactions > test_score_complex_lr)} CV iterations"
    " out of 10."
)