import pandas as pd penguins = pd.read_csv("../datasets/penguins_classification.csv") culmen_columns = ["Culmen Length (mm)", "Culmen Depth (mm)"] target_column = "Species" data, target = penguins[culmen_columns], penguins[target_column] import seaborn as sns from sklearn.tree import DecisionTreeClassifier palette = ["tab:red", "tab:blue", "black"] tree = DecisionTreeClassifier(max_depth=2, random_state=0) tree.fit(data, target) import numpy as np target_predicted = tree.predict(data) misclassified_samples_idx = np.flatnonzero(target != target_predicted) data_misclassified = data.iloc[misclassified_samples_idx] import matplotlib.pyplot as plt from sklearn.inspection import DecisionBoundaryDisplay DecisionBoundaryDisplay.from_estimator( tree, data, response_method="predict", cmap="RdBu", alpha=0.5 ) # plot the original dataset sns.scatterplot( data=penguins, x=culmen_columns[0], y=culmen_columns[1], hue=target_column, palette=palette, ) # plot the misclassified samples sns.scatterplot( data=data_misclassified, x=culmen_columns[0], y=culmen_columns[1], label="Misclassified samples", marker="+", s=150, color="k", ) plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") _ = plt.title( "Decision tree predictions \nwith misclassified samples highlighted" ) sample_weight = np.zeros_like(target, dtype=int) sample_weight[misclassified_samples_idx] = 1 tree = DecisionTreeClassifier(max_depth=2, random_state=0) tree.fit(data, target, sample_weight=sample_weight) DecisionBoundaryDisplay.from_estimator( tree, data, response_method="predict", cmap="RdBu", alpha=0.5 ) sns.scatterplot( data=penguins, x=culmen_columns[0], y=culmen_columns[1], hue=target_column, palette=palette, ) sns.scatterplot( data=data_misclassified, x=culmen_columns[0], y=culmen_columns[1], label="Previously misclassified samples", marker="+", s=150, color="k", ) plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") _ = plt.title("Decision tree by changing sample weights") target_predicted = tree.predict(data) newly_misclassified_samples_idx = np.flatnonzero(target != target_predicted) remaining_misclassified_samples_idx = np.intersect1d( misclassified_samples_idx, newly_misclassified_samples_idx ) print( "Number of samples previously misclassified and " f"still misclassified: {len(remaining_misclassified_samples_idx)}" ) ensemble_weight = [ (target.shape[0] - len(misclassified_samples_idx)) / target.shape[0], (target.shape[0] - len(newly_misclassified_samples_idx)) / target.shape[0], ] ensemble_weight from sklearn.ensemble import AdaBoostClassifier estimator = DecisionTreeClassifier(max_depth=3, random_state=0) adaboost = AdaBoostClassifier( estimator=estimator, n_estimators=3, algorithm="SAMME", random_state=0 ) adaboost.fit(data, target) for boosting_round, tree in enumerate(adaboost.estimators_): plt.figure() # we convert `data` into a NumPy array to avoid a warning raised in scikit-learn DecisionBoundaryDisplay.from_estimator( tree, data.to_numpy(), response_method="predict", cmap="RdBu", alpha=0.5, ) sns.scatterplot( x=culmen_columns[0], y=culmen_columns[1], hue=target_column, data=penguins, palette=palette, ) plt.legend(bbox_to_anchor=(1.04, 0.5), loc="center left") _ = plt.title(f"Decision tree trained at round {boosting_round}") print(f"Weight of each classifier: {adaboost.estimator_weights_}") print(f"Error of each classifier: {adaboost.estimator_errors_}")