#!/usr/bin/env python # coding: utf-8 # # ## Feature Creation: Combine with reference feature # # The CombineWithReferenceFeature() applies combines a group of variables with a group of reference variables utilising mathematical operations ['sub', 'div','add','mul'], returning one or more additional features as a result. # # For this demonstration, we use the UCI Wine Quality Dataset. # # The data is publicly available on [UCI repository](https://archive.ics.uci.edu/ml/datasets/Wine+Quality) # # P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import ( accuracy_score, roc_curve, roc_auc_score, classification_report, confusion_matrix, ) from sklearn.pipeline import Pipeline as pipe from sklearn.preprocessing import StandardScaler from feature_engine.creation import CombineWithReferenceFeature from feature_engine.imputation import MeanMedianImputer pd.set_option('display.max_columns', None) # In[2]: # Read data data = pd.read_csv('winequality-red.csv', sep=';') data.head() # **This Data contains 11 features, all numerical, with no missing values.** # In[3]: # Let's transform the Target, i.e Wine Quality into a binary classification problem: bins = [0,5,10] labels = [0, 1] # 'low'=0, 'high'=1 data['quality_range']= pd.cut(x=data['quality'], bins=bins, labels=labels) data[['quality_range','quality']].head(5) # In[4]: # drop original target data.drop('quality', axis=1, inplace = True) # ### Sub and Div Combinators: # # Let's create two new variables: # # - non_free_sulfur_dioxide = total sulfur dioxide - free sulfur dioxide # - percentage_free_sulfur = free sulfur dioxide / total sulfur dioxide # In[5]: # Create the Combinators # this transformer substracts free sulfur from total sulfur sub_with_reference_feature = CombineWithReferenceFeature( variables_to_combine=['total sulfur dioxide'], reference_variables=['free sulfur dioxide'], operations=['sub'], new_variables_names=['non_free_sulfur_dioxide'] ) # this transformer divides free sulfur by total sulfur div_with_reference_feature = CombineWithReferenceFeature( variables_to_combine=['free sulfur dioxide'], reference_variables=['total sulfur dioxide'], operations=['div'], new_variables_names=['percentage_free_sulfur'] ) # Fit the Sub Combinator on training data sub_with_reference_feature.fit(data) # perform the substraction data_t = sub_with_reference_feature.transform(data) # perform division # We can combine both steps in a single call with ".fit_transform()" method data_t = div_with_reference_feature.fit_transform(data_t) # In[6]: # Note the additional variables at the end of the dataframe data_t.head() # #### Combine with more than 1 operation # # We can also combine the variables with more than 1 mathematical operation. And the transformer has the option to create variable names automatically. # # Here we will create the following variables: # # - ratio_fixed_to_volatile_acidity = fixed acidity / volatile acidity # - total_acidity = fixed acidity + volatile acidity # In[7]: # Create the Combinator multiple_combinator = CombineWithReferenceFeature( variables_to_combine=['fixed acidity'], reference_variables=['volatile acidity'], operations=['div', 'add'], new_variables_names=['ratio_fixed_to_volatile', 'total_acidity'] ) # In[8]: # Fit the Combinator to the training data multiple_combinator.fit(data_t) # In[9]: # Transform the data data_t = multiple_combinator.transform(data_t) # In[10]: # Note the additional variables at the end of the dataframe data_t.head() # ### Pipeline Example # # We can put all these transformations into single pipeline: # # Create new variables scale features and train a Logistic Regression model to predict the wine quality range. # # See more on how to use Feature-engine within Scikit-learn Pipelines in these [examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines) # In[11]: X = data.drop(['quality_range'], axis=1) y = data.quality_range X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0, shuffle=True, stratify=y ) X_train.shape, X_test.shape # In[12]: value_pipe = pipe([ # Create new features ('subtraction', CombineWithReferenceFeature( variables_to_combine=['total sulfur dioxide'], reference_variables=['free sulfur dioxide'], operations=['sub'], new_variables_names=['non_free_sulfur_dioxide'] ) ), ('ratio', CombineWithReferenceFeature( variables_to_combine=['free sulfur dioxide'], reference_variables=['total sulfur dioxide'], operations=['div'], new_variables_names=['percentage_free_sulfur'] ) ), ('acidity', CombineWithReferenceFeature( variables_to_combine=['fixed acidity'], reference_variables=['volatile acidity'], operations=['div', 'add'], new_variables_names=['ratio_fixed_to_volatile', 'total_acidity'] ) ), # scale features ('scaler', StandardScaler()), # Logistic Regression ('LogisticRegression', LogisticRegression()) ]) # In[13]: value_pipe.fit(X_train, y_train) # In[14]: pred_train = value_pipe.predict(X_train) pred_test = value_pipe.predict(X_test) # In[15]: print('Logistic Regression Model train accuracy score: {}'.format( accuracy_score(y_train, pred_train))) print() print('Logistic Regression Model test accuracy score: {}'.format( accuracy_score(y_test, pred_test))) # In[16]: print('Logistic Regression Model test classification report: \n\n {}'.format( classification_report(y_test, pred_test))) # In[17]: score = round(accuracy_score(y_test, pred_test), 3) cm = confusion_matrix(y_test, pred_test) sns.heatmap(cm, annot=True, fmt=".0f") plt.xlabel('Predicted Values') plt.ylabel('Actual Values') plt.title('Accuracy Score: {0}'.format(score), size=15) plt.show() # In[18]: # Predict probabilities for the test data probs = value_pipe.predict_proba(X_test)[:, 1] # Get the ROC Curve fpr, tpr, thresholds = roc_curve(y_test, probs) # Plot ROC curve plt.figure(figsize=(8, 5)) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate = 1 - Specificity Score') plt.ylabel('True Positive Rate = Recall Score') plt.title('ROC Curve') plt.show() # In[ ]: