#!/usr/bin/env python
# coding: utf-8

# 
# ## Feature Creation: Combine with reference feature
# 
# The CombineWithReferenceFeature() applies combines a group of variables with a group of reference variables utilising mathematical operations ['sub', 'div','add','mul'], returning one or more additional features as a result.
# 
# For this demonstration, we use the UCI Wine Quality Dataset.
# 
# The data is publicly available on [UCI repository](https://archive.ics.uci.edu/ml/datasets/Wine+Quality)
# 
# P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

# In[1]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_curve,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)
from sklearn.pipeline import Pipeline as pipe
from sklearn.preprocessing import StandardScaler

from feature_engine.creation import CombineWithReferenceFeature
from feature_engine.imputation import MeanMedianImputer

pd.set_option('display.max_columns', None)


# In[2]:


# Read data
data = pd.read_csv('winequality-red.csv', sep=';')

data.head()


# **This Data contains 11 features, all numerical, with no missing values.**

# In[3]:


# Let's transform the Target, i.e Wine Quality into a binary classification problem:

bins = [0,5,10]

labels = [0, 1] # 'low'=0, 'high'=1

data['quality_range']= pd.cut(x=data['quality'], bins=bins, labels=labels)

data[['quality_range','quality']].head(5)


# In[4]:


# drop original target

data.drop('quality', axis=1, inplace = True)


# ### Sub and Div Combinators:
# 
# Let's create two new variables:
# 
# - non_free_sulfur_dioxide = total sulfur dioxide - free sulfur dioxide
# - percentage_free_sulfur = free sulfur dioxide / total sulfur dioxide

# In[5]:


# Create the Combinators

# this transformer substracts free sulfur from total sulfur
sub_with_reference_feature = CombineWithReferenceFeature(
    variables_to_combine=['total sulfur dioxide'],
    reference_variables=['free sulfur dioxide'],
    operations=['sub'],
    new_variables_names=['non_free_sulfur_dioxide']
)

# this transformer divides free sulfur by total sulfur
div_with_reference_feature = CombineWithReferenceFeature(
    variables_to_combine=['free sulfur dioxide'],
    reference_variables=['total sulfur dioxide'],
    operations=['div'],
    new_variables_names=['percentage_free_sulfur']
)

# Fit the Sub Combinator on training data
sub_with_reference_feature.fit(data)

# perform the substraction
data_t = sub_with_reference_feature.transform(data)

# perform division
# We can combine both steps in a single call with ".fit_transform()" method
data_t = div_with_reference_feature.fit_transform(data_t)


# In[6]:


# Note the additional variables at the end of the dataframe

data_t.head()


# #### Combine with more than 1 operation
# 
# We can also combine the variables with more than 1 mathematical operation. And the transformer has the option to create variable names automatically.
# 
# Here we will create the following variables:
# 
# - ratio_fixed_to_volatile_acidity = fixed acidity / volatile acidity
# - total_acidity = fixed acidity + volatile acidity

# In[7]:


# Create the Combinator

multiple_combinator = CombineWithReferenceFeature(
    variables_to_combine=['fixed acidity'],
    reference_variables=['volatile acidity'],
    operations=['div', 'add'],
    new_variables_names=['ratio_fixed_to_volatile', 'total_acidity']
)


# In[8]:


# Fit the Combinator to the training data

multiple_combinator.fit(data_t)


# In[9]:


# Transform the data

data_t = multiple_combinator.transform(data_t)


# In[10]:


# Note the additional variables at the end of the dataframe

data_t.head()


# ### Pipeline Example
# 
# We can put all these transformations into single pipeline:
# 
# Create new variables scale features and train a Logistic Regression model to predict the wine quality range.
# 
# See more on how to use Feature-engine within Scikit-learn Pipelines in these [examples](https://github.com/solegalli/feature_engine/tree/master/examples/Pipelines)

# In[11]:


X = data.drop(['quality_range'], axis=1)

y = data.quality_range

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0,
                                                    shuffle=True,
                                                    stratify=y
                                                    )
X_train.shape, X_test.shape


# In[12]:


value_pipe = pipe([
    
    # Create new features
    ('subtraction', CombineWithReferenceFeature(
        variables_to_combine=['total sulfur dioxide'],
        reference_variables=['free sulfur dioxide'],
        operations=['sub'],
        new_variables_names=['non_free_sulfur_dioxide']
    )
    ),

    ('ratio', CombineWithReferenceFeature(
        variables_to_combine=['free sulfur dioxide'],
        reference_variables=['total sulfur dioxide'],
        operations=['div'],
        new_variables_names=['percentage_free_sulfur']
    )
    ),

    ('acidity', CombineWithReferenceFeature(
        variables_to_combine=['fixed acidity'],
        reference_variables=['volatile acidity'],
        operations=['div', 'add'],
        new_variables_names=['ratio_fixed_to_volatile', 'total_acidity']
    )
    ),

    # scale features
    ('scaler', StandardScaler()),

    # Logistic Regression
    ('LogisticRegression', LogisticRegression())
])


# In[13]:


value_pipe.fit(X_train, y_train)


# In[14]:


pred_train = value_pipe.predict(X_train)
pred_test = value_pipe.predict(X_test)


# In[15]:


print('Logistic Regression Model train accuracy score: {}'.format(
    accuracy_score(y_train, pred_train)))

print()

print('Logistic Regression Model test accuracy score: {}'.format(
    accuracy_score(y_test, pred_test)))


# In[16]:


print('Logistic Regression Model test classification report: \n\n {}'.format(
    classification_report(y_test, pred_test)))


# In[17]:


score = round(accuracy_score(y_test, pred_test), 3)
cm = confusion_matrix(y_test, pred_test)

sns.heatmap(cm, annot=True, fmt=".0f")
plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Accuracy Score: {0}'.format(score), size=15)
plt.show()


# In[18]:


# Predict probabilities for the test data

probs = value_pipe.predict_proba(X_test)[:, 1]

# Get the ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, probs)

# Plot ROC curve
plt.figure(figsize=(8, 5))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate = 1 - Specificity Score')
plt.ylabel('True Positive Rate  = Recall Score')
plt.title('ROC Curve')
plt.show()


# In[ ]: