#!/usr/bin/env python
# coding: utf-8

# ## Regression
# 
# In this lecture, I show how easy and practical is to engineer features in an entire dataset utilising Feature-engine and the scikit-learn pipeline.
# 
# **We use the Ames House Prices dataset produced by Professor Dean De Cock:**
# 
# Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing
# Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3
# 
# http://jse.amstat.org/v19n3/decock.pdf
# 
# https://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627
# 
# The version of the dataset used in this notebook can be obtained from [Kaggle](https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data)

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# for the model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, log_loss

# for feature engineering
from sklearn.preprocessing import StandardScaler
from feature_engine import imputation as mdi
from feature_engine import discretisation as dsc
from feature_engine import encoding as ce


# In[2]:


# load training data
data = pd.read_csv('train.csv')


# In[3]:


# make lists of variable types

categorical = [var for var in data.columns if data[var].dtype == 'O']

year_vars = [var for var in data.columns if 'Yr' in var or 'Year' in var]

discrete = [
    var for var in data.columns if data[var].dtype != 'O'
    and len(data[var].unique()) < 20 and var not in year_vars
]

numerical = [
    var for var in data.columns if data[var].dtype != 'O'
    if var not in discrete and var not in ['Id', 'SalePrice']
    and var not in year_vars
]


# In[4]:


# some plots to get familiar with the variable distributions

sns.pairplot(data=data,
             y_vars=['SalePrice'],
             x_vars=['LotFrontage',
                     'LotArea',
                     'MasVnrArea',
                     'BsmtFinSF1',
                     'BsmtFinSF2', ])


# In[5]:


sns.pairplot(data=data,
             y_vars=['SalePrice'],
             x_vars=['BsmtUnfSF',
                     'TotalBsmtSF',
                     '1stFlrSF',
                     '2ndFlrSF',
                     'LowQualFinSF', ])


# In[6]:


sns.pairplot(data=data,
             y_vars=['SalePrice'],
             x_vars=['GrLivArea',
                     'GarageArea',
                     'WoodDeckSF',
                     'OpenPorchSF',
                     'EnclosedPorch', ])


# In[7]:


sns.pairplot(data=data,
             y_vars=['SalePrice'],
             x_vars=['3SsnPorch',
                     'ScreenPorch',
                     'MiscVal'])


# In[8]:


# we are going to treat discrete variables as categorical 
# thus, to allow Feature-engine to pick them up automatically
# we need to re-cast them as object

data[discrete] = data[discrete].astype('O')


# In[9]:


# split training data into train and test

X_train, X_test, y_train, y_test = train_test_split(data.drop(
    ['Id', 'SalePrice'], axis=1),
    data['SalePrice'],
    test_size=0.1,
    random_state=0)

X_train.shape, X_test.shape


# In[10]:


# transform year variables:
# calculate elapsed time

def elapsed_years(df, var):
    # capture difference between year variable and
    # year the house was sold
    
    df[var] = df['YrSold'] - df[var]
    return df

for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)


# In[11]:


# drop YrSold

X_train.drop('YrSold', axis=1, inplace=True)
X_test.drop('YrSold', axis=1, inplace=True)


# In[12]:


house_pipe = Pipeline([

    # ===  missing data imputation =======
    # add missing indicator to variables that show NA
    ('missing_ind', mdi.AddMissingIndicator(missing_only=True)),

    # impute numerical variables with the median - vars automatically identified
    ('imputer_num',  mdi.MeanMedianImputer(imputation_method='median')),

    # impute categorical variables with a string, vars automatically identified
    # with return_object set to true, the numerical variables are cast as object
    # so that the encoders can identify them automatically
    ('imputer_cat', mdi.CategoricalImputer(return_object=True)),


    # === categorical encoding =========
    # group infrequent labels into a group, called "Rare"
    # categorical variables automatically identified
    ('rare_label_enc', ce.RareLabelEncoder(tol=0.1, n_categories=1)),

    # encode categories with the predictions from a tree
    # categorical variables automatically identified
    ('categorical_enc', ce.DecisionTreeEncoder(
        param_grid={'max_depth': [1, 2,3]},
        random_state=2909)),

    # === discretisation =====
    # transform numerical variables into tree predictions
    # need to specify variable names, because by now, all variables
    # will be numerical. Otherwise transformer will transform all
    ('discretisation', dsc.DecisionTreeDiscretiser(
        param_grid={'max_depth': [1, 2, 3]},
        random_state=2909,
        variables=numerical)),

    # feature Scaling
    ('scaler', StandardScaler()),

    # regression
    ('lasso', Lasso(alpha=100, random_state=0, max_iter=1000)),
])


# In[13]:


# let's fit the pipeline
house_pipe.fit(X_train, y_train)

# let's get the predictions
X_train_preds = house_pipe.predict(X_train)
X_test_preds = house_pipe.predict(X_test)


# In[14]:


# check model performance:

print('train mse: {}'.format(mean_squared_error(y_train, X_train_preds, squared=True)))
print('train rmse: {}'.format(mean_squared_error(y_train, X_train_preds, squared=False)))
print('train r2: {}'.format(r2_score(y_train, X_train_preds)))
print()
print('test mse: {}'.format(mean_squared_error(y_test, X_test_preds,squared=True)))
print('test rmse: {}'.format(mean_squared_error(y_test, X_test_preds, squared=False)))
print('test r2: {}'.format(r2_score(y_test, X_test_preds)))


# In[ ]: