#!/usr/bin/env python
# coding: utf-8

# In[700]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from lifetimes import BetaGeoFitter
from lifetimes import GammaGammaFitter
from lifetimes.utils import calibration_and_holdout_data
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases
from decimal import Decimal 
import datetime as dt


# In[701]:


df = pd.read_csv('OnlineRetail.csv')
df


# In[702]:


df.drop('Unnamed: 0', inplace=True, axis=1)
df


# In[703]:


# Verify the existence of NaN values
df.isna().sum()


# In[704]:


df.dropna(inplace=True)


# In[705]:


# Dropping rows with negative quantity.
df = df[~df['Quantity'] < 0]


# In[706]:


df.info()


# In[707]:


# Setting data types 
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype('str')
df['Description'] = df['Description'].astype('str')
df['StockCode'] = df['StockCode'].astype('str')
df['Country'] = df['Country'].astype('str')
df['UnitPrice'] = df['UnitPrice'].apply(Decimal)


# In[708]:


df.info()


# In[709]:


# Creating the monetary value of the transactions (quantity * price)
df['Monetary'] = df['Quantity'] * df['UnitPrice']
df['Monetary'] = df['Monetary'].apply(Decimal)
df['Monetary']


# __Computing the summary data__.

# In[710]:


# Computing the summary data (Recency, Frequency, monetary and tenure)
df_rfmt = summary_data_from_transaction_data(transactions = df, 
                                         customer_id_col = 'CustomerID', 
                                         datetime_col = 'InvoiceDate', 
                                         monetary_value_col = 'Monetary')
df_rfmt.head()


# __Splitting the data__

# In[719]:


# Size of the data 
# we have 373 days of data.
# We will use 200 days as calibration data and the rest as observation data 
# to evaluate the performance of the model.
diff_time = df['InvoiceDate'].max() - df['InvoiceDate'].min() 
diff_time


# In[720]:


# Getting the ending date of the calibration period. 
end_date_cal = df['InvoiceDate'].min() + dt.timedelta(days=200)
end_date_obs = end_date_cal + (diff_time - dt.timedelta(days=200))
end_date_obs


# In[721]:


# Verify if the calculations are correct. 
df['InvoiceDate'].max() == end_date_obs


# In[723]:


df_rfmt_cal = calibration_and_holdout_data(transactions=df, 
                                          customer_id_col="CustomerID",
                                          datetime_col = "InvoiceDate", 
                                          calibration_period_end=end_date_cal,
                                          observation_period_end= end_date_obs)
df_rfmt_cal


# __Tuning the model__.

# In[724]:


l2_coefs = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
l2_list = []
rmse_list = []
for coef in l2_coefs :
    # Fitting the model using the calibration dataset.
    model = BetaGeoFitter(penalizer_coef=coef)
    model.fit(df_rfmt_cal['frequency_cal'], 
        df_rfmt_cal['recency_cal'], 
        df_rfmt_cal['T_cal'])
    # Predicting the frequency for the holdout period for all customers. 
    pred_freq = pd.DataFrame(model.predict(df_rfmt_cal['duration_holdout'], 
                                 df_rfmt_cal['frequency_cal'], df_rfmt_cal['recency_cal'], df_rfmt_cal['T_cal']), columns=['pred_frequency']).reset_index()
    # Merging the two dataframes and dropping NaN values. 
    new_df = df_rfmt_cal.reset_index().merge(pred_freq, on='CustomerID').dropna()

    # Computing the rmse score 
    rmse_score = np.sqrt(mean_squared_error(new_df['frequency_holdout'],new_df['pred_frequency']))
    l2_list.append(coef)
    rmse_list.append(rmse_score)


# In[725]:


# Getting the results 
pd.DataFrame(np.array(rmse_list), columns=['rmse_score']).merge(pd.DataFrame(np.array(l2_list), columns=['L2 coefs']), right_index=True, left_index=True)


# In[726]:


# Fitting the model using the calibration dataset.
model = BetaGeoFitter(penalizer_coef=0.80)
model.fit(df_rfmt_cal['frequency_cal'], 
        df_rfmt_cal['recency_cal'], 
        df_rfmt_cal['T_cal'])


# In[727]:


# Evaluating the performance of the model.
plot_calibration_purchases_vs_holdout_purchases(model, df_rfmt_cal)
plt.savefig('calib_hold.png')


# In[728]:


# Predicting the number of purchases in the next 180 days for the first customer.

# Getting the customer data
customer_1 = df_rfmt.iloc[0:1]

# Predicting 
n_trans = model.predict(180, 
            customer_1['frequency'], 
            customer_1['recency'], 
            customer_1['T'])
n_trans


# In[729]:


# Predicting the number of purchases in the next 180 days for all customers.
df_rfmt['predicted_purchases'] = model.conditional_expected_number_of_purchases_up_to_time(180, 
                                                                                      df_rfmt['frequency'], 
                                                                                      df_rfmt['recency'], 
                                                                                      df_rfmt['T'])


# In[730]:


df_rfmt.dropna(inplace=True)


# In[731]:


df_rfmt


# __Predicting Customer lifetime value__

# In[732]:


# Getting rid of negative values.
df_rfmt = df_rfmt[df_rfmt['monetary_value']>0]


# In[733]:


# Fitting the GammaGamma model 

gg_model = GammaGammaFitter()
gg_model.fit(df_rfmt['frequency'], df_rfmt['monetary_value'])


# In[734]:


df_rfmt['pred_monetary'] = ggf.conditional_expected_average_profit(
        df_rfmt['frequency'],
        df_rfmt['monetary_value'])


# In[735]:


df_rfmt


# In[736]:


# Predicting the CLV.
df_rfmt['CLV'] = gg_model.customer_lifetime_value(
    model,
    df_rfmt['frequency'],
    df_rfmt['recency'],
    df_rfmt['T'],
    df_rfmt['monetary_value'],
    time = 6,# In months 
    )
    

# In[737]:


df_rfmt


# In[740]:


# Computing the probability of being alive.
df_rfmt['prob_alive'] = model.conditional_probability_alive(frequency=df_rfmt['frequency'],
                                                           recency=df_rfmt['recency'],
                                                           T=df_rfmt['T'])


# In[741]:


df_rfmt


# __Customers segmentation__

# In[757]:


from yellowbrick.cluster import KElbowVisualizer


# Instantiate the clustering model and visualizer
km_model = KMeans()
visualizer = KElbowVisualizer(km_model, k=(2,10))

visualizer.fit(df_rfmt)  # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure


# In[550]:


# Training the model with the optimal number of clusters according the elbow metho
km_model = KMeans(n_clusters=4)
km_model.fit(df_rfmt)


# In[551]:


df_rfmt['cluster'] = km_model.labels_
df_rfmt


# In[692]:


# Grouping by clusters
df_clusters = df_rfmt.groupby(['cluster'])['CLV']\
                     .agg(['mean', "count"])\
                     .reset_index()

df_clusters.columns = ["cluster", "avg_CLV", "n_customers"]

df_clusters['perct_customers'] = (df_clusters['n_customers']/df_clusters['n_customers']\
                                  .sum())*100
df_clusters 


# In[569]:


# Let's name the clusters.
df_rfmt['customer_category'] = df_rfmt['cluster'].replace({3:"Gold", 1:"Diamond", 2:"Silver", 0:"Bronze"})


# In[673]:


# Grouping by customer category
df_cat = pd.DataFrame(df_rfmt.groupby(['customer_category'])['CLV'].agg('mean')).reset_index()


# In[674]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="CLV", data=df_cat)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("CLV", size=14)
 
# Setting the title for the graph
plt.title("CLV per category")
 
# Finally showing the plot
plt.show()


# In[679]:


# Computing the contribution of each category to the total CLV of the next six months
df_cat["contribution_to_CLV"] = df_cat['CLV']/df_cat['CLV'].sum()*100


# In[683]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="contribution_to_CLV", data=df_cat)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=12)

# Setting the label for y-axis
plt.ylabel("contribution to CLV in %", size=12)
 
# Setting the title for the graph
plt.title("The contribution of each category to the total CLV of the next six months")
 
# Finally showing the plot
plt.show()


# __Analyzing the frequency__

# In[575]:


sns.displot(df_rfmt['frequency'])
plt.show()


# In[608]:


df_freq = pd.DataFrame(df_rfmt.groupby(['customer_category'])['frequency'].mean().reset_index())


# In[609]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="frequency", data=df_freq)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("Frequency", size=14)
 
# Setting the title for the graph
plt.title("Frequency per category")
 
# Finally showing the plot
plt.show()


# Gold customers purchase more frequently from the company

# In[576]:


df_rfmt['frequency'].describe()


# In[649]:


# Getting the percentage of customers with a frequency less than 10.

len(df_rfmt[df_rfmt['frequency'] < 10])/len(df_rfmt)


# In[612]:


# Getting the number of customers per category for those with a frequency greater than 10.

df_freq_1 = pd.DataFrame(df_rfmt[df_rfmt['frequency'] > 10]\
                        .groupby(['customer_category'])['customer_category']\
                        .agg('count'))

df_freq_1.columns = ['n_customers']

df_freq_1 = df_freq_1.reset_index()


# In[613]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="n_customers", data=df_freq_1)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("number of customers", size=14)

# Setting the title for the graph
plt.title("Nombre of customers per category for those with frequency higher than 10")
 
# Finally showing the plot
plt.show()


# In[616]:


# Getting the number of customers per category for those with a frequency less than 10.

df_freq_2 = pd.DataFrame(df_rfmt[df_rfmt['frequency'] < 10]\
                        .groupby(['customer_category'])['customer_category']\
                        .agg('count'))

df_freq_2.columns=['n_customers']

df_freq_2 = df_freq_2.reset_index()


# In[617]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="n_customers", data=df_freq_2)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("number of customers", size=14)

# Setting the title for the graph
plt.title("Nombre of customers per category for those with frequency less than 10")
 
# Finally showing the plot
plt.show()


# In[582]:


df_rfmt[df_rfmt['frequency'] > 10].reset_index().groupby(['customer_category'])['CustomerID'].agg('count')


# __Analyzing the monetary values__

# In[626]:


sns.displot(df_rfmt['monetary_value'])
plt.show()


# In[633]:


df_rfmt['monetary_value'].describe()


# In[634]:


df_mon = pd.DataFrame(df_rfmt.groupby(['customer_category'])['monetary_value'].mean().reset_index())


# In[636]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="monetary_value", data=df_mon)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("average monetary value", size=14)
 
# Setting the title for the graph
plt.title("average monetary value per category")
 
# Finally showing the plot
plt.show()


# In[650]:


# Getting the 80th percentile 
perct = df_rfmt['monetary_value'].quantile(q=0.8)


# In[656]:


# Getting the number of customers per category for those with a monetary value greater than the 80th percentile.

df_mon_1 = pd.DataFrame(df_rfmt[df_rfmt['monetary_value'] > perct]\
                        .groupby(['customer_category'])['customer_category']\
                        .agg('count'))

df_mon_1.columns = ['n_customers']

df_mon_1 = df_mon_1.reset_index()


# In[661]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="n_customers", data=df_mon_1)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("number of customers", size=14)
 
# Setting the title for the graph
plt.title("Nombre of customers per category for those with a monetary value greater than the 80th percentile.")
 
# Finally showing the plot
plt.show()


# In[662]:


# Getting the number of customers per category for those with a monetary value less than the 80th percentile.

df_mon_2= pd.DataFrame(df_rfmt[df_rfmt['monetary_value'] < perct]\
                        .groupby(['customer_category'])['customer_category']\
                        .agg('count'))

df_mon_2.columns = ['n_customers']

df_mon_2 = df_mon_2.reset_index()


# In[663]:


# Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/
# Defining the plot size
plt.figure(figsize=(8, 8))
 
# Defining the values for x-axis, y-axis
# and from which dataframe the values are to be picked
plots = sns.barplot(x="customer_category", y="n_customers", data=df_mon_2)
 
# Iterating over the bars one-by-one
for bar in plots.patches:
   
  # Using Matplotlib's annotate function and
  # passing the coordinates where the annotation shall be done
  # x-coordinate: bar.get_x() + bar.get_width() / 2
  # y-coordinate: bar.get_height()
  # free space to be left to make graph pleasing: (0, 8)
  # ha and va stand for the horizontal and vertical alignment
    plots.annotate(format(bar.get_height(), '.2f'),
                   (bar.get_x() + bar.get_width() / 2,
                    bar.get_height()), ha='center', va='center',
                   size=15, xytext=(0, 8),textcoords='offset points')

plt.xlabel("Customer category", size=14)
 
# Setting the label for y-axis
plt.ylabel("number of customers", size=14)
 
# Setting the title for the graph
plt.title("Nombre of customers per category for those with a monetary value less than the 80th percentile.")
 
# Finally showing the plot
plt.show()


# In[ ]: