#!/usr/bin/env python # coding: utf-8 # In[700]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.cluster import KMeans from sklearn.metrics import mean_squared_error from lifetimes import BetaGeoFitter from lifetimes import GammaGammaFitter from lifetimes.utils import calibration_and_holdout_data from lifetimes.utils import summary_data_from_transaction_data from lifetimes.plotting import plot_calibration_purchases_vs_holdout_purchases from decimal import Decimal import datetime as dt # In[701]: df = pd.read_csv('OnlineRetail.csv') df # In[702]: df.drop('Unnamed: 0', inplace=True, axis=1) df # In[703]: # Verify the existence of NaN values df.isna().sum() # In[704]: df.dropna(inplace=True) # In[705]: # Dropping rows with negative quantity. df = df[~df['Quantity'] < 0] # In[706]: df.info() # In[707]: # Setting data types df['InvoiceNo'] = df['InvoiceNo'].astype('str') df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate']) df['CustomerID'] = df['CustomerID'].astype('str') df['Description'] = df['Description'].astype('str') df['StockCode'] = df['StockCode'].astype('str') df['Country'] = df['Country'].astype('str') df['UnitPrice'] = df['UnitPrice'].apply(Decimal) # In[708]: df.info() # In[709]: # Creating the monetary value of the transactions (quantity * price) df['Monetary'] = df['Quantity'] * df['UnitPrice'] df['Monetary'] = df['Monetary'].apply(Decimal) df['Monetary'] # __Computing the summary data__. # In[710]: # Computing the summary data (Recency, Frequency, monetary and tenure) df_rfmt = summary_data_from_transaction_data(transactions = df, customer_id_col = 'CustomerID', datetime_col = 'InvoiceDate', monetary_value_col = 'Monetary') df_rfmt.head() # __Splitting the data__ # In[719]: # Size of the data # we have 373 days of data. # We will use 200 days as calibration data and the rest as observation data # to evaluate the performance of the model. diff_time = df['InvoiceDate'].max() - df['InvoiceDate'].min() diff_time # In[720]: # Getting the ending date of the calibration period. end_date_cal = df['InvoiceDate'].min() + dt.timedelta(days=200) end_date_obs = end_date_cal + (diff_time - dt.timedelta(days=200)) end_date_obs # In[721]: # Verify if the calculations are correct. df['InvoiceDate'].max() == end_date_obs # In[723]: df_rfmt_cal = calibration_and_holdout_data(transactions=df, customer_id_col="CustomerID", datetime_col = "InvoiceDate", calibration_period_end=end_date_cal, observation_period_end= end_date_obs) df_rfmt_cal # __Tuning the model__. # In[724]: l2_coefs = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] l2_list = [] rmse_list = [] for coef in l2_coefs : # Fitting the model using the calibration dataset. model = BetaGeoFitter(penalizer_coef=coef) model.fit(df_rfmt_cal['frequency_cal'], df_rfmt_cal['recency_cal'], df_rfmt_cal['T_cal']) # Predicting the frequency for the holdout period for all customers. pred_freq = pd.DataFrame(model.predict(df_rfmt_cal['duration_holdout'], df_rfmt_cal['frequency_cal'], df_rfmt_cal['recency_cal'], df_rfmt_cal['T_cal']), columns=['pred_frequency']).reset_index() # Merging the two dataframes and dropping NaN values. new_df = df_rfmt_cal.reset_index().merge(pred_freq, on='CustomerID').dropna() # Computing the rmse score rmse_score = np.sqrt(mean_squared_error(new_df['frequency_holdout'],new_df['pred_frequency'])) l2_list.append(coef) rmse_list.append(rmse_score) # In[725]: # Getting the results pd.DataFrame(np.array(rmse_list), columns=['rmse_score']).merge(pd.DataFrame(np.array(l2_list), columns=['L2 coefs']), right_index=True, left_index=True) # In[726]: # Fitting the model using the calibration dataset. model = BetaGeoFitter(penalizer_coef=0.80) model.fit(df_rfmt_cal['frequency_cal'], df_rfmt_cal['recency_cal'], df_rfmt_cal['T_cal']) # In[727]: # Evaluating the performance of the model. plot_calibration_purchases_vs_holdout_purchases(model, df_rfmt_cal) plt.savefig('calib_hold.png') # In[728]: # Predicting the number of purchases in the next 180 days for the first customer. # Getting the customer data customer_1 = df_rfmt.iloc[0:1] # Predicting n_trans = model.predict(180, customer_1['frequency'], customer_1['recency'], customer_1['T']) n_trans # In[729]: # Predicting the number of purchases in the next 180 days for all customers. df_rfmt['predicted_purchases'] = model.conditional_expected_number_of_purchases_up_to_time(180, df_rfmt['frequency'], df_rfmt['recency'], df_rfmt['T']) # In[730]: df_rfmt.dropna(inplace=True) # In[731]: df_rfmt # __Predicting Customer lifetime value__ # In[732]: # Getting rid of negative values. df_rfmt = df_rfmt[df_rfmt['monetary_value']>0] # In[733]: # Fitting the GammaGamma model gg_model = GammaGammaFitter() gg_model.fit(df_rfmt['frequency'], df_rfmt['monetary_value']) # In[734]: df_rfmt['pred_monetary'] = ggf.conditional_expected_average_profit( df_rfmt['frequency'], df_rfmt['monetary_value']) # In[735]: df_rfmt # In[736]: # Predicting the CLV. df_rfmt['CLV'] = gg_model.customer_lifetime_value( model, df_rfmt['frequency'], df_rfmt['recency'], df_rfmt['T'], df_rfmt['monetary_value'], time = 6,# In months ) # In[737]: df_rfmt # In[740]: # Computing the probability of being alive. df_rfmt['prob_alive'] = model.conditional_probability_alive(frequency=df_rfmt['frequency'], recency=df_rfmt['recency'], T=df_rfmt['T']) # In[741]: df_rfmt # __Customers segmentation__ # In[757]: from yellowbrick.cluster import KElbowVisualizer # Instantiate the clustering model and visualizer km_model = KMeans() visualizer = KElbowVisualizer(km_model, k=(2,10)) visualizer.fit(df_rfmt) # Fit the data to the visualizer visualizer.show() # Finalize and render the figure # In[550]: # Training the model with the optimal number of clusters according the elbow metho km_model = KMeans(n_clusters=4) km_model.fit(df_rfmt) # In[551]: df_rfmt['cluster'] = km_model.labels_ df_rfmt # In[692]: # Grouping by clusters df_clusters = df_rfmt.groupby(['cluster'])['CLV']\ .agg(['mean', "count"])\ .reset_index() df_clusters.columns = ["cluster", "avg_CLV", "n_customers"] df_clusters['perct_customers'] = (df_clusters['n_customers']/df_clusters['n_customers']\ .sum())*100 df_clusters # In[569]: # Let's name the clusters. df_rfmt['customer_category'] = df_rfmt['cluster'].replace({3:"Gold", 1:"Diamond", 2:"Silver", 0:"Bronze"}) # In[673]: # Grouping by customer category df_cat = pd.DataFrame(df_rfmt.groupby(['customer_category'])['CLV'].agg('mean')).reset_index() # In[674]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="CLV", data=df_cat) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("CLV", size=14) # Setting the title for the graph plt.title("CLV per category") # Finally showing the plot plt.show() # In[679]: # Computing the contribution of each category to the total CLV of the next six months df_cat["contribution_to_CLV"] = df_cat['CLV']/df_cat['CLV'].sum()*100 # In[683]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="contribution_to_CLV", data=df_cat) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=12) # Setting the label for y-axis plt.ylabel("contribution to CLV in %", size=12) # Setting the title for the graph plt.title("The contribution of each category to the total CLV of the next six months") # Finally showing the plot plt.show() # __Analyzing the frequency__ # In[575]: sns.displot(df_rfmt['frequency']) plt.show() # In[608]: df_freq = pd.DataFrame(df_rfmt.groupby(['customer_category'])['frequency'].mean().reset_index()) # In[609]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="frequency", data=df_freq) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("Frequency", size=14) # Setting the title for the graph plt.title("Frequency per category") # Finally showing the plot plt.show() # Gold customers purchase more frequently from the company # In[576]: df_rfmt['frequency'].describe() # In[649]: # Getting the percentage of customers with a frequency less than 10. len(df_rfmt[df_rfmt['frequency'] < 10])/len(df_rfmt) # In[612]: # Getting the number of customers per category for those with a frequency greater than 10. df_freq_1 = pd.DataFrame(df_rfmt[df_rfmt['frequency'] > 10]\ .groupby(['customer_category'])['customer_category']\ .agg('count')) df_freq_1.columns = ['n_customers'] df_freq_1 = df_freq_1.reset_index() # In[613]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="n_customers", data=df_freq_1) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("number of customers", size=14) # Setting the title for the graph plt.title("Nombre of customers per category for those with frequency higher than 10") # Finally showing the plot plt.show() # In[616]: # Getting the number of customers per category for those with a frequency less than 10. df_freq_2 = pd.DataFrame(df_rfmt[df_rfmt['frequency'] < 10]\ .groupby(['customer_category'])['customer_category']\ .agg('count')) df_freq_2.columns=['n_customers'] df_freq_2 = df_freq_2.reset_index() # In[617]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="n_customers", data=df_freq_2) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("number of customers", size=14) # Setting the title for the graph plt.title("Nombre of customers per category for those with frequency less than 10") # Finally showing the plot plt.show() # In[582]: df_rfmt[df_rfmt['frequency'] > 10].reset_index().groupby(['customer_category'])['CustomerID'].agg('count') # __Analyzing the monetary values__ # In[626]: sns.displot(df_rfmt['monetary_value']) plt.show() # In[633]: df_rfmt['monetary_value'].describe() # In[634]: df_mon = pd.DataFrame(df_rfmt.groupby(['customer_category'])['monetary_value'].mean().reset_index()) # In[636]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="monetary_value", data=df_mon) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("average monetary value", size=14) # Setting the title for the graph plt.title("average monetary value per category") # Finally showing the plot plt.show() # In[650]: # Getting the 80th percentile perct = df_rfmt['monetary_value'].quantile(q=0.8) # In[656]: # Getting the number of customers per category for those with a monetary value greater than the 80th percentile. df_mon_1 = pd.DataFrame(df_rfmt[df_rfmt['monetary_value'] > perct]\ .groupby(['customer_category'])['customer_category']\ .agg('count')) df_mon_1.columns = ['n_customers'] df_mon_1 = df_mon_1.reset_index() # In[661]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="n_customers", data=df_mon_1) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("number of customers", size=14) # Setting the title for the graph plt.title("Nombre of customers per category for those with a monetary value greater than the 80th percentile.") # Finally showing the plot plt.show() # In[662]: # Getting the number of customers per category for those with a monetary value less than the 80th percentile. df_mon_2= pd.DataFrame(df_rfmt[df_rfmt['monetary_value'] < perct]\ .groupby(['customer_category'])['customer_category']\ .agg('count')) df_mon_2.columns = ['n_customers'] df_mon_2 = df_mon_2.reset_index() # In[663]: # Code source : https://www.geeksforgeeks.org/how-to-annotate-bars-in-barplot-with-matplotlib-in-python/ # Defining the plot size plt.figure(figsize=(8, 8)) # Defining the values for x-axis, y-axis # and from which dataframe the values are to be picked plots = sns.barplot(x="customer_category", y="n_customers", data=df_mon_2) # Iterating over the bars one-by-one for bar in plots.patches: # Using Matplotlib's annotate function and # passing the coordinates where the annotation shall be done # x-coordinate: bar.get_x() + bar.get_width() / 2 # y-coordinate: bar.get_height() # free space to be left to make graph pleasing: (0, 8) # ha and va stand for the horizontal and vertical alignment plots.annotate(format(bar.get_height(), '.2f'), (bar.get_x() + bar.get_width() / 2, bar.get_height()), ha='center', va='center', size=15, xytext=(0, 8),textcoords='offset points') plt.xlabel("Customer category", size=14) # Setting the label for y-axis plt.ylabel("number of customers", size=14) # Setting the title for the graph plt.title("Nombre of customers per category for those with a monetary value less than the 80th percentile.") # Finally showing the plot plt.show() # In[ ]: