#!/usr/bin/env python # coding: utf-8 # ## About the Data: # # A Business consulting start-up intends analyzing the marketing effort for the previous quarter. # # This will help: # - with an understanding of the correlation between marketing costs and sales generation. # - the tech start-up allocate its marketing budget more resourcefully # - identify which marketing channel is the least or most effective # - generate coefficient values that can be used to predict generated sales from each of the marketing efforts. # In[76]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # In[77]: Ad_data = pd. read_csv(r"C:\Users\Teni\Desktop\Git-Github\Datasets\Linear Regression\Advertising.csv") # In[78]: Ad_data.head() # **Define my X(feature) and y(label) varables.** # In[79]: # Ad_data['total_Ad'] = Ad_data['Instagram'] + Ad_data['Youtube']+Ad_data['Podcasts'] # In[80]: Ad_data # In[81]: X = Ad_data.drop('generated_sales', axis=1) X # In[82]: y = Ad_data['generated_sales'] y # **Split the data into our train and test set.** # In[83]: from sklearn.model_selection import train_test_split # In[84]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 101) # In[85]: Ad_data # In[86]: len(X_train) # This makes up 70% of the data # In[87]: len(X_test) # This makes up 30% of the data # **Import model** # In[88]: from sklearn.linear_model import LinearRegression # In[89]: model = LinearRegression() # We would only be fitting in our training dataset into the model, and then use the y set to check our well oir data is able to predict accurately # In[90]: model.fit(X_train, y_train) # **Test the model to predict** # In[188]: test_predictions = model.predict(X_test) test_predictions # In[100]: X_test.head(5) # The test data represents data our model has not seen. So, we check the first 5 test independent indexes of our test dataset # In[101]: y_test.head(5) # **Check the performance of our Model** # In[94]: from sklearn.metrics import mean_squared_error, mean_absolute_error # In[95]: Ad_data['generated_sales'].mean() # In[96]: sns.histplot(data=Ad_data,x='generated_sales', bins=20) # In[102]: mean_absolute_error(y_test, test_predictions) # This is an ideal low MAE- which indicates the variance between our predicted value and the real value is low # In[103]: np.sqrt(mean_squared_error(y_test, test_predictions)) # This is an ideal low RMSE- which indicates the variance between our predicted value and the real value is low # **Residual Plot** # In[107]: residuals = y_test - test_predictions # The lower the residual, the higher the confidence in our model. An ideal residual value is 0 # In[106]: residuals # the residuals below arethose values with -1 # In[193]: sns.scatterplot(x=y_test, y=residuals) plt.axhline(y=0, color='red'); # **From the above**: # # - The datapoints are heteroskedaticity (well-disperesed around the 0 line) # - This indicates no pattern or trend in the resideuals (the variance between the y_test and the test_predictions) # - If there was a homoskedaticity, we would then have to review the dataset or the model used for the dataset. # # In[196]: sns.displot(residuals, bins = 25, kde=True); # **From the above**: # # - Our data is well skewed around 0. Indicates our model is quite reliable # **To deploy the model** # In[197]: final_model = LinearRegression() # In[198]: final_model.fit(X, y) # In[200]: final_model.coef_ # In[201]: y_pred = final_model.predict(X) # In[222]: fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(16,6),dpi=200) axes[0].plot(Ad_data['Instagram'], Ad_data['generated_sales'], 'o') axes[0].plot(Ad_data['Instagram'], y_pred, 'o', color='red') axes[0].set_ylabel('Sales') axes[0].set_title('Instagram Spend') axes[1].plot(Ad_data['Youtube'], Ad_data['generated_sales'], 'o') axes[1].plot(Ad_data['Youtube'], y_pred, 'o', color='red') axes[1].set_ylabel('Sales') axes[1].set_title('Youtube Spend') axes[2].plot(Ad_data['Podcasts'], Ad_data['generated_sales'], 'o') axes[2].plot(Ad_data['Podcasts'], y_pred, 'o', color='red') axes[2].set_ylabel('Sales') axes[2].set_title('Podcast Spend') plt.show() # **From the above**: # # - The graph shows our predictions (in red) are close with the true values (in blue) # - Instagram Ads have a positive relationship with sales # - Youtube also has a positive relationship woth sales # - There is no relationship between newspaper and sales # **Model Deployment/Save** # In[224]: from joblib import dump,load # In[226]: dump(final_model, 'final_sales_model.joblib') # In[227]: loaded_model = load('final_sales_model.joblib') # In[228]: loaded_model.coef_ # **Applying our model to a marketing business case** # # With an Ad budget of 220 on Instagram, 180 on Youtube and 100 on Podcast; predict the sales value # In[237]: campaign = [[220, 180, 100]] # The shape of the campaign has to match the sahe of the Ad_data # In[238]: X.shape # In[239]: loaded_model.predict(campaign) # To sum it up, by using the calculated coefficients, the startup can predict its sales and see how each marketing effort affects sales. Interestingly, it appears that Podcasts have the smallest impact on sales. # In[ ]: