#!/usr/bin/env python # coding: utf-8 # ## Name : ADVAIT GURUNATH CHAVAN # ## Contact Number : +91 70214 55852 # ## Mail ID : advaitchavan135@gmail.com # # ## Oasis Infobyte Data Science Intern # # ## Task 5 : Sales Predicition using Python # # ## 1. Importing the necessary dependencies # In[3]: import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns import plotly.io as plio plio.templates import plotly.express as px import plotly.graph_objects as go from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from xgboost import XGBRegressor import joblib from warnings import filterwarnings filterwarnings(action='ignore') # ## 2. Exploring the dataset # In[4]: data = pd.read_csv('advertising.csv') # In[5]: data # In[6]: data.info() # ### Removing the unnecessary column Unnamed: 0 # In[7]: data.drop('Unnamed: 0', axis=1, inplace=True) # In[8]: data #data.to_csv('advertising_cleaned.csv', index=False) # In[9]: data.info() # In[10]: data.describe() # In[11]: data.duplicated().sum() # ### Correlation plot to visualize the impact of each feature on target(sales) # In[12]: sns.heatmap(data.corr(),annot=True) # #### From the above correlation plot we have feature: - # #### 1. 'TV' having an impact score of 0.78 on Sales # #### 2. 'Radio' having an impact score of 0.58 on Sales # #### 3. 'Newspaper' having an impact score of 0.23 on Sales # ### [A]Plotly Scatterplot of TV(feature) vs Sales(Target) # In[13]: # Create a scatter plot fig = go.Figure(data=go.Scatter(x=data['TV'], y=data['Sales'], mode='markers', marker=dict(color='orange', size=8))) # Customize the layout fig.update_layout( title="Scatter Plot of TV(Feature) vs Sales(Target)", xaxis_title="TV", yaxis_title="Sales" ) # Show the plot fig.show() # #### From the above scatterplot we can infer that # #### As the score of TV is increasing score of Sales is also increasing # ### [B]Plotly Scatterplot of Radio(feature) vs Sales(Target) # In[16]: # Create a scatter plot fig = go.Figure(data=go.Scatter(x=data['Radio'], y=data['Sales'], mode='markers', marker=dict(color='red', size=8))) # Customize the layout fig.update_layout( title="Scatter Plot of Radio(Feature) vs Sales(Target)", xaxis_title="Radio", yaxis_title="Sales" ) # Show the plot fig.show() # #### From the above scatterplot we can infer that # #### As the score of Radio is increasing score of Sales is also increasing # ### [C]Plotly Scatterplot of Newspaper(feature) vs Sales(Target) # In[17]: # Create a scatter plot fig = go.Figure(data=go.Scatter(x=data['Newspaper'], y=data['Sales'], mode='markers', marker=dict(color='blue', size=8))) # Customize the layout fig.update_layout( title="Scatter Plot of Newspaper(Feature) vs Sales(Target)", xaxis_title="Newspaper", yaxis_title="Sales" ) # Show the plot fig.show() # ### [i] Distribution of TV feature # In[126]: plt.figure(figsize=(10, 6)) ax = sns.histplot(data['TV'], bins=20, kde=True, color='orange') plt.xlabel('TV') plt.ylabel('Frequency') plt.title('Distribution of TV feature') for p in ax.patches: ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 10), textcoords='offset points') plt.show() # ### [ii] Distribution of Radio feature # In[46]: plt.figure(figsize=(10, 6)) ax = sns.histplot(data['Radio'], bins=20, kde=True, color='red') plt.xlabel('Radio') plt.ylabel('Frequency') plt.title('Distribution of Radio feature') for p in ax.patches: ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 10), textcoords='offset points') plt.show() # ### [iii] Distribution of Newspaper feature # In[47]: plt.figure(figsize=(10, 6)) ax = sns.histplot(data['Newspaper'], bins=20, kde=True) plt.xlabel('Newspaper') plt.ylabel('Frequency') plt.title('Distribution of Newspaper feature') for p in ax.patches: ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 10), textcoords='offset points') plt.show() # ### [iv] Distribution of Sales # In[48]: plt.figure(figsize=(10, 6)) ax = sns.histplot(data['Sales'], bins=20, kde=True, color='darkviolet') plt.xlabel('Sales') plt.ylabel('Frequency') plt.title('Distribution of Sales(target)') for p in ax.patches: ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 10), textcoords='offset points') plt.show() # ### 3. Preparing data for Training (Data Modelling) # # #### Since, we need to predict the Sales using different features of the dataset # #### We consider Sales as a seperate dataset say 'y' and all other features as 'x' # In[54]: x = data.iloc[:,:3] y = data.iloc[:,3:] # In[53]: x # In[55]: y # In[58]: x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # ### 4.Training Models using the above data # #### [A] Linear Regression Model # In[59]: model_1 = LinearRegression() model_1.fit(x_train, y_train) y_pred_1 = model_1.predict(x_test) # In[60]: mse = mean_squared_error(y_test, y_pred_1) r2 = r2_score(y_test, y_pred_1) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[61]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_1, alpha=0.5, color='darkviolet') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (Linear Regression)') plt.show() # #### [B] Random Forest Regression Model # In[62]: model_2 = RandomForestRegressor() model_2.fit(x_train, y_train) y_pred_2 = model_2.predict(x_test) # In[63]: mse = mean_squared_error(y_test, y_pred_2) r2 = r2_score(y_test, y_pred_2) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[65]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_2, label=model_2, alpha=0.5, color='green') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (Random Forest Regression)') plt.legend() plt.show() # #### [C] Ridge Regression Model # In[66]: model_3 = Ridge() model_3.fit(x_train, y_train) y_pred_3 = model_3.predict(x_test) # In[67]: mse = mean_squared_error(y_test, y_pred_3) r2 = r2_score(y_test, y_pred_3) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[68]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_3, label=model_3, alpha=0.5, color='darkblue') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (Ridge Regression)') plt.legend() plt.show() # #### [D] Lasso Regression Model # In[69]: model_4 = Lasso() model_4.fit(x_train, y_train) y_pred_4 = model_4.predict(x_test) # In[70]: mse = mean_squared_error(y_test, y_pred_4) r2 = r2_score(y_test, y_pred_4) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[71]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_4, label=model_4, alpha=0.5, color='black') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (Lasso Regression)') plt.legend() plt.show() # #### [E] ElasticNet Regression Model # In[72]: model_5 = ElasticNet() model_5.fit(x_train, y_train) y_pred_5 = model_5.predict(x_test) # In[73]: mse = mean_squared_error(y_test, y_pred_5) r2 = r2_score(y_test, y_pred_5) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[104]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_5, label=model_5, alpha=0.5, color='gold') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (ElasticNet Regression)') plt.legend() plt.show() # #### [F] Gradient Boosting Regression Model # In[75]: model_6 = GradientBoostingRegressor() model_6.fit(x_train, y_train) y_pred_6 = model_6.predict(x_test) # In[76]: mse = mean_squared_error(y_test, y_pred_6) r2 = r2_score(y_test, y_pred_6) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[77]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_6, label=model_6, alpha=0.5, color='red') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (Gradient Boosting Regression)') plt.legend() plt.show() # #### [G] XGBoost Regression Model # In[78]: model_7 = XGBRegressor() model_7.fit(x_train, y_train) y_pred_7 = model_7.predict(x_test) # In[79]: mse = mean_squared_error(y_test, y_pred_7) r2 = r2_score(y_test, y_pred_7) print("Mean Square Error is :", mse) print("R-Squared score is :",r2) # In[80]: plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_pred_7, label='XGBoost Regression Model', alpha=0.5, color='brown') plt.xlabel('Actual Sales') plt.ylabel('Predicted Sales') plt.title('Actual vs. Predicted Sales (XGBoost Regression)') plt.legend() plt.show() # ##### Model r^2 scores of each regression model # In[81]: model_r2_scores = { "Linear Regression Model": r2_score(y_test, y_pred_1), "Random Forest Regression Model": r2_score(y_test, y_pred_2), "Ridge Regression Model": r2_score(y_test, y_pred_3), "Lasso Regression Model": r2_score(y_test, y_pred_4), "ElasticNet Regression Model": r2_score(y_test, y_pred_5), "Gradient Boosting Regression Model": r2_score(y_test, y_pred_6), "XGBoost Regression Model": r2_score(y_test, y_pred_7) } # In[93]: model_r2_scores # In[90]: best_model_name = max(model_r2_scores, key=model_r2_scores.get) best_r2_score = model_r2_scores[best_model_name] print(f"Best Performing Model is {best_model_name} with an R^2 score of {best_r2_score}") # ### 5. Saving the Model having the best fit # In[83]: final_model = model_6 joblib.dump(final_model, 'gradient_boosting_model.pkl') # In[89]: feature_importances = pd.Series(final_model.feature_importances_, index=x.columns) plt.figure(figsize=(10, 6)) features = feature_importances features.plot(kind='bar') plt.xlabel('Feature') plt.ylabel('Feature Importance Score') plt.title('Features having impact on the Sales(recognised by Gradient Boosting Regression Model)') for index, value in enumerate(features): plt.text(index, value, f'{value:.2f}', ha='center', va='bottom') plt.show() # In[ ]: