from google.colab import drive drive.mount('/content/drive') import pandas as pd # Load the data from the CSV files campaigns = pd.read_csv('/content/drive/MyDrive/Data - AtliqMart/dim_campaigns.csv') products = pd.read_csv('/content/drive/MyDrive/Data - AtliqMart/dim_products.csv') stores = pd.read_csv('/content/drive/MyDrive/Data - AtliqMart/dim_stores.csv') fact_events = pd.read_csv('/content/drive/MyDrive/Data - AtliqMart/fact_events.csv') # Display the first few rows of each dataframe (campaigns.head(), products.head(), stores.head(), fact_events.head()) # Join the tables on their respective keys data = ( fact_events.merge(campaigns, how="left", on="campaign_id") .merge(products, how="left", on="product_code") .merge(stores, how="left", on="store_id") ) # Display the first few rows of the merged DataFrame pd.set_option('display.max_columns', None) data.head() data['category'].value_counts() import pandas as pd # a. Revenue Before --> To understand baseline performance. data['revenue_before'] = data['base_price'] * data['quantity_sold(before_promo)'] # b. Quantity Sold --> Adjusts the quantity sold based on promotion type (e.g., doubling for BOGOF) to reflect actual sales impact. data['quantity_sold'] = data.apply( lambda row: row['quantity_sold(after_promo)'] * 2 if row['promo_type'] == 'BOGOF' else row['quantity_sold(after_promo)'], axis=1 ) # c. Total Discount --> Computes the total discount amount applied to each sale to analyze the financial impact of promotions. def calculate_discount(row): discount_rate = 0 if row['promo_type'] == '25% OFF': discount_rate = 0.25 elif row['promo_type'] == '50% OFF': discount_rate = 0.50 elif row['promo_type'] == '33% OFF': discount_rate = 0.33 elif row['promo_type'] == 'BOGOF': discount_rate = 0.50 elif row['promo_type'] == '500 Cashback': return 500 * row['quantity_sold'] return discount_rate * row['base_price'] * row['quantity_sold'] data['total_discount'] = data.apply(calculate_discount, axis=1) # d. Actual Revenue --> Determines the actual revenue after accounting for discounts, providing a true measure of revenue. data['actual_revenue'] = (data['base_price'] * data['quantity_sold']) - data['total_discount'] # Display the DataFrame with new calculated columns pd.set_option('display.max_columns', None) data.head() import pandas as pd import matplotlib.pyplot as plt # Calculate the number of stores in each city store_count = data.groupby('city')['store_id'].nunique().sort_values(ascending=False) # Create a bar plot for the number of stores plt.figure(figsize=(14, 8)) bars = plt.bar(store_count.index, store_count.values, color='#0096FF', alpha=0.7) # Add labels to bars for bar in bars: yval = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom', ha='center', color='tab:blue', fontsize=10) # Set title and labels plt.title('Number of Stores by City') plt.xlabel('City') plt.ylabel('Number of Stores') # Show the plot plt.xticks(rotation=45) plt.show() import pandas as pd import matplotlib.pyplot as plt # Calculate the number of stores in each city store_count = data.groupby('city')['store_id'].nunique().sort_values(ascending=False) # Calculate total revenue before and after promotions for each city revenue_before_city = data.groupby('city')['revenue_before'].sum() / 1_000_000 # Convert to millions actual_revenue_city = data.groupby('city')['actual_revenue'].sum() / 1_000_000 # Convert to millions # Sort revenues based on store count order revenue_before_city = revenue_before_city.loc[store_count.index] actual_revenue_city = actual_revenue_city.loc[store_count.index] # Create a plot for the revenues plt.figure(figsize=(14, 8)) # Plotting the actual revenue line chart plt.plot(store_count.index, actual_revenue_city.values, color='tab:orange', marker='o', label='Revenue After Promo') # Plotting the revenue before promotion line chart plt.plot(store_count.index, revenue_before_city.values, color='tab:blue', marker='o', label='Revenue Before Promo') # Adding labels to lines for i, txt in enumerate(actual_revenue_city.values): plt.annotate(f'{txt:.2f}M', (store_count.index[i], actual_revenue_city.values[i]), textcoords="offset points", xytext=(0,10), ha='center', color='tab:orange') for i, txt in enumerate(revenue_before_city.values): plt.annotate(f'{txt:.2f}M', (store_count.index[i], revenue_before_city.values[i]), textcoords="offset points", xytext=(0,10), ha='center', color='tab:blue') # Adding legend plt.legend(loc='upper right') # Set title and labels plt.title('Revenue Before and After Promotions by City') plt.xlabel('City') plt.ylabel('Revenue (in Millions)') # Show the plot plt.xticks(rotation=45) plt.show() import pandas as pd import matplotlib.pyplot as plt # Calculate the number of stores in each city store_count = data.groupby('city')['store_id'].nunique().sort_values(ascending=False) # Calculate total revenue before and after promotions for each city revenue_before_city = data.groupby('city')['revenue_before'].sum() / 1_000_000 # Convert to millions revenue_after_city = data.groupby('city')['actual_revenue'].sum() / 1_000_000 # Convert to millions # Sort revenues based on store count order revenue_before_city = revenue_before_city.loc[store_count.index] revenue_after_city = revenue_after_city.loc[store_count.index] # Create a bar plot for the number of stores fig, ax1 = plt.subplots(figsize=(14, 8)) ax1.set_xlabel('City') ax1.set_ylabel('Number of Stores', color='#0096FF') # Cobalt blue shade bars = ax1.bar(store_count.index, store_count.values, color='#0096FF', alpha=0.7, label='Number of Stores') # Add labels to bars for bar in bars: yval = bar.get_height() ax1.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), va='bottom', ha='center', color='tab:blue', fontsize=10) # Create a line plot for revenue before and after promotions ax2 = ax1.twinx() # instantiate a second y-axis that shares the same x-axis ax2.set_ylabel('Revenue (in Millions)', color='tab:red') line1, = ax2.plot(revenue_before_city.index, revenue_before_city.values, color='tab:blue', marker='o', label='Revenue Before Promo (M)') line2, = ax2.plot(revenue_after_city.index, revenue_after_city.values, color='tab:orange', marker='o', label='Revenue After Promo (M)') # Add labels to lines for i, txt in enumerate(revenue_before_city.values): ax2.annotate(f'{txt:.2f}M', (revenue_before_city.index[i], revenue_before_city.values[i]), textcoords="offset points", xytext=(0,10), ha='center', color='tab:blue') for i, txt in enumerate(revenue_after_city.values): ax2.annotate(f'{txt:.2f}M', (revenue_after_city.index[i], revenue_after_city.values[i]), textcoords="offset points", xytext=(0,10), ha='center', color='tab:orange') # Adding legend lines = [line1, line2] ax2.legend(lines, [l.get_label() for l in lines], loc='upper right') # Title and labels plt.title('Performance of Stores by City') fig.tight_layout() # to ensure the right y-label is not slightly clipped # Show the plot plt.show() import pandas as pd # Calculate Incremental Sold Units (ISU) data['Incremental Sold Units (ISU)'] = data['quantity_sold(after_promo)'] - data['quantity_sold(before_promo)'] # Calculate Incremental Sold Units (ISU) % data['Incremental Sold Units (ISU) %'] = (data['Incremental Sold Units (ISU)'] / data['quantity_sold(before_promo)']) * 100 # Sort by quantity_sold(after_promo) in descending order sorted_data = data.sort_values(by='quantity_sold(after_promo)', ascending=False) # Select the relevant columns result = sorted_data[['store_id', 'city', 'quantity_sold(before_promo)', 'quantity_sold(after_promo)', 'Incremental Sold Units (ISU)', 'Incremental Sold Units (ISU) %']] # Display the result result diff_category = data['category'].value_counts() diff_category import pandas as pd import matplotlib.pyplot as plt # Assuming 'data' is your DataFrame # Use value_counts to count occurrences of each category for before and after quantity category_counts_before = data.groupby('category')['quantity_sold(before_promo)'].sum() category_counts_after = data.groupby('category')['quantity_sold'].sum() # Create a figure with two subplots fig, axs = plt.subplots(1, 2, figsize=(14, 7)) # Donut chart for quantity before axs[0].pie(category_counts_before, labels=category_counts_before.index, autopct='%1.1f%%', startangle=90) centre_circle = plt.Circle((0,0),0.70,fc='white') axs[0].add_artist(centre_circle) axs[0].set_title('Sales Dynamic Before Promo') # Pie chart for quantity after axs[1].pie(category_counts_after, labels=category_counts_after.index, autopct='%1.1f%%', startangle=90) axs[1].set_title('Sales Dynamic After Promo') # Equal aspect ratio ensures that pie is drawn as a circle. axs[0].axis('equal') axs[1].axis('equal') plt.legend(category_counts_before.index, title="Category's", loc= "lower center") plt.show() import pandas as pd # Assuming 'data' is your DataFrame table_data = { 'Store ID': data['store_id'], 'Qty Before': data['quantity_sold(before_promo)'], 'Qty After': data['quantity_sold'], 'Revenue Before': data['revenue_before'], 'Revenue After': data['actual_revenue'], 'IR': (data['quantity_sold'] - data['quantity_sold(before_promo)']), 'IR %': ((data['quantity_sold'] - data['quantity_sold(before_promo)']) / data['quantity_sold(before_promo)']) * 100, 'ISU': (data['actual_revenue'] - data['revenue_before']), 'ISU %': ((data['actual_revenue'] - data['revenue_before']) / data['revenue_before']) * 100 } # Create DataFrame from table_data table_df = pd.DataFrame(table_data) # Display the table table_df import plotly.graph_objects as go # Assuming 'data' is your DataFrame # Use value_counts to count occurrences of each promo type promo_counts = data['promo_type'].value_counts() # Create a waterfall chart fig = go.Figure(go.Waterfall( orientation = "v", measure = ["relative"] * len(promo_counts) + ["total"], x = promo_counts.index.tolist() + ["Total"], y = promo_counts.tolist() + [promo_counts.sum()], textposition = "outside", text = ["+{}".format(val) for val in promo_counts.tolist()] + ["Total"], connector = {"line":{"color":"rgb(63, 63, 63)"}}, )) fig.update_layout( title = "Promotype Frequency", xaxis_title = "Promo Type", yaxis_title = "Frequency", showlegend = False ) fig.show() import pandas as pd # Assuming 'data' is your DataFrame table_data = { 'Product Name': data['product_name'], 'Campaign Name': data['campaign_name'], 'Promo Type': data['promo_type'], 'Qty Sold': data['quantity_sold'], 'Revenue': data['actual_revenue'], 'IR%': ((data['quantity_sold'] - data['quantity_sold(before_promo)']) / data['quantity_sold(before_promo)']) * 100, 'ISU%': ((data['actual_revenue'] - data['revenue_before']) / data['revenue_before']) * 100 } # Create DataFrame from table_data table_df = pd.DataFrame(table_data) # Display the table table_df import pandas as pd import matplotlib.pyplot as plt # Assuming 'data' is your DataFrame # Group by category and sum the quantity sold category_qty = data.groupby('category')['quantity_sold'].sum() # Sort categories by quantity sold category_qty_sorted = category_qty.sort_values(ascending=False) # Plot the bar chart plt.figure(figsize=(10, 6)) bars = plt.bar(category_qty_sorted.index, category_qty_sorted.values) # Add labels to the bars for bar in bars: yval = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2, yval, f'{yval/1000:.1f}k', va='bottom') # Customize the plot plt.xlabel('Category') plt.ylabel('Quantity Sold') plt.title('Quantity Sold by Category') plt.xticks(rotation=45) plt.tight_layout() plt.show() import pandas as pd import matplotlib.pyplot as plt # Assuming 'data' is your DataFrame # Use value_counts to count occurrences of each category category_counts = data['category'].value_counts() # Create a donut chart plt.figure(figsize=(8, 8)) plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=90) # Draw a circle at the center to make it a donut chart centre_circle = plt.Circle((0,0),0.70,fc='white') fig = plt.gcf() fig.gca().add_artist(centre_circle) # Equal aspect ratio ensures that pie is drawn as a circle. plt.axis('equal') plt.title('Donut Chart of Category Distribution') plt.legend(category_counts.index, title="Category", loc = "lower center") plt.show()