#!/usr/bin/env python # coding: utf-8 # In[1]: from functools import reduce import pandas as pd import plotly.graph_objects as go from plotly import express as px from plotly.colors import n_colors from plotly.subplots import make_subplots from plotly.offline import init_notebook_mode init_notebook_mode() pd.options.plotting.backend = "plotly" # This is my attempt on analyzing the data collected by [fivethirtyeight](https://fivethirtyeight.com/) when trying to determine which candy is the best. # You can read more about the data set, how it was collected etc. [here](https://fivethirtyeight.com/videos/the-ultimate-halloween-candy-power-ranking/). # The data itself is available [here](https://github.com/fivethirtyeight/data/blob/master/candy-power-ranking/candy-data.csv) (don't forget to have a look in the [README](https://github.com/fivethirtyeight/data/blob/master/candy-power-ranking/README.md)). # In[2]: # Data taken from a fork of the original data source. df = pd.read_csv( "https://raw.githubusercontent.com/drorata/fivethirtyeight-data/b22a21b264162ad0b5d8954b02e0bca5ab782113/candy-power-ranking/candy-data.csv" ) df # In[3]: print(f"Are there missing values? {df.isna().any().any()}") # In[4]: # For easier visualization (later), discretizing the sugarpercent based on its median. # Another threshold can be decided (e.g if considering health aspect) df["high_sugar"] = df.sugarpercent.apply( lambda x: 1 if x > df.sugarpercent.median() else 0 ) # ## Analysis focus # # The focus of the analysis is to determine what constitutes a winning candy **in terms of winning percentage**. # The analysis is made under the following assumption: # # **Assumption:** # The price of the products is not impacting their winning chances. # # My understanding is that the survey was conducted among people who played the role of the recipients of the candies (during Halloween). # Therefore, their decision when asked about two candies is independent of the price. # Analyzing the price of the items will be discussed after identifying the winning composition of traits. # # **Question:** # # Assume that you want to decide which candy to include in your shop offering. # There are at least two approaches: # 1. Pick one or more _existing_ candies and add them to the shop. # 2. Develop a new candy that will be used. # # This is an involved *business* decision that require data while being discussed. # # ## Candies Traits # # There are 10 different binary traits of each candy. # First, a visualization that look into the median of the win percentage depending on the existence of a trait. # On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is. # ## Candies Traits # # There are 10 different binary traits of each candy. # First, a visualization that look into the median of the win percentage depending on the existence of a trait. # # On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is. # In[5]: traits = [ # Picking the manually the traits that are taken into account. "chocolate", "fruity", "caramel", "peanutyalmondy", "nougat", "crispedricewafer", "hard", "bar", "pluribus", "high_sugar", ] fig = make_subplots( rows=len(traits), cols=2, column_widths=[4, 2], specs=10 * [[{"type": "xy"}, {"type": "pie"}]], # Supporting mixing of different viz types subplot_titles=reduce( # The title of each plot is defined here. # Needed to flatten a nested list to a single one lambda x, y: x + y, [(f"Win % vs. {trait}", f"Presence of {trait}") for trait in traits], ), ) fill_colors = {0: "rgba(25, 40, 150, 0.5)", 1: "rgba(150, 40, 25, 0.5)"} for i, trait in enumerate(traits): for is_trait in [0, 1]: fig.add_trace( go.Box( x=df[df[trait] == is_trait][trait], y=df[df[trait] == is_trait]["winpercent"], boxmean="sd", showlegend=False, fillcolor=fill_colors[is_trait], line={"color": fill_colors[is_trait]}, # Some related SO questions: # - https://stackoverflow.com/q/72110370/671013 # - https://stackoverflow.com/q/72110582/671013 # - https://stackoverflow.com/q/72801725/671013 ), row=i + 1, col=1, ) _vcount = df[trait].value_counts(normalize=False) fig.add_trace( px.pie(df, values=_vcount.values, names=_vcount.index).data[0], row=i + 1, col=2 ) fig.update_layout(height=2000, width=1200, title_text="Impact on Win chances") fig # For easier reading of the numbers, preparing a dataframe holding the important values. # In[6]: median_impact_df = pd.DataFrame( [ [ trait, df[df[trait] == 0]["winpercent"].median(), df[df[trait] == 1]["winpercent"].median(), df[df[trait] == 0].shape[0], df[df[trait] == 1].shape[0], ] for trait in traits ], columns=["Trait", "Median (without)", "Median (with)", "c_without", "c_with",], ) # Compute the change of the median of the win percent when comparing with and without a trait median_impact_df["Median change (pct)"] = 100 * ( median_impact_df["Median (with)"] / median_impact_df["Median (without)"] - 1 ) # Computing the presence (pct) of the trait) median_impact_df["Presence (pct)"] = 100 * median_impact_df["c_with"] / df.shape[0] median_impact_df.drop(["c_without", "c_with"], axis=1, inplace=True) median_impact_df.sort_values(["Presence (pct)"], ascending=False) # **Reading this table:** # > The higher the presence of a trait is, the change in the median of the win percentage should be taken more seriously # # **Recommendations:** # When coming to pick a candy (one or more) following evidences should guide: # # - No pluribus - the win percentage drops by 12%. # - Candy should have high sugar content (a 19% increase in the win percentage) # - Candy should *not* be fruity (a loss of almost 24%) # - Candy should contain chocolate (46% increase) # - A bar candy is favorable (39% increase) # - Soft candies wins more (-25% for the hard ones) # # At this stage, I would be careful with the other traits as their presences drops (below 17% of the candies). # # Next, we can look into the correlation between the different traits: # In[7]: corr_df = df.drop( ["competitorname", "pricepercent", "winpercent", "sugarpercent"], axis=1 ).corr() fig = px.imshow(corr_df, color_continuous_scale="RdBu") fig.show() # From the above matrix, one can conclude that a *bar* snack is a good candidate as it is meeting many of the guidings mentioned above. # # Here is a good candidate (my favorite childhood's candy): # # # # Although these are newer version that weren't available back in the days 😇. # ## The Price # # Finally, a few points regarding the price of the products. # When looking at the overall price distribution there is no clear story: # In[8]: # Compare to the behavior of the price when not drilled down by traits px.violin(df, x="pricepercent") # However, by looking at the prices per trait, we see that different traits correspond to different price ranges. # This view can help when discussing the business plan and taking into account the prices of the items. # In[9]: # Help order the cureves in an increasing order. traits_ordered_by_median_of_price = ( pd.DataFrame( [[trait, df[df[trait] == 1]["pricepercent"].median()] for trait in traits], columns=["trait", "median"], ) .sort_values("median")["trait"] .values ) colors = n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", len(traits), colortype="rgb") fig = go.Figure() for trait, color in zip(traits_ordered_by_median_of_price, colors): fig.add_trace( go.Violin(x=df[df[trait] == 1].pricepercent, line_color=color, name=trait) ) fig.update_annotations() fig.update_traces( orientation="h", side="positive", width=3, points="suspectedoutliers", ) fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800) fig.show() # ### Price vs. winning # # Note that traits linked to higher win percentage are also more expansive. # In[10]: px.scatter( df, x="pricepercent", y="winpercent", trendline="ols", trendline_color_override="gray", ) # The figure above suggests that there is a positive correlation between the price of the item and its win percent. # From a business stand point, there are at least two possible tracks: # - Favor "quality"; focus on more expansive candies that aim at higher customer satisfaction. The potential tradeoff in this case can be a reduced business volume. # - Favor "volume"; aim at "low" end products that gain less winning percentage, but are cheaper and thus yield higher volumes. # # Deciding which track to take is kept out of scope for this analysis. # # # The environment # # ``` # ipykernel # pandas==1.4.2 # plotly==5.7.0 # ipywidgets==7.7.0 # statsmodels==0.13.2 # ``` # # # # Oh no... # # I just realized that the lovely plots are not rendered when served in the blog 🤯. # The notebook used to creating this notebook can be found [here](https://gist.github.com/drorata/120b25ec127a31f24e54d34ddbec08c5/858150285415ac997935a01a5ac9f613e5cbec34). # # The notebook can be opened using Colab [here](https://colab.research.google.com/github/drorata/drorata.github.io/blob/eeb65d9cc5eca4ae2f47769b9ec94ca82c707152/content/notebooks/candy-analysis.ipynb). # Not optimal, but works...