In [1]:

from functools import reduce

import pandas as pd
import plotly.graph_objects as go
from plotly import express as px
from plotly.colors import n_colors
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

init_notebook_mode()
pd.options.plotting.backend = "plotly"

This is my attempt on analyzing the data collected by fivethirtyeight when trying to determine which candy is the best. You can read more about the data set, how it was collected etc. here. The data itself is available here (don't forget to have a look in the README).

In [2]:

# Data taken from a fork of the original data source.
df = pd.read_csv(
    "https://raw.githubusercontent.com/drorata/fivethirtyeight-data/b22a21b264162ad0b5d8954b02e0bca5ab782113/candy-power-ranking/candy-data.csv"
)
df

Out[2]:

	competitorname	chocolate	fruity	caramel	peanutyalmondy	nougat	crispedricewafer	hard	bar	pluribus	sugarpercent	pricepercent	winpercent
0	100 Grand	1	0	1	0	0	1	0	1	0	0.732	0.860	66.971725
1	3 Musketeers	1	0	0	0	1	0	0	1	0	0.604	0.511	67.602936
2	One dime	0	0	0	0	0	0	0	0	0	0.011	0.116	32.261086
3	One quarter	0	0	0	0	0	0	0	0	0	0.011	0.511	46.116505
4	Air Heads	0	1	0	0	0	0	0	0	0	0.906	0.511	52.341465
...	...	...	...	...	...	...	...	...	...	...	...	...	...
80	Twizzlers	0	1	0	0	0	0	0	0	0	0.220	0.116	45.466282
81	Warheads	0	1	0	0	0	0	1	0	0	0.093	0.116	39.011898
82	WelchÕs Fruit Snacks	0	1	0	0	0	0	0	0	1	0.313	0.313	44.375519
83	WertherÕs Original Caramel	0	0	1	0	0	0	1	0	0	0.186	0.267	41.904308
84	Whoppers	1	0	0	0	0	1	0	0	1	0.872	0.848	49.524113

85 rows × 13 columns

In [3]:

print(f"Are there missing values? {df.isna().any().any()}")

Are there missing values? False

In [4]:

# For easier visualization (later), discretizing the sugarpercent based on its median.
# Another threshold can be decided (e.g if considering health aspect)
df["high_sugar"] = df.sugarpercent.apply(
    lambda x: 1 if x > df.sugarpercent.median() else 0
)

Analysis focus¶

The focus of the analysis is to determine what constitutes a winning candy in terms of winning percentage. The analysis is made under the following assumption:

Assumption: The price of the products is not impacting their winning chances.

My understanding is that the survey was conducted among people who played the role of the recipients of the candies (during Halloween). Therefore, their decision when asked about two candies is independent of the price. Analyzing the price of the items will be discussed after identifying the winning composition of traits.

Question:

Assume that you want to decide which candy to include in your shop offering. There are at least two approaches:

Pick one or more existing candies and add them to the shop.
Develop a new candy that will be used.

This is an involved business decision that require data while being discussed.

Candies Traits¶

There are 10 different binary traits of each candy. First, a visualization that look into the median of the win percentage depending on the existence of a trait. On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is.

Candies Traits¶

There are 10 different binary traits of each candy. First, a visualization that look into the median of the win percentage depending on the existence of a trait.

On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is.

In [5]:

traits = [
    # Picking the manually the traits that are taken into account.
    "chocolate",
    "fruity",
    "caramel",
    "peanutyalmondy",
    "nougat",
    "crispedricewafer",
    "hard",
    "bar",
    "pluribus",
    "high_sugar",
]

fig = make_subplots(
    rows=len(traits),
    cols=2,
    column_widths=[4, 2],
    specs=10
    * [[{"type": "xy"}, {"type": "pie"}]],  # Supporting mixing of different viz types
    subplot_titles=reduce(
        # The title of each plot is defined here.
        # Needed to flatten a nested list to a single one
        lambda x, y: x + y,
        [(f"Win % vs. {trait}", f"Presence of {trait}") for trait in traits],
    ),
)

fill_colors = {0: "rgba(25, 40, 150, 0.5)", 1: "rgba(150, 40, 25, 0.5)"}

for i, trait in enumerate(traits):
    for is_trait in [0, 1]:
        fig.add_trace(
            go.Box(
                x=df[df[trait] == is_trait][trait],
                y=df[df[trait] == is_trait]["winpercent"],
                boxmean="sd",
                showlegend=False,
                fillcolor=fill_colors[is_trait],
                line={"color": fill_colors[is_trait]},
                # Some related SO questions:
                # - https://stackoverflow.com/q/72110370/671013
                # - https://stackoverflow.com/q/72110582/671013
                # - https://stackoverflow.com/q/72801725/671013
            ),
            row=i + 1,
            col=1,
        )
    _vcount = df[trait].value_counts(normalize=False)
    fig.add_trace(
        px.pie(df, values=_vcount.values, names=_vcount.index).data[0], row=i + 1, col=2
    )
fig.update_layout(height=2000, width=1200, title_text="Impact on Win chances")
fig

For easier reading of the numbers, preparing a dataframe holding the important values.

In [6]:

median_impact_df = pd.DataFrame(
    [
        [
            trait,
            df[df[trait] == 0]["winpercent"].median(),
            df[df[trait] == 1]["winpercent"].median(),
            df[df[trait] == 0].shape[0],
            df[df[trait] == 1].shape[0],
        ]
        for trait in traits
    ],
    columns=["Trait", "Median (without)", "Median (with)", "c_without", "c_with",],
)

# Compute the change of the median of the win percent when comparing with and without a trait
median_impact_df["Median change (pct)"] = 100 * (
    median_impact_df["Median (with)"] / median_impact_df["Median (without)"] - 1
)

# Computing the presence (pct) of the trait)
median_impact_df["Presence (pct)"] = 100 * median_impact_df["c_with"] / df.shape[0]
median_impact_df.drop(["c_without", "c_with"], axis=1, inplace=True)

median_impact_df.sort_values(["Presence (pct)"], ascending=False)

Out[6]:

	Trait	Median (without)	Median (with)	Median change (pct)	Presence (pct)
8	pluribus	52.341465	45.866287	-12.371028	51.764706
9	high_sugar	46.056166	54.861111	19.117842	45.882353
1	fruity	56.490501	42.969034	-23.935824	44.705882
0	chocolate	41.646933	60.800701	45.990827	43.529412
7	bar	43.732221	60.800701	39.029528	24.705882
6	hard	52.583706	39.460556	-24.956685	17.647059
2	caramel	46.411716	60.164976	29.633164	16.470588
3	peanutyalmondy	45.995827	70.109714	52.426251	16.470588
4	nougat	46.978288	60.800701	29.422980	8.235294
5	crispedricewafer	46.354157	66.470680	43.397453	8.235294

Reading this table:

The higher the presence of a trait is, the change in the median of the win percentage should be taken more seriously

Recommendations: When coming to pick a candy (one or more) following evidences should guide:

No pluribus - the win percentage drops by 12%.
Candy should have high sugar content (a 19% increase in the win percentage)
Candy should not be fruity (a loss of almost 24%)
Candy should contain chocolate (46% increase)
A bar candy is favorable (39% increase)
Soft candies wins more (-25% for the hard ones)

At this stage, I would be careful with the other traits as their presences drops (below 17% of the candies).

Next, we can look into the correlation between the different traits:

In [7]:

corr_df = df.drop(
    ["competitorname", "pricepercent", "winpercent", "sugarpercent"], axis=1
).corr()
fig = px.imshow(corr_df, color_continuous_scale="RdBu")
fig.show()

From the above matrix, one can conclude that a bar snack is a good candidate as it is meeting many of the guidings mentioned above.

Here is a good candidate (my favorite childhood's candy):

Although these are newer version that weren't available back in the days 😇.

The Price¶

Finally, a few points regarding the price of the products. When looking at the overall price distribution there is no clear story:

In [8]:

# Compare to the behavior of the price when not drilled down by traits
px.violin(df, x="pricepercent")

However, by looking at the prices per trait, we see that different traits correspond to different price ranges. This view can help when discussing the business plan and taking into account the prices of the items.

In [9]:

# Help order the cureves in an increasing order.
traits_ordered_by_median_of_price = (
    pd.DataFrame(
        [[trait, df[df[trait] == 1]["pricepercent"].median()] for trait in traits],
        columns=["trait", "median"],
    )
    .sort_values("median")["trait"]
    .values
)

colors = n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", len(traits), colortype="rgb")

fig = go.Figure()
for trait, color in zip(traits_ordered_by_median_of_price, colors):
    fig.add_trace(
        go.Violin(x=df[df[trait] == 1].pricepercent, line_color=color, name=trait)
    )
    fig.update_annotations()

fig.update_traces(
    orientation="h", side="positive", width=3, points="suspectedoutliers",
)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800)
fig.show()

Price vs. winning¶

Note that traits linked to higher win percentage are also more expansive.

In [10]:

px.scatter(
    df,
    x="pricepercent",
    y="winpercent",
    trendline="ols",
    trendline_color_override="gray",
)

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
/Users/drorata/My Drive/dev/candy-analysis/candy-analysis.ipynb Cell 19 in <cell line: 1>()
----> <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=0'>1</a> px.scatter(
      <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=1'>2</a>     df,
      <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=2'>3</a>     x="pricepercent",
      <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=3'>4</a>     y="winpercent",
      <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=4'>5</a>     trendline="ols",
      <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=5'>6</a>     trendline_color_override="gray",
      <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=6'>7</a> )

File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/_chart_types.py:66, in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_options, trendline_color_override, trendline_scope, log_x, log_y, range_x, range_y, render_mode, title, template, width, height)
     12 def scatter(
     13     data_frame=None,
     14     x=None,
   (...)
     60     height=None,
     61 ) -> go.Figure:
     62     """
     63     In a scatter plot, each row of `data_frame` is represented by a symbol
     64     mark in 2D space.
     65     """
---> 66     return make_figure(args=locals(), constructor=go.Scatter)

File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/_core.py:2167, in make_figure(args, constructor, trace_patch, layout_patch)
   2164     elif args["ecdfnorm"] == "percent":
   2165         group[var] = 100.0 * group[var] / group_sum
-> 2167 patch, fit_results = make_trace_kwargs(
   2168     args, trace_spec, group, mapping_labels.copy(), sizeref
   2169 )
   2170 trace.update(patch)
   2171 if fit_results is not None:

File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/_core.py:358, in make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref)
    356 trace_patch["x"] = sorted_trace_data[args["x"]][non_missing]
    357 trendline_function = trendline_functions[attr_value]
--> 358 y_out, hover_header, fit_results = trendline_function(
    359     args["trendline_options"],
    360     sorted_trace_data[args["x"]],
    361     x,
    362     y,
    363     args["x"],
    364     args["y"],
    365     non_missing,
    366 )
    367 assert len(y_out) == len(
    368     trace_patch["x"]
    369 ), "missing-data-handling failure in trendline code"
    370 trace_patch["y"] = y_out

File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/trendline_functions/__init__.py:43, in ols(trendline_options, x_raw, x, y, x_label, y_label, non_missing)
     37     if k not in valid_options:
     38         raise ValueError(
     39             "OLS trendline_options keys must be one of [%s] but got '%s'"
     40             % (", ".join(valid_options), k)
     41         )
---> 43 import statsmodels.api as sm
     45 add_constant = trendline_options.get("add_constant", True)
     46 log_x = trendline_options.get("log_x", False)

ModuleNotFoundError: No module named 'statsmodels'

The figure above suggests that there is a positive correlation between the price of the item and its win percent. From a business stand point, there are at least two possible tracks:

Favor "quality"; focus on more expansive candies that aim at higher customer satisfaction. The potential tradeoff in this case can be a reduced business volume.
Favor "volume"; aim at "low" end products that gain less winning percentage, but are cheaper and thus yield higher volumes.

Deciding which track to take is kept out of scope for this analysis.

The environment¶

ipykernel
pandas==1.4.2
plotly==5.7.0
ipywidgets==7.7.0
statsmodels==0.13.2

Oh no...¶

I just realized that the lovely plots are not rendered when served in the blog 🤯. The notebook used to creating this notebook can be found here.

The notebook can be opened using Colab here. Not optimal, but works...