from functools import reduce
import pandas as pd
import plotly.graph_objects as go
from plotly import express as px
from plotly.colors import n_colors
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode
init_notebook_mode()
pd.options.plotting.backend = "plotly"
This is my attempt on analyzing the data collected by fivethirtyeight when trying to determine which candy is the best. You can read more about the data set, how it was collected etc. here. The data itself is available here (don't forget to have a look in the README).
# Data taken from a fork of the original data source.
df = pd.read_csv(
"https://raw.githubusercontent.com/drorata/fivethirtyeight-data/b22a21b264162ad0b5d8954b02e0bca5ab782113/candy-power-ranking/candy-data.csv"
)
df
competitorname | chocolate | fruity | caramel | peanutyalmondy | nougat | crispedricewafer | hard | bar | pluribus | sugarpercent | pricepercent | winpercent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 100 Grand | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0.732 | 0.860 | 66.971725 |
1 | 3 Musketeers | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0.604 | 0.511 | 67.602936 |
2 | One dime | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.011 | 0.116 | 32.261086 |
3 | One quarter | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.011 | 0.511 | 46.116505 |
4 | Air Heads | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.906 | 0.511 | 52.341465 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
80 | Twizzlers | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.220 | 0.116 | 45.466282 |
81 | Warheads | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.093 | 0.116 | 39.011898 |
82 | WelchÕs Fruit Snacks | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0.313 | 0.313 | 44.375519 |
83 | WertherÕs Original Caramel | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0.186 | 0.267 | 41.904308 |
84 | Whoppers | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0.872 | 0.848 | 49.524113 |
85 rows × 13 columns
print(f"Are there missing values? {df.isna().any().any()}")
Are there missing values? False
# For easier visualization (later), discretizing the sugarpercent based on its median.
# Another threshold can be decided (e.g if considering health aspect)
df["high_sugar"] = df.sugarpercent.apply(
lambda x: 1 if x > df.sugarpercent.median() else 0
)
The focus of the analysis is to determine what constitutes a winning candy in terms of winning percentage. The analysis is made under the following assumption:
Assumption: The price of the products is not impacting their winning chances.
My understanding is that the survey was conducted among people who played the role of the recipients of the candies (during Halloween). Therefore, their decision when asked about two candies is independent of the price. Analyzing the price of the items will be discussed after identifying the winning composition of traits.
Question:
Assume that you want to decide which candy to include in your shop offering. There are at least two approaches:
This is an involved business decision that require data while being discussed.
There are 10 different binary traits of each candy. First, a visualization that look into the median of the win percentage depending on the existence of a trait. On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is.
There are 10 different binary traits of each candy. First, a visualization that look into the median of the win percentage depending on the existence of a trait.
On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is.
traits = [
# Picking the manually the traits that are taken into account.
"chocolate",
"fruity",
"caramel",
"peanutyalmondy",
"nougat",
"crispedricewafer",
"hard",
"bar",
"pluribus",
"high_sugar",
]
fig = make_subplots(
rows=len(traits),
cols=2,
column_widths=[4, 2],
specs=10
* [[{"type": "xy"}, {"type": "pie"}]], # Supporting mixing of different viz types
subplot_titles=reduce(
# The title of each plot is defined here.
# Needed to flatten a nested list to a single one
lambda x, y: x + y,
[(f"Win % vs. {trait}", f"Presence of {trait}") for trait in traits],
),
)
fill_colors = {0: "rgba(25, 40, 150, 0.5)", 1: "rgba(150, 40, 25, 0.5)"}
for i, trait in enumerate(traits):
for is_trait in [0, 1]:
fig.add_trace(
go.Box(
x=df[df[trait] == is_trait][trait],
y=df[df[trait] == is_trait]["winpercent"],
boxmean="sd",
showlegend=False,
fillcolor=fill_colors[is_trait],
line={"color": fill_colors[is_trait]},
# Some related SO questions:
# - https://stackoverflow.com/q/72110370/671013
# - https://stackoverflow.com/q/72110582/671013
# - https://stackoverflow.com/q/72801725/671013
),
row=i + 1,
col=1,
)
_vcount = df[trait].value_counts(normalize=False)
fig.add_trace(
px.pie(df, values=_vcount.values, names=_vcount.index).data[0], row=i + 1, col=2
)
fig.update_layout(height=2000, width=1200, title_text="Impact on Win chances")
fig
For easier reading of the numbers, preparing a dataframe holding the important values.
median_impact_df = pd.DataFrame(
[
[
trait,
df[df[trait] == 0]["winpercent"].median(),
df[df[trait] == 1]["winpercent"].median(),
df[df[trait] == 0].shape[0],
df[df[trait] == 1].shape[0],
]
for trait in traits
],
columns=["Trait", "Median (without)", "Median (with)", "c_without", "c_with",],
)
# Compute the change of the median of the win percent when comparing with and without a trait
median_impact_df["Median change (pct)"] = 100 * (
median_impact_df["Median (with)"] / median_impact_df["Median (without)"] - 1
)
# Computing the presence (pct) of the trait)
median_impact_df["Presence (pct)"] = 100 * median_impact_df["c_with"] / df.shape[0]
median_impact_df.drop(["c_without", "c_with"], axis=1, inplace=True)
median_impact_df.sort_values(["Presence (pct)"], ascending=False)
Trait | Median (without) | Median (with) | Median change (pct) | Presence (pct) | |
---|---|---|---|---|---|
8 | pluribus | 52.341465 | 45.866287 | -12.371028 | 51.764706 |
9 | high_sugar | 46.056166 | 54.861111 | 19.117842 | 45.882353 |
1 | fruity | 56.490501 | 42.969034 | -23.935824 | 44.705882 |
0 | chocolate | 41.646933 | 60.800701 | 45.990827 | 43.529412 |
7 | bar | 43.732221 | 60.800701 | 39.029528 | 24.705882 |
6 | hard | 52.583706 | 39.460556 | -24.956685 | 17.647059 |
2 | caramel | 46.411716 | 60.164976 | 29.633164 | 16.470588 |
3 | peanutyalmondy | 45.995827 | 70.109714 | 52.426251 | 16.470588 |
4 | nougat | 46.978288 | 60.800701 | 29.422980 | 8.235294 |
5 | crispedricewafer | 46.354157 | 66.470680 | 43.397453 | 8.235294 |
Reading this table:
The higher the presence of a trait is, the change in the median of the win percentage should be taken more seriously
Recommendations: When coming to pick a candy (one or more) following evidences should guide:
At this stage, I would be careful with the other traits as their presences drops (below 17% of the candies).
Next, we can look into the correlation between the different traits:
corr_df = df.drop(
["competitorname", "pricepercent", "winpercent", "sugarpercent"], axis=1
).corr()
fig = px.imshow(corr_df, color_continuous_scale="RdBu")
fig.show()
From the above matrix, one can conclude that a bar snack is a good candidate as it is meeting many of the guidings mentioned above.
Here is a good candidate (my favorite childhood's candy):
Although these are newer version that weren't available back in the days 😇.
Finally, a few points regarding the price of the products. When looking at the overall price distribution there is no clear story:
# Compare to the behavior of the price when not drilled down by traits
px.violin(df, x="pricepercent")
However, by looking at the prices per trait, we see that different traits correspond to different price ranges. This view can help when discussing the business plan and taking into account the prices of the items.
# Help order the cureves in an increasing order.
traits_ordered_by_median_of_price = (
pd.DataFrame(
[[trait, df[df[trait] == 1]["pricepercent"].median()] for trait in traits],
columns=["trait", "median"],
)
.sort_values("median")["trait"]
.values
)
colors = n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", len(traits), colortype="rgb")
fig = go.Figure()
for trait, color in zip(traits_ordered_by_median_of_price, colors):
fig.add_trace(
go.Violin(x=df[df[trait] == 1].pricepercent, line_color=color, name=trait)
)
fig.update_annotations()
fig.update_traces(
orientation="h", side="positive", width=3, points="suspectedoutliers",
)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800)
fig.show()
Note that traits linked to higher win percentage are also more expansive.
px.scatter(
df,
x="pricepercent",
y="winpercent",
trendline="ols",
trendline_color_override="gray",
)
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) /Users/drorata/My Drive/dev/candy-analysis/candy-analysis.ipynb Cell 19 in <cell line: 1>() ----> <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=0'>1</a> px.scatter( <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=1'>2</a> df, <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=2'>3</a> x="pricepercent", <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=3'>4</a> y="winpercent", <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=4'>5</a> trendline="ols", <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=5'>6</a> trendline_color_override="gray", <a href='vscode-notebook-cell:/Users/drorata/My%20Drive/dev/candy-analysis/candy-analysis.ipynb#X24sZmlsZQ%3D%3D?line=6'>7</a> ) File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/_chart_types.py:66, in scatter(data_frame, x, y, color, symbol, size, hover_name, hover_data, custom_data, text, facet_row, facet_col, facet_col_wrap, facet_row_spacing, facet_col_spacing, error_x, error_x_minus, error_y, error_y_minus, animation_frame, animation_group, category_orders, labels, orientation, color_discrete_sequence, color_discrete_map, color_continuous_scale, range_color, color_continuous_midpoint, symbol_sequence, symbol_map, opacity, size_max, marginal_x, marginal_y, trendline, trendline_options, trendline_color_override, trendline_scope, log_x, log_y, range_x, range_y, render_mode, title, template, width, height) 12 def scatter( 13 data_frame=None, 14 x=None, (...) 60 height=None, 61 ) -> go.Figure: 62 """ 63 In a scatter plot, each row of `data_frame` is represented by a symbol 64 mark in 2D space. 65 """ ---> 66 return make_figure(args=locals(), constructor=go.Scatter) File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/_core.py:2167, in make_figure(args, constructor, trace_patch, layout_patch) 2164 elif args["ecdfnorm"] == "percent": 2165 group[var] = 100.0 * group[var] / group_sum -> 2167 patch, fit_results = make_trace_kwargs( 2168 args, trace_spec, group, mapping_labels.copy(), sizeref 2169 ) 2170 trace.update(patch) 2171 if fit_results is not None: File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/_core.py:358, in make_trace_kwargs(args, trace_spec, trace_data, mapping_labels, sizeref) 356 trace_patch["x"] = sorted_trace_data[args["x"]][non_missing] 357 trendline_function = trendline_functions[attr_value] --> 358 y_out, hover_header, fit_results = trendline_function( 359 args["trendline_options"], 360 sorted_trace_data[args["x"]], 361 x, 362 y, 363 args["x"], 364 args["y"], 365 non_missing, 366 ) 367 assert len(y_out) == len( 368 trace_patch["x"] 369 ), "missing-data-handling failure in trendline code" 370 trace_patch["y"] = y_out File ~/.local/share/virtualenvs/candy-analysis-2Bzwo_Fd/lib/python3.10/site-packages/plotly/express/trendline_functions/__init__.py:43, in ols(trendline_options, x_raw, x, y, x_label, y_label, non_missing) 37 if k not in valid_options: 38 raise ValueError( 39 "OLS trendline_options keys must be one of [%s] but got '%s'" 40 % (", ".join(valid_options), k) 41 ) ---> 43 import statsmodels.api as sm 45 add_constant = trendline_options.get("add_constant", True) 46 log_x = trendline_options.get("log_x", False) ModuleNotFoundError: No module named 'statsmodels'
The figure above suggests that there is a positive correlation between the price of the item and its win percent. From a business stand point, there are at least two possible tracks:
Deciding which track to take is kept out of scope for this analysis.
ipykernel
pandas==1.4.2
plotly==5.7.0
ipywidgets==7.7.0
statsmodels==0.13.2
I just realized that the lovely plots are not rendered when served in the blog 🤯. The notebook used to creating this notebook can be found here.
The notebook can be opened using Colab here. Not optimal, but works...