#!/usr/bin/env python
# coding: utf-8

# In[1]:


from functools import reduce

import pandas as pd
import plotly.graph_objects as go
from plotly import express as px
from plotly.colors import n_colors
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

init_notebook_mode()
pd.options.plotting.backend = "plotly"


# This is my attempt on analyzing the data collected by [fivethirtyeight](https://fivethirtyeight.com/) when trying to determine which candy is the best.
# You can read more about the data set, how it was collected etc. [here](https://fivethirtyeight.com/videos/the-ultimate-halloween-candy-power-ranking/).
# The data itself is available [here](https://github.com/fivethirtyeight/data/blob/master/candy-power-ranking/candy-data.csv) (don't forget to have a look in the [README](https://github.com/fivethirtyeight/data/blob/master/candy-power-ranking/README.md)).

# In[2]:


# Data taken from a fork of the original data source.
df = pd.read_csv(
    "https://raw.githubusercontent.com/drorata/fivethirtyeight-data/b22a21b264162ad0b5d8954b02e0bca5ab782113/candy-power-ranking/candy-data.csv"
)
df


# In[3]:


print(f"Are there missing values? {df.isna().any().any()}")


# In[4]:


# For easier visualization (later), discretizing the sugarpercent based on its median.
# Another threshold can be decided (e.g if considering health aspect)
df["high_sugar"] = df.sugarpercent.apply(
    lambda x: 1 if x > df.sugarpercent.median() else 0
)


# ## Analysis focus
# 
# The focus of the analysis is to determine what constitutes a winning candy **in terms of winning percentage**.
# The analysis is made under the following assumption:
# 
# **Assumption:**
# The price of the products is not impacting their winning chances.
# 
# My understanding is that the survey was conducted among people who played the role of the recipients of the candies (during Halloween). 
# Therefore, their decision when asked about two candies is independent of the price.
# Analyzing the price of the items will be discussed after identifying the winning composition of traits.
# 
# **Question:**
# 
# Assume that you want to decide which candy to include in your shop offering.
# There are at least two approaches:
# 1. Pick one or more _existing_ candies and add them to the shop.
# 2. Develop a new candy that will be used.
# 
# This is an involved *business* decision that require data while being discussed.
# 
# ## Candies Traits
# 
# There are 10 different binary traits of each candy.
# First, a visualization that look into the median of the win percentage depending on the existence of a trait.
# On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is.

# ## Candies Traits
# 
# There are 10 different binary traits of each candy.
# First, a visualization that look into the median of the win percentage depending on the existence of a trait.
# 
# On the right handside, the presence of the trait (as percentage) is visualized to give an indication how trustworthy the change in the win percentage is.

# In[5]:


traits = [
    # Picking the manually the traits that are taken into account.
    "chocolate",
    "fruity",
    "caramel",
    "peanutyalmondy",
    "nougat",
    "crispedricewafer",
    "hard",
    "bar",
    "pluribus",
    "high_sugar",
]

fig = make_subplots(
    rows=len(traits),
    cols=2,
    column_widths=[4, 2],
    specs=10
    * [[{"type": "xy"}, {"type": "pie"}]],  # Supporting mixing of different viz types
    subplot_titles=reduce(
        # The title of each plot is defined here.
        # Needed to flatten a nested list to a single one
        lambda x, y: x + y,
        [(f"Win % vs. {trait}", f"Presence of {trait}") for trait in traits],
    ),
)

fill_colors = {0: "rgba(25, 40, 150, 0.5)", 1: "rgba(150, 40, 25, 0.5)"}

for i, trait in enumerate(traits):
    for is_trait in [0, 1]:
        fig.add_trace(
            go.Box(
                x=df[df[trait] == is_trait][trait],
                y=df[df[trait] == is_trait]["winpercent"],
                boxmean="sd",
                showlegend=False,
                fillcolor=fill_colors[is_trait],
                line={"color": fill_colors[is_trait]},
                # Some related SO questions:
                # - https://stackoverflow.com/q/72110370/671013
                # - https://stackoverflow.com/q/72110582/671013
                # - https://stackoverflow.com/q/72801725/671013
            ),
            row=i + 1,
            col=1,
        )
    _vcount = df[trait].value_counts(normalize=False)
    fig.add_trace(
        px.pie(df, values=_vcount.values, names=_vcount.index).data[0], row=i + 1, col=2
    )
fig.update_layout(height=2000, width=1200, title_text="Impact on Win chances")
fig


# For easier reading of the numbers, preparing a dataframe holding the important values.

# In[6]:


median_impact_df = pd.DataFrame(
    [
        [
            trait,
            df[df[trait] == 0]["winpercent"].median(),
            df[df[trait] == 1]["winpercent"].median(),
            df[df[trait] == 0].shape[0],
            df[df[trait] == 1].shape[0],
        ]
        for trait in traits
    ],
    columns=["Trait", "Median (without)", "Median (with)", "c_without", "c_with",],
)

# Compute the change of the median of the win percent when comparing with and without a trait
median_impact_df["Median change (pct)"] = 100 * (
    median_impact_df["Median (with)"] / median_impact_df["Median (without)"] - 1
)

# Computing the presence (pct) of the trait)
median_impact_df["Presence (pct)"] = 100 * median_impact_df["c_with"] / df.shape[0]
median_impact_df.drop(["c_without", "c_with"], axis=1, inplace=True)

median_impact_df.sort_values(["Presence (pct)"], ascending=False)


# **Reading this table:**
# > The higher the presence of a trait is, the change in the median of the win percentage should be taken more seriously
# 
# **Recommendations:**
# When coming to pick a candy (one or more) following evidences should guide:
# 
# - No pluribus - the win percentage drops by 12%.
# - Candy should have high sugar content (a 19% increase in the win percentage)
# - Candy should *not* be fruity (a loss of almost 24%)
# - Candy should contain chocolate (46% increase)
# - A bar candy is favorable (39% increase)
# - Soft candies wins more (-25% for the hard ones)
# 
# At this stage, I would be careful with the other traits as their presences drops (below 17% of the candies).
# 
# Next, we can look into the correlation between the different traits:

# In[7]:


corr_df = df.drop(
    ["competitorname", "pricepercent", "winpercent", "sugarpercent"], axis=1
).corr()
fig = px.imshow(corr_df, color_continuous_scale="RdBu")
fig.show()


# From the above matrix, one can conclude that a *bar* snack is a good candidate as it is meeting many of the guidings mentioned above.
# 
# Here is a good candidate (my favorite childhood's candy):
# 
# <img src=https://upload.wikimedia.org/wikipedia/he/9/90/%D7%A4%D7%A1%D7%A7_%D7%96%D7%9E%D7%9F.jpg></img>
# 
# Although these are newer version that weren't available back in the days 😇.

# ## The Price
# 
# Finally, a few points regarding the price of the products.
# When looking at the overall price distribution there is no clear story:

# In[8]:


# Compare to the behavior of the price when not drilled down by traits
px.violin(df, x="pricepercent")


# However, by looking at the prices per trait, we see that different traits correspond to different price ranges.
# This view can help when discussing the business plan and taking into account the prices of the items.

# In[9]:


# Help order the cureves in an increasing order.
traits_ordered_by_median_of_price = (
    pd.DataFrame(
        [[trait, df[df[trait] == 1]["pricepercent"].median()] for trait in traits],
        columns=["trait", "median"],
    )
    .sort_values("median")["trait"]
    .values
)

colors = n_colors("rgb(5, 200, 200)", "rgb(200, 10, 10)", len(traits), colortype="rgb")

fig = go.Figure()
for trait, color in zip(traits_ordered_by_median_of_price, colors):
    fig.add_trace(
        go.Violin(x=df[df[trait] == 1].pricepercent, line_color=color, name=trait)
    )
    fig.update_annotations()

fig.update_traces(
    orientation="h", side="positive", width=3, points="suspectedoutliers",
)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False, height=800)
fig.show()


# ### Price vs. winning
# 
# Note that traits linked to higher win percentage are also more expansive.

# In[10]:


px.scatter(
    df,
    x="pricepercent",
    y="winpercent",
    trendline="ols",
    trendline_color_override="gray",
)


# The figure above suggests that there is a positive correlation between the price of the item and its win percent.
# From a business stand point, there are at least two possible tracks:
# - Favor "quality"; focus on more expansive candies that aim at higher customer satisfaction. The potential tradeoff in this case can be a reduced business volume.
# - Favor "volume"; aim at "low" end products that gain less winning percentage, but are cheaper and thus yield higher volumes.
# 
# Deciding which track to take is kept out of scope for this analysis.
# 
# # The environment
# 
# ```
# ipykernel
# pandas==1.4.2
# plotly==5.7.0
# ipywidgets==7.7.0
# statsmodels==0.13.2
# ```
# 
# 
# # Oh no...
# 
# I just realized that the lovely plots are not rendered when served in the blog 🤯.
# The notebook used to creating this notebook can be found [here](https://gist.github.com/drorata/120b25ec127a31f24e54d34ddbec08c5/858150285415ac997935a01a5ac9f613e5cbec34).
# 
# The notebook can be opened using Colab [here](https://colab.research.google.com/github/drorata/drorata.github.io/blob/eeb65d9cc5eca4ae2f47769b9ec94ca82c707152/content/notebooks/candy-analysis.ipynb).
# Not optimal, but works...