#!/usr/bin/env python
# coding: utf-8

# # Exploring data for the attention index
# 
# The idea of the attention index is to provide a score that indicates the impact of an article, and can easily be aggregated by subject, publisher or other axis.
# 
# The index comprises of two parts:
# 
# - **promotion** how important the article was to the publisher, based on the extent to which they chose to editorially promote it
# - **response** how readers reacted to the article, based on social engagements
# 
# The index will be a number between 0 and 100. 50% is driven by the promotion, and 50% by response:
# 
# ![Attention Index](../images/kaleida-attention-index-data-factors-chart.png)
# 
# ### Promotion Score
# 
# The promotion score should take into account:
# 
# - whether the publisher chose make the article a lead article on their primary front (30%)
# - how long the publisher chose to retain the article on their front (40%)
# - whether they chose to push the article on their facebook brand page (30%)
# 
# It should be scaled based on the value of that promotion, so a popular, well-visited site should score higher than one on the fringes. And similarly a powerful, well-followed brand page should score higher than one less followed.
# 
# ### Response Score
# 
# The response score takes into account the number of engagements on Facebook. 
# 
# 
# 
# The rest of this notebook explores how those numbers could work, starting with the response score because that is easier, I think.

# # Setup

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# In[2]:


data = pd.read_csv("articles_2017-08-01_2017-08-31.csv", index_col="id", \
                   parse_dates=["published", "discovered"])
data.head()


# # Response Score

# The response score is a number between 0 and 50 that indicates the level of response to an article.
# 
# 
# 
# Perhaps in the future we may choose to include other factors, but for now we just include engagements on Facebook. The maximum score of 50 should be achieved by an article that does really well compared with others.

# In[3]:


pd.options.display.float_format = '{:.2f}'.format
data.fb_engagements.describe([0.5, 0.75, 0.9, 0.95, 0.99, 0.995, 0.999])


# There's a few articles there with 1 million plus engagements, let's just double check that.

# In[4]:


data[data.fb_engagements > 1000000]


# In[5]:


data.fb_engagements.mode()


# Going back to the enagement counts, we see the mean is 1,542, mode is zero, median is 29, 90th percentile is 2,085, 99th percentile is 27,998, 99.5th percentile is 46,698. The standard deviation is 12,427, significantly higher than the mean, so this is not a normal distribution. 
# 
# We want to provide a sensible way of allocating this to the 50 buckets we have available. Let's just bucket geometrically first:

# In[6]:


mean = data.fb_engagements.mean()
median = data.fb_engagements.median()

plt.figure(figsize=(12,4.5))
plt.hist(data.fb_engagements, bins=50)
plt.axvline(mean, linestyle=':', label=f'Mean ({mean:,.0f})', color='green')
plt.axvline(median, label=f'Median ({median:,.0f})', color='red')
leg = plt.legend()


# Well that's not very useful. Almost everything will score less than 0 if we just do that, which isn't a useful metric.
# 
# Let's start by excluding zeros.

# In[7]:


non_zero_fb_enagagements = data.fb_engagements[data.fb_engagements > 0]

plt.figure(figsize=(12,4.5))
plt.hist(non_zero_fb_enagagements, bins=50)
plt.axvline(mean, linestyle=':', label=f'Mean ({mean:,.0f})', color='green')
plt.axvline(median, label=f'Median ({median:,.0f})', color='red')
leg = plt.legend()


# That's still a big number at the bottom, and so not a useful score.
# 
# Next, we exclude the outliers: cap at the 99.9th percentile (i.e. 119211), so that 0.1% of articles should receive the maximum score.
# 

# In[8]:


non_zero_fb_enagagements_without_outliers = non_zero_fb_enagagements.clip_upper(119211)

plt.figure(figsize=(12,4.5))
plt.hist(non_zero_fb_enagagements_without_outliers, bins=50)
plt.axvline(mean, linestyle=':', label=f'Mean ({mean:,.0f})', color='green')
plt.axvline(median, label=f'Median ({median:,.0f})', color='red')
leg = plt.legend()


# That's a bit better, but still way too clustered at the low end. Let's look at a log normal distribution.

# In[9]:


mean = data.fb_engagements.mean()
median = data.fb_engagements.median()
ninety = data.fb_engagements.quantile(.90)
ninetyfive = data.fb_engagements.quantile(.95)
ninetynine = data.fb_engagements.quantile(.99)

plt.figure(figsize=(12,4.5))
plt.hist(np.log(non_zero_fb_enagagements + median), bins=50)
plt.axvline(np.log(mean), linestyle=':', label=f'Mean ({mean:,.0f})', color='green')
plt.axvline(np.log(median), label=f'Median ({median:,.0f})', color='green')
plt.axvline(np.log(ninety), linestyle='--', label=f'90% percentile ({ninety:,.0f})', color='red')
plt.axvline(np.log(ninetyfive), linestyle='-.', label=f'95% percentile ({ninetyfive:,.0f})', color='red')
plt.axvline(np.log(ninetynine), linestyle=':', label=f'99% percentile ({ninetynine:,.0f})', color='red')
leg = plt.legend()


# That's looking a bit more interesting.

# After some exploration, to avoid too much emphasis on the lower end of the scale, we move the numbers to the right a bit by adding on the median.

# In[10]:


log_engagements = (non_zero_fb_enagagements
                   .clip_upper(data.fb_engagements.quantile(.999))
                   .apply(lambda x: np.log(x + median))
                  )
log_engagements.describe()


# Use standard feature scaling to bring that to a 1 to 50 range
# 

# In[11]:


def scale_log_engagements(engagements_logged):
    return np.ceil(
        50 * (engagements_logged - log_engagements.min()) / (log_engagements.max() - log_engagements.min())
    )

def scale_engagements(engagements):
    return scale_log_engagements(np.log(engagements + median))

scaled_non_zero_engagements = scale_log_engagements(log_engagements)
scaled_non_zero_engagements.describe()


# In[12]:


# add in the zeros, as zero
scaled_engagements = pd.concat([scaled_non_zero_engagements, data.fb_engagements[data.fb_engagements == 0]])


# In[13]:


proposed = pd.DataFrame({"fb_engagements": data.fb_engagements, "response_score": scaled_engagements})
proposed.response_score.plot.hist(bins=50)


# Now look at how the shares distribute to score:

# In[14]:


plt.figure(figsize=(15,8))

shares = np.arange(1, 60000)
plt.plot(shares, scale_engagements(shares))
plt.xlabel("shares")
plt.ylabel("score")
plt.axhline(scale_engagements(mean), linestyle=':', label=f'Mean ({mean:,.0f})', color='green')
plt.axhline(scale_engagements(median), label=f'Median ({median:,.0f})', color='green')
plt.axhline(scale_engagements(ninety), linestyle='--', label=f'90% percentile ({ninety:,.0f})', color='red')
plt.axhline(scale_engagements(ninetyfive), linestyle='-.', label=f'95% percentile ({ninetyfive:,.0f})', color='red')
plt.axhline(scale_engagements(ninetynine), linestyle=':', label=f'99% percentile ({ninetynine:,.0f})', color='red')

plt.legend(frameon=True, shadow=True)


# In[15]:


proposed.groupby("response_score").fb_engagements.agg([np.size, np.min, np.max])


# Looks good to me, lets save that.

# In[16]:


data["response_score"] = proposed.response_score


# ### Proposal
# 
# The maximum of 50 points is awarded when the engagements are greater than the 99.9th percentile, rolling over the last month. 
# 
# i.e. where $limit$ is the 99.5th percentile of engagements calculated over the previous month, the response score for article $a$ is:
# 
# \begin{align}
# basicScore_a & = 
# \begin{cases} 
#    0 & \text{if } engagements_a = 0 \\
#    \log(\min(engagements_a,limit) + median(engagements))      & \text{if } engagements_a > 0
# \end{cases} \\
# responseScore_a & = 
# \begin{cases} 
#    0 & \text{if } engagements_a = 0 \\
#    50 \cdot \frac{basicScore_a - \min(basicScore)}{\max(basicScore) - \min(basicScore)}       & \text{if } engagements_a > 0
# \end{cases} \\
# \\
# \text{The latter equation can be expanded to:} \\
# responseScore_a & = 
# \begin{cases} 
#    0 & \text{if } engagements_a = 0 \\
#    50 \cdot 
#    \frac{\log(\min(engagements_a,limit) + median(engagements)) - \log(1 + median(engagements))}
#    {\log(limit + median(engagements)) - \log(1 + median(engagements))}       & \text{if } engagements_a > 0
# \end{cases} \\
# \end{align}

# # Promotion Score
# 
# The aim of the promotion score is to indicate how important the article was to the publisher, by tracking where they chose to promote it. This is a number between 0 and 50 comprised of:
# 
# - 20 points based on whether the article was promoted as the "lead" story on the publisher's home page
# - 15 points based on how long the article was promoted anywhere on the publisher's home page
# - 15 points based on whether the article was promoted on the publisher's main facebook brand page
# 
# The first two should be scaled by the popularity/reach of the home page, for which we use the alexa page rank as a proxy.
# 
# The last should be scaled by the popularity/reach of the brand page, for which we use the number of likes the brand page has.

# ### Lead story (20 points)

# In[17]:


data.mins_as_lead.describe([0.5, 0.75, 0.9, 0.95, 0.99, 0.995, 0.999])


# As expected, the vast majority of articles don't make it as lead. Let's explore how long typically publishers put something as lead for.

# In[18]:


lead_articles = data[data.mins_as_lead > 0]


# In[19]:


lead_articles.mins_as_lead.describe([0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.995, 0.999])


# In[20]:


lead_articles.mins_as_lead.plot.hist(bins=50)


# For lead, it's a  significant thing for an article to be lead at all, so although we want to penalise articles that were lead for a very short time, mostly we want to score the maximum even if it wasn't lead for ages. So we'll give maximum points when something has been lead for an hour.

# In[21]:


lead_articles.mins_as_lead.clip_upper(60).plot.hist(bins=50)


# We also want to scale this by the alexa page rank, such that the maximum score of 20 points is for an article that was on the front for 4 hours for the most popular site.
# 
# So lets explore the alexa nunbers.

# In[22]:


alexa_ranks = data.groupby(by="publisher_id").alexa_rank.mean().sort_values()
alexa_ranks


# In[23]:


alexa_ranks.plot.bar(figsize=[10,5])


# Let's try the simple option first: just divide the number of minutes as lead by the alexa rank. What's the scale of numbers we get then.

# In[24]:


lead_proposal_1 = lead_articles.mins_as_lead.clip_upper(60) / lead_articles.alexa_rank
lead_proposal_1.plot.hist()


# Looks like there's too much of a cluster around 0. Have we massively over penalised the publishers with a high alexa rank?

# In[25]:


lead_proposal_1.groupby(data.publisher_id).mean().plot.bar(figsize=[10,5])


# Yes. Let's try taking the log of the alexa rank and see if that looks better.

# In[26]:


lead_proposal_2 = (lead_articles.mins_as_lead.clip_upper(60) / np.log(lead_articles.alexa_rank))
lead_proposal_2.plot.hist()


# In[27]:


lead_proposal_2.groupby(data.publisher_id).describe()


# In[28]:


lead_proposal_2.groupby(data.publisher_id).min().plot.bar(figsize=[10,5])


# That looks about right, as long as the smaller publishers were closer to zero. So let's apply feature scaling to this, to give a number between 1 and 20. (Anything not as lead will pass though as zero.)

# In[29]:


def rescale(series):
    return (series - series.min()) / (series.max() - series.min())

lead_proposal_3 = np.ceil(20 * rescale(lead_proposal_2))


# In[30]:


lead_proposal_2.min(), lead_proposal_2.max()


# In[31]:


lead_proposal_3.plot.hist()


# In[32]:


lead_proposal_3.groupby(data.publisher_id).median().plot.bar(figsize=[10,5])


# In[33]:


data["lead_score"] = pd.concat([lead_proposal_3, data.mins_as_lead[data.mins_as_lead==0]])


# In[34]:


data.lead_score.value_counts().sort_index()


# In[35]:


data.lead_score.groupby(data.publisher_id).max()


# In summary then, score for article $a$ is:
# 
# 
# $$
# unscaledLeadScore_a = \frac{\min(minsAsLead_a, 60)}{\log(alexaRank_a)}\\
# leadScore_a = 19 \cdot 
# \frac{unscaledLeadScore_a - \min(unscaledLeadScore)}
# {\max(unscaledLeadScore) - \min(unscaledLeadScore)} 
# + 1
# $$
# 
# Since the minium value of $minsAsLead$ is 1, $\min(unscaledLeadScore)$ is pretty insignificant. So we can simplify this to:
# 
# $$
# leadScore_a = 20 \cdot 
# \frac{unscaledLeadScore_a }
#      {\max(unscaledLeadScore)} 
# $$
# 
# or: 
# 
# $$
# leadScore_a = 20 \cdot 
# \frac{\frac{\min(minsAsLead_a, 60)}{\log(alexaRank_a)} }
#      {\frac{60}{\log(\max(alexaRank))}} 
# $$
# 
# $$
# leadScore_a = \left( 20 \cdot 
# \frac{\min(minsAsLead_a, 60)}{\log(alexaRank_a)} \cdot
#      {\frac{\log(\max(alexaRank))}{60}} \right)
# $$

# ## Time on front score (15 points)
# 
# This is similar to time as lead, so lets try doing the same calculation, except we also want to factor in the number of slots on the front:
# 
# 
# $$frontScore_a = 
# 15 
# \left(\frac{\min(minsOnFront_a, 1440)}{alexaRank_a \cdot numArticlesOnFront_a}\right) 
# \left( \frac{\min(alexaRank \cdot numArticlesOnFront)}{1440} \right)$$

# In[36]:


(data.alexa_rank * data.num_articles_on_front).min() / 1440


# In[37]:


time_on_front_proposal_1 = np.ceil(data.mins_on_front.clip_upper(1440) / (data.alexa_rank * data.num_articles_on_front) * (2.45) * 15)


# In[38]:


time_on_front_proposal_1.plot.hist(figsize=(15, 7), bins=15)


# In[39]:


time_on_front_proposal_1.value_counts().sort_index()


# In[40]:


time_on_front_proposal_1.groupby(data.publisher_id).sum()


# That looks good to me.

# In[41]:


data["front_score"] = np.ceil(data.mins_on_front.clip_upper(1440) / (data.alexa_rank * data.num_articles_on_front) * (2.45) * 15).fillna(0)


# In[42]:


data.front_score 


# ## Facebook brand page promotion (15 points)
# 
# One way a publisher has of promoting content is to post to their brand page. The significance of doing so is stronger when the brand page has more followers (likes).
# 
# $$ facebookPromotionProposed1_a = 15 \left( \frac {brandPageLikes_a} {\max(brandPageLikes)} \right) $$
# 
# Now lets explore the data to see if that makes sense. **tr;dr the formula above is incorrect**

# In[43]:


data.fb_brand_page_likes.max()


# In[44]:


facebook_promotion_proposed_1 = np.ceil((15 * (data.fb_brand_page_likes / data.fb_brand_page_likes.max())).fillna(0))


# In[45]:


facebook_promotion_proposed_1.value_counts().sort_index().plot.bar()


# In[46]:


facebook_promotion_proposed_1.groupby(data.publisher_id).describe()


# That's too much variation: sites like the Guardian, which have a respectable 7.5m likes, should not be scoring a 3. Lets try applying a log to it, and then standard feature scaling again.

# In[47]:


data.fb_brand_page_likes.groupby(data.publisher_id).max()


# In[48]:


np.log(2149)


# In[49]:


np.log(data.fb_brand_page_likes.groupby(data.publisher_id).max())


# That's more like it, but the lower numbers should be smaller.

# In[50]:


np.log(data.fb_brand_page_likes.groupby(data.publisher_id).max() / 1000)


# In[51]:


scaled_fb_brand_page_likes = (data.fb_brand_page_likes / 1000)
facebook_promotion_proposed_2 = np.ceil(\
    (15 * \
     (np.log(scaled_fb_brand_page_likes) / np.log(scaled_fb_brand_page_likes.max()))\
    )\
                                       ).fillna(0)


# In[52]:


facebook_promotion_proposed_2.groupby(data.publisher_id).max()


# LGTM. So the equation is
# 
# 
# $$ facebookPromotion_a = 15 \left( 
# \frac {\log(\frac {brandPageLikes_a}{1000})} 
# {\log(\frac {\max(brandPageLikes)}{1000}))} \right) $$
# 

# Now, let's try applying standard feature scaling approch to this, rather than using a magic number of 1,000. That equation would be:
# 
# \begin{align}
# unscaledFacebookPromotion_a &= 
#    \log(brandPageLikes_a) \\
# facebookPromotion_a &= 
#    15 \cdot \frac{unscaledFacebookPromotion_a - \min(unscaledFacebookPromotion)}{\max(unscaledFacebookPromotion) - \min(unscaledFacebookPromotion)}  \\
# \\
# \text{The scaling can be simplified to:} \\
# facebookPromotion_a &= 
#    15 \cdot \frac{unscaledFacebookPromotion_a - \log(\min(brandPageLikes))}{\log(\max(brandPageLikes)) - \log(\min(brandPageLikes))}  \\
# \\
# \text{Meaning the overall equation becomes:} \\
# facebookPromotion_a &= 
#    15 \cdot \frac{\log(brandPageLikes_a) - \log(\min(brandPageLikes))}{\log(\max(brandPageLikes)) - \log(\min(brandPageLikes))} 
#    \end{align}
# 

# In[53]:


facebook_promotion_proposed_3 = np.ceil(
    (14 * 
     ( 
         (np.log(data.fb_brand_page_likes) - np.log(data.fb_brand_page_likes.min()) ) /
         (np.log(data.fb_brand_page_likes.max()) - np.log(data.fb_brand_page_likes.min()))
     )
    ) + 1
                                       )


# In[54]:


facebook_promotion_proposed_3.groupby(data.publisher_id).max()


# In[55]:


data["facebook_promotion_score"] = facebook_promotion_proposed_3.fillna(0.0)


# # Review

# In[56]:


data["promotion_score"] = (data.lead_score + data.front_score + data.facebook_promotion_score)
data["attention_index"] = (data.promotion_score + data.response_score)


# In[57]:


data.promotion_score.plot.hist(bins=np.arange(50), figsize=(15,6))


# In[58]:


data.attention_index.plot.hist(bins=np.arange(100), figsize=(15,6))


# In[59]:


data.attention_index.value_counts().sort_index()


# In[60]:


# and lets see the articles with the biggest attention index
data.sort_values("attention_index", ascending=False)


# In[61]:


data["score_diff"] = data.promotion_score - data.response_score


# In[62]:


# promoted but low response
data.sort_values("score_diff", ascending=False).head(25)


# In[63]:


# high response but not promoted
data.sort_values("score_diff", ascending=True).head(25)


# Write that data to a file. Note that the scores here are provisional for two reasons:
# 1. they should be using a rolling-month based on the article publication date to calculate medians/min/max etc, whereas in this workbook we as just using values for the month of May
# 2. for analysis, we've rounded the numbers; we don't expect to do that for the actual scores

# In[64]:


data.to_csv("articles_with_provisional_scores_2017-08-01_2017-08-31.csv")


# # Summary

# The attention index of an article is comprised of four components:
# 
# - *lead score* (max 20 points) based on how long an article was the lead story on the publisher's home page, scaled by the traffic to that publisher
# - *front score* (max 15 points) based on how long an article was present on the publisher's home page, scaled by traffic to that publisher
# - *Facebook promotion score* (max 15 points) based on whether the article was promoted to the publisher's Facebook brand page, scaled by the reach of that brand page
# - *response score* (max 50 points) based on the number of Facebook engagements the article received, relative to other articles
# 
# Or, in other words:
# 
# \begin{align}
# attentionIndex_a &= leadScore_a + frontScore_a + facebookPromotionScore_a + responseScore_a \\
# leadScore_a &= 20 \cdot \left(\frac{\min(minsAsLead_a, 60)}{alexaRank_a}\right) \cdot \left( \frac{\min(alexaRank)}{60} \right) \\
# frontScore_a &= 
# 15 \cdot
# \left(\frac{\min(minsOnFront_a, 1440)}{alexaRank_a \cdot numArticlesOnFront_a}\right) \cdot
# \left( \frac{\min(alexaRank \cdot numArticlesOnFront)}{1440} \right) \\
# facebookPromotion_a &= 
# \begin{cases}
#  0 \text{ if not shared on brand page }\\
#    15 \cdot \frac{\log(brandPageLikes_a) - \log(\min(brandPageLikes))}{\log(\max(brandPageLikes)) - \log(\min(brandPageLikes))}  \text{ otherwise }
# \end{cases}   
# \\
# responseScore_a &= 
# \begin{cases} 
#    0 \text{ if } engagements_a = 0 \\
#    50 \cdot 
#    \frac{\log(\min(engagements_a,limit) + median(engagements)) - \log(1 + median(engagements))}
#    {\log(limit + median(engagements)) - \log(1 + median(engagements))} \text{ if } engagements_a > 0
# \end{cases} \\
# \end{align} 

# In[ ]: