The code in this notebook shows how you can use the Python package Scattertext to explore how language used in headlines can correlate with social engagement.
For background on the term-class association scores used and semiotic squares, please see https://github.com/JasonKessler/PuPPyTalk and https://github.com/JasonKessler/SemioticSquaresTalk
This notebook makes heavy use of the library Scattertext (https://github.com/JasonKessler/scattertext) for language processing and visualizations.
The data used were scraped from Facebook by Max Woolf. Please see his original notebook at https://github.com/minimaxir/clickbait-cluster.
import pandas as pd
import numpy as np
import sys
import umap
import spacy
import scattertext as st
from gensim.models import word2vec
import re
from glob import glob
from scipy.stats import rankdata
from IPython.display import IFrame
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:98% !important; }</style>"))
import matplotlib.pyplot as plt
# You need to have a recent version of Scattertext to run this package
assert st.__version__ >= '0.0.2.20'
nlp = spacy.load('en')
df = pd.concat([pd.read_csv(fn, sep='\t')
.assign(publication=fn.split('/')[-1].split('_')[0])
for fn in glob('./fb_headlines/*')]).reset_index()
df['status_published'] = pd.to_datetime(df.status_published)
df.iloc[0]
index 0 page_id BuzzFeed status_id 21898300328_10154928658355329 link_name Here's How Much The Kardashians Have Changed I... status_published 2016-08-12 21:31:00 num_reactions 349 publication BuzzFeed Name: 0, dtype: object
df.publication.value_counts()
NYTimes 43857 CNN 27677 BuzzFeed 26551 Upworthy 18701 Name: publication, dtype: int64
df.status_published.apply(lambda x: x.year).value_counts().sort_index(ascending=False).head()
2016 27115 2015 41507 2014 27850 2013 11844 2012 7106 Name: status_published, dtype: int64
df.status_published.max()
Timestamp('2016-08-12 21:55:00')
df_2016 = df[df.status_published.apply(lambda x: x.year >= 2016)].drop_duplicates()
df_2016 = df_2016.loc[df_2016['link_name'].dropna().index]
df_2016.publication.value_counts()
NYTimes 10326 CNN 9284 BuzzFeed 5386 Upworthy 824 Name: publication, dtype: int64
df_2016['parse'] = df_2016['link_name'].apply(nlp)
# Restrict to headlines longer than two words
df_2016 = df_2016[df_2016['parse'].apply(len) > 2]
from scipy.stats import rankdata
df_2016['reaction_percentile'] = df_2016.groupby('publication')['num_reactions'].apply(lambda x: pd.Series(rankdata(x)/len(x), index=x.index))
df_2016['reaction_bin'] = df_2016.reaction_percentile.apply(lambda x: 'Hi' if x > 2./3 else 'Lo' if x < 1./3 else 'Mid')
reaction_corpus = (st.CorpusFromParsedDocuments(df_2016, parsed_col='parse', category_col='reaction_bin')
.build()
.compact(st.ClassPercentageCompactor(term_count=6))
.compact(st.CompactTerms(slack=3)))
def get_metadata_from_corpus(corpus):
df = corpus.get_df()
return (df.page_id + ', '
+ df.reaction_percentile.apply(lambda x: str(int(x * 100)) + '%') + ', '
+ df.status_published.apply(lambda x: str(x.date())))
html = st.produce_frequency_explorer(reaction_corpus,
category='Hi',
not_categories=['Lo'],
neutral_categories=['Mid'],
minimum_term_frequency=0,
pmi_filter_thresold=0,
use_full_doc = True,
term_scorer = st.RankDifference(),
grey_threshold=0,
width_in_pixels=1000,
metadata=get_metadata_from_corpus(reaction_corpus))
file_name = 'output/reaction_freq.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)
# Eliminate other categories from dataset (e.g., Upworthy or mid-engagment)
df_2016['category'] = df_2016.publication + ' ' + df_2016.reaction_bin
df_2016_four_square = df_2016[df_2016.publication.isin(['BuzzFeed', 'NYTimes'])
& df_2016.reaction_bin.isin(['Hi', 'Lo'])]
# Create corpus and filter terms
four_square_corpus = (st.CorpusFromParsedDocuments(df_2016_four_square, category_col = 'category', parsed_col = 'parse')
.build()
.compact(st.CompactTerms(minimum_term_count=2, slack=5))
.compact(st.ClassPercentageCompactor(term_count=2)))
# Set up chart structure
four_square = st.FourSquare(
four_square_corpus,
category_a_list=['NYTimes Hi'],
category_b_list=['BuzzFeed Hi'],
not_category_a_list=['BuzzFeed Lo'],
not_category_b_list=['NYTimes Lo'],
scorer=st.RankDifference(),
labels={'a': 'Highbrow Engagment',
'b': 'Lowbrow Engagment',
'not_a_and_not_b': 'Few Facebook Reactions',
'a_and_b': 'Many Facebook Reactions',
'a_and_not_b': 'NYTimes',
'b_and_not_a': 'BuzzFeed',
'not_a': 'Lowbrow Ignored',
'not_b': 'Highbrow Ignored'})
html = st.produce_four_square_explorer(four_square=four_square,
x_label='NYTimes-Buzz',
y_label='Hi-Low',
use_full_doc=True,
pmi_threshold_coefficient=0,
metadata=get_metadata_from_corpus(four_square_corpus))
file_name = 'output/reaction_semiotic_axes.html'
open(file_name, 'wb').write('<center><h2>The Semiotics of Clickbait: Buzzfeed vs. The New York Times, High vs. Low Engagement</h2></center>'.encode('utf-8') + html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=900)
four_square_axes = st.FourSquareAxes(four_square_corpus,
['NYTimes Hi'],
['NYTimes Lo'],
['BuzzFeed Hi'],
['BuzzFeed Lo'],
labels = {'a': 'Appeals to all',
'a_and_not_b': 'NY Times: ↑ Engagement',
'b_and_not_a': 'NY Times: ↓ Engagement',
'a_and_b': 'BuzzFeed: ↑ Engagement',
'not_a_and_not_b': 'BuzzFeed: ↓ Engagement',
'not_a': 'Ignored by all',
'b': 'Ignored by elite, appeals to masses',
'not_b': 'Appeals to elite, ignored by masses'})
html = st.produce_four_square_axes_explorer(
four_square_axes=four_square_axes,
x_label='NYT: Hi-Lo',
y_label='Buzz: Hi-Lo',
use_full_doc=True,
pmi_threshold_coefficient=0,
metadata=get_metadata_from_corpus(four_square_corpus))
file_name = 'output/reaction_semiotic_axes.html'
open(file_name, 'wb').write('<center><h2>The Semiotics of Clickbait: Publication-Specific Engagement</h2></center>'.encode('utf-8') + html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=900)
# View chart with multiple terms visible
html = st.produce_four_square_explorer(four_square=four_square,
x_label='NYTimes-Buzz',
y_label='Hi-Low',
use_full_doc=True,
pmi_threshold_coefficient=0,
metadata=get_metadata_from_corpus(four_square_corpus),
censor_points=False)
file_name = 'output/reaction_semiotic_censor.html'
open(file_name, 'wb').write('<center><h2>The Semiotics of Clickbait: Buzzfeed vs. The New York Times</h2></center>'.encode('utf-8') + html.encode('utf-8'))
IFrame(src=file_name, width = 1600, height=900)
Python package: https://github.com/lmcinnes/umap
html = st.produce_projection_explorer(reaction_corpus,
category='Hi',
not_categories=['Lo'],
neutral_categories=['Mid'],
term_scorer = st.RankDifference(),
neutral_category_name='Mid',
width_in_pixels=1000,
use_full_doc=True,
projection_model = umap.UMAP(metric='cosine'),
term_acceptance_re=re.compile(''),
metadata=get_metadata_from_corpus(reaction_corpus))
file_name = 'output/reaction_umap_projection.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)