#!/usr/bin/env python
# coding: utf-8
# # Data retrieval
# In[1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
get_ipython().run_line_magic('matplotlib', 'inline')
# In[429]:
seed_urls = ['https://inshorts.com/en/read/technology',
'https://inshorts.com/en/read/sports',
'https://inshorts.com/en/read/world']
def build_dataset(seed_urls):
news_data = []
for url in seed_urls:
news_category = url.split('/')[-1]
data = requests.get(url)
soup = BeautifulSoup(data.content, 'html.parser')
news_articles = [{'news_headline': headline.find('span',
attrs={"itemprop": "headline"}).string,
'news_article': article.find('div',
attrs={"itemprop": "articleBody"}).string,
'news_category': news_category}
for headline, article in
zip(soup.find_all('div',
class_=["news-card-title news-right-box"]),
soup.find_all('div',
class_=["news-card-content news-right-box"]))
]
news_data.extend(news_articles)
df = pd.DataFrame(news_data)
df = df[['news_headline', 'news_article', 'news_category']]
return df
# In[430]:
news_df = build_dataset(seed_urls)
news_df.head(10)
# In[431]:
news_df.news_category.value_counts()
# # Text Wrangling and Pre-processing
# In[2]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata
nlp = spacy.load('en_core', parse = True, tag=True, entity=True)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
# ## Remove HTML tags
# In[3]:
def strip_html_tags(text):
soup = BeautifulSoup(text, "html.parser")
stripped_text = soup.get_text()
return stripped_text
strip_html_tags('
Some important text
')
# ## Remove accented characters
# In[4]:
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
remove_accented_chars('Sómě Áccěntěd těxt')
# ## Expand contractions
# In[5]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
expand_contractions("Y'all can't expand contractions I'd think")
# ## Remove special characters
# In[6]:
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
remove_special_characters("Well this was fun! What do you think? 123#@!",
remove_digits=True)
# ## Text lemmatization
# In[7]:
def lemmatize_text(text):
text = nlp(text)
text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
return text
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")
# ## Text stemming
# In[8]:
def simple_stemmer(text):
ps = nltk.porter.PorterStemmer()
text = ' '.join([ps.stem(word) for word in text.split()])
return text
simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")
# ## Remove stopwords
# In[9]:
def remove_stopwords(text, is_lower_case=False):
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
remove_stopwords("The, and, if are stopwords, computer is not")
# ## Building a text normalizer
# In[10]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True, remove_digits=True):
normalized_corpus = []
# normalize each document in the corpus
for doc in corpus:
# strip HTML
if html_stripping:
doc = strip_html_tags(doc)
# remove accented characters
if accented_char_removal:
doc = remove_accented_chars(doc)
# expand contractions
if contraction_expansion:
doc = expand_contractions(doc)
# lowercase the text
if text_lower_case:
doc = doc.lower()
# remove extra newlines
doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
# lemmatize text
if text_lemmatization:
doc = lemmatize_text(doc)
# remove special characters and\or digits
if special_char_removal:
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
doc = special_char_pattern.sub(" \\1 ", doc)
doc = remove_special_characters(doc, remove_digits=remove_digits)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
# remove stopwords
if stopword_removal:
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
normalized_corpus.append(doc)
return normalized_corpus
# ## Pre-process and normalize news articles
# In[16]:
news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]
# In[442]:
news_df['clean_text'] = normalize_corpus(news_df['full_text'])
norm_corpus = list(news_df['clean_text'])
news_df.iloc[1][['full_text', 'clean_text']].to_dict()
# # Save the news articles
# In[443]:
news_df.to_csv('news.csv', index=False, encoding='utf-8')
# # Tagging Parts of Speech
# In[11]:
news_df = pd.read_csv('news.csv')
# In[12]:
corpus = normalize_corpus(news_df['full_text'], text_lower_case=False,
text_lemmatization=False, special_char_removal=False)
sentence = str(news_df.iloc[1].news_headline)
sentence_nlp = nlp(sentence)
# In[22]:
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])
# In[24]:
nltk_pos_tagged = nltk.pos_tag(sentence.split())
pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])
# # Shallow Parsing or Chunking Text
# In[132]:
from nltk.corpus import conll2000
data = conll2000.chunked_sents()
train_data = data[:10900]
test_data = data[10900:]
print(len(train_data), len(test_data))
print(train_data[1])
# In[133]:
from nltk.chunk.util import tree2conlltags, conlltags2tree
wtc = tree2conlltags(train_data[1])
wtc
# In[134]:
tree = conlltags2tree(wtc)
print(tree)
# In[135]:
def conll_tag_chunks(chunk_sents):
tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]
def combined_tagger(train_data, taggers, backoff=None):
for tagger in taggers:
backoff = tagger(train_data, backoff=backoff)
return backoff
# In[136]:
from nltk.tag import UnigramTagger, BigramTagger
from nltk.chunk import ChunkParserI
class NGramTagChunker(ChunkParserI):
def __init__(self, train_sentences,
tagger_classes=[UnigramTagger, BigramTagger]):
train_sent_tags = conll_tag_chunks(train_sentences)
self.chunk_tagger = combined_tagger(train_sent_tags, tagger_classes)
def parse(self, tagged_sentence):
if not tagged_sentence:
return None
pos_tags = [tag for word, tag in tagged_sentence]
chunk_pos_tags = self.chunk_tagger.tag(pos_tags)
chunk_tags = [chunk_tag for (pos_tag, chunk_tag) in chunk_pos_tags]
wpc_tags = [(word, pos_tag, chunk_tag) for ((word, pos_tag), chunk_tag)
in zip(tagged_sentence, chunk_tags)]
return conlltags2tree(wpc_tags)
# In[137]:
ntc = NGramTagChunker(train_data)
print(ntc.evaluate(test_data))
# In[152]:
chunk_tree = ntc.parse(nltk_pos_tagged)
print(chunk_tree)
# In[153]:
from IPython.display import display
os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\"
display(chunk_tree)
# # Constituency parsing
# In[446]:
# set java path
import os
java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe'
os.environ['JAVAHOME'] = java_path
from nltk.parse.stanford import StanfordParser
scp = StanfordParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
result = list(scp.raw_parse(sentence))
print(result[0])
# In[447]:
from IPython.display import display
os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\"
display(result[0])
# # Dependency parsing
# In[448]:
dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------'
for token in sentence_nlp:
print(dependency_pattern.format(word=token.orth_,
w_type=token.dep_,
left=[t.orth_
for t
in token.lefts],
right=[t.orth_
for t
in token.rights]))
# In[449]:
from spacy import displacy
displacy.render(sentence_nlp, jupyter=True,
options={'distance': 110,
'arrow_stroke': 2,
'arrow_width': 8})
# In[450]:
from nltk.parse.stanford import StanfordDependencyParser
sdp = StanfordDependencyParser(path_to_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser.jar',
path_to_models_jar='E:/stanford/stanford-parser-full-2015-04-20/stanford-parser-3.5.2-models.jar')
result = list(sdp.raw_parse(sentence))
dep_tree = [parse.tree() for parse in result][0]
print(dep_tree)
# In[451]:
from IPython.display import display
os.environ['PATH'] = os.environ['PATH']+";C:\\Program Files\\gs\\gs9.09\\bin\\"
display(dep_tree)
# In[452]:
from graphviz import Source
dep_tree_dot_repr = [parse for parse in result][0].to_dot()
source = Source(dep_tree_dot_repr, filename="dep_tree", format="png")
source
# # Named Entity Recognition
# In[453]:
sentence = str(news_df.iloc[1].full_text)
sentence_nlp = nlp(sentence)
# In[454]:
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])
# In[455]:
displacy.render(sentence_nlp, style='ent', jupyter=True)
# In[19]:
named_entities = []
for sentence in corpus:
temp_entity_name = ''
temp_named_entity = None
sentence = nlp(sentence)
for word in sentence:
term = word.text
tag = word.ent_type_
if tag:
temp_entity_name = ' '.join([temp_entity_name, term]).strip()
temp_named_entity = (temp_entity_name, tag)
else:
if temp_named_entity:
named_entities.append(temp_named_entity)
temp_entity_name = ''
temp_named_entity = None
entity_frame = pd.DataFrame(named_entities,
columns=['Entity Name', 'Entity Type'])
# In[24]:
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
.size()
.sort_values(ascending=False)
.reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]
# In[26]:
top_entities = (entity_frame.groupby(by=['Entity Type'])
.size()
.sort_values(ascending=False)
.reset_index().rename(columns={0 : 'Frequency'}))
top_entities.T.iloc[:,:15]
# In[27]:
from nltk.tag import StanfordNERTagger
import os
java_path = r'C:\Program Files\Java\jdk1.8.0_102\bin\java.exe'
os.environ['JAVAHOME'] = java_path
sn = StanfordNERTagger('E:/stanford/stanford-ner-2014-08-27/classifiers/english.all.3class.distsim.crf.ser.gz',
path_to_jar='E:/stanford/stanford-ner-2014-08-27/stanford-ner.jar')
ner_tagged_sentences = [sn.tag(sent.split()) for sent in corpus]
# In[28]:
named_entities = []
for sentence in ner_tagged_sentences:
temp_entity_name = ''
temp_named_entity = None
for term, tag in sentence:
if tag != 'O':
temp_entity_name = ' '.join([temp_entity_name, term]).strip()
temp_named_entity = (temp_entity_name, tag)
else:
if temp_named_entity:
named_entities.append(temp_named_entity)
temp_entity_name = ''
temp_named_entity = None
#named_entities = list(set(named_entities))
entity_frame = pd.DataFrame(named_entities,
columns=['Entity Name', 'Entity Type'])
# In[30]:
top_entities = (entity_frame.groupby(by=['Entity Name', 'Entity Type'])
.size()
.sort_values(ascending=False)
.reset_index().rename(columns={0 : 'Frequency'}))
top_entities.head(15)
# In[462]:
top_entities = (entity_frame.groupby(by=['Entity Type'])
.size()
.sort_values(ascending=False)
.reset_index().rename(columns={0 : 'Frequency'}))
top_entities.head()
# # Emotion and Sentiment Analysis
# In[13]:
from afinn import Afinn
af = Afinn()
# In[14]:
sentiment_scores = [af.score(article) for article in corpus]
sentiment_category = ['positive' if score > 0
else 'negative' if score < 0
else 'neutral'
for score in sentiment_scores]
# In[15]:
df = pd.DataFrame([list(news_df['news_category']), sentiment_scores, sentiment_category]).T
df.columns = ['news_category', 'sentiment_score', 'sentiment_category']
df['sentiment_score'] = df.sentiment_score.astype('float')
df.groupby(by=['news_category']).describe()
# In[39]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 4))
sp = sns.stripplot(x='news_category', y="sentiment_score",
hue='news_category', data=df, ax=ax1)
bp = sns.boxplot(x='news_category', y="sentiment_score",
hue='news_category', data=df, palette="Set2", ax=ax2)
t = f.suptitle('Visualizing News Sentiment', fontsize=14)
# In[40]:
fc = sns.factorplot(x="news_category", hue="sentiment_category",
data=df, kind="count",
palette={"negative": "#FE2020",
"positive": "#BADD07",
"neutral": "#68BFF5"})
# In[41]:
pos_idx = df[(df.news_category=='technology') & (df.sentiment_score == 6)].index[0]
neg_idx = df[(df.news_category=='technology') & (df.sentiment_score == -15)].index[0]
print('Most Negative Tech News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive Tech News Article:', news_df.iloc[pos_idx][['news_article']][0])
# In[42]:
pos_idx = df[(df.news_category=='world') & (df.sentiment_score == 16)].index[0]
neg_idx = df[(df.news_category=='world') & (df.sentiment_score == -12)].index[0]
print('Most Negative World News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive World News Article:', news_df.iloc[pos_idx][['news_article']][0])
# In[16]:
from textblob import TextBlob
sentiment_scores_tb = [round(TextBlob(article).sentiment.polarity, 3) for article in news_df['clean_text']]
sentiment_category_tb = ['positive' if score > 0
else 'negative' if score < 0
else 'neutral'
for score in sentiment_scores_tb]
# In[17]:
df = pd.DataFrame([list(news_df['news_category']), sentiment_scores_tb, sentiment_category_tb]).T
df.columns = ['news_category', 'sentiment_score', 'sentiment_category']
df['sentiment_score'] = df.sentiment_score.astype('float')
df.groupby(by=['news_category']).describe()
# In[18]:
df.head()
# In[74]:
fc = sns.factorplot(x="news_category", hue="sentiment_category",
data=df, kind="count",
palette={"negative": "#FE2020",
"positive": "#BADD07",
"neutral": "#68BFF5"})
# In[75]:
pos_idx = df[(df.news_category=='world') & (df.sentiment_score == 0.7)].index[0]
neg_idx = df[(df.news_category=='world') & (df.sentiment_score == -0.296)].index[0]
print('Most Negative World News Article:', news_df.iloc[neg_idx][['news_article']][0])
print()
print('Most Positive World News Article:', news_df.iloc[pos_idx][['news_article']][0])
# In[20]:
import model_evaluation_utils as meu
meu.display_confusion_matrix_pretty(true_labels=sentiment_category,
predicted_labels=sentiment_category_tb,
classes=['negative', 'neutral', 'positive'])