#!/usr/bin/env python # coding: utf-8 # # Import Libraries # In[1]: import tweepy import json import pandas as pd from scipy.misc import imread from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib as mpl import csv import matplotlib.pyplot as plt import operator from textblob import TextBlob from textblob import Word from textblob.sentiments import NaiveBayesAnalyzer # # Authentication # In[2]: consumer_key = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' access_token = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxxxxxxx' access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) #Interacting with twitter's API auth.set_access_token(access_token, access_token_secret) api = tweepy.API (auth) #creating the API object # # Extracting Tweets # In[3]: results = [] for tweet in tweepy.Cursor (api.search, q = 'millennials', lang = "en").items(2000): results.append(tweet) print (type(results)) print (len(results)) #print (results[4000].text) # # Store Data in dataframe # In[4]: def tweets_df(results): id_list = [tweet.id for tweet in results] data_set = pd.DataFrame(id_list, columns = ["id"]) data_set["text"] = [tweet.text for tweet in results] data_set["created_at"] = [tweet.created_at for tweet in results] data_set["retweet_count"] = [tweet.retweet_count for tweet in results] data_set["user_screen_name"] = [tweet.author.screen_name for tweet in results] data_set["user_followers_count"] = [tweet.author.followers_count for tweet in results] data_set["user_location"] = [tweet.author.location for tweet in results] data_set["Hashtags"] = [tweet.entities.get('hashtags') for tweet in results] return data_set data_set = tweets_df(results) # # Remove duplicate tweets # In[5]: text = data_set["text"] for i in range(0,len(text)): txt = ' '.join(word for word in text[i] .split() if not word.startswith('https:')) data_set.set_value(i, 'text2', txt) data_set.drop_duplicates('text2', inplace=True) data_set.reset_index(drop = True, inplace=True) data_set.drop('text', axis = 1, inplace = True) data_set.rename(columns={'text2': 'text'}, inplace=True) # # Sentiment Analysis of tweets # In[6]: text = data_set["text"] for i in range(0,len(text)): textB = TextBlob(text[i]) sentiment = textB.sentiment.polarity data_set.set_value(i, 'Sentiment',sentiment) if sentiment <0.00: SentimentClass = 'Negative' data_set.set_value(i, 'SentimentClass', SentimentClass ) elif sentiment >0.00: SentimentClass = 'Positive' data_set.set_value(i, 'SentimentClass', SentimentClass ) else: SentimentClass = 'Neutral' data_set.set_value(i, 'SentimentClass', SentimentClass ) # In[7]: data_set.to_csv("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\Millennials.csv") # # Extract all hashtags for all tweets # In[8]: Htag_df = pd.DataFrame() j = 0 for tweet in range(0,len(results)): hashtag = results[tweet].entities.get('hashtags') for i in range(0,len(hashtag)): Htag = hashtag[i]['text'] Htag_df.set_value(j, 'Hashtag',Htag) j = j+1 # In[9]: Htag_df # In[16]: Millennials_Htag_wordcloud = Htag_df.groupby('Hashtag').size() Millennials_Htag_wordcloud.to_csv("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\Millennials_Htag_wordcloud.csv") # In[10]: # Join all the text from the 1000 tweets Hashtag_Combined = " ".join(Htag_df['Hashtag'].values.astype(str)) no_millennials = " ".join([word for word in Hashtag_Combined.split() if word != 'millennials' and word != 'Millennials' and word != 'Boomers' and word != 'GenX' ]) Tweet_mask = imread("C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\twitter_mask.png", flatten=True) #Create a Word Cloud wc = WordCloud(background_color="white", stopwords=STOPWORDS, mask = Tweet_mask) wc.generate(no_millennials) plt.imshow(wc) plt.axis("off") plt.savefig('C:\\Users\\kdudani\\Desktop\\Personal\\Social Media Analytics\\millennials_Hashtag.png', dpi=300) plt.show()