#!/usr/bin/env python # coding: utf-8 # # Project: Wrangling and Analysis of 'WeRateDogs' tweet archive data # ## Table of contents # #
# # ## Introduction # # This notebook shows the data wrangling and analysis of tweet archive data of a popular dog rating page on Twitter called 'WeRateDogs'. It demonstrates gathering, assessment and cleaning of the data following different methods all as part of the data wrangling process. It also demonstrates the analysis of this data and documents the various insights and visualizations. # # ### Data description # The data to be used in this project includes: # # #### 1. A file on hand containing tweet archive data including: # # ##### tweet_id: # unique identifier of a particular Tweet. # # ##### in_reply_to_status_id: # if the represented Tweet is a reply, this field will contain the integer representation of the original Tweet's ID. # # ##### in_reply_to_user_id: # if the represented Tweet is a reply, this field will contain the string representation of the original Tweet's author ID. # # ##### timestamp: # the date and time at which the tweet was posted # # ##### source: # utility used to post the tweet as a HTML-formatted string # # ##### text: # the actual UTF-8 text of the tweet. # # ##### retweeted_status_id: # if the represented Tweet is a retweet, this field will contain the integer representation of the original Tweet's ID # # ##### retweeted_status_user_id: # if the represented Tweet is a retweet, this field will contain the integer representation of the original Tweet's author ID # # ##### retweeted_timestamp: # the date and time at which the retweet was posted # # ##### expanded_urls: # the full url link of the tweet # # ##### rating_numerator: # the integer representation of the dog rating. # # ##### rating_denominator: # the integer representation of the overall value of the rating # # ##### name: # the name of the dog # # ##### doggo: # a big pupper usually older # # ##### floofer: # label given to a dog that is excessively fury # # ##### pupper: # a small doggo, usually younger. # # ##### puppo: # a transitional phase between pupper and doggo # # #### 2. A tweet image file containing tweet image prediction data including: # # ##### tweet_id: # unique identifier of a particular Tweet. # # ##### jpg_url: # url link to the image associated with the given Tweet. # ##### img_num: # since a tweet can have multiple images, this indicates the number of the image corresponding to the most confident prediction. # ##### p1: # p1 is the algorithm's #1 prediction for the image in the tweet # ##### p1_conf: # p1_conf is how confident the algorithm is in its #1 prediction # ##### p1_dog: # p1_dog is whether or not the #1 prediction is a breed of dog # ##### p2: # p2 is the algorithm's second most likely prediction # ##### p2_conf: # p2_conf is how confident the algorithm is in its #2 prediction # ##### p2_dog: # p2_dog is whether or not the #2 prediction is a breed of dog # ##### p3: # p3 is the algorithm's third most likely prediction # ##### p3_conf: # p3_conf is how confident the algorithm is in its #3 prediction # ##### p3_dog: # p3_dog is whether or not the #3 prediction is a breed of dog # # #### 3. Tweet retweet count and favorite count data including: # # ##### tweet_id: # unique identifier of a particular Tweet. # ##### retweet_count: # the number of times a Tweet has been retweeted. # ##### favorite_count: # the number of times a Tweet has been favorited. # # # # # In[1]: #installing tweepy into the environment get_ipython().system('pip install tweepy') # In[46]: # importing all the packages that will be required. import pandas as pd import requests import tweepy import json import numpy as np import re import functools import matplotlib.pyplot as plt get_ipython().run_line_magic('', 'matplotlib inline') import seaborn as sns # # ## Data Gathering # # 1. Directly downloading the WeRateDogs Twitter archive data (twitter_archive_enhanced.csv) # In[47]: # reading twitter-archive-enhanced.csv into a dataframe tweet_archive = pd.read_csv('twitter-archive-enhanced.csv') tweet_archive.head(3) # 2. Using the Requests library to download the tweet image prediction (image_predictions.tsv) # In[4]: # programmatically downloading the image_predictions.tsv file url ='https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv' response = requests.get(url) # In[5]: #saving the contents to the computer with open('image_predictions.tsv', mode = 'wb') as file: file.write(response.content) # In[48]: image_df = pd.read_csv('image_predictions.tsv', sep = '\t') image_df.head(3) # 3. Using the Tweepy library to query additional data via the Twitter API (tweet_json.txt) # In[4]: #api api_key = "YOU API KEY HERE" api_secrets = "YOUR API SECRET KEY HERE" access_token = "YOUR ACCESS TOKEN KEY HERE" access_secret = "YOUR ACCESS TOKEN SECRET HERE" # Authenticate to Twitter auth = tweepy.OAuthHandler(api_key,api_secrets) auth.set_access_token(access_token,access_secret) api = tweepy.API(auth,wait_on_rate_limit=True,wait_on_rate_limit_notify=True) # In[6]: #create list of tweet ids from archive_df tweet_id column tweet_ids = [] for id in tweet_archive.tweet_id: tweet_ids.append(str(id)) # In[10]: print(len(tweet_ids)) # In[7]: #create empty list for available tweets tweets = [] #create empty list for unavailable tweets unavailable_tweets = [] #gather each tweet's json data by id for id in tweet_ids: try: tweet = (api.get_status(id))._json tweets.append({'tweet_id':tweet['id'],'retweet_count':tweet['retweet_count'],'favorite_count':tweet['favorite_count']}) except: unavailable_tweets.append(id) indices = list(range(len(tweets))) with open('tweet_json.txt', mode = 'w') as file: for i in indices file.write(json.dumps(tweets[i]['tweet_id'])) file.write('\t') file.write(json.dumps(tweets[i]['retweet_count'])) file.write('\t') file.write(json.dumps(tweets[i]['favorite_count'])) file.write('\n') # In[34]: # check for available attributes of the json data retrieved from the api list(tweet.keys()) # In[49]: #reading the 'tweet_json.txt' file into a dataframe tweet_counts = pd.read_csv('tweet_json.txt', sep ='\t', header = None, names = ['tweet_id', 'retweet_count','favorite_count']) tweet_counts.head() # In[9]: tweet_counts.shape # # ## Assessing Data # #### 1. tweet_archive dataframe # In[22]: tweet_archive # In[18]: tweet_archive.sample(10) # In[24]: tweet_archive.info() # In[9]: tweet_archive.isnull() # In[13]: tweet_archive.name # In[14]: tweet_archive.name.value_counts() # In[50]: words = [] for n in tweet_archive.name: if n[0].islower(): words.append(n) other_words = list(np.unique(words)) print(other_words) # In[5]: tweet_archive.pupper.value_counts() # In[6]: tweet_archive.doggo.value_counts() # In[35]: sum(tweet_archive.duplicated()) # #### 2. image_df dataframe # In[40]: image_df # In[41]: image_df.sample(5) # In[42]: sum(image_df.duplicated()) # In[43]: image_df.info() # In[ ]: #checking for non-original tweets' tweet ids in the image_df dataframe. # In[7]: img_list = (list(image_df.tweet_id)) # In[8]: #takes approximately 30 mins #create empty list for unoriginal tweets unoriginal_tweets = [] #create empty list for unavailable tweets unavailable_tweets = [] #create empty list for original tweets original_tweets = [] #gather each tweet's json data by id for id in img_list: try: img_tweet = (api.get_status(id))._json if pd.isna(img_tweet['in_reply_to_status_id']) is False or 'retweeted_status' in img_tweet.keys(): unoriginal_tweets.append(id) elif pd.isna(img_tweet['in_reply_to_status_id']) is True and 'retweeted_status' not in img_tweet.keys(): original_tweets.append(id) except: unavailable_tweets.append(id) # In[10]: print(unoriginal_tweets) # In[9]: print(unavailable_tweets) # #### 3. tweet_counts dataframe # In[46]: tweet_counts.head() # In[47]: tweet_counts.info() # In[48]: tweet_counts.describe() # In[49]: sum(tweet_counts.duplicated()) # In[7]: all_columns = pd.Series(list(tweet_archive) + list(tweet_counts) + list(image_df)) all_columns[all_columns.duplicated()] # ### Quality issues # #### `tweet_archive table` # # 1. 'name', 'doggo', 'floofer', 'pupper' and 'puppo' columns have missing values misrepresented as strings('None') # # 2. 'name' column has non-valid words (adjectives,articles,adverbs) as values in some entries # # 3. 'name', 'doggo', 'floofer', 'pupper' and 'puppo' columns have missing values. # # 4. presence of some records with non null values in the 'retweeted_status_id' column,these are not original tweets # # 5. presence of some records with non null values in 'reply_to_status_id' column, these are not original tweets # # 6. records dating beyond 2017-08-01 # # 7. 'time_stamp' column is an object datatype # # #### `image_df table` # # 8. records with 'false' values in all 3 of the dog prediction columns (p1_dog, p2_dog, p3_dog),these are not dogs # 9. non-original tweet records # # # # ### Tidiness issues # 1. doggo,pupper,puppo,floofer are represented as different column headers instead of individual values of one column(tweet_archive table) # # 2. dog breeds are spread out in separate columns(image_df table) # # 3. data is in separate tables. # # ## Cleaning Data # In[109]: # Make copies of original pieces of data tweetarch_clean = tweet_archive.copy() img_clean = image_df.copy() tweetcounts_clean = tweet_counts.copy() # ### Quality issue #1: # #### tweet_archive table: # #### 'name', 'doggo', 'floofer', 'pupper' and 'puppo' columns have missing values misrepresented as strings('None') # # ### Quality issue #2: # #### tweet_archive table: # #### 'name' column has non-valid words (adjectives,articles,adverbs) as values in some entries # #### Define: # Use the .replace() function to replace the specified values from the assessment with null values.This fixes both issue 1 and 2. # #### Code: # In[110]: other_words.append('None') for word in other_words: tweetarch_clean.replace(to_replace = word , value = np.nan, inplace = True) # #### Test: # In[53]: tweetarch_clean.isnull().sample(10) # In[54]: tweetarch_clean.name.value_counts() # In[111]: tweetarch_clean.query(f'name == {other_words}') # ### Quality issue #3: # #### tweet_archive table: # #### 'name', 'doggo', 'floofer', 'pupper' and 'puppo' columns have missing values. # #### Define: # Use the values of the text column and regex methods to extract the required values for these columns # #### Code: # In[112]: #create a function to find and replace null values in the dog stages def fill_dog_stage(stage): '''loops through all the values of the specified column,searches for null values, which if True, loops through the corresponding values of the text column, searches for the appropriate value and replaces this value into the specified position''' indices = list(range(len(tweetarch_clean[stage]))) for i in indices: if pd.isna(tweetarch_clean[stage][i]): try: tweetarch_clean.loc[i,stage] = re.findall(stage,tweetarch_clean.text[i])[0] except: tweetarch_clean.loc[i,stage] = np.nan # In[113]: #create a list of the dog stages loop through each one, passing to the function stages = ['pupper','puppo','doggo','floofer'] for stage in stages: fill_dog_stage(stage) # #### Test: # In[114]: tweetarch_clean.info() # ### Tidiness issue #1: # #### tweet_archive table: # #### doggo,pupper,puppo,floofer are represented as different column headers instead of individual values of one column # #### Define: # Create a new column for dog stages and loop through each of these columns for appropriate values for the dog stage column to unpivot the three columns. Drop the three columns when done. # #### Code # In[115]: dog_stages = [] #doggo,floofer,pupper,puppo indices = list(range(len(tweetarch_clean))) for i in indices: if not pd.isna(tweetarch_clean.floofer[i]): dog_stages.append('floofer') elif not pd.isna(tweetarch_clean.puppo[i]): dog_stages.append('puppo') elif not pd.isna(tweetarch_clean.pupper[i]): dog_stages.append('pupper') elif not pd.isna(tweetarch_clean.doggo[i]): dog_stages.append('doggo') else: dog_stages.append(np.nan) # In[116]: tweetarch_clean['dog_stage'] = dog_stages # In[117]: tweetarch_clean = tweetarch_clean.drop(['pupper','doggo','puppo','floofer'], axis = 1) # #### Test # In[118]: tweetarch_clean.dog_stage.value_counts() # Note that some records had values for either both doggo & pupper, doggo & puppo and doggo & floofer.For streamlining purposes, one had to be chosen over the other, hence the slight changes in some of the dog stage value counts. I chose the other stages over 'doggo', placing it last in the loop statement because this term is generally used more loosely relative to the rest, as per my personal judgement. # In[119]: tweetarch_clean.columns # ### Quality issue #9: # #### image_df table: # #### non-original tweet records # #### Define: # Delete all the non-original tweets using the original_tweets ids list and vectorization. # #### Code: # In[120]: for id in img_clean.tweet_id: if id not in original_tweets: img_clean.drop(img_clean[img_clean['tweet_id'] == id].index, inplace = True) # In[124]: #reset the index img_clean.reset_index(inplace = True) # #### Test: # In[125]: #should evaluate to True list(img_clean.tweet_id) == original_tweets # ### Tidiness issue #2: # #### image_df table: # #### dog breeds are spread out in separate columns # # ### Quality issue#8: # #### image_df table: # #### records with 'false' values in all 3 of the dog prediction columns (p1_dog, p2_dog, p3_dog),these are not dogs # #### Define: # Create a new column for dog breeds and use the p and p_dog columns to populate the column, then drop all rows with null values in the dog breeds column. This also takes care of issue `#8` # Then drop the columns that will no longer be needed. # #### Code: # In[126]: breeds = [] indices = list(range(len(img_clean.tweet_id))) for i in indices: if img_clean.p1_dog[i]: breeds.append(img_clean.p1[i]) elif img_clean.p2_dog[i]: breeds.append(img_clean.p2[i]) elif img_clean.p3_dog[i]: breeds.append(img_clean.p3[i]) else: breeds.append(np.nan) # In[127]: img_clean['dog_breed'] = breeds # In[128]: img_clean.dropna(inplace = True) # In[129]: img_clean = img_clean.drop(["img_num",'p1','p1_conf','p1_dog','p2','p2_conf','p2_dog','p3','p3_conf','p3_dog'],axis = 1) # #### Test: # In[130]: img_clean.columns # ### Quality issue #4: # #### tweet_archive table: # #### presence of some records with non null values in the 'retweeted_status_id' column,these are not original tweets # # # ### Quality issue #5: # #### tweet_archive table: # #### presence of some records with non null values in 'reply_to_status_id' column, these are not original tweets # #### Define: # Filter out these records using boolean indexing. Afterwards drop associated columns and any other columns that will not be further used. # #### Code: # In[131]: tweetarch_clean = tweetarch_clean.loc[pd.isna(tweetarch_clean['in_reply_to_status_id'])] # In[132]: tweetarch_clean = tweetarch_clean.loc[pd.isna(tweetarch_clean['retweeted_status_id'])] # #### Test: # In[133]: tweetarch_clean.info() # #### Code: # In[134]: tweetarch_clean = tweetarch_clean.drop(['in_reply_to_status_id', 'in_reply_to_user_id','retweeted_status_id', 'retweeted_status_user_id','retweeted_status_timestamp','source','expanded_urls',], axis = 1 ) # #### Test: # In[135]: tweetarch_clean.info() # ### Tidiness issue #3: # #### Data is in separate tables # #### Define: # Merge the three dataframes on the tweet_id column to make one master dataframe. # #### Code: # In[136]: data_frames = [img_clean,tweetarch_clean,tweetcounts_clean] master_df = functools.reduce(lambda left,right: pd.merge(left,right,on=['tweet_id'], how='left'), data_frames) # #### Test: # In[29]: master_df # ### Issue #6: # #### records dating beyond 2017-08-01 # # ### Issue #7: # #### 'timestamp' column is an object datatype # # #### Define: # Merging the data sets filters out all records dating beyond 2017-08-01, fixing issue`#6`.This also means that issue`#7` is now a non-issue because the column will be dropped, as the main reason for converting it's data type would have been to filter out the records that dated beyond the specified date. # #### Code: # In[137]: master_df = master_df.drop(['timestamp'], axis = 1) # #### Test: # In[138]: master_df.columns # ## Storing Data # In[139]: # storing the cleaned master dataframe in a csv file master_df.to_csv('twitter_archive_master.csv',index=False) # # ## Analyzing and Visualizing Data # # ### Questions for analysis # # 1. What is the lowest dog rating on WeRateDogs? # 2. Which tweet has the lowest dog rating on WeRateDogs? # 3. Which tweet has the highest retweet count? # 4. What are some of the most common dog names? # 5. What are the most popular dog breeds by favorite count? # 6. Are WeRateDogs tweets more likely to be favorited or retweeted? # 7. Describe the correlation between retweet count and favorite count. # 8. Describe the correlation between dog rating and retweet count. # 9. Describe the correlation between dog rating and favorite count. # # In[140]: # importing data into a dataframe tweets = pd.read_csv('twitter_archive_master.csv') # In[141]: # cheching summary statistics for the data tweets.describe() # In[142]: #checking the denominator values tweets.rating_denominator.value_counts() # Since the values of the denominator vary, to standardize the ratings for analysis, create an additional percentage column # In[143]: #creating a percentage rating column tweets['rating'] = (tweets.rating_numerator / tweets.rating_denominator)* 100 # ### Insights: # ##### 1. What is the lowest dog rating on WeRateDogs? # In[144]: tweets.rating.min() # The lowest dog rating given on WeRateDogs is 0. # ##### 2. Which tweet has the lowest dog rating on WeRateDogs? # In[146]: #tweet with the lowest dog rating tweets.loc[tweets.rating == tweets.rating.min()] # In[147]: # getting the tweet text tweets.loc[tweets.rating == tweets.rating.min()].text[1455] # In[148]: #getting the tweet image url for download tweets.loc[tweets.rating == tweets.rating.min()].jpg_url[1455] # The tweet with the lowest rating on WeRateDogs, at 0/10 is of an American Staffordshire terrier stating: # # "When you're so blinded by your systematic plagiarism that you forget what day it is. 0/10" # ![Terrier.jpg](attachment:Terrier.jpg) # # This tweet however still has a favorite count of 20,923 which is more than double the average favorite count, and a retweet count of 2,755 which is still slightly higher than the average retweet count. # ##### 3. Which tweet has the highest retweet count? # In[149]: #tweet with the highest retweet count tweets.loc[tweets.retweet_count == tweets.retweet_count.max()] # In[150]: #getting the tweet text tweets.loc[tweets.retweet_count == tweets.retweet_count.max()].text[978] # In[151]: #getting the tweet image for downaload tweets.loc[tweets.retweet_count == tweets.retweet_count.max()].jpg_url[978] # The tweet with the highest retweet count at 70,429 retweets, also happening to have the highest favorite count at 144,312 likes, is of a Labrador retriever stating: # # "Here's a doggo realizing you can stand in a pool. 13/10 enlightened af" # ![Labrador_retriever.jpg](attachment:Labrador_retriever.jpg) # ##### 4. What are some of the most common dog names? # In[152]: tweets.name.value_counts() # Some of the most common dog names on WeRateDogs include Cooper at the very top, with 10 dogs, Charlie, Lucy, Tucker, Oliver. Each having 9 dogs with the stated names and Penny having 8 dogs. # ##### 5. What are the most popular dog breeds by favorite count? # In[153]: tweets.groupby('dog_breed').favorite_count.sum().sort_values(ascending = False) # The top 5 most popular dog breeds on WeRateDogs in descending order are: # # 1.Golden retrievers having 1,639,666 total favorite counts # # 2.Labrador retrievers at 1,028,020 favorite counts # # 3.Pembrokes at 902,671 favorite counts # # 4.Chihuahuas at 664,894 favorite counts # # 5.French Bulldogs at 524,718 favorite counts # # ##### 6. Are WeRateDogs tweets more likely to be retweeted or favorited? # In[154]: print(f'Total number of retweets: {tweets.retweet_count.sum()}') print(f'Total number of likes: {tweets.favorite_count.sum()}') print(f'Difference: {tweets.favorite_count.sum()-tweets.retweet_count.sum()}') # WeRateDogs tweets are more likely to be favorited, having 9.4 million more favorite counts than retweet counts. # ### Visualizations # ##### 1. Describe the correlation between retweet count and favorite count. # In[155]: #plotting to show correlation tweets.plot(x = 'retweet_count',y = 'favorite_count', kind = 'scatter'); plt.title('Correlation between tweet retweet count and tweet favorite count'); # There is generally a positive correlation between retweet count and favorite count. As a tweet's favorite count increases, the number of retweets is also highly likely to increase. # ##### 2. Describe the correlation between dog rating and retweet count # In[156]: tweets.plot(x = 'retweet_count',y = 'rating', kind = 'scatter'); plt.title('Correlation between tweet retweet count and dog rating'); # The scatter plot reveals a horizontal line of best fit, indicating there is no correlation between dog rating and retweet count. The rating given to a dog therefore does not affect the number of times a particular dog's tweet will be retweeted. # ##### 3. Describe the correlation between dog rating and favorite count. # In[157]: tweets.plot(x = 'favorite_count',y = 'rating', kind = 'scatter'); plt.title('Correlation between tweet favorite count and dog rating'); # Much like with retweet count, this scatter plot also reveals a horizontal line of best fit, indicating, there is no correlation between dog rating and favorite count. The rating given to a dog therefore does not affect the number of times that the particular dog's tweet will be favorited.