#!/usr/bin/env python # coding: utf-8 # # Abstract # # # This project is for wrangling data. There are three data sets. Two of them are given by Udacity (which are 'twitter-archive-enhanced.csv, image-predictions.tsv), one of them can be taken through 'twitter-API' which is about retweet counts and favorite counts. Those data sets are not cleaned yet. In other words, Those are dirty and messy. We should clean them. # # Gather # In[1]: import tweepy import pandas as pd import numpy as np import requests import json from timeit import default_timer as timer pd.set_option('display.max_rows', 7) # change the CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_SECRET by yours consumer_key = 'CONSUMER_KEY' consumer_secret = 'CONSUMER_SECRET' access_token = 'ACCESS_TOKEN' access_secret = 'ACCESS_SECRET' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_secret) api = tweepy.API(auth, wait_on_rate_limit=True) # #### Load the WeRateDogs twitter archive # In[2]: df_twit_archive_raw = pd.read_csv('twitter-archive-enhanced.csv') df_twit_archive_raw.head(1) # #### Load the tweet image predictions, i.e., what breed of dog (or other object, animal, etc.) is present in each tweet according to a neural network. # In[3]: image_predictions_url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv' df_image_predic_raw = pd.read_csv(image_predictions_url, sep='\t') df_image_predic_raw.head() # #### Each tweet's retweet count and favorite ("like") count at minimum. Using the tweet IDs in the WeRateDogs Twitter archive, query the Twitter API for each tweet's JSON data using Python's Tweepy library and store each tweet's entire set of JSON data in a file called tweet_json.txt file. Each tweet's JSON data should be written to its own line. # In[4]: '''tweet_ids = df_twit_archive.tweet_id.values fails_dict = {} start = timer() # Save each tweet's returned JSON as a new line in a .txt file with open('tweet_json.txt', 'w') as outfile: # This loop will likely take 20-30 minutes to run because of Twitter's rate limit for tweet_id in tweet_ids: try: api_status = api.get_status(tweet_id, tweet_mode='extended') json.dump(api_status._json, outfile) outfile.write('\n') except tweepy.TweepError as e: print("Fail") fails_dict[tweet_id] = e pass end = timer() print(end - start) print(fails_dict)''' # In[5]: df_twit_add_raw = pd.read_json('tweet_json.txt', lines=True) # # Assess # Before cleaning, I copied DataFrame to preserve original DataFrame. I solved eight Quality issues first and then tidienss issues. Some of them have been made through visual assessment by Excel, others have been made through programmatic assessment. I've made some useful function in order to clean DataFrame effectively. I could find all of library's function through documents and stackoverflow. Finally, I exported the cleaned DataFrame to a csv file # copy the DataFrame # In[6]: df_twit_archive = df_twit_archive_raw.copy() # In[7]: df_twit_archive.head(1) # In[8]: df_image_predic = df_image_predic_raw.copy() # In[9]: df_image_predic.head(3) # In[10]: df_twit_add = df_twit_add_raw.copy() # In[11]: df_twit_add.head(1) # In[12]: df_twit_archive.info() # In[13]: df_image_predic.info() # In[14]: df_twit_add.info() # In[15]: df_twit_archive.name.value_counts() # In[16]: df_twit_archive.query("name =='a'").text # In[17]: df_twit_archive.query('name in ["a", "an", "the", "such"]') # ### Quality Issues # # #### df_twit_archive table # - Null value in expanded_urls # - There are useless rows that have non-empty retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp # - [in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp] are useless columns # - The type of timestamp is object # - There are rows whose name == a, an, the, such # - The rating_numerator is represented by an integer whose rating is a decimal point. # - rating_denominator is not 10 # - rating_denominator and rating_numerator can be combined in one column # - expanded_url has rows with multiple URLs # # # ### Tidiness Issues # #### df_twit_archive table # - 1 variable in 4 columns in df_twit_archive table (doggo, floofer, pupper, puppo) # # #### df_twit_add table # - Attach favorite_count, retweet_count of df_twit_add to df_twit_archive # # #### df_image_predic table # - Attach p1, p1_conf, p1_dog of df_image_predic to df_twit_archive # # Clean # ### Quality Issues # # **`df_twit_archive`: Null value in expanded_urls** # ***Define*** # # Drop the rows with expanded_urls is Null # ***Cdoe*** # In[18]: df_twit_archive.dropna(subset=['tweet_id','expanded_urls'], inplace=True) df_twit_archive.reset_index(drop=True, inplace=True) # ***Test*** # In[19]: df_twit_archive.info() # **`df_twit_archive`: There are useless rows that have non-empty retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp** # ***Define*** # # Remove rows that have non-empty retweeted_status_id, retweeted_status_user_id, and retweeted_status_timestamp from df_twit_archive # ***Code*** # In[20]: retweeted_index = df_twit_archive.query('retweeted_status_id != "NaN"').index retweeted_index # In[21]: df_twit_archive.drop(retweeted_index, axis=0, inplace=True) df_twit_archive.reset_index(drop=True, inplace=True) # ***Test*** # In[22]: df_twit_archive.query('retweeted_status_id != "NaN"') # **`df_twit_archive`: [in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp] are useless columns** # ***Define*** # # Delete useless [in_reply_to_status_id, in_reply_to_user_id, retweeted_status_id, retweeted_status_user_id, retweeted_status_timestamp] columns # In[23]: df_twit_archive.query('retweeted_status_id != "NaN"') # ***Code*** # In[24]: df_twit_archive.drop(['in_reply_to_status_id', 'in_reply_to_user_id', 'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp'], axis=1, inplace=True) # ***Test*** # In[25]: df_twit_archive.head(1) # **`df_twit_archive`: The type of timestamp is object** # ***Define*** # # Change type of timestamp to datatime64 # ***code*** # In[26]: df_twit_archive.timestamp = df_twit_archive.timestamp.astype('datetime64') # ***Test*** # In[27]: df_twit_archive.info() # **`df_twit_archive`: There are rows whose name == a, an, the, such** # ***Define*** # # First, find the text of rows which name is 'a, an, the, such'. # # If text has the word 'named', replace the name with the word immediately after it. # Otherwise, replace the name with None. # ***code*** # In[28]: df_twit_wrong_name = df_twit_archive.query('name in ["a", "an", "the", "such"]') df_twit_wrong_name # In[29]: def get_correct_name(text): split_list = text.split() if 'named' in split_list: name_index = split_list.index('named') + 1 name = split_list[name_index] return name else: return None # In[30]: df_twit_wrong_name['correct_name'] = df_twit_wrong_name.apply(lambda x: get_correct_name(x['text']), axis=1) # In[31]: df_twit_wrong_name # In[32]: df_twit_archive.head(1) # In[33]: correct_name_dic = {} for index, row in df_twit_wrong_name.iterrows(): correct_name_dic[row.tweet_id] = row.correct_name correct_name_dic # In[34]: name_column_index = list(df_twit_archive.columns).index('name') name_column_index # In[35]: name_column_index = list(df_twit_archive.columns).index('name') for index, row in df_twit_archive.iterrows(): if row['tweet_id'] in correct_name_dic.keys(): df_twit_archive.iloc[index, name_column_index] = correct_name_dic[row.tweet_id] # ***Test*** # In[36]: df_twit_archive.query('name in ["a", "an", "the", "such"]') # **`df_twit_archive`: The rating_numerator is represented by an integer whose rating is a decimal point.** # ***Define*** # # In the text column of df_twit_archive table, extract the rating of 00.0 / 00 type using regex and modify rating_numerator type to 00.0 # ***Code*** # In[37]: series_with_regex = df_twit_archive.text.str.extract('((\d+\.)?\d+)\/(\d+)', expand=True) # In[38]: series_with_regex.dropna() # In[39]: rating_numerator_index = 5 df_twit_archive.iloc[40, rating_numerator_index] = 13.5 df_twit_archive.iloc[549, rating_numerator_index] = 9.75 df_twit_archive.iloc[605, rating_numerator_index] = 11.27 df_twit_archive.iloc[1429, rating_numerator_index] = 11.26 # ***Test*** # In[40]: df_twit_archive.iloc[40].rating_numerator, df_twit_archive.iloc[549].rating_numerator, df_twit_archive.iloc[605].rating_numerator, df_twit_archive.iloc[1429].rating_numerator # **`df_twit_archive`: rating_denominator is not 10** # ***Define*** # # drop the rows with rating_denominator != 10 # ***Code*** # In[41]: index_of_not_having_10_denominator = df_twit_archive.query('rating_denominator != 10').index df_twit_archive.drop(index_of_not_having_10_denominator, inplace=True) # ***Test*** # In[42]: df_twit_archive.query('rating_denominator != 10') # **`df_twit_archive`: rating_denominator and rating_numerator can be combined in one column** # # ***Define*** # # Evaluate rating_numerator / rating_denominator and save it as rating, delete rating_numerator and rating_denominator # In[43]: df_twit_archive['rating'] = df_twit_archive.rating_numerator / df_twit_archive.rating_denominator # In[44]: df_twit_archive.drop(['rating_numerator', 'rating_denominator'], axis=1, inplace=True) # ***Test*** # In[45]: df_twit_archive.head(1) # **`df_twit_archive`: expanded_url has rows with multiple URLs** # ***Define*** # # remove duplicated URLs from expanded_url column. # For example, urls = 'a,a,a' then, extract 'a' only # ***Code*** # In[46]: def delete_duplicated_urls(url): first_url = url.split(',')[0] return first_url # In[47]: df_twit_archive['correct_expanded_urls'] = df_twit_archive.apply(lambda x: delete_duplicated_urls(x['expanded_urls']), axis=1) # In[48]: df_twit_archive.drop('expanded_urls', axis=1, inplace=True) # ***Test*** # In[49]: df_twit_archive.info() # In[50]: for index, row in df_twit_archive.iterrows(): if len(row['correct_expanded_urls'].split(',')) > 1: print(row['correct_expanded_urls']) # **`df_twit_archive`: 1 variable in 4 columns in df_twit_archive table (doggo, floofer, pupper, puppo) # ***Define*** # # Change doggo, floofer, pupper, puppo columns to kinds column # ***Code*** # In[51]: df_twit_archive.groupby(["doggo", "floofer", "pupper", "puppo"]).size().reset_index().rename(columns={0: "count"}) # In[52]: df_twit_archive.doggo.replace('None', '', inplace=True) df_twit_archive.doggo.replace(np.NaN, '', inplace=True) df_twit_archive.floofer.replace('None', '', inplace=True) df_twit_archive.floofer.replace(np.NaN, '', inplace=True) df_twit_archive.pupper.replace('None', '', inplace=True) df_twit_archive.pupper.replace(np.NaN, '', inplace=True) df_twit_archive.puppo.replace('None', '', inplace=True) df_twit_archive.puppo.replace(np.NaN, '', inplace=True) # In[53]: df_twit_archive['stage'] = df_twit_archive.doggo + df_twit_archive.floofer + df_twit_archive.pupper + df_twit_archive.puppo df_twit_archive.loc[df_twit_archive.stage == 'doggopupper', 'stage'] = 'doggo,pupper' df_twit_archive.loc[df_twit_archive.stage == 'doggopuppo', 'stage'] = 'doggo,puppo' df_twit_archive.loc[df_twit_archive.stage == 'doggofloofer', 'stage'] = 'doggo,floofer' # In[54]: df_twit_archive.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True) # ***Test*** # In[55]: df_twit_archive.stage.value_counts() # In[56]: df_twit_archive # **`df_twit_add`: Attach favorite_count, retweet_count of df_twit_add to df_twit_archive** # ***Define*** # # Attach only favorite count and retwit count columns of df_twit_add to df_twit_archive based on tweet_id # ***Code*** # In[57]: df_twit_add.rename(columns={'id': 'tweet_id'}, inplace=True) # In[58]: df_twit_archive = df_twit_archive.merge(df_twit_add) df_twit_add_columns = list(df_twit_add.columns) df_twit_add_columns.remove('tweet_id') df_twit_add_columns.remove('favorite_count') df_twit_add_columns.remove('retweet_count') df_twit_archive.drop(columns=df_twit_add_columns, inplace=True) # ***Test*** # In[59]: df_twit_archive # **`df_image_predic`: Attach p1, p1_conf, p1_dog of df_image_predic to df_twit_archive** # ***Define*** # # Attach df_image_predic to df_twit_archive (p1, p1_conf, p1_dog only) # ***Code*** # In[60]: df_twit_archive = pd.merge(df_twit_archive, df_image_predic) df_twit_archive # In[61]: df_image_predic.info() # In[62]: df_twit_archive.drop(columns=['jpg_url', 'img_num', 'p2', 'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'], inplace=True) # ***Test*** # In[63]: df_twit_archive # # Analysis # #### The largest number of breeds on WeRateDogs (with a confidence of more than 70% for accuracy and p1_dog being true) # In[64]: df_believable_dog = df_twit_archive.query('p1_conf >= 0.7 and p1_dog == True') series_breeds = df_believable_dog.p1.value_counts() series_breeds[:15] # In[65]: import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') plt.figure(figsize=(12,5)); plt.bar(x=series_breeds[:15].index, height=series_breeds[:15]); plt.xlabel('breeds') plt.ylabel('counts') plt.title('The largest number of breeds on WeRateDogs') plt.xticks(rotation=90); # There are much more Golden retriever than other breeds. The next many breeds are Pembroke, Labrador retriever, and Chihuahua. # In[66]: df_twit_archive.describe() # In[67]: df_believable_famous_dog = df_twit_archive.query('p1_conf >= 0.7 and p1_dog == True and favorite_count > 10000') series_famous_breeds = df_believable_famous_dog.p1.value_counts() series_famous_breeds[:15] # In[68]: plt.figure(figsize=(12,5)); plt.bar(x=series_famous_breeds[:15].index, height=series_famous_breeds[:15]); plt.xlabel('breeds') plt.ylabel('counts') plt.title('The largest number of famous breeds on WeRateDogs') plt.xticks(rotation=90); # In this case, there are much more Golden retriever than other breeds too. The next many breeds are Pembroke, Labrador retriever, and Chihuahua as like above # In[69]: df_twit_archive.stage.value_counts() # There are 1666 pupper, 203 doggo. # # Store Dataset # # combined and cleaned data to csv file # In[70]: df_twit_archive.to_csv('df_twit_archive_master', index=True)