#!/usr/bin/env python # coding: utf-8 # # A Python Tour of Data Science: Data Acquisition & Exploration # # [Michaƫl Defferrard](http://deff.ch), *PhD student*, [EPFL](http://epfl.ch) [LTS2](http://lts2.epfl.ch) # # 1 Exercise: problem definition # # Theme of the exercise: **understand the impact of your communication on social networks**. A real life situation: the marketing team needs help in identifying which were the most engaging posts they made on social platforms to prepare their next [AdWords](https://www.google.com/adwords/) campaign. # # As you probably don't have a company (yet?), you can either use your own social network profile as if it were the company's one or choose an established entity, e.g. EPFL. You will need to be registered in FB or Twitter to generate access tokens. If you're not, either ask a classmate to create a token for you or create a fake / temporary account for yourself (no need to follow other people, we can fetch public data). # # At the end of the exercise, you should have two datasets (Facebook & Twitter) and have used them to answer the following questions, for both Facebook and Twitter. # 1. How many followers / friends / likes has your chosen profile ? # 2. How many posts / tweets in the last year ? # 3. What were the 5 most liked posts / tweets ? # 4. Plot histograms of number of likes and comments / retweets. # 5. Plot basic statistics and an histogram of text lenght. # 6. Is there any correlation between the lenght of the text and the number of likes ? # 7. Be curious and explore your data. Did you find something interesting or surprising ? # 1. Create at least one interactive plot (with bokeh) to explore an intuition (e.g. does the posting time plays a role). # # 2 Ressources # # Here are some links you may find useful to complete that exercise. # # Web APIs: these are the references. # * [Facebook Graph API](https://developers.facebook.com/docs/graph-api) # * [Twitter REST API](https://dev.twitter.com/rest/public) # # Tutorials: # * [Mining the Social Web](https://github.com/ptwobrussell/Mining-the-Social-Web-2nd-Edition) # * [Mining Twitter data with Python](https://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/) # * [Simple Python Facebook Scraper](http://simplebeautifuldata.com/2014/08/25/simple-python-facebook-scraper-part-1/) # # 3 Web scraping # # Tasks: # 1. Download the relevant information from Facebook and Twitter. Try to minimize the quantity of collected data to the minimum required to answer the questions. # 2. Build two SQLite databases, one for Facebook and the other for Twitter, using [pandas](http://pandas.pydata.org/) and [SQLAlchemy](http://www.sqlalchemy.org/). # 1. For FB, each row is a post, and the columns are at least (you can include more if you want): the post id, the message (i.e. the text), the time when it was posted, the number of likes and the number of comments. # 2. For Twitter, each row is a tweet, and the columns are at least: the tweet id, the text, the creation time, the number of likes (was called favorite before) and the number of retweets. # # Note that some data cleaning is already necessary. E.g. there are some FB posts without *message*, i.e. without text. Some tweets are also just retweets without any more information. Should they be collected ? # In[ ]: # Number of posts / tweets to retrieve. # Small value for development, then increase to collect final data. n = 4000 # 20 # ## 3.1 Facebook # # There is two ways to scrape data from Facebook, you can choose one or combine them. # 1. The low-level approach, sending HTTP requests and receiving JSON responses to / from their Graph API. That can be achieved with the json and [requests](python-requests.org) packages (altough you can use urllib or urllib2, requests has a better API). The knowledge you'll acquire using that method will be useful to query other web APIs than FB. This method is also more flexible. # 2. The high-level approach, using a [Python SDK](http://facebook-sdk.readthedocs.io). The code you'll have to write for this method is gonna be shorter, but specific to the FB Graph API. # # You will need an access token, which can be created with the help of the [Graph Explorer](https://developers.facebook.com/tools/explorer). That tool may prove useful to test queries. Once you have your token, you may create a `credentials.ini` file with the following content: # ``` # [facebook] # token = YOUR-FB-ACCESS-TOKEN # ``` # In[ ]: import configparser # Read the confidential token. credentials = configparser.ConfigParser() credentials.read('credentials.ini') token = credentials.get('facebook', 'token') # Or token = 'YOUR-FB-ACCESS-TOKEN' # In[ ]: import requests # pip install requests import facebook # pip install facebook-sdk import pandas as pd # In[ ]: page = 'EPFL.ch' # ### 3.1.1 Scrap with HTTP requests # #### 3.1.1.1 Get the number of likes # # The process is three-way: # 1. Assemble an URL to query. The documentation of the FB Graph API is useful there. You can click on the URL to let your browser make the query and return the result. # 2. Send an HTTP GET request, receive the results and interpret it as JSON (because Facebook sends data in JSON). # 3. Explore the received data and extract what interests us, here the number of likes. If we don't get what we want (or if we get too much), we can modify the query url. Note that the hierarchical JSON format is exposed as a dictionary. # In[ ]: # 1. Form URL. url = 'https://graph.facebook.com/{}?fields=likes&access_token={}'.format(page, token) #print(url) # 2. Get data. data = requests.get(url).json() print('data:', data) # Optionally, check for errors. Most probably the session has expired. if 'error' in data.keys(): raise Exception(data) # 3. Extract data. print('{} has {} likes'.format(page, data['likes'])) # #### 3.1.1.2 Get posts # # The process is similar here, except that the query and extraction are more complicated (because we work with more data). As you may have found out, FB returns at most 100 posts at a time. To get more posts, they provide *paging*, which we use to requests the next posts. # In[ ]: # 1. Form URL. You can click that url and see the returned JSON in your browser. fields = 'id,created_time,message,likes.limit(0).summary(1),comments.limit(0).summary(1)' url = 'https://graph.facebook.com/{}/posts?fields={}&access_token={}'.format(page, fields, token) #print(url) # Create the pandas DataFrame, a table which columns are post id, message, created time, #likes and #comments. fb = pd.DataFrame(columns=['id', 'text', 'time', 'likes', 'comments']) # The outer loop is to query FB multiple times, as FB sends at most 100 posts at a time. while len(fb) < n: # 2. Get the data from FB. At most 100 posts. posts = requests.get(url).json() # 3. Here we extract information for each of the received post. for post in posts['data']: # The information is stored in a dictionary. serie = dict(id=post['id'], time=post['created_time']) try: serie['text'] = post['message'] except KeyError: # Let's say we are not interested in posts without text. continue serie['likes'] = post['likes']['summary']['total_count'] serie['comments'] = post['comments']['summary']['total_count'] # Add the dictionary as a new line to our pandas DataFrame. fb = fb.append(serie, ignore_index=True) try: # That URL is returned by FB to access the next 'page', i.e. the next 100 posts. url = posts['paging']['next'] except KeyError: # No more posts. break # In[ ]: fb[:5] # ### 3.1.2 Scrap with Facebook SDK # # That method is much slower because it should retrieve the comments and likes, not only their number, for each post. The API is not expressive enough to do otherwise. # In[ ]: g = facebook.GraphAPI(token, version='2.7') # We limit to 10 because it's slow. posts = g.get_connections(page, 'posts', limit=10) if 'error' in posts.keys(): # Most probably the session has expired. raise Exception(data) for post in posts['data']: pid = post['id'] try: text = post['message'] except KeyError: continue time = post['created_time'] likes = g.get_connections(pid, 'likes', summary=True, limit=0) nlikes = likes['summary']['total_count'] comments = g.get_connections(pid, 'comments', summary=True, limit=0) ncomments = comments['summary']['total_count'] print('{:6d} {:6d} {} {}'.format(nlikes, ncomments, time, text[:50])) # ## 3.2 Twitter # # There exists a bunch of [Python-based clients](https://dev.twitter.com/overview/api/twitter-libraries#python) for Twitter. [Tweepy](http://tweepy.readthedocs.io) is a popular choice. # # You will need to create a [Twitter app](https://apps.twitter.com/) and copy the four tokens and secrets in the `credentials.ini` file: # ``` # [twitter] # consumer_key = YOUR-CONSUMER-KEY # consumer_secret = YOUR-CONSUMER-SECRET # access_token = YOUR-ACCESS-TOKEN # access_secret = YOUR-ACCESS-SECRET # ``` # In[ ]: import tweepy # pip install tweepy auth = tweepy.OAuthHandler(credentials.get('twitter', 'consumer_key'), credentials.get('twitter', 'consumer_secret')) auth.set_access_token(credentials.get('twitter', 'access_token'), credentials.get('twitter', 'access_secret')) api = tweepy.API(auth) user = 'EPFL_en' # In[ ]: followers = api.get_user(user).followers_count print('{} has {} followers'.format(user, followers)) # The code is much simpler for Twitter than Facebook because Tweepy handles much of the dirty work, like paging. # In[ ]: tw = pd.DataFrame(columns=['id', 'text', 'time', 'likes', 'shares']) for tweet in tweepy.Cursor(api.user_timeline, screen_name=user).items(n): serie = dict(id=tweet.id, text=tweet.text, time=tweet.created_at) serie.update(dict(likes=tweet.favorite_count, shares=tweet.retweet_count)) tw = tw.append(serie, ignore_index=True) # # 4 Prepare and save data # # To facilitate our analysis, we first prepare the data. # 1. Convert floating point numbers to integers. # 1. Convert Facebook post time from string to datetime. # # That is not necessary, but it'll allow to e.g. compare posting dates with standard comparison operators like `>` and `<`. # In[ ]: #fb.id = fb.id.astype(int) fb.likes = fb.likes.astype(int) fb.comments = fb.comments.astype(int) tw.id = tw.id.astype(int) tw.likes = tw.likes.astype(int) tw.shares = tw.shares.astype(int) # In[ ]: from datetime import datetime def convert_time(row): return datetime.strptime(row['time'], '%Y-%m-%dT%H:%M:%S+0000') fb['time'] = fb.apply(convert_time, axis=1) # In[ ]: from IPython.display import display display(fb[:5]) display(tw[:5]) # Now that we collected everything, let's save it in two SQLite databases. # In[ ]: import os folder = os.path.join('..', 'data', 'social_media') try: os.makedirs(folder) except FileExistsError: pass filename = os.path.join(folder, 'facebook.sqlite') fb.to_sql('facebook', 'sqlite:///' + filename, if_exists='replace') filename = os.path.join(folder, 'twitter.sqlite') tw.to_sql('twitter', 'sqlite:///' + filename, if_exists='replace') # # 5 Data analysis # # Answer the questions using [pandas](http://pandas.pydata.org/), [statsmodels](http://statsmodels.sourceforge.net/), [scipy.stats](http://docs.scipy.org/doc/scipy/reference/stats.html), [bokeh](http://bokeh.pydata.org). # In[ ]: import numpy as np import matplotlib.pyplot as plt plt.style.use('ggplot') get_ipython().run_line_magic('matplotlib', 'inline') # ## 5.1 Number of posts # In[ ]: date = datetime(2016, 9, 4) datestr = date.strftime('%Y-%m-%d') print('Number of posts after {}: {}'.format(datestr, sum(fb.time > date))) print('Number of tweets after {}: {}'.format(datestr, sum(tw.time > date))) # ## 5.2 Most liked # # Looks like we're really into rankings !! # In[ ]: display(fb.sort_values(by='likes', ascending=False)[:5]) display(tw.sort_values(by='likes', ascending=False)[:5]) # ## 5.3 Engagement: likes, comments, shares # In[ ]: pd.concat([fb.describe(), tw.loc[:,'likes':'shares'].describe()], axis=1) # In[ ]: fig, axs = plt.subplots(1, 4, figsize=(15, 5)) fb.likes.plot(kind='box', ax=axs[0]); fb.comments.plot(kind='box', ax=axs[1]); tw.likes.plot(kind='box', ax=axs[2]); tw.shares.plot(kind='box', ax=axs[3]); # In[ ]: fb.hist(bins=20, log=True, figsize=(15, 5)); # In[ ]: fig, axs = plt.subplots(1, 2, figsize=(15, 5)) tw.loc[:,'likes'].hist(bins=20, log=True, ax=axs[0]); tw.loc[tw.shares < 200, 'shares'].hist(bins=20, log=True, ax=axs[1]); # ## 5.4 Text length # # There is a stricking difference here: # 1. On Twitter, almost all tweets reach the 140 characters limit. # 2. The distribution is more Gaussian on Facebook. # In[ ]: def text_length(texts): lengths = np.empty(len(texts), dtype=int) for i, text in enumerate(texts): lengths[i] = len(text) plt.figure(figsize=(15, 5)) prop = lengths.min(), '{:.2f}'.format(lengths.mean()), lengths.max() plt.title('min = {}, mean={}, max = {}'.format(*prop)) plt.hist(lengths, bins=20) text_length(tw.text) text_length(fb.text) # ## 5.5 Posting time # # We can clearly observe the office hours. # In[ ]: fb.id.groupby(fb.time.dt.hour).count().plot(kind='bar', alpha=0.4, color='y', figsize=(15,5)); tw.id.groupby(tw.time.dt.hour).count().plot(kind='bar', alpha=0.4, color='g', figsize=(15,5)); # Let's look if the time of posting influence the number of likes. Do you see a peak at 5am ? Do you really think we should post at 5am ? What's going on here ? # In[ ]: fb.likes.groupby(fb.time.dt.hour).mean().plot(kind='bar', figsize=(15,5)); plt.figure() tw.likes.groupby(tw.time.dt.hour).mean().plot(kind='bar', figsize=(15,5));