from functools import partial from sys import maxint import sys import time from urllib2 import URLError from httplib import BadStatusLine import json import twitter # pip install twitter import pymongo # pip install pymongo def oauth_login(): # XXX: Go to http://twitter.com/apps/new to create an app and get values # for these credentials that you'll need to provide in place of these # empty string values that are defined as placeholders. # See https://dev.twitter.com/docs/auth/oauth for more information # on Twitter's OAuth implementation CONSUMER_KEY = '' CONSUMER_SECRET = '' OAUTH_TOKEN = '' OAUTH_TOKEN_SECRET = '' auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET) twitter_api = twitter.Twitter(auth=auth) return twitter_api def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): # A nested helper function that handles common HTTPErrors. Return an updated value # for wait_period if the problem is a 500 level error. Block until the rate limit # is reset if a rate limiting issue (429 error). Returns None for 401 and 404 errors # which requires special handling by the caller. def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True): if wait_period > 3600: # Seconds print >> sys.stderr, 'Too many retries. Quitting.' raise e # See https://dev.twitter.com/docs/error-codes-responses for common codes if e.e.code == 401: print >> sys.stderr, 'Encountered 401 Error (Not Authorized)' return None elif e.e.code == 404: print >> sys.stderr, 'Encountered 404 Error (Not Found)' return None elif e.e.code == 429: print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)' if sleep_when_rate_limited: print >> sys.stderr, "Sleeping for 15 minutes, and then I'll try again...ZzZ..." sys.stderr.flush() time.sleep(60*15 + 5) print >> sys.stderr, '...ZzZ...Awake now and trying again.' return 2 else: raise e # Allow user to handle the rate limiting issue however they'd like elif e.e.code in (500, 502, 503, 504): print >> sys.stderr, 'Encountered %i Error. Will retry in %i seconds' % (e.e.code, wait_period) time.sleep(wait_period) wait_period *= 1.5 return wait_period else: raise e # End of nested helper function wait_period = 2 error_count = 0 while True: try: return twitter_api_func(*args, **kw) except twitter.api.TwitterHTTPError, e: error_count = 0 wait_period = handle_twitter_http_error(e, wait_period) if wait_period is None: return except URLError, e: error_count += 1 print >> sys.stderr, "URLError encountered. Continuing." if error_count > max_errors: print >> sys.stderr, "Too many consecutive errors...bailing out." raise except BadStatusLine, e: error_count += 1 print >> sys.stderr, "BadStatusLine encountered. Continuing." if error_count > max_errors: print >> sys.stderr, "Too many consecutive errors...bailing out." raise def store_friends_followers_ids(twitter_api, screen_name=None, user_id=None, friends_limit=maxint, followers_limit=maxint, database=None): # Must have either screen_name or user_id (logical xor) assert (screen_name != None) != (user_id != None), "Must have screen_name or user_id, but not both" # See https://dev.twitter.com/docs/api/1.1/get/friends/ids and # See https://dev.twitter.com/docs/api/1.1/get/followers/ids for details on API parameters get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids, count=5000) get_followers_ids = partial(make_twitter_request, twitter_api.followers.ids, count=5000) for twitter_api_func, limit, label in [ [get_friends_ids, friends_limit, "friends"], [get_followers_ids, followers_limit, "followers"] ]: if limit == 0: continue total_ids = 0 cursor = -1 while cursor != 0: # Use make_twitter_request via the partially bound callable... if screen_name: response = twitter_api_func(screen_name=screen_name, cursor=cursor) else: # user_id response = twitter_api_func(user_id=user_id, cursor=cursor) if response is not None: ids = response['ids'] total_ids += len(ids) save_to_mongo({"ids" : [_id for _id in ids ]}, database, label + "_ids") cursor = response['next_cursor'] print >> sys.stderr, 'Fetched {0} total {1} ids for {2}'.format(total_ids, label, (user_id or screen_name)) sys.stderr.flush() # Consider storing the ids to disk during each iteration to provide an # an additional layer of protection from exceptional circumstances if len(ids) >= limit or response is None: break print >> sys.stderr, 'Last cursor', cursor print >> sts.stderr, 'Last response', response def save_to_mongo(data, mongo_db, mongo_db_coll, auth=None, **mongo_conn_kw): # Connects to the MongoDB server running on # localhost:27017 by default client = pymongo.MongoClient(**mongo_conn_kw) # Get a reference to a particular database db = client[mongo_db] if auth: db.authenticate(auth[0], auth[1]) # Reference a particular collection on the database coll = db[mongo_db_coll] # Perform a bulk insert and return the ids return coll.insert(data) def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False, criteria=None, projection=None, auth=None, **mongo_conn_kw): # Optionally, use criteria and projection to limit the data that is # returned as documented in # http://docs.mongodb.org/manual/reference/method/db.collection.find/ # Consider leveraging MongoDB's aggregations framework for more # sophisticated queries. client = pymongo.MongoClient(**mongo_conn_kw) db = client[mongo_db] if auth: db.authenticate(auth[0], auth[1]) coll = db[mongo_db_coll] if criteria is None: criteria = {} if projection is None: cursor = coll.find(criteria) else: cursor = coll.find(criteria, projection) # Returning a cursor is recommended for large amounts of data if return_cursor: return cursor else: return [ item for item in cursor ] def store_user_info(twitter_api, screen_names=None, user_ids=None, database=None): # Must have either screen_name or user_id (logical xor) assert (screen_names != None) != (user_ids != None), "Must have screen_names or user_ids, but not both" items = screen_names or user_ids while len(items) > 0: if len(items)/100*100 % 1000 == 0: print >> sys.stderr, len(items), "remaining" # Process 100 items at a time per the API specifications for /users/lookup. See # https://dev.twitter.com/docs/api/1.1/get/users/lookup for details items_str = ','.join([str(item) for item in items[:100]]) items = items[100:] if screen_names: response = make_twitter_request(twitter_api.users.lookup, screen_name=items_str) else: # user_ids response = make_twitter_request(twitter_api.users.lookup, user_id=items_str) for profile in response: save_to_mongo(profile, database, 'followers_profiles') # Go ahead and instantiate an instance of the Twitter API for common use # throughout the rest of this notebook. twitter_api = oauth_login() # Harvest the follower IDS. Note that this process takes a bit of time since # you are limited to harvesting 75k IDs per 15 minute window. # # Some example accounts that we'll look at later in this notebook: # # Tim O'Reilly: ~1.7M followers (~6 hours) # Marissa Mayer: ~460k followers (~1.5 hours) # Lady Gaga: ~40M followers (~5.5 days) # # All in all, it takes a little over a week to harvest all of these IDs. # Define a simple wrapper that accepts a list of screen names for convenience # and reusability later. def harvest_followers_ids(screen_names=[]): for screen_name in screen_names: store_friends_followers_ids(twitter_api, screen_name=screen_name, friends_limit=0, database=screen_name) harvest_followers_ids(screen_names=[ 'timoreilly' ]) print "Done" # Iterate over the IDs for each follower of Tim O'Reilly and # lookup the profile. This process also takes some time since you are # limited to 18k profiles per 15 minute rate limit window. Resolving # all of these profiles takes a little over a day. # # Note that we could always opt to draw a random sample for a highly popular # account's followers IDs for a statistical analysis. However, be advised that # for the sample to be truly random, you do probabably need to first pull down the # totality of the follower IDs and then draw from that collection. Simply harvesting # the first N followers returned by the API and using that as a sample may not be # sufficient, since Twitter currently returns followers in the order in which the # follow occurred (though this is subject to change without notice.) # # For example, a 95% confidence interval with a 1% margin of error requires just # under 10k items be drawn for the random sample. A 99% confidence interval with # a 1% margin of error requires just under 17k items be drawn for the random sample. # Define another convenience wrapper and assume that you've already # harvested the follower IDs for the account in question def harvest_followers_profiles(screen_names=[]): for screen_name in screen_names: followers_ids = load_from_mongo(screen_name, 'followers_ids') # Flattening the IDs into memory requires a generous heap space. An # m1.xlarge AWS node (15GB of memory) should be more than sufficient # and spot prices are typically around $0.05/hr all_ids = [ _id for ids_batch in followers_ids for _id in ids_batch['ids'] ] store_user_info(twitter_api, user_ids=all_ids, database=screen_name) harvest_followers_profiles(screen_names=[ 'timoreilly' ]) print "Done." # Compute a list containing the number of followers for each of Tim O'Reilly's own followers. # Sort it for convenience of slicing and plotting. timoreilly_followers_counts = sorted([f['followers_count'] for f in load_from_mongo('timoreilly', 'followers_profiles', projection={'followers_count' : 1, '_id' : 0})]) # Now, let's plot the list to gain some initial intuition about the curve. plt.loglog(timoreilly_followers_counts) plt.ylabel("Num Followers") plt.xlabel("Follower Rank") # Now knowing that the curve resembles that of a "power law" and that # most of the substance is in the "long tail", let's zoom in on some the tail # with a histogram. # The resolution for a histogram displaying the full data set would be less than # useful to look at since the x-axis would stretch all the way out to accomodate # the outliers. (Try it.) # plt.hist(timoreilly_followers_counts) # So, hack the display a bit so that we only visualize 99% of the data and avoid the # outliers on the upper extreme. It's easy enough to produce a separate plot of them bins = [0,5,10,100,200,300,400,500,1000,4000] plt.hist(timoreilly_followers_counts[:len(timoreilly_followers_counts)/100*99], bins=bins) plt.title("Tim O'Reilly Followers") plt.xlabel('Bins (range of popularity for Tim\'s followers)') plt.ylabel('Number of followers in bin') # Let's start with a very base assumption for the time being: # any follower with less than 10 followers of their own is "suspect" in the # sense that this follower may be an inactive account, spambot, or abandoned account. # Regardless, it probably offers no meaningful influence. (Although this assumption # may be a bit naive, it seems a reasonable starting point for now.) MIN = 10 timoreilly_suspect_followers = [f for f in load_from_mongo('timoreilly', 'followers_profiles', projection={'followers_count' : 1, 'id' : 1, '_id' : 0}) if f['followers_count'] < MIN] print "Tim O'Reilly has {0} 'suspect' followers for MIN={1}".format(len(timoreilly_suspect_followers), MIN) # What does a plot of these 'suspect' followers look like? timoreilly_suspect_followers_counts = sorted([f['followers_count'] for f in timoreilly_suspect_followers], reverse=True) plt.hist(timoreilly_suspect_followers_counts) plt.title("Tim O'Reilly Suspect Followers") plt.xlabel('Bins (range of followers)') plt.ylabel('Number of followers in each bin') # A little more analysis of how the distribution breaks down print "{0} of Tim O'Reilly's followers have 0 followers"\ .format(sum([1 for c in timoreilly_suspect_followers_counts if c < 1])) print "{0} of Tim O'Reilly's followers have 1 follower"\ .format(sum([1 for c in timoreilly_suspect_followers_counts if c <= 1])) print "{0} of Tim O'Reilly's followers have less than 3 followers"\ .format(sum([1 for c in timoreilly_suspect_followers_counts if c < 3])) print "{0} of Tim O'Reilly's followers have less than 4 followers"\ .format(sum([1 for c in timoreilly_suspect_followers_counts if c < 4])) print "{0} of Tim O'Reilly's followers have less than 5 followers"\ .format(sum([1 for c in timoreilly_suspect_followers_counts if c < 5])) # Create sets of ids for each account of interest by flattening out the lists for each # request that was stored in MongoDB. (The maximum document size in MongoDB is ~16MB so # there would have been little to gain by trying to store them in a flatter structure # from the outset since multiple documents would have inevitably been required.) timoreilly_followers_ids = set([fid for ids in load_from_mongo('timoreilly', 'followers_ids', projection={'ids' : 1}) for fid in ids['ids'] ]) ladygaga_followers_ids = set([fid for ids in load_from_mongo('ladygaga', 'followers_ids', projection={'ids' : 1}) for fid in ids['ids'] ]) # Now, calculate the number of followers in common between each person of interest # by using set intersections. timoreilly_ladygaga_common_followers_ids = timoreilly_followers_ids & ladygaga_followers_ids print "Tim O'Reilly and Lady Gaga have {0} followers in common."\ .format(len(timoreilly_ladygaga_common_followers_ids)) # So, how many of Tim O'Reilly's suspect followers are in common with # Lady Gaga? It would be helpful to know if the followers that they have in common # are or aren't domainted by the suspect followers. # For convenience, let's carve out just the IDs for easy setwise comparison timoreilly_suspect_followers_ids = set([f['id'] for f in timoreilly_suspect_followers]) print "{0} of Tim O'Reilly's 'suspect' followers are from the set that's in common with Lady Gaga's followers"\ .format(len(timoreilly_suspect_followers_ids & timoreilly_ladygaga_common_followers_ids)) # There are non-trivial numbers of followers in common, but computing the set # intersection doesn't take into account the size of any given set, so let's # calculate the Jaccard similarity score, which provides a sort of # normalization. See http://en.wikipedia.org/wiki/Jaccard_index for details. # In short, it's a similarity measurement. The higher the score, the more # similar two sets are in comparison to two other sets. def jaccard(x,y): return 1.0*len(x & y) / len(x | y) timoreilly_ladygaga_jaccard = jaccard(timoreilly_followers_ids, ladygaga_followers_ids) print "Tim O'Reilly and Lady Gaga's Jaccard Index: {0}".format(timoreilly_ladygaga_jaccard) # Need to define this variable, assuming you've pulled down the data for this account marissamayer_followers_ids = set([fid for ids in load_from_mongo('marissamayer', 'followers_ids', projection={'ids' : 1}) for fid in ids['ids'] ]) marissamayer_ladygaga_jaccard = jaccard(marissamayer_followers_ids, ladygaga_followers_ids) print "Marissa Mayer and Lady Gaga's Jaccard Index: {0}".format(marissamayer_ladygaga_jaccard) timoreilly_marissamayer_jaccard = jaccard(timoreilly_followers_ids, marissamayer_followers_ids) print "Tim O'Reilly and Marissa Mayer's Jaccard Index {0}".format(timoreilly_marissamayer_jaccard) timoreilly_followers_ids_not_suspect = timoreilly_followers_ids - timoreilly_suspect_followers_ids timoreilly_ladygaga_jaccard_not_suspect = jaccard(timoreilly_followers_ids_not_suspect, ladygaga_followers_ids) print "Tim O'Reilly and Lady Gaga's Jaccard Index adjusted for suspect followers: {0}"\ .format(timoreilly_ladygaga_jaccard_not_suspect) # Need to define this variable, assuming you've pulled down the data for this account marissamayer_followers_ids = set([fid for ids in load_from_mongo('marissamayer', 'followers_ids', projection={'ids' : 1}) for fid in ids['ids'] ]) marissamayer_followers_ids_not_suspect = marissamayer_followers_ids - marissamayer_suspect_followers_ids marissamayer_ladygaga_jaccard_not_suspect = jaccard(marissamayer_followers_ids_not_suspect, ladygaga_followers_ids) print "Marissa Mayer and Lady Gaga's Jaccard Index adjusted for suspect followers: {0}"\ .format(marissamayer_ladygaga_jaccard_not_suspect) timoreilly_marissamayer_jaccard_not_suspect = jaccard(timoreilly_followers_ids_not_suspect, marissamayer_followers_ids) print "Tim O'Reilly and Marissa Mayer's Jaccard Index adjusted for suspect followers {0}"\ .format(timoreilly_marissamayer_jaccard_not_suspect) # Calculate the number of followers in common between each person of interest # by using set intersections. all_common_followers_ids = marissamayer_followers_ids & timoreilly_followers_ids & ladygaga_followers_ids print "Tim O'Reilly, Lady Gaga, and Marissa Mayer have {0} followers in common."\ .format(len(all_common_followers_ids)) # Let's recycle some code to compute Marissa Mayer's suspect followers MIN = 10 marissamayer_suspect_followers = [f for f in load_from_mongo('marissamayer', 'followers_profiles', projection={'followers_count' : 1, 'id' : 1, '_id' : 0}) if f['followers_count'] < MIN] print "Marissa Mayer has {0} 'suspect' followers for MIN={1}".format(len(marissamayer_suspect_followers), MIN) marissamayer_suspect_followers_counts = sorted([f['followers_count'] for f in marissamayer_suspect_followers], reverse=True) plt.hist(marissamayer_suspect_followers_counts) plt.title("Marissa Mayer Suspect Followers") plt.xlabel('Bins (range of followers)') plt.ylabel('Number of followers in each bin') marissamayer_suspect_followers_ids = set([f['id'] for f in marissamayer_suspect_followers]) all_common_followers_ids_not_suspect = all_common_followers_ids - \ (timoreilly_suspect_followers_ids | marissamayer_suspect_followers_ids) print "Tim O'Reilly, Lady Gaga, and Marissa Mayer have {0} non-suspect followers in common."\ .format(len(all_common_followers_ids_not_suspect)) marissamayer_ladygaga_followers_ids = marissamayer_followers_ids & ladygaga_followers_ids marissamayer_ladygaga_followers_ids_not_suspect = marissamayer_ladygaga_followers_ids - marissamayer_suspect_followers_ids timoreilly_marissamayer_followers_ids = timoreilly_followers_ids & marissamayer_followers_ids timoreilly_marissamayer_followers_ids_not_suspect = timoreilly_marissamayer_followers_ids - marissamayer_suspect_followers_ids print "Marissa Mayer and Lady Gaga have {0} followers in common. {1} of them are not suspect."\ .format(len(marissamayer_ladygaga_followers_ids), len(marissamayer_ladygaga_followers_ids_not_suspect)) print "Tim O'Reilly and Marissa Mayer have {0} followers in common. {1} of them are not suspect."\ .format(len(timoreilly_marissamayer_followers_ids), len(timoreilly_marissamayer_followers_ids_not_suspect))