from functools import partial
from sys import maxint
import sys
import time
from urllib2 import URLError
from httplib import BadStatusLine
import json
import twitter # pip install twitter
import pymongo # pip install pymongo


def oauth_login():
    # XXX: Go to http://twitter.com/apps/new to create an app and get values
    # for these credentials that you'll need to provide in place of these
    # empty string values that are defined as placeholders.
    # See https://dev.twitter.com/docs/auth/oauth for more information 
    # on Twitter's OAuth implementation
    
    CONSUMER_KEY = ''
    CONSUMER_SECRET = ''
    OAUTH_TOKEN = ''
    OAUTH_TOKEN_SECRET = ''
        
    auth = twitter.oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
                               CONSUMER_KEY, CONSUMER_SECRET)
    
    twitter_api = twitter.Twitter(auth=auth)
    return twitter_api

def make_twitter_request(twitter_api_func, max_errors=10, *args, **kw): 
    
    # A nested helper function that handles common HTTPErrors. Return an updated value 
    # for wait_period if the problem is a 500 level error. Block until the rate limit 
    # is reset if a rate limiting issue (429 error). Returns None for 401 and 404 errors
    # which requires special handling by the caller.
    def handle_twitter_http_error(e, wait_period=2, sleep_when_rate_limited=True):
    
        if wait_period > 3600: # Seconds
            print >> sys.stderr, 'Too many retries. Quitting.'
            raise e
    
        # See https://dev.twitter.com/docs/error-codes-responses for common codes
    
        if e.e.code == 401:
            print >> sys.stderr, 'Encountered 401 Error (Not Authorized)'
            return None
        elif e.e.code == 404:
            print >> sys.stderr, 'Encountered 404 Error (Not Found)'
            return None
        elif e.e.code == 429: 
            print >> sys.stderr, 'Encountered 429 Error (Rate Limit Exceeded)'
            if sleep_when_rate_limited:
                print >> sys.stderr, "Sleeping for 15 minutes, and then I'll try again...ZzZ..."
                sys.stderr.flush()
                time.sleep(60*15 + 5)
                print >> sys.stderr, '...ZzZ...Awake now and trying again.'
                return 2
            else:
                raise e # Allow user to handle the rate limiting issue however they'd like 
        elif e.e.code in (500, 502, 503, 504):
            print >> sys.stderr, 'Encountered %i Error. Will retry in %i seconds' % (e.e.code,
                    wait_period)
            time.sleep(wait_period)
            wait_period *= 1.5
            return wait_period
        else:
            raise e

    # End of nested helper function
    
    wait_period = 2 
    error_count = 0 

    while True:
        try:
            return twitter_api_func(*args, **kw)
        except twitter.api.TwitterHTTPError, e:
            error_count = 0 
            wait_period = handle_twitter_http_error(e, wait_period)
            if wait_period is None:
                return
        except URLError, e:
            error_count += 1
            print >> sys.stderr, "URLError encountered. Continuing."
            if error_count > max_errors:
                print >> sys.stderr, "Too many consecutive errors...bailing out."
                raise
        except BadStatusLine, e:
            error_count += 1
            print >> sys.stderr, "BadStatusLine encountered. Continuing."
            if error_count > max_errors:
                print >> sys.stderr, "Too many consecutive errors...bailing out."
                raise
                
def store_friends_followers_ids(twitter_api, screen_name=None, user_id=None,
                              friends_limit=maxint, followers_limit=maxint, database=None):
    
    # Must have either screen_name or user_id (logical xor)
    assert (screen_name != None) != (user_id != None), "Must have screen_name or user_id, but not both"
    
    # See https://dev.twitter.com/docs/api/1.1/get/friends/ids  and
    # See https://dev.twitter.com/docs/api/1.1/get/followers/ids for details on API parameters
    
    get_friends_ids = partial(make_twitter_request, twitter_api.friends.ids, count=5000)
    get_followers_ids = partial(make_twitter_request, twitter_api.followers.ids, count=5000)
    
    for twitter_api_func, limit, label in [
                                 [get_friends_ids, friends_limit, "friends"], 
                                 [get_followers_ids, followers_limit, "followers"]
                             ]:
        
        if limit == 0: continue
        
        total_ids = 0
        cursor = -1
        while cursor != 0:
        
            # Use make_twitter_request via the partially bound callable...
            if screen_name: 
                response = twitter_api_func(screen_name=screen_name, cursor=cursor)
            else: # user_id
                response = twitter_api_func(user_id=user_id, cursor=cursor)

            if response is not None:
                ids = response['ids']
                total_ids += len(ids)
                save_to_mongo({"ids" : [_id for _id in ids ]}, database, label + "_ids")
                cursor = response['next_cursor']
        
            print >> sys.stderr, 'Fetched {0} total {1} ids for {2}'.format(total_ids, label, (user_id or screen_name))
            sys.stderr.flush()
        
            # Consider storing the ids to disk during each iteration to provide an 
            # an additional layer of protection from exceptional circumstances
        
            if len(ids) >= limit or response is None:
                break
                print >> sys.stderr, 'Last cursor', cursor
                print >> sts.stderr, 'Last response', response

def save_to_mongo(data, mongo_db, mongo_db_coll, auth=None, **mongo_conn_kw):
    
    # Connects to the MongoDB server running on 
    # localhost:27017 by default
    
    client = pymongo.MongoClient(**mongo_conn_kw)
    
    # Get a reference to a particular database
    
    db = client[mongo_db]
    if auth:
        db.authenticate(auth[0], auth[1])
        
    # Reference a particular collection on the database
    
    coll = db[mongo_db_coll]
    
    # Perform a bulk insert and  return the ids
    
    return coll.insert(data)

def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,
                    criteria=None, projection=None, auth=None, **mongo_conn_kw):
    
    # Optionally, use criteria and projection to limit the data that is 
    # returned as documented in 
    # http://docs.mongodb.org/manual/reference/method/db.collection.find/
    
    # Consider leveraging MongoDB's aggregations framework for more 
    # sophisticated queries.
    
    client = pymongo.MongoClient(**mongo_conn_kw)
    db = client[mongo_db]
    
    if auth:
        db.authenticate(auth[0], auth[1])

    coll = db[mongo_db_coll]
    
    if criteria is None:
        criteria = {}
    
    if projection is None:
        cursor = coll.find(criteria)
    else:
        cursor = coll.find(criteria, projection)

    # Returning a cursor is recommended for large amounts of data
    
    if return_cursor:
        return cursor
    else:
        return [ item for item in cursor ]
    
def store_user_info(twitter_api, screen_names=None, user_ids=None, database=None):
   
    # Must have either screen_name or user_id (logical xor)
    assert (screen_names != None) != (user_ids != None), "Must have screen_names or user_ids, but not both"
    
    items = screen_names or user_ids
    
    while len(items) > 0:
        if len(items)/100*100 % 1000 == 0:
            print >> sys.stderr, len(items), "remaining"
            
        # Process 100 items at a time per the API specifications for /users/lookup. See
        # https://dev.twitter.com/docs/api/1.1/get/users/lookup for details
        
        items_str = ','.join([str(item) for item in items[:100]])
        items = items[100:]

        if screen_names:
            response = make_twitter_request(twitter_api.users.lookup, screen_name=items_str)
        else: # user_ids
            response = make_twitter_request(twitter_api.users.lookup, user_id=items_str)
    
        for profile in response:            
            save_to_mongo(profile, database, 'followers_profiles')
            
# Go ahead and instantiate an instance of the Twitter API for common use
# throughout the rest of this notebook.

twitter_api = oauth_login()

# Harvest the follower IDS. Note that this process takes a bit of time since
# you are limited to harvesting 75k IDs per 15 minute window.
# 
# Some example accounts that we'll look at later in this notebook:
#
# Tim O'Reilly: ~1.7M followers (~6 hours)
# Marissa Mayer: ~460k followers (~1.5 hours)
# Lady Gaga: ~40M followers (~5.5 days)
#
# All in all, it takes a little over a week to harvest all of these IDs.


# Define a simple wrapper that accepts a list of screen names for convenience
# and reusability later.

def harvest_followers_ids(screen_names=[]):
    for screen_name in screen_names:
        store_friends_followers_ids(twitter_api, screen_name=screen_name, 
                                    friends_limit=0, database=screen_name)

        
harvest_followers_ids(screen_names=[ 'timoreilly' ])

print "Done"

# Iterate over the IDs for each follower of Tim O'Reilly and
# lookup the profile. This process also takes some time since you are 
# limited to 18k profiles per 15 minute rate limit window. Resolving 
# all of these profiles takes a little over a day.
#
# Note that we could always opt to draw a random sample for a highly popular 
# account's followers IDs for a statistical analysis. However, be advised that
# for the sample to be truly random, you do probabably need to first pull down the
# totality of the follower IDs and then draw from that collection. Simply harvesting
# the first N followers returned by the API and using that as a sample may not be 
# sufficient, since Twitter currently returns followers in the order in which the
# follow occurred (though this is subject to change without notice.)
#
# For example, a 95% confidence interval with a 1% margin of error requires just
# under 10k items be drawn for the random sample. A 99% confidence interval with 
# a 1% margin of error requires just under 17k items be drawn for the random sample.


# Define another convenience wrapper and assume that you've already
# harvested the follower IDs for the account in question

def harvest_followers_profiles(screen_names=[]): 
    for screen_name in screen_names:
        followers_ids = load_from_mongo(screen_name, 'followers_ids')
        
        # Flattening the IDs into memory requires a generous heap space. An
        # m1.xlarge AWS node (15GB of memory) should be more than sufficient
        # and spot prices are typically around $0.05/hr
        
        all_ids = [ _id for ids_batch in followers_ids for _id in ids_batch['ids'] ]
        
        store_user_info(twitter_api, user_ids=all_ids, database=screen_name)

        
harvest_followers_profiles(screen_names=[ 'timoreilly' ])

print "Done."

# Compute a list containing the number of followers for each of Tim O'Reilly's own followers.
# Sort it for convenience of slicing and plotting.

timoreilly_followers_counts = sorted([f['followers_count'] 
                                      for f in load_from_mongo('timoreilly', 'followers_profiles', 
                                                         projection={'followers_count' : 1, '_id' : 0})])

# Now, let's plot the list to gain some initial intuition about the curve.

plt.loglog(timoreilly_followers_counts)
plt.ylabel("Num Followers")
plt.xlabel("Follower Rank")

# Now knowing that the curve resembles that of a "power law" and that
# most of the substance is in the "long tail", let's zoom in on some the tail 
# with a histogram.

# The resolution for a histogram displaying the full data set would be less than 
# useful to look at since the x-axis would stretch all the way out to accomodate 
# the outliers. (Try it.)
# plt.hist(timoreilly_followers_counts)

# So, hack the display a bit so that we only visualize 99% of the data and avoid the
# outliers on the upper extreme. It's easy enough to produce a separate plot of them

bins = [0,5,10,100,200,300,400,500,1000,4000]
plt.hist(timoreilly_followers_counts[:len(timoreilly_followers_counts)/100*99], bins=bins)


plt.title("Tim O'Reilly Followers")
plt.xlabel('Bins (range of popularity for Tim\'s followers)')
plt.ylabel('Number of followers in bin')

# Let's start with a very base assumption for the time being: 
# any follower with less than 10 followers of their own is "suspect" in the
# sense that this follower may be an inactive account, spambot, or abandoned account.
# Regardless, it probably offers no meaningful influence. (Although this assumption 
# may be a bit naive, it seems a reasonable starting point for now.)

MIN = 10
timoreilly_suspect_followers = [f 
                                for f in load_from_mongo('timoreilly', 'followers_profiles', 
                                                          projection={'followers_count' : 1, 'id' : 1, '_id' : 0})
                                if f['followers_count'] < MIN]

print "Tim O'Reilly has {0} 'suspect' followers for MIN={1}".format(len(timoreilly_suspect_followers), MIN)

# What does a plot of these 'suspect' followers look like?

timoreilly_suspect_followers_counts = sorted([f['followers_count'] 
                                              for f in timoreilly_suspect_followers], reverse=True)

plt.hist(timoreilly_suspect_followers_counts)
plt.title("Tim O'Reilly Suspect Followers")
plt.xlabel('Bins (range of followers)')
plt.ylabel('Number of followers in each bin')

# A little more analysis of how the distribution breaks down
print "{0} of Tim O'Reilly's followers have 0 followers"\
.format(sum([1 for c in timoreilly_suspect_followers_counts if c < 1]))

print "{0} of Tim O'Reilly's followers have 1 follower"\
.format(sum([1 for c in timoreilly_suspect_followers_counts if c <= 1]))

print "{0} of Tim O'Reilly's followers have less than 3 followers"\
.format(sum([1 for c in timoreilly_suspect_followers_counts if c < 3]))

print "{0} of Tim O'Reilly's followers have less than 4 followers"\
.format(sum([1 for c in timoreilly_suspect_followers_counts if c < 4]))

print "{0} of Tim O'Reilly's followers have less than 5 followers"\
.format(sum([1 for c in timoreilly_suspect_followers_counts if c < 5]))

# Create sets of ids for each account of interest by flattening out the lists for each
# request that was stored in MongoDB. (The maximum document size in MongoDB is ~16MB so
# there would have been little to gain by trying to store them in a flatter structure 
# from the outset since multiple documents would have inevitably been required.)

timoreilly_followers_ids = set([fid
                 for ids in load_from_mongo('timoreilly', 'followers_ids', projection={'ids' : 1})
                     for fid in ids['ids']
                 ])

ladygaga_followers_ids = set([fid
                 for ids in load_from_mongo('ladygaga', 'followers_ids', projection={'ids' : 1})
                     for fid in ids['ids']
                 ])

# Now, calculate the number of followers in common between each person of interest
# by using set intersections.

timoreilly_ladygaga_common_followers_ids = timoreilly_followers_ids & ladygaga_followers_ids

print "Tim O'Reilly and Lady Gaga have {0} followers in common."\
.format(len(timoreilly_ladygaga_common_followers_ids))

# So, how many of Tim O'Reilly's suspect followers are in common with
# Lady Gaga? It would be helpful to know if the followers that they have in common
# are or aren't domainted by the suspect followers.

# For convenience, let's carve out just the IDs for easy setwise comparison
timoreilly_suspect_followers_ids = set([f['id'] for f in timoreilly_suspect_followers])

print "{0} of Tim O'Reilly's 'suspect' followers are from the set that's in common with Lady Gaga's followers"\
.format(len(timoreilly_suspect_followers_ids & timoreilly_ladygaga_common_followers_ids))

# There are non-trivial numbers of followers in common, but computing the set
# intersection doesn't take into account the size of any given set, so let's
# calculate the Jaccard similarity score, which provides a sort of
# normalization. See http://en.wikipedia.org/wiki/Jaccard_index for details.
# In short, it's a similarity measurement. The higher the score, the more
# similar two sets are in comparison to two other sets.

def jaccard(x,y): return 1.0*len(x & y) / len(x | y)

timoreilly_ladygaga_jaccard = jaccard(timoreilly_followers_ids, ladygaga_followers_ids)
print "Tim O'Reilly and Lady Gaga's Jaccard Index: {0}".format(timoreilly_ladygaga_jaccard)

# Need to define this variable, assuming you've pulled down the data for this account

marissamayer_followers_ids = set([fid
                 for ids in load_from_mongo('marissamayer', 'followers_ids', projection={'ids' : 1})
                     for fid in ids['ids']
                 ])

marissamayer_ladygaga_jaccard = jaccard(marissamayer_followers_ids, ladygaga_followers_ids)
print "Marissa Mayer and Lady Gaga's Jaccard Index: {0}".format(marissamayer_ladygaga_jaccard)

timoreilly_marissamayer_jaccard = jaccard(timoreilly_followers_ids, marissamayer_followers_ids)
print "Tim O'Reilly and Marissa Mayer's Jaccard Index {0}".format(timoreilly_marissamayer_jaccard)

timoreilly_followers_ids_not_suspect = timoreilly_followers_ids - timoreilly_suspect_followers_ids

timoreilly_ladygaga_jaccard_not_suspect = jaccard(timoreilly_followers_ids_not_suspect, ladygaga_followers_ids)
print "Tim O'Reilly and Lady Gaga's Jaccard Index adjusted for suspect followers: {0}"\
.format(timoreilly_ladygaga_jaccard_not_suspect)

# Need to define this variable, assuming you've pulled down the data for this account

marissamayer_followers_ids = set([fid
                 for ids in load_from_mongo('marissamayer', 'followers_ids', projection={'ids' : 1})
                     for fid in ids['ids']
                 ])

marissamayer_followers_ids_not_suspect = marissamayer_followers_ids - marissamayer_suspect_followers_ids

marissamayer_ladygaga_jaccard_not_suspect = jaccard(marissamayer_followers_ids_not_suspect, ladygaga_followers_ids)
print "Marissa Mayer and Lady Gaga's Jaccard Index adjusted for suspect followers: {0}"\
.format(marissamayer_ladygaga_jaccard_not_suspect)

timoreilly_marissamayer_jaccard_not_suspect = jaccard(timoreilly_followers_ids_not_suspect, marissamayer_followers_ids)
print "Tim O'Reilly and Marissa Mayer's Jaccard Index adjusted for suspect followers {0}"\
.format(timoreilly_marissamayer_jaccard_not_suspect)

# Calculate the number of followers in common between each person of interest
# by using set intersections.

all_common_followers_ids = marissamayer_followers_ids & timoreilly_followers_ids & ladygaga_followers_ids

print "Tim O'Reilly, Lady Gaga, and Marissa Mayer have {0} followers in common."\
.format(len(all_common_followers_ids))

# Let's recycle some code to compute Marissa Mayer's suspect followers

MIN = 10
marissamayer_suspect_followers = [f 
                                for f in load_from_mongo('marissamayer', 'followers_profiles', 
                                                          projection={'followers_count' : 1, 'id' : 1, '_id' : 0})
                                if f['followers_count'] < MIN]

print "Marissa Mayer has {0} 'suspect' followers for MIN={1}".format(len(marissamayer_suspect_followers), MIN)

marissamayer_suspect_followers_counts = sorted([f['followers_count'] 
                                              for f in marissamayer_suspect_followers], reverse=True)

plt.hist(marissamayer_suspect_followers_counts)
plt.title("Marissa Mayer Suspect Followers")
plt.xlabel('Bins (range of followers)')
plt.ylabel('Number of followers in each bin')

marissamayer_suspect_followers_ids = set([f['id'] for f in marissamayer_suspect_followers])

all_common_followers_ids_not_suspect = all_common_followers_ids - \
                                       (timoreilly_suspect_followers_ids | marissamayer_suspect_followers_ids)

print "Tim O'Reilly, Lady Gaga, and Marissa Mayer have {0} non-suspect followers in common."\
.format(len(all_common_followers_ids_not_suspect))

marissamayer_ladygaga_followers_ids = marissamayer_followers_ids & ladygaga_followers_ids
marissamayer_ladygaga_followers_ids_not_suspect = marissamayer_ladygaga_followers_ids - marissamayer_suspect_followers_ids
                                                  

timoreilly_marissamayer_followers_ids = timoreilly_followers_ids & marissamayer_followers_ids
timoreilly_marissamayer_followers_ids_not_suspect = timoreilly_marissamayer_followers_ids - marissamayer_suspect_followers_ids
                                                    

print "Marissa Mayer and Lady Gaga have {0} followers in common. {1} of them are not suspect."\
.format(len(marissamayer_ladygaga_followers_ids), len(marissamayer_ladygaga_followers_ids_not_suspect))    
    
print "Tim O'Reilly and Marissa Mayer have {0} followers in common. {1} of them are not suspect."\
.format(len(timoreilly_marissamayer_followers_ids), len(timoreilly_marissamayer_followers_ids_not_suspect))