from pandas import read_csv,DatetimeIndex,ols
from urllib import urlopen

def get_index(gindex, startdate=20040101):
    """
    API wrapper for Google Domestic Trends data.
        https://www.google.com/finance/domestic_trends

    Available Indices:

       'ADVERT', 'AIRTVL', 'AUTOBY', 'AUTOFI', 'AUTO', 'BIZIND', 'BNKRPT',
       'COMLND', 'COMPUT', 'CONSTR', 'CRCARD', 'DURBLE', 'EDUCAT', 'INVEST',
       'FINPLN', 'FURNTR', 'INSUR', 'JOBS', 'LUXURY', 'MOBILE', 'MTGE',
       'RLEST', 'RENTAL', 'SHOP', 'TRAVEL', 'UNEMPL'

    """
    base_url = 'http://www.google.com/finance/historical?q=GOOGLEINDEX_US:'
    full_url = '%s%s&output=csv&startdate=%s' % (base_url, gindex, startdate)
    dframe = read_csv(urlopen(full_url), index_col=0)
    dframe.index = DatetimeIndex(dframe.index)
    dframe = dframe.sort_index(0)
    for col in dframe.columns:
        if len(dframe[col].unique()) == 1:
            dframe.pop(col)
    if len(dframe.columns) == 1 and dframe.columns[0] == 'Close':
        dframe.columns = [gindex]
    return dframe[gindex]

autobuyers = get_index('AUTOBY') # https://www.google.com/finance?q=GOOGLEINDEX_US:AUTOBY
autofinancing = get_index('AUTOFI') # https://www.google.com/finance?q=GOOGLEINDEX_US:AUTOFI

# Run OLS testing if searches for queries related to people looking to purchase a car
# are predictive of searches for automotive financing.
model = ols(y=autofinancing, x={'Automotive Buyers': autobuyers})
print model.summary

# Plot actual Y vs. predicted Y
pred = model.predict()
pred.plot(color='b')
autofinancing.plot(color='g')

# Plot the residual
err = model.resid
err.plot(color='r')

# Test simple model of 1-period autocorrelation
err_t1 = err.tshift(1, freq='D')
autocorr_model = ols(y=err, x={'err_t1': err_t1})
print autocorr_model.summary

# Test model with up to 14-period autocorrelation
err_terms = {}
for lag in xrange(1, 15):
    err_terms['err_%s' % lag] = err.tshift(lag, freq='D')
autocorr_model = ols(y=err, x=err_terms)
print autocorr_model.summary

# Find the lagged error terms that are significant at the 95% level
significantvals = []
for pval, term, paramval in zip(autocorr_model.p_value, autocorr_model.beta.index, autocorr_model.beta):
    if pval < 0.05:
        significantvals.append((term, paramval))

# Sort them by the size of the coefficient
from operator import itemgetter
sorted_significantvals = sorted(significantvals, key=itemgetter(1))
for sv in sorted_significantvals:
    print '%s: %s' % (sv[0], sv[1])