import pandas as pd
import io
import statsmodels.api as sm
%matplotlib inline
import matplotlib.pyplot as plt
content2 = '''\
Units	lastqu
2000-12-31	 19391	 NaN
2001-12-31	 35068	 5925
2002-12-31	 39279	 8063
2003-12-31	 47517	 9473
2004-12-31	 51439	 11226
2005-12-31	 59674	 11667
2006-12-31	 58664	 14016
2007-12-31	 55698	 13186
2008-12-31	 42235	 11343
2009-12-31	 40478	 7867
2010-12-31	 38722	 8114
2011-12-31	 36965	 8361
2012-12-31	 39132	 8608
2013-12-31	 43160	 9016
2014-12-31	 NaN	 9785
'''
df2 = pd.read_table(io.BytesIO(content2))
#make sure that the columns are int
df2['Units']=df2['Units'][:-1].astype('int')
df2['lastqu']=df2['lastqu'][1:].astype('int')
df2 #note sample data is from Statewide

def fit_line2(x, y):
    X = sm.add_constant(x, prepend=True) #Add a column of ones to allow the calculation of the intercept
    ols_test = sm.OLS(y, X,missing='drop').fit()
    """Return slope, intercept of best fit line."""
    X = sm.add_constant(x)
    return ols_test

ols_test=fit_line2(df2['lastqu'][1:-1], df2['Units'][1:-1]) #Use the lastqu to fit to Units (so can forecast future)
ols_test.summary()

nextq=ols_test.predict(sm.add_constant(df2['lastqu'][-1:], prepend=True)) # the prediction for 2014
nextq

fig = plt.figure(figsize=(12,8))
fig=sm.graphics.plot_regress_exog(ols_test,'lastqu',fig=fig)

fig.clf() #clears the fig!
sm.graphics.plot_partregress_grid(ols_test, fig=fig)

#fig.clf() #clears the fig!
result=ols_test.model.fit()
result.summary()

# this uses the statsmodels formula API (same results
# import formula api as alias smf
import statsmodels.formula.api as smf
# formula: response ~ predictors
est = smf.ols(formula='Units ~ lastqu', data=df2).fit()
est.summary()

fig = plt.figure(figsize=(12,8))
fig=sm.graphics.plot_regress_exog(est,'lastqu',fig=fig)

#Remove NaN's
df2=df2[1:-1]
import pymc as pm
import numpy as np
x=df2['lastqu']
y=df2['Units']
trace = None
with pm.Model() as model:
    alpha = pm.Normal('alpha', mu=0, sd=20)
    beta = pm.Normal('beta', mu=0, sd=20)
    sigma = pm.Uniform('sigma', lower=0, upper=12000)
    
    y_est = alpha + beta * x
    
    likelihood = pm.Normal('y', mu=y_est, sd=sigma, observed=y)
    
    start = pm.find_MAP()
    step = pm.NUTS(state=start)
    trace = pm.sample(2000, step, start=start, progressbar=False)

    pm.traceplot(trace);

def graph(formula, x_range, color='black', alpha=1):  
    x = np.array(x_range)  
    y = eval(formula)
    plt.plot(x, y, color=color, alpha=alpha)
    
plt.scatter(x,y)

for i in xrange(0,2000):
    point = trace.point(i)
    graph('{0} + {1}*x'.format(point['alpha'], point['beta']), range(6000,15000), color='black', alpha=.0098035)

plt.show()

x=df2['lastqu']
y=df2['Units']
data = dict(x=x, y=y)
data

with pm.Model() as model:
    # specify glm and pass in data. The resulting linear model, its likelihood and 
    # and all its parameters are automatically added to our model.
    pm.glm.glm('y ~ x', data)
    step = pm.NUTS() # Instantiate MCMC sampling algorithm
    trace = pm.sample(2000, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling

plt.figure(figsize=(7, 7))
pm.traceplot(trace)
plt.tight_layout();

point['sigma']

point['beta']

point['alpha']

#!pip list