#!/usr/bin/env python
# coding: utf-8

# # Solution sketch, lab 13

# In[5]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as spt

import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns


# In[6]:


from cycler import cycler

plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams["axes.labelsize"]= 12
plt.rcParams["figure.facecolor"] = "#f2f2f2"
#plt.rcParams['figure.savefig.dpi'] = 100
plt.rcParams['savefig.edgecolor'] = "#f2f2f2"
plt.rcParams['savefig.facecolor'] ="#f2f2f2"
plt.rcParams["figure.figsize"] = [16,10]
plt.rcParams['savefig.bbox'] = "tight"
plt.rcParams['font.size'] = 14
greens = ['#66c2a4','#41ae76','#238b45','#006d2c','#00441b']
multi =['#66c2a4','#1f78b4','#a6cee3','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f']
plt.rcParams["axes.prop_cycle"] = cycler(color=multi)


# In[16]:


wt_data = pd.read_csv("http://jmaurit.github.io/analytics/labs/data/wt_data2.csv")


# In the *wt_data* data set we have information on net-exchange data for different countries and price areas in the Nordic exchange. That is, this indicates how much net import or export a country/area experiences over the course of an hour. Consider the net exchange series for DK1 and DK2: *DK1_nx* and *DK2_nx*. 
# 
#   a. Are these series stationary? Use and interpret with an appropriate test. Does the stationarity or non-stationarity of these series have some economic interpretation? If so, what does it say about these series? 
#   
#   b. Estimate what the effect of wind power is on net exchange in these two areas using OLS. Are there any lagged effects? Should you include any controlling variables? Such as prices, the other areas wind power, etc? Explain why or why not? Interpret the results. 
#   
#   c. Check the residuals for serial correlation (look at ACF and pACF figures). What effect does this have on our results or interpretation of results? If there are serial correlation, we should probably model the dynamics directly in our model, for example by including autoregressive (AR) terms in our regression. Will that effect how we interpret the exogenous coefficients (wind power)? [Chapter 9](https://otexts.com/fpp3/dynamic.html) parts 1-2 could be a useful reference in answering this question.  
#   
#   d. Throughout this lab, we have assumed that wind power is exogenous. That is, when we include it as an independent variable, we interpret the coefficients as causal. Is this justified? Why? What if we included price as a variable in the regresion in c. Should we interpret price as exogenous? 
#  

# In[18]:


#a
wt_data["time"] = pd.to_datetime(wt_data.time, format="%Y-%m-%d %H:%M:%S")
wt_data["date"] = pd.to_datetime(wt_data.time, format="%Y-%m-%d")


# In[22]:


NX_df = wt_data[["time", "DK1_nx", "DK2_nx"]].copy()


# In[26]:


from statsmodels.tsa.stattools import adfuller


# In[ ]:


NX_df = NX_df.loc[NX_df.DK1_nx.notna(),:]


# In[28]:


dftest = adfuller(NX_df["DK1_nx"], autolag="AIC") 
dftest


# Here we have a large-in-magnitude statistic, which gives us quite a lot of confidence in rejecting the null-hypothesis of non-stationarity. 

# In[29]:


# We could start with a simple model of net exchange on wind and some lags: 

# In[36]:


wt_data.sort_values('time', inplace=True)
wt_data.set_index('time', inplace=True)

wt_data["wind_DK1_l"] = wt_data.wind_DK1.shift(freq="1H") #shift 1 hour
wt_data["wind_DK1_l2"] = wt_data.wind_DK1.shift(freq="2H") #shift 1 hour
wt_data["wind_DK1_l3"] = wt_data.wind_DK1.shift(freq="3H") #shift 1 hour


# In[38]:


nxmod1 = smf.ols(formula="DK1_nx ~ wind_DK1 + wind_DK1_l + wind_DK1_l2 + wind_DK1_l3", data=wt_data).fit()
nxmod1.summary()


# The coefficient on the concurrent term is strongly negative, but the lagged terms are all close to zero and not significant. We could probably exclude these

# In[41]:


nxmod1_resids = nxmod1.resid


# In[30]:


#let us first look at the autocorrelations left
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf


# In[42]:


fig, ax = plt.subplots(1,2)
plot_acf(nxmod1_resids, lags=50, ax=ax[0])
plot_pacf(nxmod1_resids, lags=50, ax=ax[1])
plt.show()


# From the autocorrelation and partial autocorellation charts, we see quite a bit of autocorrelation in the residuals, which we probably want to model at least partially in our model. let us add some autoregressive terms

# In[43]:


wt_data["DK1_nx_l"] = wt_data.DK1_nx.shift(freq="1H") #shift 1 hour
wt_data["DK1_nx_l2"] = wt_data.DK1_nx.shift(freq="2H") #shift 2 hour
wt_data["DK1_nx_l3"] = wt_data.DK1_nx.shift(freq="3H") #shift 3 hour
wt_data["DK1_nx_l24"] = wt_data.DK1_nx.shift(freq="24H") #shift 24 hour


# In[46]:


nxmod1 = smf.ols(formula="DK1_nx ~ DK1_nx_l + DK1_nx_l2 + DK1_nx_l3 + DK1_nx_l24 + wind_DK1", data=wt_data).fit()
nxmod1.summary()


# adding the autocorrelation now reduces the magnitude of the our wind variable by a lot, though it is still significant. We could generally interpret this to say that 1 extra mwh of wind power increases net export (which is a negative here) by .09 MW.
# 
# In the next lab we will refine this type of modelling with ARMA modelling

# In[ ]: