#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd, numpy as np
import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[4]:


p='E:/Onedrive - Lancaster University/Datarepo/szekelydata/klima/'


# In[5]:


stations=pd.read_csv(p+'stations.csv')
stations=stations.set_index('ID')
stations.to_csv('data/stations.csv')


# 0: NoCloud
# 1: Cloud
# 4: Fog
# 6: Rain
# 7: Snow
# 11: Hail

# In[6]:


weather={
2:1,
3:7,
5:6,
8:6,
9:6,
100:0,
101:0,
102:0,
103:0,
104:0,
105:0,
106:0,
107:0,
108:0,
109:0,
110:4,
111:4,
112:4,
113:1,
114:6,
115:6,
116:6,
117:6,
118:6,
119:1,
120:7,
121:6,
122:7,
123:6,
124:11,
125:6,
126:7,
127:11,
128:4,
129:6,
130:1,
131:1,
132:1,
133:1,
134:1,
135:1,
136:7,
137:7,
138:7,
139:7,
140:4,
141:4,
142:4,
143:4,
144:4,
145:4,
146:4,
147:4,
148:4,
149:4,
150:6,
151:6,
152:6,
153:6,
154:6,
155:6,
156:6,
157:6,
158:6,
159:6,
160:6,
161:6,
162:6,
163:6,
164:6,
165:6,
166:11,
167:11,
168:6,
169:6,
170:7,
171:7,
172:7,
173:7,
174:7,
175:7,
176:7,
177:7,
178:7,
179:7,
180:6,
181:6,
182:6,
183:7,
184:7,
185:7,
186:7,
187:7,
188:7,
189:11,
190:11,
191:6,
192:6,
193:7,
194:7,
195:6,
196:11,
197:6,
198:1,
199:11}


# In[7]:


measures=['XTEMP','XSPD','XPCP','XSD','XVSB','YFOG','YPCP','YSNW','YHAL']
ycolumns={1:'YCLD',4:'YFOG',6:'YPCP',7:'YSNW',10:'YCLR',11:'YHAL'}


# In[8]:


def load_data(stn,d='high_res',p=p,stations=stations,verbose=True):
    if verbose:
        print('loading...',stn,stations.loc[int(stn)]['LOC'],d)
    df=pd.read_csv(p+'/'+d+'/export/'+stn+'.csv',dtype={' FRSHTT':str})
    df.columns=[i.strip() for i in df.columns]
    df['time']=pd.to_datetime(df['time'])
    df['XTEMP']=(pd.to_numeric(df['TEMP'], errors='coerce').replace(9999.9,np.nan)-32)*5/9 #Fahrenheit to Celsiu
    if d=='high_res':
        df['XSPD']=pd.to_numeric(df['SPD'], errors='coerce')*1.61 #MPH to Km/h
        df['XVSB']=(pd.to_numeric(df['VSB'], errors='coerce')*1.61).apply(lambda x: min(x,10)) #miles to Km, max VSB=20Km
        df['XPCP']=pd.to_numeric(df['PCP06'], errors='coerce')*25.4 #inch to mm
        df['XSD']=pd.to_numeric(df['SD'], errors='coerce')*25.4 #inch to mm
        
        df['PCP01']=pd.to_numeric(df['PCP01'], errors='coerce')
        df['PCP06']=pd.to_numeric(df['PCP06'], errors='coerce')
        df['PCP24']=pd.to_numeric(df['PCP24'], errors='coerce')
        df['PCPXX']=pd.to_numeric(df['PCPXX'], errors='coerce')
        df['PCP06'].loc[~df['PCP06'].isnull()] = 6
        df['PCPXX'].loc[~df['PCPXX'].isnull()] = 6
        df['PCP01'].loc[~df['PCP01'].isnull()] = 6
        df['PCP24'].loc[~df['PCP24'].isnull()] = 6
        df['AW']=pd.to_numeric(df['AW'], errors='coerce')+100
        df['MW']=pd.to_numeric(df['MW'], errors='coerce')+100
        df['W']=pd.to_numeric(df['W'], errors='coerce')
        dz=df[['PCP01','PCP06','PCP24','PCPXX','AW','MW','W']]
        df['W']=dz.ffill(axis=1)['W'].replace(weather).replace(0,10)
        dz=df.groupby(['time','W']).count()['TEMP'].unstack().fillna(0)
        dz.columns=[ycolumns[i] for i in dz.columns]
        df=df.set_index('time').join(dz).reset_index()
    else:
        df['year']=df['time'].dt.year
        df['month']=df['time'].dt.month
        df['day']=df['time'].dt.day
        df['hour']=df['time'].dt.hour
        df['XSPD']=pd.to_numeric(df['WDSP'], errors='coerce').replace(999.9,np.nan)*1.85 #knots to Km/h
        df['XVSB']=(pd.to_numeric(df['VISIB'], errors='coerce').replace(999.9,np.nan)*1.61).apply(lambda x: min(x,10)) #miles to Km, max VSB=20Km
        df['XPCP']=pd.to_numeric(df['PRCP'].str[:-1], errors='coerce').replace(99.99,np.nan)*25.4 #inch to mm
        df['XSD']=pd.to_numeric(df['SNDP'], errors='coerce').replace(999.9,np.nan)*25.4 #inch to mm
        
        df['YFOG']=pd.to_numeric(df['FRSHTT'].str[0], errors='coerce')
        df['YPCP']=pd.to_numeric(df['FRSHTT'].str[1], errors='coerce')
        df['YSNW']=pd.to_numeric(df['FRSHTT'].str[2], errors='coerce')
        df['YHAL']=pd.to_numeric(df['FRSHTT'].str[3], errors='coerce')
    for m in measures:
        if m not in df.columns:
            df[m]=np.nan
    return df[['time','year','month','day','hour']+measures].set_index('time')


# In[9]:


# stn='154200' #aurel vlaicu
stn='151700' #mciuc
daily=load_data(stn,'daily')
hires=load_data(stn,'high_res')


# In[10]:


#keep only days with at least 4 readings (usually at 6,12,18,24) to false means
filt=hires.groupby(['year','month','day'])[['hour']].nunique()
filt2=filt[filt>=4].dropna().reset_index()
filt3=pd.to_datetime(dict(year=filt2['year'], month=filt2['month'], day=filt2['day'])).values


# In[11]:


def comparison_getter(measure,daily=daily,hires=hires,filt3=filt3):
    if type(measure)!=list:
        measure=[measure]
        
    d=daily[measure]
    h=hires.groupby(['year','month','day']).mean()[measure]
    ymeasures=[m for m in measure if 'Y' in m]
    h[ymeasures]=h[ymeasures][h[ymeasures]==0].fillna(1)
    h=h.reset_index()
    h['time']=pd.to_datetime(dict(year=h['year'], month=h['month'], day=h['day'])).values
    h=h.set_index('time')[measure]
    h=h.loc[filt3]
    return d,h

def comaprison_plotter(measure,daily=daily,hires=hires,filt3=filt3):
    d,h=comparison_getter(measure,daily,hires,filt3)
    d.columns=['d']
    h.columns=['h']
    x=h.join(d,how='outer').dropna()
    x['diff']=(x['h']-x['d'])
    fig,ax=plt.subplots(1,3,figsize=(15,4))
    x['diff'].plot(ax=ax[0],title='diff')
    x['h'].plot(ax=ax[1],title='high res')
    x['d'].plot(ax=ax[2],title='daily')
    return x,d,h


# In[12]:


comaprison_plotter('XTEMP');


# In[13]:


comaprison_plotter('XSPD');


# In[14]:


x,d,h=comaprison_plotter('XPCP')


# In[15]:


x,d,h=comaprison_plotter('XSD');


# In[16]:


x,d,h=comaprison_plotter('XVSB');


# In[17]:


x,d,h=comaprison_plotter('YSNW');


# In[18]:


x,d,h=comaprison_plotter('YPCP');


# In[19]:


x,d,h=comaprison_plotter('YHAL');


# In[20]:


hu=['127720',
'128050',
'128120',
'128220',
'128250',
'128300',
'128390',
'128430',
'128510',
'128600',
'128820',
'128920',
'129100',
'129150',
'129420',
'129600',
'129700',
'129820',
'129920',
'129350']
ro=['150040',
'150100',
'150140',
'150150',
'150200',
'150230',
'150800',
'150850',
'150900',
'151080',
'151200',
'151450',
'151500',
'151700',
'151970',
'152000',
'152300',
'152350',
'152470',
'152600',
'152800',
'152920',
'153100',
'153350',
'153460',
'153500',
'153600',
'154100',
'154200',
'154210',
'154500',
'154600',
'154700',
'154800',
'154810',
'154990']


# In[21]:


hs=[]
ds=[]
for stn in ro:
    try:
        d_ok=True
        daily=load_data(stn,'daily')
    except:
        d_ok=False
        print('failed')
    try:
        h_ok=True
        hires=load_data(stn,'high_res')
    except:
        h_ok=False
        print('failed')
    #keep only days with at least 4 readings (usually at 6,12,18,24) to false means
    filt=hires.groupby(['year','month','day'])[['hour']].nunique()
    filt2=filt[filt>=4].dropna().reset_index()
    filt3=pd.to_datetime(dict(year=filt2['year'], month=filt2['month'], day=filt2['day'])).values
    d,h=comparison_getter(measures,daily,hires,filt3)
    d['ID']=stn
    h['ID']=stn
    if d_ok: ds.append(d)
    if h_ok: hs.append(h)
ds=pd.concat(ds)
hs=pd.concat(hs)


# In[22]:


ds.to_csv('data/ro_ds.csv')
hs.to_csv('data/ro_hs.csv')


# In[23]:


hs=[]
ds=[]
for stn in hu:
    try:
        d_ok=True
        daily=load_data(stn,'daily')
    except:
        d_ok=False
        print('failed')
    try:
        h_ok=True
        hires=load_data(stn,'high_res')
    except:
        h_ok=False
        print('failed')
    #keep only days with at least 4 readings (usually at 6,12,18,24) to false means
    filt=hires.groupby(['year','month','day'])[['hour']].nunique()
    filt2=filt[filt>=4].dropna().reset_index()
    filt3=pd.to_datetime(dict(year=filt2['year'], month=filt2['month'], day=filt2['day'])).values
    d,h=comparison_getter(measures,daily,hires,filt3)
    d['ID']=stn
    h['ID']=stn
    if d_ok: ds.append(d)
    if h_ok: hs.append(h)
ds=pd.concat(ds)
hs=pd.concat(hs)


# In[24]:


ds.to_csv('data/hu_ds.csv')
hs.to_csv('data/hu_hs.csv')