#!/usr/bin/env python # coding: utf-8 # In[1]: import pandas as pd, numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[4]: p='E:/Onedrive - Lancaster University/Datarepo/szekelydata/klima/' # In[5]: stations=pd.read_csv(p+'stations.csv') stations=stations.set_index('ID') stations.to_csv('data/stations.csv') # 0: NoCloud # 1: Cloud # 4: Fog # 6: Rain # 7: Snow # 11: Hail # In[6]: weather={ 2:1, 3:7, 5:6, 8:6, 9:6, 100:0, 101:0, 102:0, 103:0, 104:0, 105:0, 106:0, 107:0, 108:0, 109:0, 110:4, 111:4, 112:4, 113:1, 114:6, 115:6, 116:6, 117:6, 118:6, 119:1, 120:7, 121:6, 122:7, 123:6, 124:11, 125:6, 126:7, 127:11, 128:4, 129:6, 130:1, 131:1, 132:1, 133:1, 134:1, 135:1, 136:7, 137:7, 138:7, 139:7, 140:4, 141:4, 142:4, 143:4, 144:4, 145:4, 146:4, 147:4, 148:4, 149:4, 150:6, 151:6, 152:6, 153:6, 154:6, 155:6, 156:6, 157:6, 158:6, 159:6, 160:6, 161:6, 162:6, 163:6, 164:6, 165:6, 166:11, 167:11, 168:6, 169:6, 170:7, 171:7, 172:7, 173:7, 174:7, 175:7, 176:7, 177:7, 178:7, 179:7, 180:6, 181:6, 182:6, 183:7, 184:7, 185:7, 186:7, 187:7, 188:7, 189:11, 190:11, 191:6, 192:6, 193:7, 194:7, 195:6, 196:11, 197:6, 198:1, 199:11} # In[7]: measures=['XTEMP','XSPD','XPCP','XSD','XVSB','YFOG','YPCP','YSNW','YHAL'] ycolumns={1:'YCLD',4:'YFOG',6:'YPCP',7:'YSNW',10:'YCLR',11:'YHAL'} # In[8]: def load_data(stn,d='high_res',p=p,stations=stations,verbose=True): if verbose: print('loading...',stn,stations.loc[int(stn)]['LOC'],d) df=pd.read_csv(p+'/'+d+'/export/'+stn+'.csv',dtype={' FRSHTT':str}) df.columns=[i.strip() for i in df.columns] df['time']=pd.to_datetime(df['time']) df['XTEMP']=(pd.to_numeric(df['TEMP'], errors='coerce').replace(9999.9,np.nan)-32)*5/9 #Fahrenheit to Celsiu if d=='high_res': df['XSPD']=pd.to_numeric(df['SPD'], errors='coerce')*1.61 #MPH to Km/h df['XVSB']=(pd.to_numeric(df['VSB'], errors='coerce')*1.61).apply(lambda x: min(x,10)) #miles to Km, max VSB=20Km df['XPCP']=pd.to_numeric(df['PCP06'], errors='coerce')*25.4 #inch to mm df['XSD']=pd.to_numeric(df['SD'], errors='coerce')*25.4 #inch to mm df['PCP01']=pd.to_numeric(df['PCP01'], errors='coerce') df['PCP06']=pd.to_numeric(df['PCP06'], errors='coerce') df['PCP24']=pd.to_numeric(df['PCP24'], errors='coerce') df['PCPXX']=pd.to_numeric(df['PCPXX'], errors='coerce') df['PCP06'].loc[~df['PCP06'].isnull()] = 6 df['PCPXX'].loc[~df['PCPXX'].isnull()] = 6 df['PCP01'].loc[~df['PCP01'].isnull()] = 6 df['PCP24'].loc[~df['PCP24'].isnull()] = 6 df['AW']=pd.to_numeric(df['AW'], errors='coerce')+100 df['MW']=pd.to_numeric(df['MW'], errors='coerce')+100 df['W']=pd.to_numeric(df['W'], errors='coerce') dz=df[['PCP01','PCP06','PCP24','PCPXX','AW','MW','W']] df['W']=dz.ffill(axis=1)['W'].replace(weather).replace(0,10) dz=df.groupby(['time','W']).count()['TEMP'].unstack().fillna(0) dz.columns=[ycolumns[i] for i in dz.columns] df=df.set_index('time').join(dz).reset_index() else: df['year']=df['time'].dt.year df['month']=df['time'].dt.month df['day']=df['time'].dt.day df['hour']=df['time'].dt.hour df['XSPD']=pd.to_numeric(df['WDSP'], errors='coerce').replace(999.9,np.nan)*1.85 #knots to Km/h df['XVSB']=(pd.to_numeric(df['VISIB'], errors='coerce').replace(999.9,np.nan)*1.61).apply(lambda x: min(x,10)) #miles to Km, max VSB=20Km df['XPCP']=pd.to_numeric(df['PRCP'].str[:-1], errors='coerce').replace(99.99,np.nan)*25.4 #inch to mm df['XSD']=pd.to_numeric(df['SNDP'], errors='coerce').replace(999.9,np.nan)*25.4 #inch to mm df['YFOG']=pd.to_numeric(df['FRSHTT'].str[0], errors='coerce') df['YPCP']=pd.to_numeric(df['FRSHTT'].str[1], errors='coerce') df['YSNW']=pd.to_numeric(df['FRSHTT'].str[2], errors='coerce') df['YHAL']=pd.to_numeric(df['FRSHTT'].str[3], errors='coerce') for m in measures: if m not in df.columns: df[m]=np.nan return df[['time','year','month','day','hour']+measures].set_index('time') # In[9]: # stn='154200' #aurel vlaicu stn='151700' #mciuc daily=load_data(stn,'daily') hires=load_data(stn,'high_res') # In[10]: #keep only days with at least 4 readings (usually at 6,12,18,24) to false means filt=hires.groupby(['year','month','day'])[['hour']].nunique() filt2=filt[filt>=4].dropna().reset_index() filt3=pd.to_datetime(dict(year=filt2['year'], month=filt2['month'], day=filt2['day'])).values # In[11]: def comparison_getter(measure,daily=daily,hires=hires,filt3=filt3): if type(measure)!=list: measure=[measure] d=daily[measure] h=hires.groupby(['year','month','day']).mean()[measure] ymeasures=[m for m in measure if 'Y' in m] h[ymeasures]=h[ymeasures][h[ymeasures]==0].fillna(1) h=h.reset_index() h['time']=pd.to_datetime(dict(year=h['year'], month=h['month'], day=h['day'])).values h=h.set_index('time')[measure] h=h.loc[filt3] return d,h def comaprison_plotter(measure,daily=daily,hires=hires,filt3=filt3): d,h=comparison_getter(measure,daily,hires,filt3) d.columns=['d'] h.columns=['h'] x=h.join(d,how='outer').dropna() x['diff']=(x['h']-x['d']) fig,ax=plt.subplots(1,3,figsize=(15,4)) x['diff'].plot(ax=ax[0],title='diff') x['h'].plot(ax=ax[1],title='high res') x['d'].plot(ax=ax[2],title='daily') return x,d,h # In[12]: comaprison_plotter('XTEMP'); # In[13]: comaprison_plotter('XSPD'); # In[14]: x,d,h=comaprison_plotter('XPCP') # In[15]: x,d,h=comaprison_plotter('XSD'); # In[16]: x,d,h=comaprison_plotter('XVSB'); # In[17]: x,d,h=comaprison_plotter('YSNW'); # In[18]: x,d,h=comaprison_plotter('YPCP'); # In[19]: x,d,h=comaprison_plotter('YHAL'); # In[20]: hu=['127720', '128050', '128120', '128220', '128250', '128300', '128390', '128430', '128510', '128600', '128820', '128920', '129100', '129150', '129420', '129600', '129700', '129820', '129920', '129350'] ro=['150040', '150100', '150140', '150150', '150200', '150230', '150800', '150850', '150900', '151080', '151200', '151450', '151500', '151700', '151970', '152000', '152300', '152350', '152470', '152600', '152800', '152920', '153100', '153350', '153460', '153500', '153600', '154100', '154200', '154210', '154500', '154600', '154700', '154800', '154810', '154990'] # In[21]: hs=[] ds=[] for stn in ro: try: d_ok=True daily=load_data(stn,'daily') except: d_ok=False print('failed') try: h_ok=True hires=load_data(stn,'high_res') except: h_ok=False print('failed') #keep only days with at least 4 readings (usually at 6,12,18,24) to false means filt=hires.groupby(['year','month','day'])[['hour']].nunique() filt2=filt[filt>=4].dropna().reset_index() filt3=pd.to_datetime(dict(year=filt2['year'], month=filt2['month'], day=filt2['day'])).values d,h=comparison_getter(measures,daily,hires,filt3) d['ID']=stn h['ID']=stn if d_ok: ds.append(d) if h_ok: hs.append(h) ds=pd.concat(ds) hs=pd.concat(hs) # In[22]: ds.to_csv('data/ro_ds.csv') hs.to_csv('data/ro_hs.csv') # In[23]: hs=[] ds=[] for stn in hu: try: d_ok=True daily=load_data(stn,'daily') except: d_ok=False print('failed') try: h_ok=True hires=load_data(stn,'high_res') except: h_ok=False print('failed') #keep only days with at least 4 readings (usually at 6,12,18,24) to false means filt=hires.groupby(['year','month','day'])[['hour']].nunique() filt2=filt[filt>=4].dropna().reset_index() filt3=pd.to_datetime(dict(year=filt2['year'], month=filt2['month'], day=filt2['day'])).values d,h=comparison_getter(measures,daily,hires,filt3) d['ID']=stn h['ID']=stn if d_ok: ds.append(d) if h_ok: hs.append(h) ds=pd.concat(ds) hs=pd.concat(hs) # In[24]: ds.to_csv('data/hu_ds.csv') hs.to_csv('data/hu_hs.csv')