import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
ro=['5765358043206','9812808043220','9576658043223','1699958043227','7225068043228','265208043229']
hu=['8073718043234','2087988043232','6247548043235']
p='C:/Users/csala/Onedrive - Lancaster University/Datarepo/szekelydata/klima/high_res/'
stations=[]
for i in ro:
stations.append(pd.read_csv(p+'raw/ro/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
print(i)
5765358043206 9812808043220 9576658043223 1699958043227 7225068043228 265208043229
station=pd.concat(stations)
station=station.drop_duplicates()
station[2]=station[2].str.strip()
station[3]=station[3].str.strip()
!!! 16G memory required at least, 64G recommended
dfs=[]
for i in ro:
df=pd.read_csv(p+'raw/ro/'+i+'dat.txt',delimiter= '\s+')
dfs.append(df)
print(i)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
5765358043206
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
9812808043220 9576658043223 1699958043227
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,6,11,12,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
7225068043228
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
265208043229
dfz=pd.concat(dfs)
dfs=None #free memory
df=None #free memory
dfz.head()
USAF | WBAN | YR--MODAHRMN | DIR | SPD | GUS | CLG | SKC | L | M | ... | SLP | ALT | STP | MAX | MIN | PCP01 | PCP06 | PCP24 | PCPXX | SD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 119000 | 99999 | 201205010000 | 040 | 2 | *** | *** | *** | * | * | ... | 1017.6 | ***** | 984.8 | *** | *** | ***** | 0.00 | ***** | ***** | ** |
1 | 119000 | 99999 | 201205010100 | 040 | 2 | *** | *** | *** | * | * | ... | 1017.8 | ***** | 984.9 | *** | *** | ***** | ***** | ***** | ***** | ** |
2 | 119000 | 99999 | 201205010200 | *** | 0 | *** | *** | *** | * | * | ... | 1018.1 | ***** | 985 | *** | *** | ***** | ***** | ***** | ***** | ** |
3 | 119000 | 99999 | 201205010300 | 030 | 2 | *** | *** | *** | * | * | ... | 1018.3 | ***** | 985.2 | *** | *** | ***** | ***** | ***** | ***** | ** |
4 | 119000 | 99999 | 201205010400 | *** | 0 | *** | *** | *** | * | * | ... | 1018.8 | ***** | 985.6 | *** | *** | ***** | ***** | ***** | ***** | ** |
5 rows × 33 columns
#!!!! DO NOT DO THIS
#dfz=dfz.drop_duplicates()
dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour
#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()
filt4.head()
filt4.index.unique(0)
for i in filt4.index.unique(0):
d=df_avg2.loc[i].loc[filt4.loc[i].index.unique()]
dfz[i]=d
d.to_csv('data/'+str(i)+'.csv')
print(i)
119000 127560 127660 127720 127860 128050 128120 128150 128220 128250 128300 128305 128310 128360 128380 128390 128400 128430 128460 128470 128510 128550 128600 128603 128605 128660 128700 128820 128920 129100 129150 129200 129220 129250 129255 129300 129320 129350 129400 129410 129420 129500 129600 129700 129820 129920 150000 150001 150002 150010 150040 150070 150090 150100 150105 150140 150150 150200 150230 150235 150250 150320 150330 150400 150410 150420 150440 150470 150520 150550 150560 150630 150690 150730 150750 150800 150830 150850 150880 150890 150900 150940 150950 150990 151070 151080 151090 151110 151130 151170 151180 151190 151200 151205 151230 151240 151270 151320 151340 151360 151380 151400 151430 151450 151455 151480 151500 151540 151580 151590 151600 151620 151630 151650 151680 151700 151740 151790 151820 151840 151890 151940 151970 151990 152000 152005 152040 152060 152080 152090 152120 152150 152170 152190 152210 152300 152310 152350 152380 152410 152450 152470 152540 152590 152600 152610 152620 152640 152650 152670 152700 152730 152770 152790 152800 152820 152840 152850 152870 152890 152920 152960 152970 152980 152990 153000 153010 153020 153070 153100 153140 153150 153160 153170 153190 153200 153210 153240 153250 153280 153330 153350 153355 153360 153370 153380 153400 153410 153440 153450 153460 153470 153490 153500 153550 153560 153600 153630 153640 153660 153690 153730 153750 153770 153870 153880 153890 153950 154020 154050 154060 154080 154090 154100 154120 154160 154190 154200 154210 154215 154220 154230 154240 154250 154280 154290 154340 154430 154440 154450 154470 154500 154510 154550 154600 154620 154650 154690 154700 154750 154760 154770 154790 154800 154810 154820 154890 154900 154910 154930 154931 154940 154980 154990
Load
import os
arr = os.listdir('data/')
for i in arr:
if i not in ['all.csv']:
dfz[i[:-4]]=pd.read_csv('data/'+i)
print(i)
119000.csv 127560.csv 127660.csv 127720.csv 127860.csv 128050.csv 128120.csv 128150.csv 128220.csv 128250.csv 128300.csv 128305.csv 128310.csv 128360.csv 128380.csv 128390.csv 128400.csv 128430.csv 128460.csv 128470.csv 128510.csv 128550.csv 128600.csv 128603.csv 128605.csv 128660.csv 128700.csv 128820.csv 128920.csv 129100.csv 129150.csv 129200.csv 129220.csv 129250.csv 129255.csv 129300.csv 129320.csv 129350.csv 129400.csv 129410.csv 129420.csv 129500.csv 129600.csv 129700.csv 129820.csv 129920.csv 150000.csv 150001.csv 150002.csv 150010.csv 150040.csv 150070.csv 150090.csv 150100.csv 150105.csv 150140.csv 150150.csv 150200.csv 150230.csv 150235.csv 150250.csv 150320.csv 150330.csv 150400.csv 150410.csv 150420.csv 150440.csv 150470.csv 150520.csv 150550.csv 150560.csv 150630.csv 150690.csv 150730.csv 150750.csv 150800.csv 150830.csv 150850.csv 150880.csv 150890.csv 150900.csv 150940.csv 150950.csv 150990.csv 151070.csv 151080.csv 151090.csv 151110.csv 151130.csv 151170.csv 151180.csv 151190.csv 151200.csv 151205.csv 151230.csv 151240.csv 151270.csv 151320.csv 151340.csv 151360.csv 151380.csv 151400.csv 151430.csv 151450.csv 151455.csv 151480.csv 151500.csv 151540.csv 151580.csv 151590.csv 151600.csv 151620.csv 151630.csv 151650.csv 151680.csv 151700.csv 151740.csv 151790.csv 151820.csv 151840.csv 151890.csv 151940.csv 151970.csv 151990.csv 152000.csv 152005.csv 152040.csv 152060.csv 152080.csv 152090.csv 152120.csv 152150.csv 152170.csv 152190.csv 152210.csv 152300.csv 152310.csv 152350.csv 152380.csv 152410.csv 152450.csv 152470.csv 152540.csv 152590.csv 152600.csv 152610.csv 152620.csv 152640.csv 152650.csv 152670.csv 152700.csv 152730.csv 152770.csv 152790.csv 152800.csv 152820.csv 152840.csv 152850.csv 152870.csv 152890.csv 152920.csv 152960.csv 152970.csv 152980.csv 152990.csv 153000.csv 153010.csv 153020.csv 153070.csv 153100.csv 153140.csv 153150.csv 153160.csv 153170.csv 153190.csv 153200.csv 153210.csv 153240.csv 153250.csv 153280.csv 153330.csv 153350.csv 153355.csv 153360.csv 153370.csv 153380.csv 153400.csv 153410.csv 153440.csv 153450.csv 153460.csv 153470.csv 153490.csv 153500.csv 153550.csv 153560.csv 153600.csv 153630.csv 153640.csv 153660.csv 153690.csv 153730.csv 153750.csv 153770.csv 153870.csv 153880.csv 153890.csv 153950.csv 154020.csv 154050.csv 154060.csv 154080.csv 154090.csv 154100.csv 154120.csv 154160.csv 154190.csv 154200.csv 154210.csv 154215.csv 154220.csv 154230.csv 154240.csv 154250.csv 154280.csv 154290.csv 154340.csv 154430.csv 154440.csv 154450.csv 154470.csv 154500.csv 154510.csv 154550.csv 154600.csv 154620.csv 154650.csv 154690.csv 154700.csv 154750.csv 154760.csv 154770.csv 154790.csv 154800.csv 154810.csv 154820.csv 154890.csv 154900.csv 154910.csv 154930.csv 154931.csv 154940.csv 154980.csv 154990.csv
stationc=station.set_index(0)[2]
for z in dfz:
dfz[z]=dfz[z].reset_index()
stationx={}
for z in dfz:
stationx[stationc.loc[int(z)]]={'id':z,'yrs':len(dfz[z]['year'].unique())}
dq=pd.DataFrame(stationx).T.reset_index().set_index('yrs').sort_index(ascending=False)
dq.head(5)
index | id | |
---|---|---|
yrs | ||
72 | AUREL VLAICU | 154200 |
68 | GALATI | 153100 |
67 | CLUJ NAPOCA | 151200 |
67 | DROBETA TURNU SEVERIN | 154100 |
66 | CONSTANTA | 154800 |
dds=[]
indicator='temp_avg'
for a in dq.head(500)['id'].values:
dw=dfz[a]
tmean=dw.groupby(['month','hour']).mean()[[indicator]]
tmean.columns=['temp_mean']
dw=dw.loc[1980:]
dw=dw.loc[:2010]
tmean80=dw.groupby(['month','hour']).mean()[[indicator]]
tmean80.columns=['temp_mean80']
dc=dfz[a].groupby(['year','month','hour']).mean()[[indicator]].join(tmean).join(tmean80)
dc['temp_delta']=dc[indicator]-dc['temp_mean']
dc['temp_delta80']=dc[indicator]-dc['temp_mean80']
dd=dc.groupby(['year']).mean()
dd=np.round(dd,1)
dd['varos']=stationc[int(a)]
dd['value']=1
dds.append(dd)
ddb=pd.concat(dds)
ddb.to_csv('stripes/geo.csv')
dgeo=ddb.reset_index().set_index('varos').join(station.set_index(2))
dgeo[dgeo[3]=='HUNGARY'].to_csv('stripes/geotest.csv')
dds=[]
indicator='temp_avg'
# clean_months=[1,4,7,10]
clean_months=[1,2,3,4,5,6,7,8,9,10,11,12]
# clean_hours=[0,6,12,18]
clean_hours=[0,3,6,12,15,18,21]
# clean_hours=[6]
clean_slice=[(i,j) for i in clean_months for j in clean_hours]
for a in dq['id'].values:
dw=dfz[a].groupby(['month','hour']).mean().loc[clean_slice]
if dw[indicator].count()==len(clean_hours)*len(clean_months):
dc=dfz[a].set_index(['month','hour']).loc[clean_slice].reset_index()
dx=dc.groupby(['year']).nunique()
full_data_years=dx[((dx['month']==len(clean_months))&(dx['hour']==len(clean_hours)))].index
tmean=dw.groupby(['month','hour']).mean()[[indicator]]
tmean.columns=['temp_mean']
dw=dfz[a].set_index(['month','hour']).loc[clean_slice].reset_index().set_index(['year']).loc[full_data_years].reset_index()
tmean_full_years=dw.groupby(['month','hour']).mean()[[indicator]]
tmean_full_years.columns=['temp_mean_full_years']
dc=dc.groupby(['year','month','hour']).mean()[[indicator]].loc[full_data_years].join(tmean).join(tmean_full_years)
dc['temp_delta']=dc[indicator]-dc['temp_mean']
dc['temp_delta_full_years']=dc[indicator]-dc['temp_mean_full_years']
if ((2018 in dw['year'].unique()) and (2010 in dw['year'].unique())):
tmean_last10=dw[dw['year']>2009].groupby(['month','hour']).mean()[[indicator]]
tmean_last10.columns=['temp_mean_last10']
dc=dc.join(tmean_last10)
dc['temp_delta_last10']=dc[indicator]-dc['temp_mean_last10']
dd=dc.groupby(['year']).mean()
dd=np.round(dd,1)
dd['varos']=stationc[int(a)]
dd['value']=1
dds.append(dd)
ddb=pd.concat(dds)
dgeo=ddb.reset_index().set_index('varos').join(station.set_index(2))
dgeo[dgeo[3]=='HUNGARY'].to_csv('stripes/hu.csv')
dgeo[dgeo[3]=='ROMANIA'].to_csv('stripes/ro.csv')
dgeo.to_csv('stripes/huro.csv')
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike # Remove the CWD from sys.path while we load stuff. C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:38: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'.
dds=[]
indicator='temp_avg'
clean_months=[1,5,9]
# clean_months=[1,2,3,4,5,6,7,8,9,10,11,12]
# clean_hours=[0,6,12,18]
# clean_hours=[0,3,6,12,15,18,21]
clean_hours=[6]
clean_slice=[(i,j) for i in clean_months for j in clean_hours]
for a in dq['id'].values:
dw=dfz[a].groupby(['month','hour']).mean().loc[clean_slice]
if dw[indicator].count()==len(clean_hours)*len(clean_months):
dc=dfz[a].set_index(['month','hour']).loc[clean_slice].reset_index()
dx=dc.groupby(['year']).nunique()
full_data_years=dx[((dx['month']==len(clean_months))&(dx['hour']==len(clean_hours)))].index
tmean=dw.groupby(['month','hour']).mean()[[indicator]]
tmean.columns=['temp_mean']
dw=dfz[a].set_index(['month','hour']).loc[clean_slice].reset_index().set_index(['year']).loc[full_data_years].reset_index()
tmean_full_years=dw.groupby(['month','hour']).mean()[[indicator]]
tmean_full_years.columns=['temp_mean_full_years']
dc=dc.groupby(['year','month','hour']).mean()[[indicator]].loc[full_data_years].join(tmean).join(tmean_full_years)
dc['temp_delta']=dc[indicator]-dc['temp_mean']
dc['temp_delta_full_years']=dc[indicator]-dc['temp_mean_full_years']
if ((2018 in dw['year'].unique()) and (2010 in dw['year'].unique())):
tmean_last10=dw[dw['year']>2009].groupby(['month','hour']).mean()[[indicator]]
tmean_last10.columns=['temp_mean_last10']
dc=dc.join(tmean_last10)
dc['temp_delta_last10']=dc[indicator]-dc['temp_mean_last10']
dd=dc.groupby(['year']).mean()
dd=np.round(dd,1)
dd['varos']=stationc[int(a)]
dd['value']=1
dds.append(dd)
ddb=pd.concat(dds)
dgeo=ddb.reset_index().set_index('varos').join(station.set_index(2))
dgeo[dgeo[3]=='HUNGARY'].to_csv('stripes/hu_mini.csv')
dgeo[dgeo[3]=='ROMANIA'].to_csv('stripes/ro_mini.csv')
dgeo.to_csv('stripes/huro_mini.csv')
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: Passing list-likes to .loc or [] with any missing label will raise KeyError in the future, you can use .reindex() as an alternative. See the documentation here: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike # Remove the CWD from sys.path while we load stuff. C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:38: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'.
station[station[2]=='DEBRECEN']
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
28 | 128820 | 99999 | DEBRECEN | HUNGARY | 47.489 | 21.615 | 110.0 |
namer={"AGARD":"Agárd",
"AUREL VLAICU":"Bukarest - Aurel Vlaicu",
"BACAU":"Bákó",
"BAJA":"Baja",
"BEKESCSABA":"Békéscsaba",
"BOBOC AIR BASE":"Boboc légi bázis",
"BOTOSANI":"Botoșani",
"BUDAPEST/PESTSZENTLORINC":"Budapest - Pestszentlőrinc",
"BUZAU":"Buzău",
"CALARASI":"Călărasi",
"CAMPIA TURZII":"Aranyosgyéres",
"CARANSEBES":"Káránszebes",
"CATALOI":"Tulcea - Cataloi",
"CEAHLAU TOACA":"Csalhó",
"CLUJ NAPOCA":"Kolozsvár",
"CONSTANTA":"Konstanca",
"DEBRECEN":"Debrecen",
"DEVA":"Déva",
"DROBETA TURNU SEVERIN":"Szörényvár",
"EGER":"Eger",
"FERIHEGY":"Budapest - Ferihegy",
"GALATI":"Galac",
"GYOR":"Győr",
"HENRI COANDA":"Bukarest - Henri Coandă",
"IASI":"Jászvásár",
"JOSVAFO":"Jósvafő",
"KECSKEMET":"Kecskemét",
"KEKESTETO":"Kékestető",
"MIERCUREA CIUC":"Csíkszereda",
"MIHAIL KOGALNICEANU":"Konstanca - Mihail Kogălniceanu",
"MISKOLC":"Miskolc",
"MOSONMAGYAROVAR":"Mosonmagyaróvár",
"NAGYKANIZSA":"Nagykanizsa",
"NYIREGYHAZA":"Nyíregyháza",
"OCNA SUGATAG":"Aknasugatag",
"ORADEA":"Nagyvárad",
"PAKS":"Paks",
"PAPA":"Pápa",
"PECS SOUTH":"Pécs",
"POROSZLO":"Poroszló",
"RAMNICU VALCEA":"Râmnicu Vâlcea",
"ROSIORII DE VEDE":"Roșiorii De Vede",
"SARMELLEK":"Balaton - Sármellék",
"SATU MARE":"Szatmár",
"SIBIU":"Nagyszeben",
"SIOFOK":"Siófok",
"SOPRON":"Sopron",
"STEFAN CEL MARE":"Suceava - Ștefan Cel Mare",
"SULINA":"Sulina",
"SZECSENY":"Szécsény",
"SZEGED (AUT)":"Szeged",
"SZENTGOTTHARD/FARKASFA":"Szentgotthárd",
"SZOLNOK":"Szolnok",
"SZOMBATHELY ARPT / VAS":"Szombathely",
"TAT":"Tát",
"TATA":"Tata",
"TAUTII MAGHERAUS":"Nagybánya - Miszmogyorós",
"TRAIAN VUIA":"Temesvár - Traian Vuia",
"VARFU OMU":"Bucsecs - Omu csúcs",
"VESZPREM/SZENTKIRALYSZABADJA":"Veszprém",
"VIDRASAU":"Marosvásárhely - Vidrátszeg",
"ZAHONY":"Záhony",
"ZALAEGERSZEG/ANDRASHIDA":"Zalaegerszeg"}
import json
open('namer.json','w').write(json.dumps(namer))
2143
df=df.replace({'******':np.nan,'*****':np.nan,'****':np.nan,'***':np.nan,'**':np.nan,'*':np.nan})
df['time']=pd.to_datetime(df['YR--MODAHRMN'],format='%Y%m%d%H%M')
#df=df[['time','USAF','SPD','TEMP','PCP06','SD','MW','AW','W']]
df['TEMP']=(pd.to_numeric(df['TEMP'], errors='coerce')-32)*5/9
df['SPD']=pd.to_numeric(df['SPD'], errors='coerce')*1.61
df['PCP06']=pd.to_numeric(df['PCP06'], errors='coerce')*25.4
df['SD']=pd.to_numeric(df['SD'], errors='coerce')*25.4
dfs['time']=pd.to_datetime(df['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfs['year']=dfs['time'].dt.year
dfs['month']=dfs['time'].dt.month
dfs['day']=dfs['time'].dt.day
dfs['hour']=dfs['time'].dt.hour