import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
ro=['5765358043206','9812808043220','9576658043223','1699958043227','7225068043228','265208043229']
hu=['8073718043234','2087988043232','6247548043235']
p='C:/Users/csala/Onedrive - Lancaster University/Datarepo/szekelydata/klima/'
stations=[]
for i in ro:
stations.append(pd.read_csv(p+'high_res/raw/ro/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
print(i)
5765358043206 9812808043220 9576658043223 1699958043227 7225068043228 265208043229
for i in hu:
stations.append(pd.read_csv(p+'high_res/raw/hu/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
print(i)
8073718043234 2087988043232 6247548043235
station=pd.concat(stations)
station=station.drop_duplicates()
station[2]=station[2].str.strip()
station[3]=station[3].str.strip()
station=station[[0,2,3,4,5,6]]
station.columns=['ID','LOC','COUNTRY','LAT','LON','ELEVATION']
station.to_csv(p+'stations.csv')
!!! 16G memory required at least, 64G recommended
RO
dfs=[]
for i in ro:
df=pd.read_csv(p+'high_res/raw/ro/'+i+'dat.txt',delimiter= '\s+')
dfs.append(df)
print(i)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
5765358043206
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
9812808043220 9576658043223 1699958043227
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,6,11,12,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
7225068043228
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
265208043229
dfz=pd.concat(dfs)
dfs=None #free memory
df=None #free memory
dfz.head()
USAF | WBAN | YR--MODAHRMN | DIR | SPD | GUS | CLG | SKC | L | M | ... | SLP | ALT | STP | MAX | MIN | PCP01 | PCP06 | PCP24 | PCPXX | SD | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 119000 | 99999 | 201205010000 | 040 | 2 | *** | *** | *** | * | * | ... | 1017.6 | ***** | 984.8 | *** | *** | ***** | 0.00 | ***** | ***** | ** |
1 | 119000 | 99999 | 201205010100 | 040 | 2 | *** | *** | *** | * | * | ... | 1017.8 | ***** | 984.9 | *** | *** | ***** | ***** | ***** | ***** | ** |
2 | 119000 | 99999 | 201205010200 | *** | 0 | *** | *** | *** | * | * | ... | 1018.1 | ***** | 985 | *** | *** | ***** | ***** | ***** | ***** | ** |
3 | 119000 | 99999 | 201205010300 | 030 | 2 | *** | *** | *** | * | * | ... | 1018.3 | ***** | 985.2 | *** | *** | ***** | ***** | ***** | ***** | ** |
4 | 119000 | 99999 | 201205010400 | *** | 0 | *** | *** | *** | * | * | ... | 1018.8 | ***** | 985.6 | *** | *** | ***** | ***** | ***** | ***** | ** |
5 rows × 33 columns
#!!!! DO NOT DO THIS
#dfz=dfz.drop_duplicates()
dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour
#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()
filt4.head()
month | ||
---|---|---|
USAF | year | |
119000 | 2012 | 8.0 |
2013 | 12.0 | |
2014 | 12.0 | |
2015 | 12.0 | |
2016 | 4.0 |
for stn in filt4.index.unique(0):
years=filt4.loc[stn].index.unique()
d=dfz[dfz['USAF']==stn]
d=d[d['year'].isin(years)]
d.to_csv(p+'high_res/export/'+str(stn)+'.csv')
print(stn)
119000 150000 150001 150002 150010 150040 150070 150090 150100 150105 150140 150150 150200 150230 150235 150250 150320 150330 150400 150410 150420 150440 150470 150520 150550 150560 150630 150690 150730 150750 150800 150830 150850 150880 150890 150900 150940 150950 150990 151070 151080 151090 151110 151130 151170 151180 151190 151200 151205 151230 151240 151270 151320 151340 151360 151380 151400 151430 151450 151455 151480 151500 151540 151580 151590 151600 151620 151630 151650 151680 151700 151740 151790 151820 151840 151890 151940 151970 151990 152000 152005 152040 152060 152080 152090 152120 152150 152170 152190 152210 152300 152310 152350 152380 152410 152450 152470 152540 152590 152600 152610 152620 152640 152650 152670 152700 152730 152770 152790 152800 152820 152840 152850 152870 152890 152920 152960 152970 152980 152990 153000 153010 153020 153070 153100 153140 153150 153160 153170 153190 153200 153210 153240 153250 153280 153330 153350 153355 153360 153370 153380 153400 153410 153440 153450 153460 153470 153490 153500 153550 153560 153600 153630 153640 153660 153690 153730 153750 153770 153870 153880 153890 153950 154020 154050 154060 154080 154090 154100 154120 154160 154190
HU
dfs=[]
for i in hu:
df=pd.read_csv(p+'high_res/raw/hu/'+i+'dat.txt',delimiter= '\s+')
dfs.append(df)
print(i)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
8073718043234 2087988043232
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,20,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
6247548043235
dfz=pd.concat(dfs)
dfs=None #free memory
df=None #free memory
dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour
#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()
for stn in filt4.index.unique(0):
years=filt4.loc[stn].index.unique()
d=dfz[dfz['USAF']==stn]
d=d[d['year'].isin(years)]
d.to_csv(p+'high_res/export/'+str(stn)+'.csv')
print(stn)
127560 127660 127720 127860 128050 128120