from cw.io import read_cw_data
import matplotlib.pylab as plt
import numpy as np
%matplotlib inline
cw_data = read_cw_data()
print(f"Total number of cw reading {len(cw_data)}")
Total number of cw reading 34415
/home/esowc31/anand/CW4Floods/cw/io.py:45: DtypeWarning: Columns (2,3,7,11,16,28,36,38,46,48) have mixed types. Specify dtype option on import or set low_memory=False. data = pd.read_csv(file_name)
cw_data = cw_data[cw_data.WATER_LEVEL.notna()]
print(f"Total number of cw reading without NA {len(cw_data)}")
Total number of cw reading without NA 7321
cw_data = cw_data[cw_data.WATER_LEVEL != "false" ]
print(f"Total number of cw reading without NA and false {len(cw_data)}")
Total number of cw reading without NA and false 7320
print(f"All the columns in cw_data {cw_data.columns[:]}")
All the columns in cw_data Index(['ID', 'ROOT_ID', 'LATITUDE', 'LONGITUDE', 'CATEGORY', 'IMAGE', 'FLOW_TYPE', 'SNOW_ICE_PRESENT', 'MOISTURE', 'WATER_LEVEL', 'PP_TYPE', 'PP_RIVER_STAGNENT', 'PP_STREAM_OBSERVATION_TIME', 'PP_STREAM_PROPORTIONS', 'PP_SHORE_PLOTSIZE', 'PP_AMOUNT', 'WL_ADVANCED', 'WL_WIDTH', 'WL_DEPTH', 'STREAMTYPE_TYPE', 'STREAMTYPE_DRINK_WATER', 'STREAMTYPE_SWIMMING', 'STREAMTYPE_BUILTIN', 'WL_MATERIAL', 'STREAMTYPE_WATERCOLOR', 'STREAMTYPE_GROUNDVISIBLE', 'STREAMTYPE_ANIMALS', 'STREAMTYPE_POLLUTION', 'STREAMTYPE_DRIESUP', 'STREAMTYPE_NAME', 'WL_METHOD', 'WL_FLOW_VELOCITY', 'WL_DISTANCE', 'WL_TIME_A', 'WL_TIME_B', 'WL_TIME_C', 'WL_DISTANCE_B', 'WL_DISTANCE_C', 'PP_ADVANCED', 'PP_ADV_PET', 'PP_ADV_POSOFT', 'PP_ADV_POHARD', 'PP_ADV_PS', 'PP_ADV_PSE', 'PP_ADV_MULTILAYER', 'PP_ADV_OTHER', 'PP_PLASTIC_REMOVED_CHECK', 'PHYSICAL_SCALE_UNIT', 'PHYSICAL_SCALE_LEVEL', 'DESCRIPTION', 'SPOTTED_AT'], dtype='object')
## Sort stations with highest frequency of the data
reading_frequency = np.unique(cw_data.ROOT_ID, return_counts = True)
sorted_freq = sorted(zip(reading_frequency[1], reading_frequency[0]), reverse=True)
tuples = zip(*sorted_freq)
freq, station_id = [ list(tuple) for tuple in tuples]
# Frequency distribution for 50 stations with highest number of readings
fig, (ax1, ax2) = plt.subplots(1,2, figsize = (16, 6))
ax1.plot(freq)
ax2.plot(np.arange(20, 100), freq[20:100])
ax2.axhline(20)
plt.grid()
plt.show()
print("They are atleast 50 stations with 20 or more readings, \n this includes the stations on entire globe and not only to europe")
They are atleast 50 stations with 20 or more readings, this includes the stations on entire globe and not only to europe
# Lets see how are the values distributed
fig, ax = plt.subplots(1,1, figsize = (10, 4))
plt.hist(cw_data.WATER_LEVEL, bins = 13)
print(f"Most of the values are in the range -1 to +1")
Most of the values are in the range -1 to +1