%%bash
pip install numpy
pip install pandas
pip install seaborn
pip install mlxtend
pip install arch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
def print_full(x):
pd.set_option('display.max_rows', len(x))
print(x)
pd.reset_option('display.max_rows')
def add_year(df):
df['year'] = df['Date'].apply(lambda x: int(x[2:4]))
return df
#Load rain and hotel occupancy data
r16 = pd.read_csv("data/rain2016.csv")
r17 = pd.read_csv("data/rain2017.csv")
r20 = pd.read_csv("data/rain2020.csv")
R=pd.concat([r16,r17,r20])[['Date', 'Rain']]
R=add_year(R)
R.to_csv('data/rain_all.csv', index=False)
H = pd.read_csv("data/hotel-nights.csv")
H['per_day'] = H['Rooms']/H['Days']
R['rooms']=0
for i, row in R.iterrows():
R.at[i,"rooms"]=H.loc[ H['Month']==row["Date"][:7] ,'per_day'].values[0]
def expand_columns(ds, n=3):
m=len(ds)
Z=[]
for i in range(n):
ds['Plate_number']=i+1
Z.append(ds.copy())
plate_prefix = [f"P{i+1}_" for i in range(n)]
plates=[s for s in ds.columns.tolist() if any(xs in s for xs in plate_prefix)]
for p in plates:
colname=p[3:]
Z[int(p[1])-1][colname] =ds[p]
e=pd.concat(Z)
return e
def map_columns(ds, col_dict):
col_dict={**common_cols, **col_dict}
x=ds.rename(columns=col_dict)
return x[list(col_dict.values())]
def calcRain(w):
i=R.loc[R['Date']==w].index.values[0]
r=R.iloc[i-2:i].agg('sum')['Rain']
h=R.iloc[i-6:i].agg('sum')['rooms']
return (h,r)
def annotate_weeks(ds, weeks):
ds['week']=0
ds['festival']=''
ds['rain']=0
ds['rooms']=0
for w, i in zip(weeks, range(len(weeks))):
I=ds['Date']==w
ds.loc[I, 'week']= i+1
ds.loc[I, 'festival']= weeks[w]
h, r = calcRain(w)
ds.loc[I, 'rain'] = r
ds.loc[I, 'rooms']= h
return ds
def prepare_year(ds, col_dict, all_indicators, weeks, locations, medium):
ds=ds.query(' or '.join([ f"(Location=='{u}')" for u in locations])) \
.query(f"medium==\'{medium}\'")
x=map_columns(ds, col_dict)
x['total']=ds[all_indicators].agg("sum", axis="columns")
z= annotate_weeks(x, weeks)
z=z.sort_values(['Date', 'Location', 'Plate_number'],ignore_index=True)
return z
def normalize_cols(ds, cols, factor):
for col in cols:
ds[col] = 100 * ds[col] / ds[factor]
return ds
sites=['VNX','MRD','SVT']
common_cols={'Date' : 'Date',
'Location' : 'Location' ,
'Plate_number' : 'Plate_number',
'medium' : 'medium'}
ds16 = pd.read_csv("data/2016_Data.csv")
ds16['medium']='easy_gel'
ds16['P2_qty_sample'] = ds16['P1_qty_sample']
ds16['P3_qty_sample'] = ds16['P1_qty_sample']
#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)
ds17 = pd.read_csv("data/2017_Data.csv")
ds17['P2_qty_sample'] = ds17['P1_qty_sample']
ds17['P3_qty_sample'] = ds17['P1_qty_sample']
s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)
ds20 = pd.read_csv("data/2020_Data.csv").query("medium=='levine'").sort_values(['Date', 'Location', 'Plate_number'])
# 2016
weeks16= {'2016-06-21' : 'before',
'2016-06-28' : 'before',
'2016-07-05' : 'during',
'2016-07-12' : 'during',
'2016-07-19' : 'after',
'2016-07-26' : 'after',
'2016-08-02' : 'after',
'2016-08-09' : 'after'}
Y16 = prepare_year(expand_columns(ds16,3), {
'24h_big_blue' : 'Bioindicator',
'24h_med_blue' : 'Coliform',
'qty_sample' : 'volume'
}, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] ,
weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])
Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')
# 2017
weeks17 = {'2017-06-12' : 'before',
'2017-06-19' : 'before',
'2017-06-26' : 'before',
'2017-07-03' : 'during',
'2017-07-10' : 'during',
'2017-07-17' : 'after',
'2017-07-24' : 'after',
'2017-07-31' : 'after',}
Y17 = prepare_year(expand_columns(ds17,3), {
'24h_big_blue' : 'Bioindicator',
'fluo_halo_colonies' : 'Bioindicator UV+',
'24h_med_blue' : 'Coliform',
'qty_sample' : 'volume'
}, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],
weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])
Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')
# 2020
weeks20 = {'2020-06-11' : 'before',
'2020-06-18' : 'before',
'2020-06-25' : 'before',
'2020-07-01' : 'before',
'2020-07-09' : 'before',
'2020-07-16' : 'before',
'2020-07-22' : 'before',
'2020-08-06' : 'before'}
Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',
'purple' : 'Coliform',
'volume' : 'volume'
}, ['green_met', 'purple', 'mauve', 'pink', 'other'] ,
weeks20 , sites, 'levine')
Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)
D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])
D=add_year(D)
D.to_pickle("data/allyears.pkl")
#Take average over plates
A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])
A.to_pickle("data/allyears-avg_plate.pkl")
vars_cols=['Bioindicator', 'Coliform', 'total']
#A[[f"{col}_std" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]
#A[[f"{col}_max" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]
R = pd.read_csv("data/rain_all_cumulative_48h.csv")
display(R)
H = pd.read_csv("data/hotel-nights.csv")
H['per_day'] = H['Rooms']/H['Days']
R['rooms']=0
for i, row in R.iterrows():
R.at[i,"rooms"]=H.loc[ H['Month']==row["Date"][:7] ,'per_day'].values[0]
sites=['VNX','MRD','SVT']
common_cols={'Date' : 'Date',
'Location' : 'Location' ,
'Plate_number' : 'Plate_number',
'medium' : 'medium'}
ds16 = pd.read_csv("data/2016_Data.csv")
ds16['medium']='easy_gel'
ds16['P2_qty_sample'] = ds16['P1_qty_sample']
ds16['P3_qty_sample'] = ds16['P1_qty_sample']
#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)
ds17 = pd.read_csv("data/2017_Data.csv")
ds17['P2_qty_sample'] = ds17['P1_qty_sample']
ds17['P3_qty_sample'] = ds17['P1_qty_sample']
s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)
ds20 = pd.read_csv("data/2020_Data.csv").query("medium=='levine'").sort_values(['Date', 'Location', 'Plate_number'])
# 2016
weeks16= {'2016-06-21' : 'before',
'2016-06-28' : 'before',
'2016-07-05' : 'during',
'2016-07-12' : 'during',
'2016-07-19' : 'after',
'2016-07-26' : 'after',
'2016-08-02' : 'after',
'2016-08-09' : 'after'}
Y16 = prepare_year(expand_columns(ds16,3), {
'24h_big_blue' : 'Bioindicator',
'24h_med_blue' : 'Coliform',
'qty_sample' : 'volume'
}, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] ,
weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])
Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')
# 2017
weeks17 = {'2017-06-12' : 'before',
'2017-06-19' : 'before',
'2017-06-26' : 'before',
'2017-07-03' : 'during',
'2017-07-10' : 'during',
'2017-07-17' : 'after',
'2017-07-24' : 'after',
'2017-07-31' : 'after',}
Y17 = prepare_year(expand_columns(ds17,3), {
'24h_big_blue' : 'Bioindicator',
'fluo_halo_colonies' : 'Bioindicator UV+',
'24h_med_blue' : 'Coliform',
'qty_sample' : 'volume'
}, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],
weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])
Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')
# 2020
weeks20 = {'2020-06-11' : 'before',
'2020-06-18' : 'before',
'2020-06-25' : 'before',
'2020-07-01' : 'before',
'2020-07-09' : 'before',
'2020-07-16' : 'before',
'2020-07-22' : 'before',
'2020-08-06' : 'before'}
Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',
'purple' : 'Coliform',
'volume' : 'volume'
}, ['green_met', 'purple', 'mauve', 'pink', 'other'] ,
weeks20 , sites, 'levine')
Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)
D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])
D=add_year(D)
D.to_pickle("data/allyears_48h_cumulative.pkl")
#Take average over plates
A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])
A.to_pickle("data/allyears-avg_plate_48h_cumulative.pkl")
vars_cols=['Bioindicator', 'Coliform', 'total']
#A[[f"{col}_std" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]
#A[[f"{col}_max" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]
Date | Rain | year | rain_same_day | rain_48h | rain_48 | |
---|---|---|---|---|---|---|
0 | 2016-06-13 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
1 | 2016-06-14 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
2 | 2016-06-15 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
3 | 2016-06-16 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
4 | 2016-06-17 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... |
168 | 2020-08-02 | 16.2 | 20 | 14.8 | 1.4 | 0.5 |
169 | 2020-08-03 | 29.1 | 20 | 14.3 | 14.8 | 1.4 |
170 | 2020-08-04 | 18.6 | 20 | 4.3 | 14.3 | 14.8 |
171 | 2020-08-05 | 4.3 | 20 | 0.0 | 4.3 | 14.3 |
172 | 2020-08-06 | 0.0 | 20 | 0.0 | 0.0 | 4.3 |
173 rows × 6 columns
R = pd.read_csv("data/rain_all_cumulative_72h.csv")
display(R)
H = pd.read_csv("data/hotel-nights.csv")
H['per_day'] = H['Rooms']/H['Days']
R['rooms']=0
for i, row in R.iterrows():
R.at[i,"rooms"]=H.loc[ H['Month']==row["Date"][:7] ,'per_day'].values[0]
sites=['VNX','MRD','SVT']
common_cols={'Date' : 'Date',
'Location' : 'Location' ,
'Plate_number' : 'Plate_number',
'medium' : 'medium'}
ds16 = pd.read_csv("data/2016_Data.csv")
ds16['medium']='easy_gel'
ds16['P2_qty_sample'] = ds16['P1_qty_sample']
ds16['P3_qty_sample'] = ds16['P1_qty_sample']
#ds16=ds16.sort_values(['Date', 'Location'],ignore_index=True)
ds17 = pd.read_csv("data/2017_Data.csv")
ds17['P2_qty_sample'] = ds17['P1_qty_sample']
ds17['P3_qty_sample'] = ds17['P1_qty_sample']
s17=ds17.sort_values(['Date', 'Location'],ignore_index=True)
ds20 = pd.read_csv("data/2020_Data.csv").query("medium=='levine'").sort_values(['Date', 'Location', 'Plate_number'])
# 2016
weeks16= {'2016-06-21' : 'before',
'2016-06-28' : 'before',
'2016-07-05' : 'during',
'2016-07-12' : 'during',
'2016-07-19' : 'after',
'2016-07-26' : 'after',
'2016-08-02' : 'after',
'2016-08-09' : 'after'}
Y16 = prepare_year(expand_columns(ds16,3), {
'24h_big_blue' : 'Bioindicator',
'24h_med_blue' : 'Coliform',
'qty_sample' : 'volume'
}, ['24h_big_blue', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'] ,
weeks16, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])
Y16 = normalize_cols(Y16, ['Bioindicator', 'Coliform', 'total'], 'volume')
# 2017
weeks17 = {'2017-06-12' : 'before',
'2017-06-19' : 'before',
'2017-06-26' : 'before',
'2017-07-03' : 'during',
'2017-07-10' : 'during',
'2017-07-17' : 'after',
'2017-07-24' : 'after',
'2017-07-31' : 'after',}
Y17 = prepare_year(expand_columns(ds17,3), {
'24h_big_blue' : 'Bioindicator',
'fluo_halo_colonies' : 'Bioindicator UV+',
'24h_med_blue' : 'Coliform',
'qty_sample' : 'volume'
}, ['fluo_halo_colonies', '24h_med_blue', '24h_pink', '24h_turq', '24h_other'],
weeks17, sites, 'easy_gel')#.sort_values(['Date', 'Location', 'Plate_number'])
Y17 = normalize_cols(Y17, ['Bioindicator', 'Coliform', 'Bioindicator UV+', 'total'], 'volume')
# 2020
weeks20 = {'2020-06-11' : 'before',
'2020-06-18' : 'before',
'2020-06-25' : 'before',
'2020-07-01' : 'before',
'2020-07-09' : 'before',
'2020-07-16' : 'before',
'2020-07-22' : 'before',
'2020-08-06' : 'before'}
Y20=prepare_year(ds20, { 'green_met' : 'Bioindicator',
'purple' : 'Coliform',
'volume' : 'volume'
}, ['green_met', 'purple', 'mauve', 'pink', 'other'] ,
weeks20 , sites, 'levine')
Y20 = normalize_cols(Y20, ['Bioindicator', 'Coliform', 'total'], 'volume').fillna(0)
D=pd.concat([Y16,Y17,Y20]).drop(columns=['volume','medium'])
D=add_year(D)
D.to_pickle("data/allyears_72h_cumulative.pkl")
#Take average over plates
A=D.groupby(['Date','Location','festival','rooms','year','rain'],as_index=False).mean().drop(columns=['Plate_number'])
A.to_pickle("data/allyears-avg_plate_72h_cumulative.pkl")
vars_cols=['Bioindicator', 'Coliform', 'total']
#A[[f"{col}_std" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).std()[vars_cols]
#A[[f"{col}_max" for col in vars_cols]] = D.groupby(grouped_cols,as_index=False).max()[vars_cols]
Date | Rain | year | rain_same_day | rain_48h | rain_48 | |
---|---|---|---|---|---|---|
0 | 2016-06-13 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
1 | 2016-06-14 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
2 | 2016-06-15 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
3 | 2016-06-16 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
4 | 2016-06-17 | 0.0 | 16 | 0.0 | 0.0 | 0.0 |
... | ... | ... | ... | ... | ... | ... |
168 | 2020-08-02 | 16.7 | 20 | 14.8 | 1.4 | 0.5 |
169 | 2020-08-03 | 30.5 | 20 | 14.3 | 14.8 | 1.4 |
170 | 2020-08-04 | 33.4 | 20 | 4.3 | 14.3 | 14.8 |
171 | 2020-08-05 | 18.6 | 20 | 0.0 | 4.3 | 14.3 |
172 | 2020-08-06 | 4.3 | 20 | 0.0 | 0.0 | 4.3 |
173 rows × 6 columns