#!git clone https://github.com/CoronaWhy/task-geo.git #!os.chdir('task-geo') import pandas as pd !wget -O coronavirus_timeseries.csv https://coronadatascraper.com/timeseries.csv !pip install wandb !wandb login import wandb from tensorflow import keras from wandb.keras import WandbCallback from google.colab import auth from datetime import datetime auth.authenticate_user() df = pd.read_csv("coronavirus_timeseries.csv") !gsutil cp coronavirus_timeseries.csv gs://coronaviruspublicdata/coronavirus_timeseries.csv df['month'] = pd.to_datetime(df['date']).map(lambda x: x.month) df['weekday'] = pd.to_datetime(df['date']).map(lambda x: x.weekday()) unqiue_counties = df['county'].unique() print(len(unqiue_counties)) def loop_through_geo_codes(df, column='full_county'): df_county_list = [] df['full_county'] = df['state'] + "_" + df['county'] for code in df['full_county'].unique(): mask = df['full_county'] == code df_code = df[mask] ts_count = len(df_code) if ts_count > 60: df_county_list.append(df_code) return df_county_list county_info = loop_through_geo_codes(df) print(len(county_info)) county_info[0] county_info[8]['full_county'].values antwerp_df = df[df['county']=='Antwerp'].fillna(0) antwerp_relevant = antwerp_df[['cases', 'deaths', 'recovered', 'population', 'lat', 'long']].values antwerp_df['new_cases'] = antwerp_df.cases.diff() print(len(antwerp_df)) !pip install tsaug from tsaug.visualization import plot from tsaug import TimeWarp, Crop, Quantize, Drift, Reverse my_augmenter = (TimeWarp() * 5, # random time warping 5 times in parallel Crop(size=300), # random crop subsequences with length 300 Quantize(n_levels=[10, 20, 30]), # random quantize to 10-, 20-, or 30- level sets Drift(max_drift=(0.1, 0.5)), # with 80% probability, random drift the signal up to 10% - 50% Reverse()) #0.5 # with 50% probability, reverse the sequence) #X_aug = my_augmenter[0].augment(antwerp_relevant) print(antwerp_relevant.shape) X_aug = TimeWarp(antwerp[:70]) import numpy as np import tensorflow as tf from sklearn.preprocessing import RobustScaler def create_dataset(X, y, time_steps=1): Xs, ys = [], [] for i in range(len(X) - time_steps): v = X.iloc[i:(i + time_steps)].values Xs.append(v) ys.append(y.iloc[i + time_steps]) return np.array(Xs), np.array(ys) sweep_config = { "name": "Default sweep", "method": "grid", "parameters": { "batch_size": { "values": [2, 3, 4, 5] }, "learn":{ "values":[0.001, 0.002, 0.004, 0.01] }, "seq_len":{ "values":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] } } } #sweep_id = wandb.sweep(sweep_config, project="covid-forecast") import matplotlib.pyplot as plt def train(x_train_full, y_train_full, test, test_first, scaler, config_default): run = wandb.init(project="covid-forecast", config=config_default, magic=True) config = wandb.config X_train, Y_train = create_dataset(x_train_full, y_train_full, config["seq_len"]) X_test, y_test = create_dataset(test, test, config["seq_len"]) opt = keras.optimizers.Adam(learning_rate=config["learn"], beta_1=config["beta"], beta_2=0.999, amsgrad=False) model = keras.Sequential() model.add( keras.layers.Bidirectional( keras.layers.LSTM( units=128, input_shape=(X_train.shape[1], X_train.shape[2]) ) ) ) model.add(keras.layers.Dropout(rate=0.2)) model.add(keras.layers.Dense(units=2)) model.compile(loss=config["loss"], optimizer=opt) history = model.fit( X_train, Y_train, epochs=config["epochs"], batch_size=config["batch_size"], validation_split=config["validation_split"], callbacks=[WandbCallback()], shuffle=False ) evaluate_single(model, X_test, y_test, scaler) evaluate_plot_multi(model, test_first, config, X_test, scaler) return model def evaluate_single(model, x_test, y_test, scaler): y_preds = model.predict(x_test) y_preds = scaler.inverse_transform(y_preds) y_test = scaler.inverse_transform(y_test) complete_mse = tf.keras.losses.MSE( y_preds[:, 1], y_test[:, 1]) wandb.run.summary["test_mse"] = complete_mse return complete_mse def evaluate_plot_multi(model, test_orig, config, x_test, scaler, predictor="new_cases"): arr = predict_multi(model, len(test_orig)-config["seq_len"], x_test[0, :, :], config) test_orig['predicted_cases'] = 0 test_orig['predicted_cases'][config["seq_len"]:] = scaler.inverse_transform(arr.squeeze(0))[:, 1] plt.plot(test_orig['predicted_cases'], label='predicted_cases') plt.plot(test_orig[predictor], label='actual_cases') plt.axvline(x=config['seq_len'], label='Predictions start'.format(config["seq_len"])) plt.legend(); wandb.log({"test":plt}) plt.plot(test_orig['predicted_cases'], label='predicted_cases') plt.plot(test_orig[predictor], label='actual_cases') plt.legend(); wandb.Image(plt, caption="Plot") large_mse = tf.keras.losses.MSE( scaler.inverse_transform(arr.squeeze(0))[:, 1], test_orig[predictor][config["seq_len"]:].values ) wandb.run.summary["test_mse_full"] = large_mse return large_mse def predict_multi(model, time_steps, start_rows, config, trg=None, known_colums_indices=None): """ start_rows:np.array of dimension () """ start_rows=np.expand_dims(start_rows, axis=0) trg = np.expand_dims(trg, axis=0) print(start_rows) print(start_rows.shape) for i in range(0, time_steps): out = model.predict(start_rows[:, i:, :]) out = out[np.newaxis, ...] if trg and known_colums_indices: for target in known_colums_indices: out[:, :, target] = trg[:, i, target] targ[:, i, :] start_rows = np.concatenate((start_rows, out), axis=1) return start_rows[:, config["seq_len"]:, :] import matplotlib.pyplot as plt %matplotlib inline %config InlineBackend.figure_format='retina' plt.plot(history.history['loss'], label='train') plt.plot(history.history['val_loss'], label='test') plt.legend(); import numpy as np import tensorflow as tf from sklearn.preprocessing import RobustScaler def run_geo_sweeps(county_info, start_idx=0, end_idx=1, test_columns=[]): for s in range(start_idx, end_idx): county = county_info[s].fillna(0) county_name = county['full_county'].values[0] config_default = {"epochs":30, "validation_split":0.1, "loss":"mean_squared_error", "optimizer":'adam', "geo_segment":county_name, "seq_len":7, "train_steps":60, "test_steps":27, "scaler":"RobustScaler", "new_cases":True, "beta":0.899, "additional_features":["none"]} county['new_cases'] = county.cases.diff() r = RobustScaler() x_train_full = county[['deaths', 'new_cases']][1:config_default["train_steps"]] x_train_full = pd.DataFrame(r.fit_transform(x_train_full)) y_train_full = x_train_full r_test = RobustScaler() test_orig = county[['deaths', 'new_cases']][60:] test = pd.DataFrame(r_test.fit_transform(test_orig), columns=["deaths", "new_cases"]) sweep_id = wandb.sweep(sweep_config, project="covid-forecast") wandb.agent(sweep_id, lambda:train(x_train_full, y_train_full, test, test_orig, r_test, config_default)) run_geo_sweeps(county_info, 8, 28) res = model.predict(X_test) res = r_test.inverse_transform(res) res y_true = r_test.inverse_transform(y_test) y_true import os !gcloud source repos clone github_aistream-peelout_flow-forecast --project=gmap-997 os.chdir('/content/github_aistream-peelout_flow-forecast') !git checkout -t origin/branch_fixes !python setup.py develop !pip install -r requirements.txt !mkdir data from flood_forecast.trainer import train_function def make_config_file(file_path, df_len): run = wandb.init(project="covid-forecast") wandb_config = wandb.config train_number = df_len * .7 validation_number = df_len *.9 config_default={ "model_name": "MultiAttnHeadSimple", "model_type": "PyTorch", "model_params": { "number_time_series":3, "seq_len":wandb_config["forecast_history"], "output_seq_len":wandb_config["out_seq_length"], "forecast_length":wandb_config["out_seq_length"] }, "dataset_params": { "class": "default", "training_path": file_path, "validation_path": file_path, "test_path": file_path, "batch_size":wandb_config["batch_size"], "forecast_history":wandb_config["forecast_history"], "forecast_length":wandb_config["out_seq_length"], "train_end": int(train_number), "valid_start":int(train_number+1), "valid_end": int(validation_number), "target_col": ["new_cases"], "relevant_cols": ["new_cases", "month", "weekday"], "scaler": "StandardScaler", "interpolate": False }, "training_params": { "criterion":"MSE", "optimizer": "Adam", "optim_params": { }, "lr": wandb_config["lr"], "epochs": 10, "batch_size":wandb_config["batch_size"] }, "GCS": False, "wandb": { "name": "multihead_pytorch_antwerp", "tags": ["covid_run", "circleci"], "project": "covid-forecast" }, "forward_params":{}, "metrics":["MSE"], "inference_params": { "datetime_start":"2020-04-21", "hours_to_forecast":10, "test_csv_path":file_path, "decoder_params":{ "decoder_function": "simple_decode", "unsqueeze_dim": 1 }, "dataset_params":{ "file_path": file_path, "forecast_history":wandb_config["forecast_history"], "forecast_length":wandb_config["out_seq_length"], "relevant_cols": ["new_cases", "month", "weekday"], "target_col": ["new_cases"], "scaling": "StandardScaler", "interpolate_param": False } } } print(config_default) wandb.config = config_default return config_default county_info[0]['datetime'] = county_info[0]['date'] county_info[0]['precip'] = 0 county_info[0]['temp'] = 0 county_info[0] = county_info[0].fillna(0) county_info[0]['new_cases'] = county_info[0]['cases'].diff() county_info[0].iloc[0]['new_cases'] = 0 county_info[0] = county_info[0].fillna(0) county_info[0].to_csv("antwerp.csv") sweep_config = { "name": "Default sweep", "method": "grid", "parameters": { "batch_size": { "values": [2, 3, 4, 5] }, "lr":{ "values":[0.001, 0.002, 0.004, 0.01] }, "forecast_history":{ "values":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10] }, "out_seq_length":{ "values":[1, 2, 3] } } } len_csv = len(county_info[0]) sweep_id = wandb.sweep(sweep_config, project="covid-forecast") wandb.agent(sweep_id, lambda:train_function("PyTorch", make_config_file("antwerp.csv", len_csv))) pd.read_csv("antwerp.csv") wandb.config !git clone https://github.com/CoronaWhy/task-ts -b add_mobility_data import os os.chdir('task-ts') !pip install loguru from corona_ts.data_utils.data_crawler import load_data !git clone https://github.com/CoronaWhy/task-geo.git os.chdir('task-geo') !make install os.chdir('..') df = load_data() df[df['region']=='Florida'].sort_by()