import os
import pandas as pd
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!gcloud source repos clone github_aistream-peelout_flow-forecast --project=gmap-997
os.chdir('/content/github_aistream-peelout_flow-forecast')
!git checkout -t origin/covid_fixes
!python setup.py develop
!pip install -r requirements.txt
!mkdir data
from flood_forecast.trainer import train_function
!pip install git+https://github.com/CoronaWhy/task-geo.git
!wandb login
# Pretrained solar data
!mkdir weights
!gsutil cp -r gs://coronaviruspublicdata/pretrained/model_save weights/
def make_config_file(file_path, df_len, weight_path=None):
run = wandb.init(project="pretrain-counties")
wandb_config = wandb.config
train_number = df_len * .7
validation_number = df_len *.9
config_default={
"model_name": "MultiAttnHeadSimple",
"model_type": "PyTorch",
"model_params": {
"number_time_series":3,
"seq_len":wandb_config["forecast_history"],
"output_seq_len":wandb_config["out_seq_length"],
"forecast_length":wandb_config["out_seq_length"]
},
"weight_path_add":{
"excluded_layers":["last_layer.weight", "last_layer.bias"]
},
"dataset_params":
{ "class": "default",
"training_path": file_path,
"validation_path": file_path,
"test_path": file_path,
"batch_size":wandb_config["batch_size"],
"forecast_history":wandb_config["forecast_history"],
"forecast_length":wandb_config["out_seq_length"],
"train_end": int(train_number),
"valid_start":int(train_number+1),
"valid_end": int(validation_number),
"target_col": ["new_cases"],
"relevant_cols": ["new_cases", "month", "weekday"],
"scaler": "StandardScaler",
"interpolate": False
},
"training_params":
{
"criterion":"MSE",
"optimizer": "Adam",
"optim_params":
{
},
"lr": wandb_config["lr"],
"epochs": 10,
"batch_size":wandb_config["batch_size"]
},
"GCS": False,
"sweep":True,
"wandb":False,
"forward_params":{},
"metrics":["MSE"],
"inference_params":
{
"datetime_start":"2020-04-21",
"hours_to_forecast":10,
"test_csv_path":file_path,
"decoder_params":{
"decoder_function": "simple_decode",
"unsqueeze_dim": 1
},
"dataset_params":{
"file_path": file_path,
"forecast_history":wandb_config["forecast_history"],
"forecast_length":wandb_config["out_seq_length"],
"relevant_cols": ["new_cases", "month", "weekday"],
"target_col": ["new_cases"],
"scaling": "StandardScaler",
"interpolate_param": False
}
}
}
if weight_path:
config_default["weight_path"] = weight_path
wandb.config.update(config_default)
return config_default
sweep_config = {
"name": "Default sweep",
"method": "grid",
"parameters": {
"batch_size": {
"values": [2, 3]
},
"lr":{
"values":[0.001, 0.01]
},
"forecast_history":{
"values":[1, 2, 3]
},
"out_seq_length":{
"values":[1, 2, 3]
}
}
}
def format_corona_data(region_df:pd.DataFrame, region_name:str):
"""
Format data for a specific region into
a format that can be used with flow forecast.
"""
if region_name == 'county':
region_name = region_df['full_county'].iloc[0]
else:
region_name = region_df['state'].iloc[0]
#else:
#region_name = region_df['country'].iloc[0]
print(region_name)
region_df['datetime'] = region_df['date']
region_df['precip'] = 0
region_df['temp'] = 0
region_df = region_df.fillna(0)
region_df['new_cases'] = region_df['cases'].diff()
region_df.iloc[0]['new_cases'] = 0
region_df= region_df.fillna(0)
region_df.to_csv(region_name+".csv")
return region_df, len(region_df), region_name+".csv"
def loop_through_geo_codes(df, column='full_county'):
df_county_list = []
df['full_county'] = df['state'] + "_" + df['county']
for code in df['full_county'].unique():
mask = df['full_county'] == code
df_code = df[mask]
ts_count = len(df_code)
if ts_count > 60:
df_county_list.append(df_code)
return df_county_list
def fetch_time_series() -> pd.DataFrame:
"""Fetch raw time series data from coronadatascraper.com
Returns:
pd.DataFrame: raw timeseries data at county/sub-region level
"""
if 1==1:
url = "https://coronadatascraper.com/timeseries.csv"
urllib.request.urlretrieve(url, "timeseries.csv")
time_series_df = pd.read_csv("timeseries.csv")
return time_series_df
import urllib
df = fetch_time_series()
df['month'] = pd.to_datetime(df['date']).map(lambda x: x.month)
df['weekday'] = pd.to_datetime(df['date']).map(lambda x: x.weekday())
df['year'] = pd.to_datetime(df['date']).map(lambda x: x.year)
df_list = loop_through_geo_codes(df)
region_df, full_len, file_path = format_corona_data(df_list[9], 'county')
region_df.head()
/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False. if self.run_code(code, result):
Washington, D.C._District of Columbia
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy del sys.path[0] /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy from ipykernel import kernelapp as app /usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
name | level | city | county | state | country | population | lat | long | url | aggregate | tz | cases | deaths | recovered | active | tested | hospitalized | discharged | icu | growthFactor | date | month | weekday | year | full_county | datetime | precip | temp | new_cases | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
13412 | District of Columbia, Washington, D.C., United... | county | 0 | District of Columbia | Washington, D.C. | United States | 705749.0 | 38.894 | -77.0145 | https://coronavirus.dc.gov/page/coronavirus-data | 0 | America/New_York | 2.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2020-03-07 | 3 | 5 | 2020 | Washington, D.C._District of Columbia | 2020-03-07 | 0 | 0 | 0.0 |
13413 | District of Columbia, Washington, D.C., United... | county | 0 | District of Columbia | Washington, D.C. | United States | 705749.0 | 38.894 | -77.0145 | https://coronavirus.dc.gov/page/coronavirus-data | 0 | America/New_York | 2.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2020-03-08 | 3 | 6 | 2020 | Washington, D.C._District of Columbia | 2020-03-08 | 0 | 0 | 0.0 |
13414 | District of Columbia, Washington, D.C., United... | county | 0 | District of Columbia | Washington, D.C. | United States | 705749.0 | 38.894 | -77.0145 | https://coronavirus.dc.gov/page/coronavirus-data | 0 | America/New_York | 4.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.0 | 2020-03-09 | 3 | 0 | 2020 | Washington, D.C._District of Columbia | 2020-03-09 | 0 | 0 | 2.0 |
13415 | District of Columbia, Washington, D.C., United... | county | 0 | District of Columbia | Washington, D.C. | United States | 705749.0 | 38.894 | -77.0145 | https://coronavirus.dc.gov/page/coronavirus-data | 0 | America/New_York | 4.0 | 0.0 | 0.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 2020-03-10 | 3 | 1 | 2020 | Washington, D.C._District of Columbia | 2020-03-10 | 0 | 0 | 0.0 |
13416 | District of Columbia, Washington, D.C., United... | county | 0 | District of Columbia | Washington, D.C. | United States | 705749.0 | 38.894 | -77.0145 | https://coronavirus.dc.gov/page/coronavirus-data | 0 | America/New_York | 10.0 | 0.0 | 0.0 | 10.0 | 0.0 | 0.0 | 0.0 | 0.0 | 2.5 | 2020-03-11 | 3 | 2 | 2020 | Washington, D.C._District of Columbia | 2020-03-11 | 0 | 0 | 6.0 |
special_city_list1 = ["California_Los Angeles County", "Illinois_Cook County", "Arizona_Maricopa County", "Massachusetts_Middlesex County", "Texas_Dallas County", "Texas_Harris County", "Florida_Miami Dade County", "California_Riverside County", "Colorado_Denver County", "Ohio_Cuyahoga County", "New York_Queens County", "New York_Bronx County"]
selected_list = {}
for dfs in df_list:
if dfs['full_county'].iloc[0] in special_city_list1:
selected_list[dfs['full_county'].iloc[0]] = dfs
import wandb
#12_May_202004_39AM_model.pth <- Solar pretrained model
for county in selected_list.values():
region_df, full_len, file_path = format_corona_data(county, 'county')
sweep_id = wandb.sweep(sweep_config, project="pretrain-counties")
wandb.agent(sweep_id, lambda:train_function("PyTorch", make_config_file(file_path, full_len, weight_path="12_May_202004_39AM_model.pth")))
!gsutil cp -r -n model_save gs://coronaviruspublicdata/pretrained/
Check out the sweeps here : https://app.wandb.ai/pranjalya/pretrain-counties