Examining Transfer Effects¶

In [0]:

import os
import pandas as pd
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!gcloud source repos clone github_aistream-peelout_flow-forecast --project=gmap-997
os.chdir('/content/github_aistream-peelout_flow-forecast')
!git checkout -t origin/covid_fixes
!python setup.py develop
!pip install -r requirements.txt
!mkdir data
from flood_forecast.trainer import train_function
!pip install git+https://github.com/CoronaWhy/task-geo.git
!wandb login

In [0]:

# Pretrained solar data
!mkdir weights
!gsutil cp -r gs://coronaviruspublicdata/pretrained/model_save weights/

In [0]:

def make_config_file(file_path, df_len, weight_path=None):
  run = wandb.init(project="pretrain-counties")
  wandb_config = wandb.config
  train_number = df_len * .7
  validation_number = df_len *.9
  config_default={                 
    "model_name": "MultiAttnHeadSimple",
    "model_type": "PyTorch",
    "model_params": {
      "number_time_series":3,
      "seq_len":wandb_config["forecast_history"], 
      "output_seq_len":wandb_config["out_seq_length"],
      "forecast_length":wandb_config["out_seq_length"]
     },
     "weight_path_add":{
         "excluded_layers":["last_layer.weight", "last_layer.bias"]
     },
    "dataset_params":
    {  "class": "default",
       "training_path": file_path,
       "validation_path": file_path,
       "test_path": file_path,
       "batch_size":wandb_config["batch_size"],
       "forecast_history":wandb_config["forecast_history"],
       "forecast_length":wandb_config["out_seq_length"],
       "train_end": int(train_number),
       "valid_start":int(train_number+1),
       "valid_end": int(validation_number),
       "target_col": ["new_cases"],
       "relevant_cols": ["new_cases", "month", "weekday"],
       "scaler": "StandardScaler", 
       "interpolate": False
    },
    "training_params":
    {
       "criterion":"MSE",
       "optimizer": "Adam",
       "optim_params":
       {

       },
       "lr": wandb_config["lr"],
       "epochs": 10,
       "batch_size":wandb_config["batch_size"]
    
    },
    "GCS": False,
    
    "sweep":True,
    "wandb":False,
    "forward_params":{},
   "metrics":["MSE"],
   "inference_params":
   {     
         "datetime_start":"2020-04-21",
          "hours_to_forecast":10, 
          "test_csv_path":file_path,
          "decoder_params":{
              "decoder_function": "simple_decode", 
            "unsqueeze_dim": 1
          },
          "dataset_params":{
             "file_path": file_path,
             "forecast_history":wandb_config["forecast_history"],
             "forecast_length":wandb_config["out_seq_length"],
             "relevant_cols": ["new_cases", "month", "weekday"],
             "target_col": ["new_cases"],
             "scaling": "StandardScaler",
             "interpolate_param": False
          }
      }
  }
  if weight_path: 
    config_default["weight_path"] = weight_path
  wandb.config.update(config_default)
  return config_default

sweep_config = {
  "name": "Default sweep",
  "method": "grid",
  "parameters": {
        "batch_size": {
            "values": [2, 3]
        },
        "lr":{
            "values":[0.001, 0.01]
        },
        "forecast_history":{
            "values":[1, 2, 3]
        },
        "out_seq_length":{
            "values":[1, 2, 3]
        }
    }
}

In [0]:

def format_corona_data(region_df:pd.DataFrame, region_name:str):
  """
  Format data for a specific region into 
  a format that can be used with flow forecast. 
  """
  if region_name == 'county':
    region_name = region_df['full_county'].iloc[0]
  else:
    region_name = region_df['state'].iloc[0]
  #else:
    #region_name = region_df['country'].iloc[0]
  print(region_name)
  region_df['datetime'] = region_df['date']
  region_df['precip'] = 0
  region_df['temp'] = 0
  region_df = region_df.fillna(0)
  region_df['new_cases'] = region_df['cases'].diff()
  region_df.iloc[0]['new_cases'] = 0
  region_df= region_df.fillna(0)
  region_df.to_csv(region_name+".csv")
  return region_df, len(region_df), region_name+".csv"

def loop_through_geo_codes(df, column='full_county'):
  df_county_list = []
  df['full_county'] = df['state'] + "_" + df['county'] 
  for code in df['full_county'].unique():
    mask = df['full_county'] == code
    df_code = df[mask]
    ts_count = len(df_code)
    if ts_count > 60:
      df_county_list.append(df_code)
  return df_county_list 

def fetch_time_series() -> pd.DataFrame:
    """Fetch raw time series data from coronadatascraper.com
    Returns:
        pd.DataFrame: raw timeseries data at county/sub-region level
    """
    if 1==1:
        url = "https://coronadatascraper.com/timeseries.csv"
        urllib.request.urlretrieve(url, "timeseries.csv")

    time_series_df = pd.read_csv("timeseries.csv")
    return time_series_df

In [8]:

import urllib 
df = fetch_time_series()
df['month'] = pd.to_datetime(df['date']).map(lambda x: x.month)
df['weekday'] = pd.to_datetime(df['date']).map(lambda x: x.weekday())
df['year'] = pd.to_datetime(df['date']).map(lambda x: x.year)
df_list = loop_through_geo_codes(df)
region_df, full_len, file_path = format_corona_data(df_list[9], 'county')
region_df.head()

/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2822: DtypeWarning: Columns (2) have mixed types.Specify dtype option on import or set low_memory=False.
  if self.run_code(code, result):

Washington, D.C._District of Columbia

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[8]:

	name	level	county	state	country	population	lat	long	url	tz	cases	active	growthFactor	date	month	weekday	year	full_county	datetime	new_cases
13412	District of Columbia, Washington, D.C., United...	county	District of Columbia	Washington, D.C.	United States	705749.0	38.894	-77.0145	https://coronavirus.dc.gov/page/coronavirus-data	America/New_York	2.0	2.0	0.0	2020-03-07	3	5	2020	Washington, D.C._District of Columbia	2020-03-07	0.0
13413	District of Columbia, Washington, D.C., United...	county	District of Columbia	Washington, D.C.	United States	705749.0	38.894	-77.0145	https://coronavirus.dc.gov/page/coronavirus-data	America/New_York	2.0	2.0	1.0	2020-03-08	3	6	2020	Washington, D.C._District of Columbia	2020-03-08	0.0
13414	District of Columbia, Washington, D.C., United...	county	District of Columbia	Washington, D.C.	United States	705749.0	38.894	-77.0145	https://coronavirus.dc.gov/page/coronavirus-data	America/New_York	4.0	4.0	2.0	2020-03-09	3	0	2020	Washington, D.C._District of Columbia	2020-03-09	2.0
13415	District of Columbia, Washington, D.C., United...	county	District of Columbia	Washington, D.C.	United States	705749.0	38.894	-77.0145	https://coronavirus.dc.gov/page/coronavirus-data	America/New_York	4.0	4.0	1.0	2020-03-10	3	1	2020	Washington, D.C._District of Columbia	2020-03-10	0.0
13416	District of Columbia, Washington, D.C., United...	county	District of Columbia	Washington, D.C.	United States	705749.0	38.894	-77.0145	https://coronavirus.dc.gov/page/coronavirus-data	America/New_York	10.0	10.0	2.5	2020-03-11	3	2	2020	Washington, D.C._District of Columbia	2020-03-11	6.0

In [0]:

special_city_list1 = ["California_Los Angeles County", "Illinois_Cook County", "Arizona_Maricopa County", "Massachusetts_Middlesex County", "Texas_Dallas County", "Texas_Harris County", "Florida_Miami Dade County", "California_Riverside County", "Colorado_Denver County", "Ohio_Cuyahoga County", "New York_Queens County", "New York_Bronx County"]

In [0]:

selected_list = {}
for dfs in df_list:
    if dfs['full_county'].iloc[0] in special_city_list1:
        selected_list[dfs['full_county'].iloc[0]] = dfs

In [0]:

import wandb
#12_May_202004_39AM_model.pth <- Solar pretrained model
for county in selected_list.values():
    region_df, full_len, file_path = format_corona_data(county, 'county')
    sweep_id = wandb.sweep(sweep_config, project="pretrain-counties")
    wandb.agent(sweep_id, lambda:train_function("PyTorch", make_config_file(file_path, full_len, weight_path="12_May_202004_39AM_model.pth")))
    !gsutil cp -r -n model_save gs://coronaviruspublicdata/pretrained/

Check out the sweeps here : https://app.wandb.ai/pranjalya/pretrain-counties

In [0]: