!pip install Pillow !pip install gdown !pip install darts # https://exerror.com/typeerror-load-missing-1-required-positional-argument-loader/ !pip install pyyaml==5.4.1 import pandas as pd import numpy as np import os from google.colab import drive import matplotlib.pyplot as plt from io import BytesIO from PIL import Image import requests import gc import math from itertools import product import ast # from IPython.display import display, HTML # CSS = """ # .output { # flex-direction: row; # } # """ # HTML(''.format(CSS)) import torch from darts import TimeSeries from darts.models import ( RNNModel, TCNModel, TransformerModel, NBEATSModel, BlockRNNModel, ) from darts.metrics import mape, smape from darts.dataprocessing.transformers import Scaler from darts.utils.timeseries_generation import datetime_attribute_timeseries try: # Dan drive.mount('/gdrive/') # !ls /gdrive project_path = '/gdrive/MyDrive/Northwestern/CompSci/de200/data' my_data_path = os.path.join(project_path, "data/my_data") os.chdir(project_path) except Exception as e: # Jason !gdown https://drive.google.com/drive/u/1/folders/1j0O8ft4HYvFLyT86VEIjnSCaLVPLMXg6 -O de200data --folder &> /dev/null #drive.mount("/gdrive/", force_remount=True) os.chdir('de200data') !gdown --id 1j0O8ft4HYvFLyT86VEIjnSCaLVPLMXg6 -O de200data --folder &> /dev/null os.listdir() ctaL = pd.read_csv('CTA_L.csv') ctaL['Location'] = ctaL['Location'].apply(ast.literal_eval) #from string of tup to tup ctaL['Line'] = ctaL.loc[:,"RED":"O"].idxmax(axis=1) # for color ctaL['lats'], ctaL['lgts'] = zip(*ctaL['Location']) # L_lats, L_lgts = zip(*ctaL['Location']) ctaL.head() # identify by "STATION_NAME", has useful "Location" ctaL_colormap = {"RED":"red","BLUE":"blue","G":"green","BRN":"brown","P":"purple","Pexp":"purple","Y":"yellow","Pnk":"pink","O":"orange"} stations = pd.read_csv('Divvy_Bicycle_Stations.csv') stations.head() stations[stations.Status != "In Service"] stations[stations['Docks in Service']!=stations['Total Docks']] for f in os.listdir(): print(f) print(pd.read_csv(f).columns) d = pd.read_csv('202112-divvy-tripdata.csv') d.head() d.start_station_id.unique() d.shape, d.dropna().shape df = pd.read_csv('Divvy_Trips_2019_Q2.csv') df.head() df.shape, df.dropna().shape dff = pd.read_csv('Divvy_Trips_2019_Q1.csv') dff.head() f2019s = ['Divvy_Trips_2019_Q1.csv', "Divvy_Trips_2019_Q2.csv", 'Divvy_Trips_2019_Q3.csv', 'Divvy_Trips_2019_Q4.csv',] def build_2019(): for f in f2019s: if f == "Divvy_Trips_2019_Q2.csv": _df = pd.read_csv(f) _df.set_axis(['trip_id', 'start_time', 'end_time', 'bikeid', 'tripduration', 'from_station_id', 'from_station_name', 'to_station_id', 'to_station_name', 'usertype', 'gender', 'birthyear'], axis=1, inplace=True) else: _df = pd.read_csv(f) _df = _df[["bikeid","start_time","end_time","from_station_name","to_station_name","usertype"]] _df[['start_time','end_time']] = _df[['start_time','end_time']].apply(pd.to_datetime, errors='coerce') _df.dropna(inplace=True) yield _df if os.path.exists('df_2019.csv'): df_2019 = pd.read_csv('df_2019.csv', parse_dates=["start_time","end_time"], infer_datetime_format=True) else: df_2019 = pd.concat(build_2019()) df_2019.to_csv('df_2019.csv', index=False) df_2019.head() df_2019.dtypes f2020s = ['Divvy_Trips_2020_Q1.csv', '202004-divvy-tripdata.csv', '202006-divvy-tripdata.csv', '202005-divvy-tripdata.csv', '202007-divvy-tripdata.csv', '202008-divvy-tripdata.csv', '202009-divvy-tripdata.csv', '202011-divvy-tripdata.csv', '202010-divvy-tripdata.csv', '202012-divvy-tripdata.csv', '202101-divvy-tripdata.csv', '202104-divvy-tripdata.csv', '202103-divvy-tripdata.csv', '202102-divvy-tripdata.csv', '202105-divvy-tripdata.csv', '202106-divvy-tripdata.csv', '202107-divvy-tripdata.csv', '202108-divvy-tripdata.csv', '202109-divvy-tripdata.csv', '202110-divvy-tripdata.csv', '202112-divvy-tripdata.csv', '202111-divvy-tripdata.csv'] def build_2020s(): for f in f2020s: if "2020" in f or "2021" in f: _df = pd.read_csv(f) # (lat, lgt) # _df['start_coord'] = list(zip(_df["start_lat"], _df["start_lng"])) # _df['end_coord'] = list(zip(_df["end_lat"], _df["end_lng"])) _df = _df[["started_at","ended_at","start_station_name","end_station_name","member_casual", "rideable_type", "start_lat", "end_lat", "start_lng", "end_lng"]].copy() _df.set_axis(["start_time","end_time","from_station_name","to_station_name","usertype", "bike_type", "start_lat", "end_lat", "start_lgt", "end_lgt"], axis=1, inplace=True) _df[['start_time','end_time']] = _df[['start_time','end_time']].apply(pd.to_datetime, errors='coerce') _df.dropna(inplace=True) yield _df if os.path.exists('df_2020s.csv'): df_2020s = pd.read_csv('df_2020s.csv', parse_dates=["start_time","end_time"], infer_datetime_format=True) else: df_2020s = pd.concat(build_2020s()) df_2020s.to_csv('df_2020s.csv', index=False) df_2020s.head() df_2020s.dtypes df_2020s.shape df_2020s.bike_type.unique() try: shared_cols = ["start_time","end_time","from_station_name","to_station_name","usertype"] df = pd.concat([df_2019[shared_cols], df_2020s[shared_cols]]) except NameError: try: isinstance(df_2019, pd.DataFrame) except NameError: if os.path.exists('df_2019.csv'): df_2019 = pd.read_csv('df_2019.csv') else: df_2019 = pd.concat(build_2019()) df_2019.to_csv('df_2019.csv', index=False) try: isinstance(df_2020s, pd.DataFrame) except NameError: if os.path.exists('df_2020s.csv'): df_2020s = pd.read_csv('df_2020s.csv') else: df_2020s = pd.concat(build_2020s()) df_2020s.to_csv('df_2020s.csv', index=False) df = pd.concat([df_2019[shared_cols], df_2020s[shared_cols]]) assert df.shape[0] == df_2019.shape[0] + df_2020s.shape[0] df.head() df.dtypes # Simple Plot def splot(x, y, xlab, ylab, title): fig = plt.figure(figsize=(8,8)) plt.plot(x, y) fig.suptitle(title) plt.xticks(x) plt.grid() plt.xlabel(xlab) plt.ylabel(ylab) URL = "https://tile.openstreetmap.org/{z}/{x}/{y}.png".format TILE_SIZE = 256 def point_to_pixels(lon, lat, zoom): """convert gps coordinates to web mercator""" r = math.pow(2, zoom) * TILE_SIZE lat = math.radians(lat) x = int((lon + 180.0) / 360.0 * r) y = int((1.0 - math.log(math.tan(lat) + (1.0 / math.cos(lat))) / math.pi) / 2.0 * r) return x, y # Easy interface for quick subsetting and plotting def plot_map(lats, lgts, L=False, zoom=13, overload=False): top, bot = lats.max(), lats.min() lef, rgt = lgts.min(), lgts.max() x0, y0 = point_to_pixels(lef, top, zoom) x1, y1 = point_to_pixels(rgt, bot, zoom) x0_tile, y0_tile = int(x0 / TILE_SIZE), int(y0 / TILE_SIZE) x1_tile, y1_tile = math.ceil(x1 / TILE_SIZE), math.ceil(y1 / TILE_SIZE) n_tiles = (x1_tile - x0_tile) * (y1_tile - y0_tile) print("# of tiles to be downloaded", n_tiles) #size and ax fig, ax = plt.subplots(figsize=(20,20)) # full size image we'll add tiles to img = Image.new('RGB', ( (x1_tile - x0_tile) * TILE_SIZE, (y1_tile - y0_tile) * TILE_SIZE)) # loop through every tile inside our bounded box for x_tile, y_tile in product(range(x0_tile, x1_tile), range(y0_tile, y1_tile)): with requests.get(URL(x=x_tile, y=y_tile, z=zoom)) as resp: tile_img = Image.open(BytesIO(resp.content)) # add each tile to the full size image img.paste( im=tile_img, box=((x_tile - x0_tile) * TILE_SIZE, (y_tile - y0_tile) * TILE_SIZE)) #cropping from tileset to fit our bounding box x, y = x0_tile * TILE_SIZE, y0_tile * TILE_SIZE # was erroring "tile cannot extend outside image" # bc I think lgt was negative so calculations should use abs() # YES! SUCCESS img = img.crop(( abs(int(x - x0)), # left abs(int(y - y0)), # top abs(int(x - x1)), # right abs(int(y - y1)))) # bottom ax.imshow(img, extent=(lef, rgt, bot, top)) # (lats, lgts), here fed in as (lgts, lats) if overload: ax.scatter(lgts, lats, alpha=1, c='purple', s=200, marker='v') else: ax.scatter(lgts, lats, alpha=1, c='salmon', s=5) # plot train stations if L: print(top, bot, rgt, lef) #https://stackoverflow.com/questions/26139423/plot-different-color-for-different-categorical-levels-using-matplotlib ctaL_ss = ctaL.loc[(ctaL.lats < top) & (ctaL.lats > bot) & (ctaL.lgts < rgt) & (ctaL.lgts > lef)] ax.scatter(ctaL_ss.lgts, ctaL_ss.lats, alpha=1, c=ctaL_ss['Line'].map(ctaL_colormap), s=200, marker='*') # side by side: https://stackoverflow.com/questions/66209719/ipython-display-pandas-dataframe-and-matplotlib-plot-side-by-side # given list of strings (index of series in most cases), plot the stations on map # zoom default to plot_map (except when very zoomed in can specify) # aiya, so elegant :D def show_locs(series, zoom=13): series = list(series.index) # find lats/lgts for given stations def _inner(): for loc in series: yield stations.loc[stations['Station Name'] == loc][["Latitude", "Longitude"]] a = pd.concat(_inner()) plot_map(a['Latitude'], a['Longitude'], zoom=zoom, overload=True) # For a period of time for a certain station, get its +1,0,-1 usage history and return datetime, List[1/0/-1], and plots def usage_history(station: str, start: pd.Timestamp, end: pd.Timestamp, mksize=10, suppress=False): subset = df.loc[(df['start_time'] >= start) & (df['start_time'] <= end) & ((df['from_station_name'] == station) | (df['to_station_name'] == station)) ].sort_values(by='start_time') N = subset.shape[0] history = [0]*N #slight optimization for i, row in enumerate(subset.itertuples(index=False)): if row.from_station_name == station and row.to_station_name == station: # 0 (dont do anything) pass elif row.from_station_name == station: # -1 history[i] = -1 else: # + 1 history[i] = 1 if not suppress: # plotting fig, ax = plt.subplots(figsize=(10,10)) fig.suptitle("Usage at {} from {} to {}".format(station, start, end)) ax.set_xlabel("Time") ax.set_ylabel("Change in available bikes") ax.scatter(subset.start_time, np.cumsum(history), alpha=1, c='purple', s=mksize, marker='.') ax.grid() return subset.start_time, history plot_map(df_2020s.start_lat, df_2020s.start_lgt, L=True) chi_dt = df_2020s.loc[(df_2020s.start_lat < 41.9) & (df_2020s.start_lat > 41.87) & (df_2020s.start_lgt > -87.65) & (df_2020s.start_lgt < -87.1)][["start_lat","start_lgt"]] plot_map(chi_dt.start_lat, chi_dt.start_lgt, zoom=16, L=True) pre_pandemic = df.loc[df['start_time'] <= pd.Timestamp('2020-01-31')] fsc_prep = pre_pandemic.groupby('from_station_name')['from_station_name'].count().sort_values(ascending=False) tsc_prep = pre_pandemic.groupby('to_station_name')['to_station_name'].count().sort_values(ascending=False) # Top 25 FROM stations fsc_prep.head(25) show_locs(fsc_prep.head(25), zoom=15) # Top 25 TO stations tsc_prep.head(25) show_locs(tsc_prep.head(25), zoom=15) # code used to compare pd.concat([fsc_prep.head(25), tsc_prep.head(25)], axis=1) # Bottom 25 FROM fsc_prep.tail(25) show_locs(fsc_prep.tail(25)) # Bottom 25 TO tsc_prep.tail(25) show_locs(tsc_prep.tail(25)) pd.concat([fsc_prep.tail(25), tsc_prep.tail(25)], axis=1) post_pandemic = df_2020s.loc[df_2020s['start_time'] > pd.Timestamp('2020-01-31')] post_pandemic.head(10).dtypes fsc_postp = post_pandemic.groupby('from_station_name')['from_station_name'].count().sort_values(ascending=False) tsc_postp = post_pandemic.groupby('to_station_name')['to_station_name'].count().sort_values(ascending=False) # TOP 25 FROM fsc_postp.head(10) show_locs(fsc_postp.head(25), zoom=15) # TOP 10 TO tsc_postp.head(10) show_locs(tsc_postp.head(10), zoom=16) trips_by_hour_start_postp = post_pandemic.groupby(post_pandemic['start_time'].dt.hour)['start_time'].count() # trips_by_hour_end_postp = post_pandemic.groupby(post_pandemic['end_time'].dt.hour)['end_time'].count() # ^ almost same shape (which is to be expected since by hour, most trips aren't that long) # Only weekdays trips by hour trips_by_hour_start_postp_weekdays = post_pandemic[post_pandemic['start_time'].dt.day_of_week.isin([0,1,2,3,4])].groupby(post_pandemic['start_time'].dt.hour)['start_time'].count() trips_by_hour_start_postp_weekends = post_pandemic[post_pandemic['start_time'].dt.day_of_week.isin([5,6])].groupby(post_pandemic['start_time'].dt.hour)['start_time'].count() trips_by_min_start_postp = post_pandemic.groupby(post_pandemic['start_time'].dt.minute)['start_time'].count() trips_by_dow_start_postp = post_pandemic.groupby(post_pandemic['start_time'].dt.day_of_week)['start_time'].count() trips_by_month_start_postp = post_pandemic.groupby(post_pandemic['start_time'].dt.month)['start_time'].count() trips_by_woy_start_postp = post_pandemic.groupby(post_pandemic['start_time'].dt.isocalendar().week)['start_time'].count() splot(range(24), trips_by_hour_start_postp, "Hour", "Trips", "# of Trips in 2020-2021 by hour") splot(range(24), trips_by_hour_start_postp_weekdays, "Hour", "Trips", "# of Trips in 2020-2021 by hour") splot(range(24), trips_by_hour_start_postp_weekends, "Hour", "Trips", "# of Trips in 2020-2021 by hour") splot(range(60), trips_by_min_start_postp, "Minute", "Trips", "# of Trips in 2020-2021 by minute") splot(range(7), trips_by_dow_start_postp, "Day of Week (Monday=0, Sunday=6) ", "Trips", "# of Trips in 2020-2021 by day of week") splot(range(12), trips_by_month_start_postp, "Month", "Trips", "# of Trips in 2020-2021 by month") splot(range(53), trips_by_woy_start_postp, "Week in Year", "Trips", "# of Trips in 2020-2021 by week in year") # IDK HOW, and no time post_pandemic.head() # omg what a save: https://stackoverflow.com/questions/60140400/pandas-group-by-and-calculate-ratio-of-two-columns member_ratio = post_pandemic.groupby("from_station_name")["usertype"].value_counts(normalize=True).mul(100) # member_ratio = post_pandemic.groupby(['from_station_name', 'usertype']) # TOP K stations with most percentage of "member riders" (loyal stations) member_ratio.unstack()['member'].sort_values(ascending=False).head(50) show_locs(member_ratio.unstack()['member'].sort_values(ascending=False).head(50)) # There are stations with no membership users, more than pure loyal stations member_ratio.unstack()['member'].sort_values(ascending=False).tail(30) # sanity check assert len(member_ratio.unstack()['member'].sort_values(ascending=False)) == len(member_ratio.unstack()['casual'].sort_values(ascending=False)) # TOP K stations with most percentage of "casual riders" (non-loyal stations) member_ratio.unstack()['casual'].sort_values(ascending=False).head(50) # There exists stations with no casual riders (all member) member_ratio.unstack()['casual'].sort_values(ascending=False).tail(10) show_locs(member_ratio.unstack()['casual'].sort_values(ascending=False).head(50)) biketype_ratio = post_pandemic.groupby("from_station_name")["bike_type"].value_counts(normalize=True).mul(100) biketype_counts = post_pandemic.groupby("from_station_name")["bike_type"].value_counts() biketype_ratio biketype_counts # by raw post_pandemic.groupby('to_station_name')['bike_type'].count().sort_values(ascending=False).head(10) tsc_postp = post_pandemic.groupby('to_station_name')['to_station_name'].count().sort_values(ascending=False) fsc_postp = post_pandemic.groupby('from_station_name') fsc_postp['from_station_name'].count().sort_values(ascending=False).head(25) fsc_postp['from_station_name'].count().sort_values(ascending=False).head(25) fsc_postp.head(25) fsc_postp.tail(25) pd.concat([fsc_postp.head(25), tsc_postp.head(25)], axis=1) # Bottom 25 pd.concat([fsc_postp.tail(25), tsc_postp.tail(25)], axis=1) _, _ = usage_history("Chicago Ave & Sheridan Rd", pd.Timestamp('2021-01-01'), pd.Timestamp('2021-12-31'), mksize=25) _, _ = usage_history("University Library (NU)", pd.Timestamp('2021-01-01'), pd.Timestamp('2021-12-31'), mksize=25) _, _ = usage_history("Sheridan Rd & Noyes St (NU)", pd.Timestamp('2021-01-01'), pd.Timestamp('2021-12-31'), mksize=25) stations[stations['Station Name'] == "Streeter Dr & Grand Ave"] post_pandemic.head() _, _ = usage_history("Streeter Dr & Grand Ave", pd.Timestamp('2021-03-01'), pd.Timestamp('2021-03-08')) _, _ = usage_history("Streeter Dr & Grand Ave", pd.Timestamp('2020-01-01'), pd.Timestamp('2021-12-31')) _, _ = usage_history("Streeter Dr & Grand Ave", pd.Timestamp('2021-07-01'), pd.Timestamp('2021-07-10'),mksize=10) _, _ = usage_history("Canal St & Adams St", pd.Timestamp('2020-01-01'), pd.Timestamp('2021-12-31')) xs, delta = usage_history("Canal St & Adams St", pd.Timestamp('2021-08-01'), pd.Timestamp('2021-08-02'), mksize=25) # https://stackoverflow.com/questions/1060279/iterating-through-a-range-of-dates-in-python def delta1day(station, start, end): for start in pd.date_range(start, end): end = start + pd.Timedelta(days=1) _, delta = usage_history(station, start, end, suppress=True) yield TimeSeries.from_values(np.cumsum(delta)) model_interpretable = NBEATSModel( input_chunk_length=30, output_chunk_length=7, generic_architecture=False, #we use interpretable model to get trend and seasonality num_blocks=3, num_layers=4, layer_widths=512, n_epochs=50, nr_epochs_val_period=1, # batch_size=800, model_name="nbeats_interpretable_run", ) model_milk = NBEATSModel(input_chunk_length=24, output_chunk_length=12, n_epochs=50) # plot demand forecasting. Currently only supports NBEATS forecasting with fixed parameters def plot_forecast(model, start: pd.Timestamp, end: pd.Timestamp, station, forecast_horizon=1): """ assumes start, end, split_days are valid. trains over 1 day windows of station and forecast horizon over end - split days (assuming is valid) in one go starting from last train day onwards. [start-------split---end] where end-split = forecast_horizon, assuming end-split > 0 """ split = pd.Timedelta(forecast_horizon, unit='D') if forecast_horizon > 1: #split the get windows with the prediction side getting all of horizon in one go rather than 1day windows pass else: #next day forecast alldays = list(delta1day(station, start, end)) mod = model.fit(alldays[:-1]) pred_x_len = len(alldays[-1].values()) pred = model.predict(n=pred_x_len, series=alldays[-1]) #make fit to real value size xranges = range(len(alldays[-2].values()), len(alldays[-2].values())+len(alldays[-1].values())) plt.plot(xranges, alldays[-1].values(), label="actual demand") plt.plot(alldays[-2].values(), label="last day's usage") plt.plot(xranges, pred.values(), label="prediction") plt.legend() return mod res_milk = plot_forecast(model_milk, pd.Timestamp('2021-08-01'), pd.Timestamp('2021-08-11'), "Canal St & Adams St") res_interp = plot_forecast(model_interpretable, pd.Timestamp('2021-08-01'), pd.Timestamp('2021-09-10'), "Canal St & Adams St")