#!/usr/bin/env python # coding: utf-8 # # How Many Transit Riders in California have Autos? # # Questioner: Gillian Gillett # March 23, 2022 # ## Setup Environment # # ! Warning: will install libraries into current environment. # In[1]: import logging logging.basicConfig(level=logging.DEBUG) logging.debug("test") # In[100]: try: import pandas as pd import seaborn as sns import matplotlib.pyplot as plt except: logging.info('pandas seaborn not found. Will try and install into current environment') get_ipython().system(' conda install pandas seaborn') import pandas as pd import seaborn as sns import matplotlib.pyplot as plt pd.set_option("display.max.columns", None) # In[3]: import os WORKING_DIR = os.path.dirname(os.getcwd()) logging.info(f"Working directory: {WORKING_DIR}") # ## National Household Travel Survey # # Data Codebook: [https://nhts.ornl.gov/tables09/CodebookBrowser.aspx](https://nhts.ornl.gov/tables09/CodebookBrowser.aspx) # # Relevant variables: # # - HOUSEID Household Identifier # - HBHUR Urban / Rural indicator - Block group # - HHFAMINC Household income # - HHSIZE Count of household members # - HHSTATE Household state # - HHVEHCNT Count of Household vehicles # - WRKCOUNT Number of workers in household # - WTHHFIN Final HH weight # # - CAR Frequency of Personal Vehicle Use for Travel # - BUS Frequency of Bus Use for Travel # - PARA Frequency of Paratransit Use for Travel # - TAXI Frequency of Taxi Service or Rideshare Use for Travel # - [WALK](https://nhts.ornl.gov/tables09/CodebookPage.aspx?id=1365) Frequency of Walk Use for Travel # - TRAIN Frequency of Train Use for Travel # # - PLACE Travel is a Financial Burden # - PTRANS Public Transportation to Reduce Financial Burden of Travel # - WALK2SAVE Walk to Reduce Financial Burden of Travel # ### Import Data # # Assumes you have downloaded and unzipped NHTS data and weights into `csv` and `ReplicatesCSV` folders respectfully. # In[150]: NHTS_DATA_DIR = os.path.join(os.path.dirname(WORKING_DIR), 'NHTS') logging.info(f"NHTS Data directory: {NHTS_DATA_DIR}") # In[81]: cols_to_keep = [ 'HOUSEID', 'HHSIZE', 'HHVEHCNT', 'HHFAMINC', 'BUS', 'TRAIN', 'WTHHFIN', 'PLACE', 'HHSTATE', ] hh_all_df = pd.read_csv( os.path.join(NHTS_DATA_DIR, 'csv','hhpub.csv'), usecols=cols_to_keep, ) hh_all_df = hh_all_df[hh_all_df['HHSTATE'] == 'CA'] hh_all_df.head() # ### Analyze NHTS Data # # #### Recode Variables # In[130]: def is_transit_user(x): _FREQ_TRANSIT = [1,2,3] _INFREQ_TRANSIT = [4,5] # UNKOWN if int(x['BUS']) < 0 or int(x['TRAIN']) < 0: return "Unknown" # NO if int(x['BUS']) == 5 and int(x['TRAIN']) == 5: return "No Transit Use" # YES if int(x['BUS']) in _FREQ_TRANSIT or int(x['TRAIN']) in _FREQ_TRANSIT: return "Frequent Transit" if int(x['BUS']) in _FREQ_TRANSIT+_INFREQ_TRANSIT or int(x['TRAIN']) in _FREQ_TRANSIT+_INFREQ_TRANSIT: return "Infrequent Transit" else: logging.debug(f"Unable to process row for is_transit_user:\n {x}") raise Exception(f'Unable to determine if transit user for row: {x}') def has_hh_veh(x): # UNKOWN if int(x['HHVEHCNT']) < 0: return "Unknown" # NO if int(x['HHVEHCNT']) == 0: return "No Vehicle" # YES if int(x['HHVEHCNT']) > 0: return "Has vehicle" else: logging.debug(f"Unable to process row for has_hh_veh:\n {x}") raise Exception(f'Unable to determine if household has vehicles for row: {x}') def travel_burden_hh(x): BURDEN = [1,2] NOT_BURDEN = [3,4,5] # UNKOWN if int(x['PLACE']) < 0: return "Unknown" # NO if int(x['PLACE']) in NOT_BURDEN: return "Not Burdened" # YES if int(x['PLACE']) in BURDEN: return "Burdened" else: logging.debug(f"Unable to process row for travel_burden_hh:\n {x}") raise Exception(f'Unable to determine if household has financial burden to travel for row: {x}') def filter_recs(x): FILTER_COLS = ['transit_hh','vehicle_hh','burden_hh'] FILTER_VALUE = "Unknown" if FILTER_VALUE in [x[c] for c in FILTER_COLS]: return -1 return 1 # In[131]: analysis_cols = ['transit_hh','vehicle_hh','burden_hh'] hh_all_df['transit_hh'] = hh_all_df.apply(lambda x: is_transit_user(x), axis=1) hh_all_df['vehicle_hh'] = hh_all_df.apply(lambda x: has_hh_veh(x), axis=1) hh_all_df['burden_hh'] = hh_all_df.apply(lambda x: travel_burden_hh(x), axis=1) hh_all_df['keep'] = hh_all_df.apply(lambda x: filter_recs(x) ,axis=1) logging.debug(f"Excluded records:\n{hh_all_df[hh_all_df['keep']<0][analysis_cols].head()}") hh_df = hh_all_df[hh_all_df['keep']>0] for c in analysis_cols: hh_df[c]= pd.Categorical( hh_df[c], ordered = True, ) logging.debug(f"Cleaned records:\n{hh_df[analysis_cols].head()}") recs_exc = len(hh_all_df)-len(hh_df) logging.info(f"Cleaned records filtered to exclude {recs_exc}({round(100*recs_exc/len(hh_all_df),1)}%) of {len(hh_all_df)} records") hh_all_df # In[194]: hh_df_3way_ct = pd.crosstab( [hh_df['burden_hh'],hh_df['vehicle_hh']], hh_df['transit_hh'], values = hh_df['WTHHFIN'], aggfunc = sum, normalize='index', margins = True, rownames=['HHold Transportation Burden','HHold Vehicles'], colnames=['HHold Transit Use in California (Source: NHTS 2017)'], ) s_hh_df_3way_ct=hh_df_3way_ct.style\ .background_gradient()\ .format("{:.1%}") s_hh_df_3way_ct # In[195]: hh_df_ct = pd.crosstab( hh_df['vehicle_hh'], hh_df['transit_hh'], values = hh_df['WTHHFIN'], aggfunc = sum, normalize='index', margins = True, rownames=['HHold Vehicles'], colnames=['HHold Transit Use in California (Source: NHTS 2017)'], ) s_hh_df_ct=hh_df_ct.style\ .background_gradient()\ .format("{:.1%}") s_hh_df_ct # In[196]: hh_df_ct_r = pd.crosstab( hh_df['transit_hh'], hh_df['vehicle_hh'], values = hh_df['WTHHFIN'], aggfunc = sum, normalize='index', margins = True, colnames=['HHold Vehicles'], rownames=['HHold Transit Use in California (Source: NHTS 2017)'], ) s_hh_df_ct_r=hh_df_ct_r.style\ .background_gradient()\ .format("{:.1%}") s_hh_df_ct_r # ## Export Results # In[207]: # Export to markdown try: import tabulate except: get_ipython().system('pip install tabulate') with open("results.md","w") as f: f.write("\n\n**Which transit-using households have vehicles?**\n\n") f.write(hh_df_ct.to_markdown(floatfmt=".1%")) f.write("\n\n**Which vehicle-owning households ride transit?**\n\n") f.write(hh_df_ct_r.to_markdown(floatfmt=".1%")) f.write("\n\n**Does it vary among households who are financially burdened by transportation?**\n\n") f.write(hh_df_3way_ct.to_markdown(floatfmt=".1%")) # In[209]: ## Exporting to images try: import dataframe_image as dfi except: get_ipython().system(' pip install dataframe_image') import dataframe_image as dfi dfi.export(s_hh_df_ct,"hh_df_ct.png") dfi.export(s_hh_df_ct_r,"hh_df_ct_r.png") dfi.export(s_hh_df_3way_ct,"hh_df_3way_ct.png") # In[113]: # Plot Chord Chart # Currently getting an error with this >:( try: from chord import Chord except: get_ipython().system(' pip install chord') from chord import Chord names=list(hh_df_ct.columns) matrix = hh_df_ct.values.tolist() ch=Chord(hh_df_ct,names) # In[125]: # Plot Tree Map # This is really ugly >:( try: import squarify except: get_ipython().system(' pip install squarify') import squarify matrix = hh_df_ct.values.tolist() values = [item for sublist in matrix for item in sublist] names=list(hh_df_ct.columns) fig, ax = plt.subplots(1, figsize = (12,12)) squarify.plot(sizes=values, label=names, alpha=.8 ) plt.axis('off') plt.show()