# The usual preamble import pandas as pd # Make the graphs a bit prettier, and bigger pd.set_option('display.mpl_style', 'default') figsize(15, 5) # Always display all the columns pd.set_option('display.line_width', 5000) pd.set_option('display.max_columns', 60) requests = pd.read_csv('../data/311-service-requests.csv') requests['Incident Zip'].unique() na_values = ['NO CLUE', 'N/A', '0'] requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str}) requests['Incident Zip'].unique() rows_with_dashes = requests['Incident Zip'].str.contains('-').fillna(False) len(requests[rows_with_dashes]) requests[rows_with_dashes] long_zip_codes = requests['Incident Zip'].str.len() > 5 requests['Incident Zip'][long_zip_codes].unique() requests['Incident Zip'] = requests['Incident Zip'].str.slice(0, 5) requests[requests['Incident Zip'] == '00000'] zero_zips = requests['Incident Zip'] == '00000' requests['Incident Zip'][zero_zips] = np.nan unique_zips = requests['Incident Zip'].unique() unique_zips.sort() unique_zips zips = requests['Incident Zip'] # Let's say the zips starting with '0' and '1' are okay, for now. (this isn't actually true -- 13221 is in Syracuse, and why?) is_close = zips.str.startswith('0') | zips.str.startswith('1') # There are a bunch of NaNs, but we're not interested in them right now, so we'll say they're True is_far = ~(is_close.fillna(True).astype(bool)) zips[is_far] requests[is_far][['Incident Zip', 'Descriptor', 'City']].sort('Incident Zip') requests['City'].str.upper().value_counts() na_values = ['NO CLUE', 'N/A', '0'] requests = pd.read_csv('../data/311-service-requests.csv', na_values=na_values, dtype={'Incident Zip': str}) def fix_zip_codes(zips): # Truncate everything to length 5 zips = zips.str.slice(0, 5) # Set 00000 zip codes to nan zero_zips = zips == '00000' zips[zero_zips] = np.nan return zips requests['Incident Zip'] = fix_zip_codes(requests['Incident Zip']) requests['Incident Zip'].unique()