# You should have this downloaded & extracted already, so I've commented it out
# !curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz
# !tar -zxvf convote_v1.1.tar.gz

import re
import glob
import pandas as pd

paths = glob.glob('convote_v1.1/data_stage_one/development_set/*')

speeches = []
for path in paths:
    speech = {}
    filename = path[-26:]
    speech['filename'] = filename
    speech['bill no'] = filename[:3]
    speech['speaker no'] = filename[4:10]
    speech['bill vote'] = filename[-5]
    speech['party'] = filename[-7]
    
    speech['contents'] = open(path, 'r').read()

    cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
    cleaned_contents = re.sub(r" +",' ', cleaned_contents)
    cleaned_contents = cleaned_contents.strip()
    tokens = cleaned_contents.split(' ')
    speech['tokenized contents'] = tokens
    speech['word count'] = len(tokens)
    
    speeches.append(speech)

speeches_df = pd.DataFrame(speeches)
speeches_df[:5]


# If you get a n_samples=1 should be >= n_clusters=4 error,
# you'll want to make sure you're using *two sets of square brackets*
# around the column name


!curl -O http://www.boutell.com/zipcodes/zipcode.zip
!unzip zipcode.zip


# pd.isnull checks to see if latitude is None or NaN
zipcodes[pd.isnull(zipcodes["latitude"])]

# The ~ means 'not', so 'the zipcodes that are not null for latitude'
cleaned_zipcodes = zipcodes[~pd.isnull(zipcodes["latitude"])]