# You should have this downloaded & extracted already, so I've commented it out # !curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz # !tar -zxvf convote_v1.1.tar.gz import re import glob import pandas as pd paths = glob.glob('convote_v1.1/data_stage_one/development_set/*') speeches = [] for path in paths: speech = {} filename = path[-26:] speech['filename'] = filename speech['bill no'] = filename[:3] speech['speaker no'] = filename[4:10] speech['bill vote'] = filename[-5] speech['party'] = filename[-7] speech['contents'] = open(path, 'r').read() cleaned_contents = re.sub(r"[^ \w]",'', speech['contents']) cleaned_contents = re.sub(r" +",' ', cleaned_contents) cleaned_contents = cleaned_contents.strip() tokens = cleaned_contents.split(' ') speech['tokenized contents'] = tokens speech['word count'] = len(tokens) speeches.append(speech) speeches_df = pd.DataFrame(speeches) speeches_df[:5] # If you get a n_samples=1 should be >= n_clusters=4 error, # you'll want to make sure you're using *two sets of square brackets* # around the column name !curl -O http://www.boutell.com/zipcodes/zipcode.zip !unzip zipcode.zip # pd.isnull checks to see if latitude is None or NaN zipcodes[pd.isnull(zipcodes["latitude"])] # The ~ means 'not', so 'the zipcodes that are not null for latitude' cleaned_zipcodes = zipcodes[~pd.isnull(zipcodes["latitude"])]