import re import itertools import string import csv # Open raw text and split over newlines FILE = open('/bigdrive/Documents/MSAN622_Data_Visualization/msan622/project-prototype/Catch-22.txt') data = FILE.read() data = data.split("\n") # Find chapters, skipping intro and appendix chapters = {} key = False for line in data: # Find chapter markers and make new dictionary entry if re.match(r'^[0-9]+ [A-Za-z0-9-&\'. ]+$', line) is not None: key = int(line[0:2]) chapters[key] = [] # If we have a new chapter, append the lowercase, punctuation-cleaned words to the dictionary elif key: chapters[key].append(line.lower().translate(string.maketrans("",""), string.punctuation).split()) # Stop at the end of the book if line == 'APPENDIX': break # Clean up broken lists into one total list for each chapter for chapter in chapters: chapters[chapter] = list(itertools.chain(*chapters[chapter])) # Now look for occurances of the main characters in the book char_names = {'yossarian':"Yossarian", 'chaplain':"Chaplain Tappman", 'milo':"Milo Minderbinder", 'cathcart':"Colonel Cathcart", 'korn':"Colonel Korn", 'nately':"Nately", 'orr':"Orr", 'major':"Major Major Major Major", 'dunbar':"Dunbar", 'daneeka':"Doc Daneeka", 'joe':"Hungry Joe", 'clevinger':"Clevinger", 'aarfy':"Aarfy", 'dreedle':"General Dreedle", 'danby':"Major Danby", 'mcwatt':"McWatt", 'scheisskopf':"General Scheisskopf", 'peckem':"General Peckem", 'dobbs':"Dobbs", 'whitcomb':"Corporal Whitcomb", 'black':"Captain Black", 'halfoat':"Chief White Halfoat", 'duckett':"Nurse Duckett", 'coverley':"Major — de Coverley", 'wintergreen':"ex-P.F.C. Wintergreen", 'appleby':"Appleby", 'havermeyer':"Havermeyer", 'snowden':"Snowden"} # Loop through characters and chapters, index an appearance by the percentile of a chapter, # ie. 1st percentile of chapter 2, encode as 2.01 characters = {character: [] for character in char_names} for character in characters: for chapter in chapters: length = len(chapters[chapter]) # Speacial handling for Major Major (Major Major Major Major) if character == 'major': b = ['major','major'] location = [i for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b] location.append(0) location = [location[i] for i in range(len(location) - 1) if location[i] != location[i+1] - 1] location = [(chapter + (float(x)/length)) for x in location] # Speacial handling for Captain Black elif character == 'black': b = ['captain','black'] location = [(chapter + (float(i)/length)) for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b] else: location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if x == character] characters[character].append(location) # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort characters[character] = sorted(list(set(list(itertools.chain(*characters[character]))))) # Print summary of number of appearances, limit character dictionary to those only appearing 50+ times for char in sorted(characters): print char, len(characters[char]) # Now load it into a melted CSV file with characters and their appearance times with open('catch22.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL) headers = ['Character', 'Chapter'] csvwriter.writerow(headers) for character in characters: for location in characters[character]: this_row = [char_names[character], location] csvwriter.writerow(this_row) # Now look for occurances of the main locations visited in the book locations = {'pianosa':'Pianosa, Italy', 'rome':'Rome, Italy', 'smyrna':'Smyrna, Turkey', 'corsica':'Corsica, France', 'parma':'Parma, Italy', 'salerno':'Salerno, Italy', 'marrakech':'Marrakech, Morocco', 'malta':'Valletta, Malta', 'cairo':'Cairo, Egypt', 'sicily':'Sicily, Italy', 'istanbul':'Istanbul, Turkey', 'etna':'Mt Etna, Italy', 'vesuvius':'Mt Vesuvius, Italy', 'palermo':'Palermo, Italy', 'catania':'Catania, Italy', 'oran':'Oran, Algeria', 'beirut':'Beirut, Lebanon', 'bengasi':'Bengasi, Libya', 'sardinia':'Sardinia, Italy', 'barcelona':'Barcelona, Spain', 'leghorn':'Livorno, Italy', 'marseilles':'Marseilles, France', 'spezia':'Spezia, Italy', 'majorca':'Majorca, Spain', 'elba':'Elba, Italy', 'ferrara':'Ferrara, Italy', 'bologna':'Bologna, Italy', 'arezzo':'Arezzo, Italy', 'avignon':'Avignon, France'} # Use OpenStreetMaps to geo-code the cities from geopy.geocoders import Nominatim geolocator = Nominatim(timeout=10) loc_geo = {} for locale in sorted(locations): address, (latitude, longitude) = geolocator.geocode(locations[locale]) loc_geo[locale] = (latitude, longitude) # Loop through locations and chapters, index a location mention by the percentile of a chapter, # ie. 1st percentile of chapter 2, encode as 2.01 loc_times = {locale: [] for locale in locations} for locale in locations: for chapter in chapters: length = len(chapters[chapter]) location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if x == locale] loc_times[locale].append(location) # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort loc_times[locale] = sorted(list(set(list(itertools.chain(*loc_times[locale]))))) # Now load it into a melted CSV file with locations, their mention times, and the geo-coded location with open('catch22geo.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL) headers = ['Location', 'Time', 'Lat', 'Lon'] csvwriter.writerow(headers) for locale in sorted(locations): for t in loc_times[locale]: this_line = [locale, t, loc_geo[locale][0], loc_geo[locale][1]] csvwriter.writerow(this_line) import nltk from nltk.tag.simplify import simplify_wsj_tag # Now look for the words surrounding our main character yo_words = {'words': [], 'locs': []} for chapter in chapters: length = len(chapters[chapter]) location = [i for i, x in enumerate(chapters[chapter]) if x == 'yossarian'] # Expand range of words to 20 either side, this just gets indexes locations = [range(max(0,(i-25)),min(len(chapters[chapter]),(i+26))) for i in location] # Remove duplicates for overlapping ranges locations = list(set(list(itertools.chain(*locations)))) # Grab the words and store to dictionary words = [chapters[chapter][i] for i in locations] locations = [(chapter + (float(x)/length)) for x in locations] yo_words['words'].append(words) yo_words['locs'].append(locations) # Clean up broken liss yo_words['words'] = list(itertools.chain(*yo_words['words'])) yo_words['locs'] = list(itertools.chain(*yo_words['locs'])) yo_words['words'] = nltk.pos_tag(yo_words['words']) yo_words['words'] = [(word, simplify_wsj_tag(tag)) for word, tag in yo_words['words']] from nltk.corpus import stopwords stop = stopwords.words('english') stop.extend(('said','thats','im','dont','got','get','say','youre')) # Now load it into a melted CSV file with word, POS type and their mention times with open('catch22pos.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL) headers = ['Word', 'Time', 'POS'] csvwriter.writerow(headers) for i in range(len(yo_words['locs'])): if yo_words['words'][i][0] not in stop and yo_words['words'][i][0] not in char_names: this_line = [yo_words['words'][i][0], yo_words['locs'][i], yo_words['words'][i][1]] csvwriter.writerow(this_line)