import re
import itertools
import string
import csv

# Open raw text and split over newlines
FILE = open('/bigdrive/Documents/MSAN622_Data_Visualization/msan622/project-prototype/Catch-22.txt')
data = FILE.read()
data = data.split("\n")

# Find chapters, skipping intro and appendix
chapters = {}
key = False
for line in data:
    # Find chapter markers and make new dictionary entry
    if re.match(r'^[0-9]+ [A-Za-z0-9-&\'. ]+$', line) is not None:
        key = int(line[0:2])
        chapters[key] = []
    # If we have a new chapter, append the lowercase, punctuation-cleaned words to the dictionary
    elif key:
        chapters[key].append(line.lower().translate(string.maketrans("",""), string.punctuation).split())
    # Stop at the end of the book
    if line == 'APPENDIX':
        break
# Clean up broken lists into one total list for each chapter
for chapter in chapters:
    chapters[chapter] = list(itertools.chain(*chapters[chapter]))

# Now look for occurances of the main characters in the book
char_names = {'yossarian':"Yossarian",
              'chaplain':"Chaplain Tappman",
              'milo':"Milo Minderbinder",
              'cathcart':"Colonel Cathcart",
              'korn':"Colonel Korn",
              'nately':"Nately",
              'orr':"Orr",
              'major':"Major Major Major Major",
              'dunbar':"Dunbar",
              'daneeka':"Doc Daneeka",
              'joe':"Hungry Joe",
              'clevinger':"Clevinger",
              'aarfy':"Aarfy",
              'dreedle':"General Dreedle",
              'danby':"Major Danby",
              'mcwatt':"McWatt",
              'scheisskopf':"General Scheisskopf",
              'peckem':"General Peckem",
              'dobbs':"Dobbs",
              'whitcomb':"Corporal Whitcomb",
              'black':"Captain Black",
              'halfoat':"Chief White Halfoat",
              'duckett':"Nurse Duckett",
              'coverley':"Major — de Coverley",
              'wintergreen':"ex-P.F.C. Wintergreen",
              'appleby':"Appleby",
              'havermeyer':"Havermeyer",
              'snowden':"Snowden"}
# Loop through characters and chapters, index an appearance by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
characters = {character: [] for character in char_names}
for character in characters:
    for chapter in chapters:
        length = len(chapters[chapter])
        # Speacial handling for Major Major (Major Major Major Major)
        if character == 'major':
            b = ['major','major']
            location = [i for i in range(len(chapters[chapter])) if chapters[chapter][i:i+len(b)] == b]
            location.append(0)
            location = [location[i] for i in range(len(location) - 1) if location[i] != location[i+1] - 1]
            location = [(chapter + (float(x)/length)) for x in location]
        # Speacial handling for Captain Black
        elif character == 'black':
            b = ['captain','black']
            location = [(chapter + (float(i)/length)) for i in range(len(chapters[chapter])) if
                        chapters[chapter][i:i+len(b)] == b]
        else:
            location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if 
                        x == character]
        characters[character].append(location)
    # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
    characters[character] = sorted(list(set(list(itertools.chain(*characters[character])))))

# Print summary of number of appearances, limit character dictionary to those only appearing 50+ times
for char in sorted(characters):
    print char, len(characters[char])

# Now load it into a melted CSV file with characters and their appearance times
with open('catch22.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
    headers = ['Character', 'Chapter']
    csvwriter.writerow(headers)
    for character in characters:
        for location in characters[character]:
            this_row = [char_names[character], location]
            csvwriter.writerow(this_row)

# Now look for occurances of the main locations visited in the book
locations = {'pianosa':'Pianosa, Italy',
             'rome':'Rome, Italy',
             'smyrna':'Smyrna, Turkey',
             'corsica':'Corsica, France',
             'parma':'Parma, Italy',
             'salerno':'Salerno, Italy',
             'marrakech':'Marrakech, Morocco',
             'malta':'Valletta, Malta',
             'cairo':'Cairo, Egypt', 
             'sicily':'Sicily, Italy', 
             'istanbul':'Istanbul, Turkey', 
             'etna':'Mt Etna, Italy',
             'vesuvius':'Mt Vesuvius, Italy',
             'palermo':'Palermo, Italy', 
             'catania':'Catania, Italy', 
             'oran':'Oran, Algeria',
             'beirut':'Beirut, Lebanon',
             'bengasi':'Bengasi, Libya',
             'sardinia':'Sardinia, Italy',
             'barcelona':'Barcelona, Spain',
             'leghorn':'Livorno, Italy',
             'marseilles':'Marseilles, France',
             'spezia':'Spezia, Italy',
             'majorca':'Majorca, Spain',
             'elba':'Elba, Italy',
             'ferrara':'Ferrara, Italy',
             'bologna':'Bologna, Italy',
             'arezzo':'Arezzo, Italy',
             'avignon':'Avignon, France'}
# Use OpenStreetMaps to geo-code the cities
from geopy.geocoders import Nominatim
geolocator = Nominatim(timeout=10)
loc_geo = {}
for locale in sorted(locations):
    address, (latitude, longitude) = geolocator.geocode(locations[locale])
    loc_geo[locale] = (latitude, longitude)

# Loop through locations and chapters, index a location mention by the percentile of a chapter,
# ie. 1st percentile of chapter 2, encode as 2.01
loc_times = {locale: [] for locale in locations}
for locale in locations:
    for chapter in chapters:
        length = len(chapters[chapter])
        location = [(chapter + (float(i)/length)) for i, x in enumerate(chapters[chapter]) if 
                    x == locale]
        loc_times[locale].append(location)
    # Clean up broken lists, remove duplicates (only relevant if binning locations) and sort
    loc_times[locale] = sorted(list(set(list(itertools.chain(*loc_times[locale])))))

# Now load it into a melted CSV file with locations, their mention times, and the geo-coded location
with open('catch22geo.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
    headers = ['Location', 'Time', 'Lat', 'Lon']
    csvwriter.writerow(headers)
    for locale in sorted(locations):
        for t in loc_times[locale]:
            this_line = [locale, t, loc_geo[locale][0], loc_geo[locale][1]]
            csvwriter.writerow(this_line)

import nltk
from nltk.tag.simplify import simplify_wsj_tag
# Now look for the words surrounding our main character
yo_words = {'words': [], 'locs': []}
for chapter in chapters:
    length = len(chapters[chapter])
    location = [i for i, x in enumerate(chapters[chapter]) if x == 'yossarian']
    # Expand range of words to 20 either side, this just gets indexes
    locations = [range(max(0,(i-25)),min(len(chapters[chapter]),(i+26))) for i in location]
    # Remove duplicates for overlapping ranges
    locations = list(set(list(itertools.chain(*locations))))
    # Grab the words and store to dictionary
    words = [chapters[chapter][i] for i in locations]
    locations = [(chapter + (float(x)/length)) for x in locations]
    yo_words['words'].append(words)
    yo_words['locs'].append(locations)
    
# Clean up broken liss
yo_words['words'] = list(itertools.chain(*yo_words['words']))
yo_words['locs'] = list(itertools.chain(*yo_words['locs']))
yo_words['words'] = nltk.pos_tag(yo_words['words'])
yo_words['words'] = [(word, simplify_wsj_tag(tag)) for word, tag in yo_words['words']]

from nltk.corpus import stopwords
stop = stopwords.words('english')
stop.extend(('said','thats','im','dont','got','get','say','youre'))

# Now load it into a melted CSV file with word, POS type and their mention times
with open('catch22pos.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',',quotechar='"', quoting=csv.QUOTE_MINIMAL)
    headers = ['Word', 'Time', 'POS']
    csvwriter.writerow(headers)
    for i in range(len(yo_words['locs'])):
        if yo_words['words'][i][0] not in stop and yo_words['words'][i][0] not in char_names:
            this_line = [yo_words['words'][i][0], yo_words['locs'][i], yo_words['words'][i][1]]
            csvwriter.writerow(this_line)