import os
import sys
import csv
import codecs
import collections
import glob

import graf
import scipy.sparse
import networkx


os.chdir("/Users/pbouda/Projects/git-github/notebooks/polysemy")
if not os.path.exists("sources.csv"):
    import requests
    import zipfile
    r = requests.get(
        "http://www.quanthistling.info/data/downloads/xml/data.zip")
    with open("data.zip", "wb") as f:
        f.write(r.content)

    z = zipfile.ZipFile("data.zip")
    z.extractall()

sources = csv.reader(open("sources.csv", "rU"), delimiter="\t")
dict_sources = list()
for source in sources:
    if source[0] != "QLCID" and source[1] == "dictionary":
        dict_sources.append(source[0])

import unicodedata
tbl = dict.fromkeys(i for i in xrange(sys.maxunicode)
                      if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('S'))
def remove_punctuation(text):
    return text.translate(tbl)

if not os.path.exists(os.path.join("stopwords", "spanish")):
    import requests
    import zipfile
    r = requests.get(
        "https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip?raw=true")
    with open("stopwords.zip", "wb") as f:
        f.write(r.content)

    z = zipfile.ZipFile("stopwords.zip")
    z.extractall()

stopwords = list()
with codecs.open(os.path.join("stopwords", "spanish"), "r", "utf-8") as f:
    for line in f:
        stopwords.append(line.strip())

parser = graf.GraphParser()
all_dicts_frame = None
parsed_first = False

spa_to_indi = collections.defaultdict(set)
indi = set()
spa = set()
spa_to_spa = []

for d in dict_sources:
    for f in glob.glob(os.path.join(d, "dict-*-dictinterpretation.xml")):
        #print("Parsing {0}...".format(f))
        graf_graph = parser.parse(f)
    
        for (node_id, node) in graf_graph.nodes.items():
            if node_id.endswith("..entry"):
                entry_spa = set()
                spa_to_spa_tmp = list()
                others = set()
                for e in node.out_edges:
                    if e.annotations.get_first().label == "head" or e.annotations.get_first().label == "translation":
                        # get lang
                        for n in e.to_node.links[0][0].nodes:
                            if n.annotations.get_first().label == "iso-639-3":
                                if n.annotations.get_first().features.get_value("substring") == "spa":
                                    substr = remove_punctuation(e.to_node.annotations.get_first().features.get_value("substring"))
                                    collo = set()
                                    for w in substr.split(" "):
                                        if w not in stopwords:
                                            entry_spa.add(w)
                                            collo.add(w)
                                    if len(collo) > 1:
                                        spa_to_spa_tmp.append(list(collo))
                                    break
                                else:
                                    trans = u"{0}|{1}".format(e.to_node.annotations.get_first().features.get_value("substring"), d)
                                    others.add(trans)
                                    break
                                
                if len(entry_spa) > 0 and len(others) > 0:
                    #spa_to_spa.append(list(entry_spa))
                    spa_to_spa.extend(spa_to_spa_tmp)
                    for head in entry_spa:
                        for translation in others:
                            spa_to_indi[head].add(translation)
                            spa.add(head)
                            indi.add(translation)


import gc
gc.collect()

spa = list(spa)
indi = list(indi)
indi_indices = { w: i for i, w in enumerate(indi) }
spa_indices = { w: i for i, w in enumerate(spa) }
all_dicts_cooc = scipy.sparse.lil_matrix((len(indi), len(spa)))
#all_dicts_cooc = numpy.zeros((len(indi), len(spa)))

len(spa)

for i, head in enumerate(spa):
    for trans in spa_to_indi[head]:
        all_dicts_cooc[indi_indices[trans], i] = 1

all_dicts_spa_collo = scipy.sparse.lil_matrix((len(spa), len(spa)))
for j, p in enumerate(spa_to_spa):
    for i in range(len(p)-1):
        for w in p[i+1:]:
            all_dicts_spa_collo[spa_indices[p[i]], spa_indices[w]] += 1

spa_to_spa[19764]

all_dicts_cooc = scipy.sparse.csc_matrix(all_dicts_cooc)
all_dicts_spa_collo = scipy.sparse.csc_matrix(all_dicts_spa_collo)

spa_similarity = all_dicts_cooc.T * all_dicts_cooc

spa_similarity_without_collo = spa_similarity - all_dicts_spa_collo

g = networkx.Graph(spa_similarity)
#solitary = [ n for n, d in g.degree_iter() if d==2 ]
#g.remove_nodes_from(solitary)

labels = dict(zip(range(len(spa)), spa))
#labels = { k: v for k,v in enumerate(spa) if k in g }
g2 = networkx.relabel_nodes(g, labels)

word = u"casa"
cutoff = 50
comer_nodes = g2[word]
comer_graph = networkx.Graph()
comer_graph.add_node(word)
for n in comer_nodes:
    if comer_nodes[n]['weight'] >= cutoff:
        comer_graph.add_node(n)
        comer_graph.add_edge(word, n, weight=comer_nodes[n]['weight'])
    

len(comer_graph)

from networkx.readwrite import json_graph
import json
comer_json = json_graph.node_link_data(comer_graph)
#json.dump(bodyparts_json, codecs.open("bodyparts_graph.json", "w", "utf-8"))

from IPython.display import HTML, Javascript
from IPython.core.display import display
html = """
<style>

.link {
  stroke: #999;
  stroke-opacity: .6;
}

.link:hover {
  stroke: #000;
  stroke-opacity: 1.0;
}

</style>

<script src="http://d3js.org/d3.v3.min.js"></script>

<div id="nav"></div>
"""

javascript = """
var color = d3.scale.category20();

var width = 500,
    height = 400;

var svg = d3.select("#nav").append("svg")
    .attr("width", width)
    .attr("height", height);

var force = d3.layout.force()
    .gravity(.05)
    .distance(100)
    .charge(-250)
    .size([width, height]);

var json = """ + json.dumps(comer_json) + """;

//d3.json("bodyparts_graph.json", function(error, json) {
  force
      .nodes(json.nodes)
      .links(json.links)
      .start();

  var link = svg.selectAll("line.link")
      .data(json.links)
    .enter().append("line")
      .attr("class", "link")
      .style("stroke-width", function(d) { return d.weight/""" + str(cutoff) + """; });

  var node = svg.selectAll("circle.node")
      .data(json.nodes)
    .enter().append("g")
      .attr("class", "node")
      .call(force.drag);

  node.append("circle")
      .attr("r", 5);
      //.style("fill", function(d) { return color(d.group); })

  node.append("text")
      .attr("dx", 12)
      .attr("dy", ".35em")
      .text(function(d) { return d.id });

  force.on("tick", function() {
    link.attr("x1", function(d) { return d.source.x; })
        .attr("y1", function(d) { return d.source.y; })
        .attr("x2", function(d) { return d.target.x; })
        .attr("y2", function(d) { return d.target.y; });

    node.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; });
  });
//});
"""
display(HTML(html))
display(Javascript(javascript))