import os import sys import csv import codecs import collections import glob import graf import scipy.sparse import networkx os.chdir("/Users/pbouda/Projects/git-github/notebooks/polysemy") if not os.path.exists("sources.csv"): import requests import zipfile r = requests.get( "http://www.quanthistling.info/data/downloads/xml/data.zip") with open("data.zip", "wb") as f: f.write(r.content) z = zipfile.ZipFile("data.zip") z.extractall() sources = csv.reader(open("sources.csv", "rU"), delimiter="\t") dict_sources = list() for source in sources: if source[0] != "QLCID" and source[1] == "dictionary": dict_sources.append(source[0]) import unicodedata tbl = dict.fromkeys(i for i in xrange(sys.maxunicode) if unicodedata.category(unichr(i)).startswith('P') or unicodedata.category(unichr(i)).startswith('S')) def remove_punctuation(text): return text.translate(tbl) if not os.path.exists(os.path.join("stopwords", "spanish")): import requests import zipfile r = requests.get( "https://github.com/nltk/nltk_data/blob/gh-pages/packages/corpora/stopwords.zip?raw=true") with open("stopwords.zip", "wb") as f: f.write(r.content) z = zipfile.ZipFile("stopwords.zip") z.extractall() stopwords = list() with codecs.open(os.path.join("stopwords", "spanish"), "r", "utf-8") as f: for line in f: stopwords.append(line.strip()) parser = graf.GraphParser() all_dicts_frame = None parsed_first = False spa_to_indi = collections.defaultdict(set) indi = set() spa = set() spa_to_spa = [] for d in dict_sources: for f in glob.glob(os.path.join(d, "dict-*-dictinterpretation.xml")): #print("Parsing {0}...".format(f)) graf_graph = parser.parse(f) for (node_id, node) in graf_graph.nodes.items(): if node_id.endswith("..entry"): entry_spa = set() spa_to_spa_tmp = list() others = set() for e in node.out_edges: if e.annotations.get_first().label == "head" or e.annotations.get_first().label == "translation": # get lang for n in e.to_node.links[0][0].nodes: if n.annotations.get_first().label == "iso-639-3": if n.annotations.get_first().features.get_value("substring") == "spa": substr = remove_punctuation(e.to_node.annotations.get_first().features.get_value("substring")) collo = set() for w in substr.split(" "): if w not in stopwords: entry_spa.add(w) collo.add(w) if len(collo) > 1: spa_to_spa_tmp.append(list(collo)) break else: trans = u"{0}|{1}".format(e.to_node.annotations.get_first().features.get_value("substring"), d) others.add(trans) break if len(entry_spa) > 0 and len(others) > 0: #spa_to_spa.append(list(entry_spa)) spa_to_spa.extend(spa_to_spa_tmp) for head in entry_spa: for translation in others: spa_to_indi[head].add(translation) spa.add(head) indi.add(translation) import gc gc.collect() spa = list(spa) indi = list(indi) indi_indices = { w: i for i, w in enumerate(indi) } spa_indices = { w: i for i, w in enumerate(spa) } all_dicts_cooc = scipy.sparse.lil_matrix((len(indi), len(spa))) #all_dicts_cooc = numpy.zeros((len(indi), len(spa))) len(spa) for i, head in enumerate(spa): for trans in spa_to_indi[head]: all_dicts_cooc[indi_indices[trans], i] = 1 all_dicts_spa_collo = scipy.sparse.lil_matrix((len(spa), len(spa))) for j, p in enumerate(spa_to_spa): for i in range(len(p)-1): for w in p[i+1:]: all_dicts_spa_collo[spa_indices[p[i]], spa_indices[w]] += 1 spa_to_spa[19764] all_dicts_cooc = scipy.sparse.csc_matrix(all_dicts_cooc) all_dicts_spa_collo = scipy.sparse.csc_matrix(all_dicts_spa_collo) spa_similarity = all_dicts_cooc.T * all_dicts_cooc spa_similarity_without_collo = spa_similarity - all_dicts_spa_collo g = networkx.Graph(spa_similarity) #solitary = [ n for n, d in g.degree_iter() if d==2 ] #g.remove_nodes_from(solitary) labels = dict(zip(range(len(spa)), spa)) #labels = { k: v for k,v in enumerate(spa) if k in g } g2 = networkx.relabel_nodes(g, labels) word = u"casa" cutoff = 50 comer_nodes = g2[word] comer_graph = networkx.Graph() comer_graph.add_node(word) for n in comer_nodes: if comer_nodes[n]['weight'] >= cutoff: comer_graph.add_node(n) comer_graph.add_edge(word, n, weight=comer_nodes[n]['weight']) len(comer_graph) from networkx.readwrite import json_graph import json comer_json = json_graph.node_link_data(comer_graph) #json.dump(bodyparts_json, codecs.open("bodyparts_graph.json", "w", "utf-8")) from IPython.display import HTML, Javascript from IPython.core.display import display html = """ """ javascript = """ var color = d3.scale.category20(); var width = 500, height = 400; var svg = d3.select("#nav").append("svg") .attr("width", width) .attr("height", height); var force = d3.layout.force() .gravity(.05) .distance(100) .charge(-250) .size([width, height]); var json = """ + json.dumps(comer_json) + """; //d3.json("bodyparts_graph.json", function(error, json) { force .nodes(json.nodes) .links(json.links) .start(); var link = svg.selectAll("line.link") .data(json.links) .enter().append("line") .attr("class", "link") .style("stroke-width", function(d) { return d.weight/""" + str(cutoff) + """; }); var node = svg.selectAll("circle.node") .data(json.nodes) .enter().append("g") .attr("class", "node") .call(force.drag); node.append("circle") .attr("r", 5); //.style("fill", function(d) { return color(d.group); }) node.append("text") .attr("dx", 12) .attr("dy", ".35em") .text(function(d) { return d.id }); force.on("tick", function() { link.attr("x1", function(d) { return d.source.x; }) .attr("y1", function(d) { return d.source.y; }) .attr("x2", function(d) { return d.target.x; }) .attr("y2", function(d) { return d.target.y; }); node.attr("transform", function(d) { return "translate(" + d.x + "," + d.y + ")"; }); }); //}); """ display(HTML(html)) display(Javascript(javascript))