from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer as wnl
import nltk, string, glob
import gensim
import itertools
import re
import csv
import scipy
import warnings
import numpy as np
import networkx as nx
warnings.simplefilter(action='ignore', category=FutureWarning)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
#
model = "/home/jovyan/enwiki_5_ner.txt"
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False)
#################################################
# Initialize, config & define helpful functions #
#################################################
translator = str.maketrans('', '', string.punctuation.replace('-', '')) #filters punctuation except dash
lemmatizeCondition = 1
lemmatizer = wnl()
stop_words = nltk.corpus.stopwords.words('english')
# Function for finding index of words of interest, like 'references'
def find(target):
for i, word in enumerate(sents):
try:
j = word.index(target)
except ValueError:
continue
yield i
# Function for handling the input for gensim word2vec
class FileToSent(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for line in open(self.filename, 'r'):
ll = line.strip().split(",")
ll = [''.join(c for c in s if c not in string.punctuation) for s in ll]
ll = [num.strip() for num in ll]
yield ll
###################################################
# Read in .txt file(s) from a specified directory #
###################################################
IDs = sorted(glob.glob('/home/jovyan/data/*.txt'))
####################
# Clean, lemmatize #
####################
for ID in IDs: # loop through papers
print(ID)
totalWords = []
with open(ID, newline='') as csvfile:
text = csvfile.read()
text = re.sub("\u2013|\u2014", "-", str(text)) # Replace em-dashes
sents = sent_tokenize(text) # Split into sentences
sents = [word_tokenize(s) for s in sents]
sents = [[w.translate(translator) for w in s] for s in sents] # filter punctuation
sents = [[re.sub(r'^[-+]?[0-9]*[\.\-]?[0-9]+$', 'numeric', w) for w in s] for s in sents] # replace all numerals with the holder "number"
sents = [[w for w in s if re.search('[^a-zA-Z-0-9-]+', w) is None] for s in sents] # trips everything but alphanumeric
sents = [[w.lower() for w in s] for s in sents] # make lower case
sents = [s for s in sents if len(s) > 0] # remove empty lines
sents = [[w for w in s if not w in stop_words] for s in sents] # filter stop words
sents = [[w for w in s if len(w) > 1] for s in sents] # filters out variables, etc
sents = [[w for w in s if len(w) > 2] for s in sents] # filters out variables, etc
sents = [[w for w in s if len(w) > 3] for s in sents] # filters out variables and abbreviations
sents = [s for s in sents if len(s) > 0] # remove empty lines
words = [[lemmatizer.lemmatize(w) for w in s if lemmatizeCondition == 1] for s in sents] # lemmatize
words = list(itertools.chain.from_iterable(words)) # join list of lists
totalWords.append(words)
# Write cleaned text to file
with open('/home/jovyan/cleanedText.txt', 'w') as f:
for _list in words:
f.write(str(_list) + ' ')
###############################
# Construct semantic networks #
###############################
"""
Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)
between a set of words in a gensim word2vec model.
"""
model = word_vectors # load
# Specify words
###############
my_words = []
text = open('/home/jovyan/cleanedText.txt').read()
my_words = [word for word in word_tokenize(text)]
# filter out words not in model
my_words = [word for word in my_words if word in model.index_to_key]
# The number of connections we want: either as a factor of the number of words or a set number
num_top_conns = len(my_words) * 15
# Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
dists = []
# Find similarity distances between each word pair
for i1, word1 in enumerate(my_words):
for i2, word2 in enumerate(my_words):
if i1 >= i2: continue
cosine_similarity = model.similarity(word1, word2)
cosine_distance = 1 - cosine_similarity
dist = (word1, word2, cosine_distance)
dists.append(dist)
# Sort the list by ascending distance
dists.sort(key=lambda _tuple: _tuple[-1])
# Get the top connections
top_conns = dists[:num_top_conns]
# Make a network
g = nx.Graph()
g.add_nodes_from(my_words)
for word1, word2, dist in top_conns:
weight = 1 - dist # cosine similarity for weight
g.add_edge(word1, word2, weight=float(weight))
# Write the network
nx.write_graphml(g, "/home/jovyan/network.graphml") # Readable by Gephi
nx.write_edgelist(g, "/home/jovyan/edgeList.txt", delimiter=' ', data=['weight'])
text = open('/home/jovyan/edgeList.txt', 'r')
lines = text.readlines()
counter = 0
with open('/home/jovyan/networkDynamicsLabels.txt', 'w') as f:
for word1 in my_words:
for line in lines:
if word1 in line:
for i in [i for i, x in enumerate(my_words) if x == word1]:
f.write(line.replace('\n', ' ' + str(i) + '\n'))
# A = nx.adjacency_matrix(g, nodelist=my_words, weight=float(weight))
# adjmat = A.todense()
#
# numpy.savetxt("./semanticNetwork/semanticNetworkAdjmat.txt", adjmat, delimiter=' ')
#
# with open('./semanticNetwork/semanticNetworkNodeLabels.txt', 'w') as f:
# print(g.nodes, file=f)