In [ ]:
from nltk import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer as wnl
import nltk, string, glob
import gensim
import itertools
import re
import csv
import scipy
import warnings
import numpy as np
import networkx as nx
warnings.simplefilter(action='ignore', category=FutureWarning)

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

#
model = "/home/jovyan/enwiki_5_ner.txt"
word_vectors = gensim.models.KeyedVectors.load_word2vec_format(model, binary=False)

#################################################
# Initialize, config & define helpful functions #
#################################################

translator = str.maketrans('', '', string.punctuation.replace('-', '')) #filters punctuation except dash
lemmatizeCondition = 1
lemmatizer = wnl()
stop_words = nltk.corpus.stopwords.words('english')

# Function for finding index of words of interest, like 'references'

def find(target):
    for i, word in enumerate(sents):
        try:
            j = word.index(target)
        except ValueError:
            continue
        yield i

# Function for handling the input for gensim word2vec

class FileToSent(object):
    def __init__(self, filename):
        self.filename = filename

    def __iter__(self):
        for line in open(self.filename, 'r'):
            ll = line.strip().split(",")
            ll = [''.join(c for c in s if c not in string.punctuation) for s in ll]
            ll = [num.strip() for num in ll]
            yield ll


###################################################
# Read in .txt file(s) from a specified directory #
###################################################

IDs = sorted(glob.glob('/home/jovyan/data/*.txt'))

####################
# Clean, lemmatize #
####################

for ID in IDs: # loop through papers
    print(ID)
    totalWords = []
    with open(ID, newline='') as csvfile:
        text = csvfile.read()
        text = re.sub("\u2013|\u2014", "-", str(text))  # Replace em-dashes
        sents = sent_tokenize(text)  # Split into sentences
        sents = [word_tokenize(s) for s in sents]
        sents = [[w.translate(translator) for w in s] for s in sents]  # filter punctuation
        sents = [[re.sub(r'^[-+]?[0-9]*[\.\-]?[0-9]+$', 'numeric', w) for w in s] for s in sents]  # replace all numerals with the holder "number"
        sents = [[w for w in s if re.search('[^a-zA-Z-0-9-]+', w) is None] for s in sents]  # trips everything but alphanumeric
        sents = [[w.lower() for w in s] for s in sents]  # make lower case
        sents = [s for s in sents if len(s) > 0]  # remove empty lines
        sents = [[w for w in s if not w in stop_words] for s in sents]  # filter stop words
        sents = [[w for w in s if len(w) > 1] for s in sents]  # filters out variables, etc
        sents = [[w for w in s if len(w) > 2] for s in sents]  # filters out variables, etc
        sents = [[w for w in s if len(w) > 3] for s in sents]  # filters out variables and abbreviations
        sents = [s for s in sents if len(s) > 0]  # remove empty lines
        words = [[lemmatizer.lemmatize(w) for w in s if lemmatizeCondition == 1] for s in sents]  # lemmatize
        words = list(itertools.chain.from_iterable(words))  # join list of lists
        totalWords.append(words)

        # Write cleaned text to file
        with open('/home/jovyan/cleanedText.txt', 'w') as f:
            for _list in words:
                f.write(str(_list) + ' ')

###############################
# Construct semantic networks #
###############################

"""
Code to make a network out of the shortest N cosine-distances (or, equivalently, the strongest N associations)
between a set of words in a gensim word2vec model.
"""

model = word_vectors  # load

# Specify words
###############

my_words = []
text = open('/home/jovyan/cleanedText.txt').read()
my_words = [word for word in word_tokenize(text)]

# filter out words not in model
my_words = [word for word in my_words if word in model.index_to_key]

# The number of connections we want: either as a factor of the number of words or a set number
num_top_conns = len(my_words) * 15

# Make a list of all word-to-word distances [each as a tuple of (word1,word2,dist)]
dists = []

# Find similarity distances between each word pair

for i1, word1 in enumerate(my_words):
    for i2, word2 in enumerate(my_words):
        if i1 >= i2: continue
        cosine_similarity = model.similarity(word1, word2)
        cosine_distance = 1 - cosine_similarity
        dist = (word1, word2, cosine_distance)
        dists.append(dist)

# Sort the list by ascending distance
dists.sort(key=lambda _tuple: _tuple[-1])

# Get the top connections
top_conns = dists[:num_top_conns]

# Make a network
g = nx.Graph()
g.add_nodes_from(my_words)
for word1, word2, dist in top_conns:
    weight = 1 - dist  # cosine similarity for weight
    g.add_edge(word1, word2, weight=float(weight))

# Write the network
nx.write_graphml(g, "/home/jovyan/network.graphml")  # Readable by Gephi

nx.write_edgelist(g, "/home/jovyan/edgeList.txt", delimiter=' ', data=['weight'])

text = open('/home/jovyan/edgeList.txt', 'r')
lines = text.readlines()

counter = 0
with open('/home/jovyan/networkDynamicsLabels.txt', 'w') as f:
    for word1 in my_words:
        for line in lines:
            if word1 in line:
                for i in [i for i, x in enumerate(my_words) if x == word1]:
                    f.write(line.replace('\n', ' ' + str(i) + '\n'))


# A = nx.adjacency_matrix(g, nodelist=my_words, weight=float(weight))
# adjmat = A.todense()
#
# numpy.savetxt("./semanticNetwork/semanticNetworkAdjmat.txt", adjmat, delimiter=' ')
#
# with open('./semanticNetwork/semanticNetworkNodeLabels.txt', 'w') as f:
#     print(g.nodes, file=f)