#!/usr/bin/env python # coding: utf-8 # # Computation of embeddings of Wikipedia texts # # Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server. # # In **1 hour** you can process around 60x60 = **3600 text-pairs**. # In[ ]: # Create embeddings for wikipedia texts # Note: Embeddings computation for 11,020x2 texts takes 10,822 seconds on the EML4U experiment server. # In 1 hour you can process around 3,600 = 60*60 text-pairs. # Current script baseDir = "/home/eml4u/EML4U/notebooks/wikipedia-embeddings" # File IDs (for input and output) #title = "american-films" #title = "british-films" #title = "indian-films" title = "living-people" dateA = "20100408" dateB = "20201101" idA = dateA + "-" + title idB = dateB + "-" + title # Input directories dataDirA = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idA + "/" dataDirB = "/home/eml4u/EML4U/data/corpus/2021-02-10-wikipedia-texts/" + idB + "/" # Output files outDir = "/home/eml4u/EML4U/data/wikipedia-embeddings/" fileEmbeddingsA = outDir + idA + ".txt" fileEmbeddingsB = outDir + idB + ".txt" fileIds = outDir + title + ".txt" print(dataDirA) print(dataDirB) print(fileEmbeddingsA) print(fileEmbeddingsB) print(fileIds) # In[ ]: # Get file paths import glob filesA = glob.glob(dataDirA + '*.txt') filesB = glob.glob(dataDirB + '*.txt') # In[ ]: # Development # Limit number of file paths if False: filesA = filesA[:20] filesB = filesB[:20] # Print file paths if False: print('\n'.join(map(str, filesA))) print() print('\n'.join(map(str, filesB))) # In[ ]: # Read files textsA = [] for filename in filesA: fileobject = open(filename, "r") text = fileobject.read() textsA.append(text) fileobject.close textsB = [] for filename in filesB: fileobject = open(filename, "r") text = fileobject.read() textsB.append(text) fileobject.close # In[ ]: # Print text sizes / texts print("len(textsA):", len(textsA)) print("len(textsB):", len(textsB)) if False: print(textsA[0]) print(textsB[0]) # In[ ]: # Ensure similar filenames in both points of time import ntpath filenames = [] for x in range(len(filesA)): filenames.append(ntpath.basename(filesA[x])) if(ntpath.basename(filesA[x]) != ntpath.basename(filesB[x])): print (x , ntpath.basename(filesA[x]), ntpath.basename(filesB[x])) print("len(filenames):", len(filenames)) # In[ ]: # Prepare embeddings import sys import os sys.path.append(os.path.abspath(baseDir)) from embedding import BertHuggingface NUM_CLASSES = 8 # irrelevant if you dont want to retrain bert = BertHuggingface(NUM_CLASSES) # In[ ]: # Create embeddings import time print(time.asctime()) startTime = time.time() embeddingsA = bert.embed(textsA) embeddingsB = bert.embed(textsB) print("Runtime: %s seconds" % (time.time() - startTime)) print("embeddingsA.shape:", embeddingsA.shape) print("embeddingsB.shape:", embeddingsB.shape) # In[ ]: # Write embeddings/arrays to files print(fileEmbeddingsA) print(fileEmbeddingsB) print(fileIds) import numpy numpy.savetxt(fileEmbeddingsA, embeddingsA) numpy.savetxt(fileEmbeddingsB, embeddingsB) with open(fileIds, "w") as outfile: outfile.write("\n".join(filenames)) # In[ ]: # Check: Load arrays if True: loadedA = numpy.loadtxt(fileEmbeddingsA) loadedB = numpy.loadtxt(fileEmbeddingsB) with open(fileIds) as f: loadedFilenames = f.read().splitlines() print(numpy.array_equal(embeddingsA, loadedA)) print(numpy.array_equal(embeddingsB, loadedB)) print(numpy.array_equal(filenames, loadedFilenames)) print(type(embeddingsA)) print(type(loadedA)) print(type(loadedFilenames)) # In[ ]: