In this notebook, we make a collection of all the embeddings which we use to do retrofitting. These embeddings are then evaluated for their similarity based on the evaluation benchmark datasets.

In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from itertools import combinations
from math import comb
from sklearn.ensemble import RandomForestClassifier
import os
import h5py
import json
import gzip

In [2]:

# DWD V2 files
# https://drive.google.com/drive/u/3/folders/1OIZegxxrs_Hv2ZhDsSO-zLVARCR60P01
# SITELINKS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/sitelinks.en.tsv.gz"
CLAIMS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/claims.tsv.gz"
LABELS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz"
DESCRIPTIONS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/descriptions.en.tsv.gz"

In [3]:

# wikidata-20210215 files
# https://drive.google.com/drive/u/3/folders/1NGtob1BFQ03sXf4yQyYvP13ly3u1Ul5u
# SITELINKS_FILE_V1 = "../source_dataset_files/wikidata-20210215/sitelinks.en.tsv.gz"

In [4]:

# wikidata-20201208 files
# https://drive.google.com/drive/u/3/folders/1qbbgjo7pddMdDvQzOSeSaL6lYwj_f5gi
SITELINKS_FILE_V2 = "../source_dataset_files/wikidata-20201208/sitelinks.en.tsv.gz"

In [5]:

# Embedding Related Files
DBPEDIA_SHORT_ABSTRACTS_TTL = "../data/evaluation/source_files/short-abstracts_lang=en.ttl"
DBPEDIA_SHORT_ABSTRACTS_CSV = "../data/evaluation/source_files/short-abstracts_lang=en.csv"
ABSTRACTS_INTERMEDIATE_FILE = "../data/embeddings/intermediate_files/abstracts.csv"

COMPLEX_EMB_SOURCE_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.complEx.graph-embeddings.txt"
TRANSE_EMB_SOURCE_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.transE.graph-embeddings.txt"
TEXT_EMB_SOURCE_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/text-embeddings-concatenated.tsv.gz"

COMPLEX_EMB_FINAL_FILE = "../data/embeddings/complex_orig_embedding_dict.json"
TRANSE_EMB_FINAL_FILE = "../data/embeddings/transe_orig_embedding_dict.json"
TEXT_EMB_FINAL_FILE = "../data/embeddings/text_7_props_orig_embedding_dict.json"
ABS_EMB_FINAL_FILE = "../data/embeddings/abstract_orig_embedding_dict.json"
ABS_FIRST_SENT_EMB_FINAL_FILE = "../data/embeddings/abstract_first_sent_orig_embedding_dict.json"

LABELS_EMB_FINAL_FILE = "../data/embeddings/labels_orig_embedding_dict.json"
LABELS_DESC_EMB_FINAL_FILE = "../data/embeddings/labels_n_desc_orig_embedding_dict.json"

In [6]:

# HAS Embedding Related Files
A_SOURCE_FILE = "../source_dataset_files/A_walks_analysis/a_embeddings_10x10,min_count=0.kv"
A_OP_FILE = "../data/embeddings/has_a_orig_embedding_dict.json"

H_SOURCE_FILE = "../source_dataset_files/H_walks_analysis/h_embeddings_5x8,min_count=21.kv"
H_OP_FILE = "../data/embeddings/has_h_orig_embedding_dict.json"

S_SOURCE_FILE = "../source_dataset_files/S_walks_analysis/s_embeddings_5x10,min_count=0.kv"
S_OP_FILE = "../data/embeddings/has_s_orig_embedding_dict.json"

In [28]:

WORDSIM_CLASS_SIM_FILE = '../data/embeddings/wordsim_class_sim.csv'
WORDSIM_JC_SIM_FILE = '../data/embeddings/wordsim_jc_sim.csv'
WORDSIM_TOP_SIM_FILE = '../data/embeddings/wordsim_top_sim.csv'

WORDSIM_OLD_CLASS_SIM_FILE = '../data/embeddings/wordsim_old_class_sim.csv'
WORDSIM_OLD_JC_SIM_FILE = '../data/embeddings/wordsim_old_jc_sim.csv'
WORDSIM_OLD_TOP_SIM_FILE = '../data/embeddings/wordsim_old_top_sim.csv'

DBPEDIA_MC_30_CLASS_SIM_FILE = '../data/embeddings/dbpedia_mc_30_class_sim.csv'
DBPEDIA_MC_30_JC_SIM_FILE = '../data/embeddings/dbpedia_mc_30_jc_sim.csv'
DBPEDIA_MC_30_TOP_SIM_FILE = '../data/embeddings/dbpedia_mc_30_top_sim.csv'

DBPEDIA_RG_65_CLASS_SIM_FILE = '../data/embeddings/dbpedia_rg_65_class_sim.csv'
DBPEDIA_RG_65_JC_SIM_FILE = '../data/embeddings/dbpedia_rg_65_jc_sim.csv'
DBPEDIA_RG_65_TOP_SIM_FILE = '../data/embeddings/dbpedia_rg_65_top_sim.csv'

In [29]:

P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = "../data/basis/P279_ChildPar.all-distilroberta-v1.csv"
WORDSIM_FILE = "../data/evaluation/wordsim353_with_r3.csv"
WORDSIM_OLD_FILE = "../data/evaluation/wordsim_old.csv"
DBPEDIA_MC_30_FINAL_FILE = "../data/evaluation/mc-30_DBpedia.csv"
DBPEDIA_RG_65_FINAL_FILE = "../data/evaluation/rg-65_DBpedia.csv"

Common Code¶

In [9]:

def get_all_nodes():
    """
    This function generates the set of all nodes needed for execution
    """
    p279ChildPar = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)
    wordsim_df = pd.read_csv(WORDSIM_FILE)
    dbpedia_mc_30_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)
    dbpedia_rg_65_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)
#     wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')
#     concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')
    p279QnodesList = set(p279ChildPar.node1.to_list() 
                        + p279ChildPar.node2.to_list()
                        + wordsim_df['word1_kg_id'].to_list() 
                        + wordsim_df['word2_kg_id'].to_list()
                        + dbpedia_mc_30_df['word1_kg_id'].to_list()
                        + dbpedia_mc_30_df['word2_kg_id'].to_list()
                        + dbpedia_rg_65_df['word1_kg_id'].to_list()
                        + dbpedia_rg_65_df['word2_kg_id'].to_list())
#                         + wiki_cs_df['word1_kg_id'].to_list() 
#                         + wiki_cs_df['word2_kg_id'].to_list()
#                         + concept_net_df['word1_kg_id'].to_list()
#                         + concept_net_df['word2_kg_id'].to_list())
    print(len(p279QnodesList))
    return p279QnodesList

allNodes = get_all_nodes()

In [10]:

def fillCoverage(embedDict):
    wordSim353AnnotDF_New = pd.read_csv(WORDSIM_FILE)
    wordSim353AnnotDF_set = set(wordSim353AnnotDF_New['word1_kg_id'].to_list() + wordSim353AnnotDF_New['word2_kg_id'].to_list())
    embed_size = len(embedDict[next(iter(embedDict))])
#     print(embed_size)
    count = 0
    for word in wordSim353AnnotDF_set:
        if word not in embedDict:
            embedDict[word] = np.zeros((embed_size))
            count += 1
    print(f"Added {count} corrections")
    return embedDict

def deserializeEmbeddingDict(embedDict):
    for key2 in embedDict.keys():
        embedDict[key2] = np.array(embedDict[key2])
    return embedDict

def serializeEmbeddingDict(embedDict):
    for key2 in embedDict.keys():
        embedDict[key2] = embedDict[key2].tolist() if type(embedDict[key2]) != list else embedDict[key2]
    return embedDict

In [11]:

def get_labels(node_set):
    labels_dict = {}
    first_line = True
    with gzip.open(LABELS_FILE, 'r') as labelsFile:
        firstLine = True
        for line in tqdm(labelsFile, total=41845781):
            if firstLine:
                firstLine = False
                continue
            line = line.decode('utf-8').strip().split('\t')
            line[3] = line[3][1:-5]
            qnode, label = line[1], line[3]
    #         print(qnode, label)
            if qnode in node_set:
                labels_dict[qnode] = label
    return labels_dict

In [12]:

def get_labels_n_desc(node_set):
    labels_dict = get_labels(node_set)
    first_line = True
    with gzip.open(DESCRIPTIONS_FILE, 'r') as labelsFile:
        firstLine = True
        for line in tqdm(labelsFile, total=34700043):
            if firstLine:
                firstLine = False
                continue
            line = line.decode('utf-8').strip().split('\t')
            line[3] = line[3][1:-5]
            qnode, label = line[1], line[3]
    #         print(qnode, label)
            if qnode in node_set:
                if qnode in labels_dict:
                    labels_dict[qnode] += ' ' + label
                else:
                    raise "Label not present"
    return labels_dict

Complex + Transe Embeddings Generation¶

In [20]:

complex_emb_dict = json.load(open(COMPLEX_EMB_FINAL_FILE))

In [21]:

first_line = True
complex_emb_dict = {}
with open(COMPLEX_EMB_SOURCE_FILE) as complex_file:
    for line in tqdm(complex_file, total=53002671):
        if first_line:
            first_line = False
            continue
        line = line.strip().split()
        if line[0] in allNodes and line[0] not in complex_emb_dict:
            complex_emb_dict[line[0]] = [float(elem) for elem in line[1:]]

0it [00:00, ?it/s]

In [22]:

len(complex_emb_dict)

Out[22]:

In [23]:

json.dump(complex_emb_dict, open(COMPLEX_EMB_FINAL_FILE, 'w'))

In [24]:

transe_emb_dict = json.load(open(TRANSE_EMB_FINAL_FILE))

In [25]:

first_line = True
transe_emb_dict = {}
with open(TRANSE_EMB_SOURCE_FILE) as complex_file:
    for line in tqdm(complex_file, total=53002671):
        if first_line:
            first_line = False
            continue
        line = line.strip().split()
        if line[0] in allNodes and line[0] not in transe_emb_dict:
            transe_emb_dict[line[0]] = [float(elem) for elem in line[1:]]
len(transe_emb_dict)

0it [00:00, ?it/s]

Out[25]:

In [26]:

json.dump(transe_emb_dict, open(TRANSE_EMB_FINAL_FILE, 'w'))

In [ ]:

In [5]:

# p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')
# print(len(set(p279ChildPar.node1.to_list() 
#                         + p279ChildPar.node2.to_list())))

# # Load complex, transe embedding files and entity names file
# compf = h5py.File('../data/complTrans/complEx.h5','r')
# transf = h5py.File('../data/complTrans/transE.h5','r')
# ent_names = json.load(open('../data/complTrans/entity_names_all_0.json'))
# allNodes = get_all_nodes()
# # json.dump(list(p279QnodesList), open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json', 'w'))

# complexEmb = {qnode: emb for emb, qnode in zip(compf['embeddings'], ent_names) if qnode in allNodes}
# transeEmb = {qnode: emb for emb, qnode in zip(transf['embeddings'], ent_names) if qnode in allNodes}
# print(f"Out of {len(ent_names)} embeddings, retaining {len(transeEmb)} embeddings")

# def serialize_embedding_dict(embed_dict):
#     for key2 in embed_dict.keys():
#         embed_dict[key2] = embed_dict[key2].tolist() if type(embed_dict[key2]) != list else embed_dict[key2]
#     return embed_dict

# json.dump(serialize_embedding_dict(complexEmb),open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json','w'))
# json.dump(serialize_embedding_dict(transeEmb),open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json','w'))
# # complexEmb = json.load(open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json'))
# # transeEmb = json.load(open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json'))

Text Embedding¶

In [34]:

first_line = True
text_emb_dict = {}
with gzip.open(TEXT_EMB_SOURCE_FILE) as file:
    for line in tqdm(file):
        if first_line:
            first_line = False
            continue
        line = line.decode('utf-8').strip().split('\t')
        if line[1] == 'text_embedding' and line[0] in allNodes:
            text_emb_dict[line[0]] = [float(elem) for elem in line[2].split(',')]

0it [00:00, ?it/s]

In [37]:

json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))

In [20]:

# text_emb_dict = json.load(open('../data/embeddings/archived/text_7_props_orig_embedding_dict.json.old'))

In [40]:

missing_nodes = []
for node in tqdm(allNodes):
    if node not in text_emb_dict:
        missing_nodes.append(node)

  0%|          | 0/241698 [00:00<?, ?it/s]

In [41]:

missing_nodes_set = set(missing_nodes)
new_file = []
with gzip.open(CLAIMS_FILE, 'r') as all_claims_file:
    firstLine = True
    for ogline in tqdm(all_claims_file, total=491297976):
        if firstLine:
            firstLine = False
            continue
        line = ogline.decode('utf-8').strip().split('\t')
        line[3] = line[3][1:-5]
        qnode, label = line[1], line[3]
#         print(qnode, label)
        if qnode in missing_nodes_set:
            new_file.append(ogline)

0it [00:00, ?it/s]

In [52]:

allowed_props = set(['P31', 'P279', 'P106', 'P39', 'P1382', 'P373', 'P452'])
new_file1 = []
for line in new_file:
    line1 = line.decode('utf-8').strip().split('\t')
    if line1[2] in allowed_props:
        new_file1.append(line.decode('utf-8'))
new_file1 = ['id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n'] + new_file1

In [53]:

with open('../output/text-embeddings/missing_nodes.tsv', 'w') as f:
    f.writelines(new_file1)

In [14]:

#     --model sentence-transformers/roberta-large-nli-mean-tokens   \
q1 = "~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../output/text-embeddings/missing_nodes.tsv   \
    --model roberta-large-nli-mean-tokens   \
    --property-labels-file " + LABELS_FILE + "  --debug   \
    --isa-properties P31 P279 P106 P39 P1382 P373 P452   \
    --save-embedding-sentence > ../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv"
os.system(q1 + " ")

Out[14]:

In [15]:

text7_missingnodes = pd.read_csv("../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv", sep='\t')
text7_missingnodes = text7_missingnodes[text7_missingnodes.property == 'text_embedding']

In [16]:

text7_missingnodes['value'] = text7_missingnodes['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])

In [17]:

text7EmbDict = {row['node']: row['value'] for _,row in text7_missingnodes.iterrows()}

In [21]:

for key in text7EmbDict.keys():
    if key not in text_emb_dict:
        text_emb_dict[key] = text7EmbDict[key]

In [22]:

json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))

Old technique follows¶

In [2]:

p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')
p279QnodesList = list(set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list()))

In [5]:

missingNodes = allNodes - set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list())

In [6]:

len(missingNodes)

Out[6]:

In [8]:

# Split main file into sub-files for groups of properties for multi-processing

# bsize = len(p279QnodesList) // 250
# cnt = 1
# for i in range(0, len(p279QnodesList), bsize):
#     q1 = "kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '" + '|'.join(p279QnodesList[i:i+bsize]) + ";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv -v True"
# #     print(len(q1))
#     cnt += 1
# #     print(q1)
#     os.system("screen -dm " + q1)

  0%|          | 0/38 [00:00<?, ?it/s]

In [11]:

for cnt in range(290,503):
    os.system("rm ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv")

In [8]:

# # Split main file into sub-files for groups of properties for multi-processing
missingNodes = list(missingNodes)
bsize = 1000
cnt = 252
for i in tqdm(range(0, len(missingNodes), bsize)):
    q1 = "kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '" + '|'.join(missingNodes[i:i+bsize]) + ";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv -v True"
#     print(len(q1))
    cnt += 1
#     print(q1)
    os.system("screen -dm " + q1)

  0%|          | 0/38 [00:00<?, ?it/s]

In [12]:

def checkIfFileContainsLines(file):
    with open(file) as f:
        for line in f:
            return True
    return False

In [13]:

def countFileLines(file):
    count = 0
    with open(file) as f:
        for line in f:
            count += 1
    return count

In [ ]:

import time
from os.path import exists

runCommCnt = 1
# 252
for cnt in tqdm(range(252,290)):
    if exists("../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv") and countFileLines("../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv") == 4097:
        continue
    q1 = ""
#     if cnt % 10 == 0:
#         q1 += "sleep 20m; "
    q1 = "~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv   \
        --model sentence-transformers/all-distilroberta-v1   \
        --property-labels-file ../data/labels.en.tsv  --debug   \
        --isa-properties P31 P279 P106 P39 P1382 P373 P452   \
        --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv"
    print(cnt)
    runCommCnt += 1
    os.system(q1 + " &amp;")
    if runCommCnt % 15 == 0:
        time.sleep(11*60)
    

In [ ]:

for cnt in tqdm(range(1,290)):
    if countFileLines("../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv") != 4097:
        print(cnt)

In [ ]:

import time
from os.path import exists

# roberta-large-nli-mean-tokens
runCommCnt = 0
for cnt in tqdm(range(252,290)):
    if exists("../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-" + str(cnt) + ".tsv") and countFileLines("../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-" + str(cnt) + ".tsv") == 4097:
        continue
    q1 = ""
#     if cnt % 10 == 0:
#         q1 += "sleep 20m; "
    q1 += "~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv   \
        --model sentence-transformers/all-distilroberta-v1   \
        --property-labels-file ../data/labels.en.tsv  --debug   \
        --isa-properties P31 P279   \
        --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-" + str(cnt) + ".tsv"
    print(cnt)
    runCommCnt += 1
    os.system(q1 + " &amp;")
    if runCommCnt % 15 == 0:
        time.sleep(13*60)
    

In [71]:

p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))

In [38]:

# temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-1.tsv', sep='\t')

In [39]:

# temp.head()

Out[39]:

	node	property	value
0	Q99738027	text_embedding	0.74755263,1.6350263,-0.73952675,1.0463063,-0....
1	Q99738027	embedding_sentence	night shift, work shift during nighttime hours...
2	Q99228502	text_embedding	0.25261465,0.06285462,0.029052094,0.50796187,0...
3	Q99228502	embedding_sentence	avenue, thoroughfare named \"avenue\" is thoro...
4	Q98970128	text_embedding	0.11887096,0.8598291,0.4446009,-0.5038472,-0.9...

In [ ]:

text2EmbArr = []
for i in tqdm(range(1, 290)):
    if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv')):
        continue
    temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv', sep='\t')
    temp = temp[temp.property == 'text_embedding']
    text2EmbArr.append(temp)
text2Emb = pd.concat(text2EmbArr)

In [ ]:

text2Emb.head()

In [ ]:

text7EmbArr = []
for i in tqdm(range(1, 290)):
    if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv')):
        continue
    temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv', sep='\t')
    temp = temp[temp.property == 'text_embedding']
    text7EmbArr.append(temp)
text7Emb = pd.concat(text7EmbArr)

In [21]:

text2Emb = text2Emb[text2Emb.node.apply(lambda p: p in allNodes)]
text7Emb = text7Emb[text7Emb.node.apply(lambda p: p in allNodes)]

In [22]:

print(f"We have 2prop text embeddings for {len(text2Emb)} nodes and 7prop for {len(text7Emb)} nodes")

We have 2prop text embeddings for 278467 nodes and 7prop for 277587 nodes

In [23]:

text2Emb['value'] = text2Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])
text7Emb['value'] = text7Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])

In [24]:

text2EmbDict = {row['node']: row['value'] for _,row in text2Emb.iterrows()}
text7EmbDict = {row['node']: row['value'] for _,row in text7Emb.iterrows()}

In [25]:

json.dump(text2EmbDict, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))
json.dump(text7EmbDict, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))

In [ ]:

Abstract Embeddings Generation¶

Downloaded short abstracts file from DBPedia Short Abstracts - 2020.07.01

Then, we extract the abstracts file from the bz2 file using: bzip2 -d short-abstracts_lang=en.ttl.bz2

In [5]:

# cnt = 0
# p1s = []
# p11s = []
# p2s = []
# lines = []
# with open(DBPEDIA_SHORT_ABSTRACTS_TTL, 'r', encoding='utf-8') as f:
#     for line in tqdm(f):
#         p1 = line[:line.find(" ")]
#         p11 = p1[len("<http://dbpedia.org/resource/"):][:-1]
#         line = line[line.find(" ")+1:]
#         p2 = line[:line.find(" ")]
#         line = line[line.find(" ")+1:line.rfind(" ")][1:-4]
#         p1s.append(p1)
#         p11s.append(p11)
#         p2s.append(p2)
#         lines.append(line)
# df1 = pd.DataFrame({'urlComp': p11s, 'url':p1s, 'p2': p2s, 'abstract': lines})
# df1.to_csv(DBPEDIA_SHORT_ABSTRACTS_CSV)

0it [00:00, ?it/s]

In [13]:

df1 = pd.read_csv(DBPEDIA_SHORT_ABSTRACTS_CSV, skiprows=1, skipfooter=1, header=None, engine='python')
df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']
df1 = df1.set_index('node1')
df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]
print(f"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values")
df1.loc[df1[df1.index.duplicated()].index]
sitelinksDF = pd.read_csv(SITELINKS_FILE_V2, sep='\t')
sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split("/")[-1] if p.split("/")[-1] != '' else p.split("/")[-2])
sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']
sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')
print(f"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)")

sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]
sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in allNodes)]
labelsDF = pd.read_csv(LABELS_FILE, sep='\t')
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in allNodes)]
labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}
descriptionsDF = pd.read_csv(DESCRIPTIONS_FILE, compression='gzip', sep='\t')
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in allNodes)]
descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}

sdf_set = set(sitelinksDF2.index.to_list())
df1 = df1[df1.index.map(lambda p: p in sdf_set)]
abstractsDF2 = sitelinksDF2.join(df1).reset_index()
print(f"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract")

abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else "")
abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else "")
from nltk.tokenize import sent_tokenize
abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)

currNodes = set(abstractsDF2.node1.tolist())

correctedRows = []
for key in tqdm(allNodes):
    if key not in currNodes:
        correctedRows.append([None, None, key, None, None, None, None, None, None, labelsDict[key][1:-4] if key in labelsDict else None, descDict[key][1:-4] if key in descDict else None, None])
        
abstractsDF3 = pd.concat([abstractsDF2, pd.DataFrame(correctedRows, columns=abstractsDF2.columns)])

DBPedia dataset has 5732949 records with unique 5732947 index values
There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)
From 116088 Qnodes, there are 5134 sitelink Qnodes which do not have a short abstract i.e 110954 have a short abstract

  0%|          | 0/241698 [00:00<?, ?it/s]

DBPedia dataset has 5732949 records with unique 5732947 index values There are 8637003 sitelinks present in the dataset corresponding to 8637003 unique node1s (Qxxx), 8563928 unique labels (text) From 116430 Qnodes, there are 5707 sitelink Qnodes which do not have a short abstract i.e 110723 have a short abstract

DBPedia dataset has 5732949 records with unique 5732947 index values There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)

From 122585 Qnodes, there are 5661 sitelink Qnodes which do not have a short abstract i.e 116924 have a short abstract

In [14]:

def combineAbsLabDesc(row, parameter):
    if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != "":
        return row[parameter]
    elif row['node1_label'] != '' and row['node1_desc'] != '' and not(pd.isna(row['node1_label'])) and not(pd.isna(row['node1_desc'])):
        return row['node1_label'] + ' ' + row['node1_desc']
    elif row['node1_label'] != '' and not(pd.isna(row['node1_label'])):
        return row['node1_label']
    else:
        return None
    
abstractsDF3['abstract'] = abstractsDF3.apply(combineAbsLabDesc, axis=1, args=('abstract',))
abstractsDF3['abstract_firstSent'] = abstractsDF3.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))
abstractsDF3 = abstractsDF3[~abstractsDF3.abstract.isna()]
abstractsDF3 = abstractsDF3.reset_index()
abstractsDF3 = abstractsDF3.drop(columns=['level_0']).reset_index()
abstractsDF3 = abstractsDF3.drop(columns=['level_0'])
print(len(abstractsDF3))
abstractsDF3.to_csv(ABSTRACTS_INTERMEDIATE_FILE)

In [ ]:

In [15]:

abstractsDF2 = pd.read_csv(ABSTRACTS_INTERMEDIATE_FILE)
len(abstractsDF2)

/nas/home/kshenoy/miniconda3/envs/kgtkEnv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (1,2,4,5,7,8) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,

Out[15]:

In [17]:

# abstractsDF2[abstractsDF2.abstract == (abstractsDF2.node1_label +  ' ' + abstractsDF2.node1_desc)].to_csv('../data/Master_P279_dataset/temppppp.csv')

In [15]:

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName, device='cuda:2')
    start = time()
    encodings = model.encode(valSeries.to_list(), show_progress_bar=True, batch_size=1000)
    print(time()-start,'s')
    return encodings

In [19]:

modelName = 'sentence-transformers/all-distilroberta-v1'

In [20]:

absEmbSeries = getSentEmbeddings(abstractsDF2.abstract, modelName)
absFirstSentEmbSeries = getSentEmbeddings(abstractsDF2.abstract_firstSent, modelName)

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

316.05097579956055 s

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

260.16796946525574 s

In [21]:

absEmbDict = {node: emb.tolist() for node, emb in zip(abstractsDF2.node1.to_list(), absEmbSeries)}
absFirstSentEmbDict = {node: emb.tolist() for node, emb in zip(abstractsDF2.node1.to_list(), absFirstSentEmbSeries)}

In [22]:

json.dump(absEmbDict, open(ABS_EMB_FINAL_FILE, 'w'))
json.dump(absFirstSentEmbDict, open(ABS_FIRST_SENT_EMB_FINAL_FILE, 'w'))

H,A,S embeddings¶

Fetched from sita /data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/

In [23]:

def store_relevant_embeddings(wvec, fname):
    tempEmb = {key: wvec[key] for key in wvec.index_to_key if key in allNodes}
    print(f"Original Length: {len(wvec.index_to_key)}, No. of keys stored: {len(tempEmb)}")
    json.dump(serializeEmbeddingDict(tempEmb),open(fname, 'w'))

In [24]:

from gensim.models import KeyedVectors, Word2Vec

In [25]:

%%time

a_key_vec = KeyedVectors.load(A_SOURCE_FILE)
store_relevant_embeddings(a_key_vec, A_OP_FILE)

h_key_vec = KeyedVectors.load(H_SOURCE_FILE)
store_relevant_embeddings(h_key_vec, H_OP_FILE)

s_key_vec = KeyedVectors.load(S_SOURCE_FILE)
store_relevant_embeddings(s_key_vec, S_OP_FILE)

Original Length: 12106870, No. of keys stored: 27876
Original Length: 19593942, No. of keys stored: 166201
Original Length: 39030788, No. of keys stored: 116993
CPU times: user 5min 54s, sys: 2min 1s, total: 7min 56s
Wall time: 18min 51s

Label Embeddings¶

In [ ]:

labels_dict = get_labels(allNodes)

In [16]:

modelName = 'sentence-transformers/all-distilroberta-v1'
embs = getSentEmbeddings(pd.Series(labels_dict.values()), modelName)

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

98.77918219566345 s

In [17]:

labels_emb_dict = {k:v.tolist() for k, v in (zip(labels_dict.keys(), embs))}

In [18]:

json.dump(labels_emb_dict, open(LABELS_EMB_FINAL_FILE, 'w'))

In [ ]:

Label + Desc Embeddings¶

In [27]:

labels_desc_dict = get_labels_n_desc(allNodes)
modelName = 'sentence-transformers/all-distilroberta-v1'
embs = getSentEmbeddings(pd.Series(labels_desc_dict.values()), modelName)

  0%|          | 0/41845781 [00:00<?, ?it/s]

  0%|          | 0/34700043 [00:00<?, ?it/s]

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling

Batches:   0%|          | 0/242 [00:00<?, ?it/s]

203.93888425827026 s

In [28]:

labels_desc_emb_dict = {k:v.tolist() for k, v in (zip(labels_desc_dict.keys(), embs))}

In [29]:

json.dump(labels_desc_emb_dict, open(LABELS_DESC_EMB_FINAL_FILE, 'w'))

Direct sim files¶

In [46]:

import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

word_sim_df = pd.read_csv(WORDSIM_FILE)

def fetchSim(row, similarity_type):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
    try:
        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['embedding_cos_sim'] = None
    row['Resp_code'] = resp
    return row

word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))

word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_class_sim_df.to_csv(WORDSIM_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(WORDSIM_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(WORDSIM_TOP_SIM_FILE, index=None)

  0%|          | 0/334 [00:00<?, ?it/s]

  0%|          | 0/334 [00:00<?, ?it/s]

  0%|          | 0/334 [00:00<?, ?it/s]

In [25]:

word_sim_class_sim_df.head()

Out[25]:

	Word 1	Word 2	ID	H_Sim	H_Dim	F_Sim	F_Dim	N_Sim	N_Dim	D_Sim	...	P_Dim	Avg	Stdev	H_orig	H_reversed	word1_kg_id	word2_kg_id	category	embedding_cos_sim	Resp_code
0	Arafat	peace	8	3	D	4	NaN	3	U	4	...	NaN	3.6	0.547723	2.1250	7.8750	Q34211	Q454	U	3.982734	<Response [200]>
1	Arafat	terror	9	3	D	4	NaN	3	U	4	...	NaN	3.6	0.547723	3.0625	6.9375	Q34211	Q13648784	U	3.969884	<Response [200]>
2	FBI	fingerprint	109	3	D	4	NaN	4	NaN	3	...	NaN	3.6	0.547723	4.0625	5.9375	Q8333	Q178022	U	4.000000	<Response [200]>
3	FBI	investigation	110	3	U	3	U	3	U	3	...	u	3.0	0.000000	5.0625	4.9375	Q8333	Q21004260	M	3.951077	<Response [200]>
4	Harvard	Yale	137	2	S	3	S	2	S	2	...	s	2.2	0.447214	4.8750	5.1250	Q13371	Q49112	M	1.264601	<Response [200]>

5 rows × 22 columns

In [41]:

import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

word_sim_df = pd.read_csv(WORDSIM_OLD_FILE)

def fetchSim(row, similarity_type):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
    try:
        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['embedding_cos_sim'] = None
    row['Resp_code'] = resp
    return row

word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))

word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)


word_sim_class_sim_df.to_csv(WORDSIM_OLD_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(WORDSIM_OLD_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(WORDSIM_OLD_TOP_SIM_FILE, index=None)

  0%|          | 0/349 [00:00<?, ?it/s]

  0%|          | 0/349 [00:00<?, ?it/s]

  0%|          | 0/349 [00:00<?, ?it/s]

In [44]:

import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

word_sim_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)

def fetchSim(row, similarity_type):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
    try:
        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['embedding_cos_sim'] = None
    row['Resp_code'] = resp
    return row

word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))

word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)


word_sim_class_sim_df.to_csv(DBPEDIA_MC_30_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(DBPEDIA_MC_30_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(DBPEDIA_MC_30_TOP_SIM_FILE, index=None)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [45]:

import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

word_sim_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)

def fetchSim(row, similarity_type):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
    try:
        row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['embedding_cos_sim'] = None
    row['Resp_code'] = resp
    return row

word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))

word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)

word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)


word_sim_class_sim_df.to_csv(DBPEDIA_RG_65_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(DBPEDIA_RG_65_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(DBPEDIA_RG_65_TOP_SIM_FILE, index=None)

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

  0%|          | 0/34 [00:00<?, ?it/s]

Summary of embeddings¶

In [12]:

p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))

complexEmb = json.load(open('../data/Master_P279_dataset/masterComplexEmb.json'))
transeEmb = json.load(open('../data/Master_P279_dataset/masterTranseEmb.json'))

text2Emb = json.load(open('../data/Master_P279_dataset/text2Emb.json'))
text7Emb = json.load(open('../data/Master_P279_dataset/text7Emb.json'))

abstractEmb = json.load(open('../data/Master_P279_dataset/abstractEmb.json'))
abstractFirstSentEmb = json.load(open('../data/Master_P279_dataset/abstractFirstSentEmb.json'))

In [18]:

json.dump({key:val for key, val in complexEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in transeEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in text2Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in text7Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in abstractEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in abstractFirstSentEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_first_sent_orig_embedding_dict.json', 'w'))

In [13]:

def countOverlap(source, target):
    cnt = 0
    for key1 in source:
        if key1 in target:
            cnt += 1
    return cnt

In [14]:

summArr = []
cnt = countOverlap(complexEmb, p279QnodesList)
summArr.append(['complex', cnt, cnt / len(p279QnodesList) * 100])

cnt = countOverlap(transeEmb, p279QnodesList)
summArr.append(['transe', cnt, cnt / len(p279QnodesList) * 100])

cnt = countOverlap(text2Emb, p279QnodesList)
summArr.append(['text2', cnt, cnt / len(p279QnodesList) * 100])

cnt = countOverlap(text7Emb, p279QnodesList)
summArr.append(['text7', cnt, cnt / len(p279QnodesList) * 100])

cnt = countOverlap(abstractEmb, p279QnodesList)
summArr.append(['abstract', cnt, cnt / len(p279QnodesList) * 100])

cnt = countOverlap(abstractFirstSentEmb, p279QnodesList)
summArr.append(['abstractFirstSent', cnt, cnt / len(p279QnodesList) * 100])

In [16]:

len(p279QnodesList)

Out[16]:

In [15]:

pd.DataFrame(summArr, columns=['embedding', 'count', 'Coverage Percentage'])

Out[15]:

	embedding	count	Coverage Percentage
0	complex	238448	99.815395
1	transe	238448	99.815395
2	text2	238889	100.000000
3	text7	238889	100.000000
4	abstract	105828	44.300072
5	abstractFirstSent	105828	44.300072

Embeddings correction¶

In [6]:

masterEmbedDictMaster = {}
subsetEmbedDictMaster = {}

In [7]:

masterEmbedKeys = ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']
for key1 in masterEmbedKeys:
    masterEmbedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict.json'))

In [6]:

subsetEmbedKeys = ['text_7props', 'text_2props', 'complex', 'transe', 'abstract', 'abstract_first_sent']
for key1 in subsetEmbedKeys:
    subsetEmbedDictMaster[key1] = json.load(open('../data/orig_embeddings/'+key1+'_original_embeddings_dict.json'))

In [8]:

wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')

In [9]:

wordsim_pairs = {(row['word1_kg_id'], row['word2_kg_id']) for _, row in wordSim353AnnotDF_New.iterrows()}

In [15]:

for key1 in subsetEmbedKeys:
    print(f"Pair Coverage by {key1} embeddings created for 19k retrofitting: {sum([row[0] in subsetEmbedDictMaster[key1] and row[1] in subsetEmbedDictMaster[key1] for row in wordsim_pairs])}")

Pair Coverage by text_7props embeddings created for 19k retrofitting: 325
Pair Coverage by text_2props embeddings created for 19k retrofitting: 325
Pair Coverage by complex embeddings created for 19k retrofitting: 342
Pair Coverage by transe embeddings created for 19k retrofitting: 342
Pair Coverage by abstract embeddings created for 19k retrofitting: 343
Pair Coverage by abstract_first_sent embeddings created for 19k retrofitting: 343

In [17]:

for key1 in masterEmbedKeys:
    print(f"Pair Coverage by old {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}")

Pair Coverage by old text_7_props embeddings created for 19k retrofitting: 278
Pair Coverage by old text_2_props embeddings created for 19k retrofitting: 278
Pair Coverage by old complex embeddings created for 19k retrofitting: 278
Pair Coverage by old transe embeddings created for 19k retrofitting: 278
Pair Coverage by old abstract embeddings created for 19k retrofitting: 183
Pair Coverage by old abstract_first_sent embeddings created for 19k retrofitting: 183

In [10]:

wordSim353AnnotDF_New_set = set(wordSim353AnnotDF_New.word1_kg_id.to_list() + wordSim353AnnotDF_New.word2_kg_id.to_list())

In [11]:

from collections import defaultdict
masterEmbCorrections = defaultdict(list)
for node in wordSim353AnnotDF_New_set:
    for i in range(len(masterEmbedKeys)):
        if node not in masterEmbedDictMaster[masterEmbedKeys[i]] and node in wordSim353AnnotDF_New_set:
            masterEmbCorrections[masterEmbedKeys[i]].append(node)

In [21]:

masterEmbCorrections.keys()

Out[21]:

dict_keys(['abstract', 'abstract_first_sent', 'text_7_props', 'text_2_props', 'complex', 'transe'])

Complex, Transe¶

In [142]:

# import requests
# correctedComplexEmb = {}
# correctedTranseEmb = {}
# for wordID in masterEmbCorrections['complex']:
#     try:
#         resp = requests.get("http://ckg07:9200/wikidatadwd-augmented/_doc/"+wordID).json()['_source']
#         correctedComplexEmb[wordID] = [float(p) for p in resp['graph_embedding_complex'].split(',')]
#         correctedTranseEmb[wordID] = [float(p) for p in resp['graph_embeddings_transe'].split(',')]
#     except:
#         print("Failure returned for http://ckg07:9200/wikidatadwd-augmented/_doc/"+wordID)

In [15]:

%%time
correctedComplexEmb = {qnode: emb for emb, qnode in tqdm(zip(f['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}
correctedTranseEmb = {qnode: emb for emb, qnode in tqdm(zip(transf['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}

  0%|          | 0/42575933 [00:00<?, ?it/s]

  0%|          | 0/42575933 [00:00<?, ?it/s]

CPU times: user 1h 33min 17s, sys: 2min 38s, total: 1h 35min 56s
Wall time: 1h 35min 28s

In [16]:

len(list(correctedComplexEmb.items())[0][1])

Out[16]:

In [ ]:

len(masterEmbedDictMaster['complex'][list(masterEmbedDictMaster['complex'].keys())[0]])

In [17]:

for node, emb in correctedComplexEmb.items():
    masterEmbedDictMaster['complex'][node] = emb
for node, emb in correctedTranseEmb.items():
    masterEmbedDictMaster['transe'][node] = emb

In [ ]:

Text Embeddings Correction file generation¶

In [28]:

q1 = "kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '" + '|'.join(masterEmbCorrections['text_7_props']) + ";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv -v True"
os.system("screen -dm " + q1)

Out[28]:

In [ ]:

q1 = "kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv   \
    --model roberta-large-nli-mean-tokens   \
    --property-labels-file ../data/labels.en.tsv  --debug   \
    --isa-properties P31 P279 P106 P39 P1382 P373 P452   \
    --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv"
#     print(q1)
os.system(q1 + " &amp;")

In [30]:

q1 = "kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv   \
    --model roberta-large-nli-mean-tokens   \
    --property-labels-file ../data/labels.en.tsv  --debug   \
    --isa-properties P31 P279   \
    --save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv"
#     print(q1)
os.system(q1 + " &amp;")

Out[30]:

In [110]:

corrected7Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv', sep='\t')
corrected2Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv', sep='\t')

In [111]:

corrected7Emb = corrected7Emb[corrected7Emb.property == 'text_embedding']
corrected7Emb['value'] = corrected7Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])

corrected2Emb = corrected2Emb[corrected2Emb.property == 'text_embedding']
corrected2Emb['value'] = corrected2Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])

In [112]:

for _, row in corrected7Emb.iterrows():
    masterEmbedDictMaster['text_7_props'][row['node']] = row['value']
for _, row in corrected2Emb.iterrows():
    masterEmbedDictMaster['text_2_props'][row['node']] = row['value']

In [ ]:

Abstract¶

In [34]:

df1 = pd.read_csv("../data/short-abstracts_lang=en.csv", skiprows=1, skipfooter=1, header=None, engine='python')
df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']
df1 = df1.set_index('node1')
df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]
print(f"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values")
sitelinksDF = pd.read_csv("../data/sitelinks.en.tsv.gz", sep='\t')
sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split("/")[-1] if p.split("/")[-1] != '' else p.split("/")[-2])
sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']
sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')
print(f"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)")
sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]
masterEmbCorrections_abs_set = set(masterEmbCorrections['abstract'])
sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]

DBPedia dataset has 5732949 records with unique 5732947 index values
There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)

In [49]:

labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\t')
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]
labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}
descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\t')
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]
descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}

In [88]:

sdf_set = set(sitelinksDF2.index.to_list())
df1 = df1[df1.index.map(lambda p: p in sdf_set)]
abstractsDF2 = sitelinksDF2.join(df1).reset_index()
print(f"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract")
# abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]

From 58 Qnodes, there are 16 sitelink Qnodes which do not have a short abstract i.e 42 have a short abstract

In [89]:

abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else "")
abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else "")
def combineAbsLabDesc(row, parameter):
    if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != "":
        return row[parameter]
    elif row['node1_label'] == "" and row['node1_desc'] == "":
        return None
    else:
        return row['node1_label'] + ' ' + row['node1_desc']

In [90]:

from nltk.tokenize import sent_tokenize
abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)
abstractsDF2 = abstractsDF2.reset_index()

In [92]:

abstractsDF2['abstract'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract',))
abstractsDF2['abstract_firstSent'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))

In [93]:

len(abstractsDF2)

Out[93]:

In [95]:

abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]

In [96]:

len(abstractsDF2)

Out[96]:

In [97]:

abstractsDF2 = abstractsDF2.drop(columns=['index']).reset_index()

In [71]:

abstractsDF2.head()

Out[71]:

	level_0	index	trimmedNode2	id	node1	label	node2	ignore	url	ignore2	abstract	node1_label	node1_desc	abstract_firstSent
0	0	0	Luxuries	Q10953913-wikipedia_sitelink-538fe3-0	Q10953913	wikipedia_sitelink	http://en.wikipedia.org/wiki/Luxuries	NaN	NaN	NaN	luxuryBehavior, expenses or equipment that far...	luxury	Behavior, expenses or equipment that far excee...	nan
1	1	1	Potato	Q10998-wikipedia_sitelink-56b85c-0	Q10998	wikipedia_sitelink	http://en.wikipedia.org/wiki/Potato	10709.0	<http://dbpedia.org/resource/Potato>	<http://www.w3.org/2000/01/rdf-schema#comment>	The potato is a root vegetable native to the A...	potato	species of plant	The potato is a root vegetable native to the A...
2	2	2	Mars	Q111-wikipedia_sitelink-9ff296-0	Q111	wikipedia_sitelink	http://en.wikipedia.org/wiki/Mars	1803088.0	<http://dbpedia.org/resource/Mars>	<http://www.w3.org/2000/01/rdf-schema#comment>	Mars is the fourth planet from the Sun and the...	Mars	fourth planet from the Sun	Mars is the fourth planet from the Sun and the...
3	3	3	Dawn	Q11326182-wikipedia_sitelink-ae2918-0	Q11326182	wikipedia_sitelink	http://en.wikipedia.org/wiki/Dawn	97544.0	<http://dbpedia.org/resource/Dawn>	<http://www.w3.org/2000/01/rdf-schema#comment>	Dawn is the time that marks the beginning of t...	dawn	time that marks the beginning of the twilight ...	Dawn is the time that marks the beginning of t...
4	4	4	Change_(philosophy)	Q1150070-wikipedia_sitelink-81cf5f-0	Q1150070	wikipedia_sitelink	http://en.wikipedia.org/wiki/Change_(philosophy)	NaN	NaN	NaN	changeprocess, event or action that deviates f...	change	process, event or action that deviates from th...	nan

In [117]:

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName)
    start = time()
    encodings = model.encode(valSeries.to_list())
    print(time()-start,'s')
    return encodings

def getIndSentEmbeddings(sent, modelName):
    model = SentenceTransformer(modelName)
    start = time()
    encodings = model.encode([sent])
    print(time()-start,'s')
    return encodings

In [102]:

abstractsDF2['abs_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract, 'bert-base-nli-mean-tokens')))
abstractsDF2['abs_firstSent_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract_firstSent, 'bert-base-nli-mean-tokens')))

0.6419482231140137 s
0.5260367393493652 s

In [104]:

for _, row in abstractsDF2.iterrows():
    masterEmbedDictMaster['abstract'][row['node1']] = row['abs_emb']
    masterEmbedDictMaster['abstract_first_sent'][row['node1']] = row['abs_firstSent_emb']

In [124]:

for node in masterEmbCorrections_abs_set:
    if node not in masterEmbedDictMaster['abstract']:
        if node in labelsDict and node in descDict:
            masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]
        elif node in labelsDict:
            masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]

0.37706875801086426 s
0.3001420497894287 s
0.370746374130249 s
0.6896324157714844 s
0.33779358863830566 s
0.3965473175048828 s
0.3200962543487549 s
0.3489806652069092 s
0.3413431644439697 s
0.32114505767822266 s
0.3811838626861572 s
0.34630370140075684 s
0.37790727615356445 s
0.26860570907592773 s
0.3601953983306885 s
0.3713240623474121 s
0.34137582778930664 s
0.33736181259155273 s
0.37023448944091797 s
0.31382036209106445 s
0.35136938095092773 s
0.37309718132019043 s
0.33543896675109863 s
0.38199710845947266 s
0.3740067481994629 s
0.3278031349182129 s
0.32283997535705566 s
0.34000563621520996 s
0.31502628326416016 s
0.34996771812438965 s
0.3871273994445801 s
0.3487060070037842 s
0.35172486305236816 s
0.3280646800994873 s
0.3519773483276367 s
0.3354451656341553 s
0.3633551597595215 s
0.3226644992828369 s
0.33882975578308105 s
0.36072254180908203 s
0.3833494186401367 s
0.2929043769836426 s
0.32875680923461914 s
0.36334872245788574 s
0.34148168563842773 s
0.3569769859313965 s
0.37468576431274414 s
0.399524450302124 s
0.3516504764556885 s
0.333402156829834 s
0.3851203918457031 s
0.34867238998413086 s
0.3607771396636963 s
0.38669753074645996 s
0.33347272872924805 s
0.36278390884399414 s
0.3602781295776367 s
0.3322322368621826 s
0.36807823181152344 s
0.3407411575317383 s
0.3837134838104248 s
0.38958096504211426 s
0.3332521915435791 s
0.3331124782562256 s
0.35001134872436523 s
0.32433485984802246 s
0.36315059661865234 s
0.34323906898498535 s
0.3112339973449707 s
0.30588483810424805 s
0.30704236030578613 s
0.31201720237731934 s

In [125]:

for node in masterEmbCorrections_abs_set:
    if node not in masterEmbedDictMaster['abstract_first_sent']:
        if node in labelsDict and node in descDict:
            masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]
        elif node in labelsDict:
            masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]

0.32213783264160156 s
0.357776403427124 s
0.37949395179748535 s
0.35210466384887695 s
0.28103041648864746 s
0.3626406192779541 s
0.35109710693359375 s
0.34203338623046875 s
0.32386112213134766 s
0.3354361057281494 s
0.3063056468963623 s
0.3441202640533447 s
0.32869935035705566 s
0.42442989349365234 s
0.37239527702331543 s
0.38650059700012207 s
0.3191685676574707 s
0.3609733581542969 s
0.3115823268890381 s
0.36015963554382324 s
0.3338603973388672 s
0.3487727642059326 s
0.3250617980957031 s
0.35145044326782227 s
0.33944034576416016 s
0.31502413749694824 s
0.3611795902252197 s
0.35285043716430664 s
0.3575010299682617 s
0.304781436920166 s
0.4003562927246094 s
0.3315858840942383 s
0.36008763313293457 s
0.36187100410461426 s
0.32981252670288086 s
0.3378865718841553 s
0.31662964820861816 s
0.32143092155456543 s
0.3152732849121094 s
0.38222813606262207 s
0.3846759796142578 s
0.33153700828552246 s
0.37013936042785645 s
0.33272790908813477 s
0.29526567459106445 s
0.3218040466308594 s
0.3795340061187744 s
0.3576061725616455 s
0.35764193534851074 s
0.36867713928222656 s
0.3807237148284912 s
0.33266758918762207 s
0.33878159523010254 s
0.38289546966552734 s
0.38695788383483887 s
0.33074188232421875 s
0.32749414443969727 s
0.33860039710998535 s
0.36585235595703125 s
0.33011841773986816 s
0.3293156623840332 s
0.3491702079772949 s
0.3720529079437256 s
0.3078622817993164 s
0.3844125270843506 s
0.32468104362487793 s
0.3186354637145996 s
0.3438723087310791 s
0.36643028259277344 s
0.34279680252075195 s
0.3625810146331787 s
0.35865354537963867 s
0.3503103256225586 s
0.37160682678222656 s
0.3268110752105713 s
0.2564544677734375 s
0.37343525886535645 s
0.33298277854919434 s

Updated coverage details¶

In [145]:

for key1 in masterEmbedKeys:
    print(f"Pair Coverage by new {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}")

Pair Coverage by new text_7_props embeddings created for 19k retrofitting: 325
Pair Coverage by new text_2_props embeddings created for 19k retrofitting: 325
Pair Coverage by new complex embeddings created for 19k retrofitting: 343
Pair Coverage by new transe embeddings created for 19k retrofitting: 343
Pair Coverage by new abstract embeddings created for 19k retrofitting: 339
Pair Coverage by new abstract_first_sent embeddings created for 19k retrofitting: 339

In [19]:

for key1 in masterEmbedDictMaster.keys():
    for key2 in masterEmbedDictMaster[key1].keys():
        if type(masterEmbedDictMaster[key1][key2]) != list:
            masterEmbedDictMaster[key1][key2] = masterEmbedDictMaster[key1][key2].tolist()

In [20]:

for key1 in ['complex', 'transe']:
    json.dump(masterEmbedDictMaster[key1], open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json', 'w'))

In [134]:

def countOverlap(source, target):
    cnt = 0
    for key1 in source:
        if key1 in target:
            cnt += 1
    return cnt
p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))

In [147]:

summArr = []
for key1 in masterEmbedDictMaster:
    cnt = countOverlap(masterEmbedDictMaster[key1], p279QnodesList)
    summArr.append([key1, len(masterEmbedDictMaster[key1]), cnt, cnt / len(p279QnodesList) * 100])
pd.DataFrame(summArr, columns=['embedding', 'total count', 'overlap count', 'Coverage Percentage'])

Out[147]:

	embedding	total count	overlap count	Coverage Percentage
0	text_7_props	238930	238889	100.000000
1	text_2_props	238930	238889	100.000000
2	complex	238500	238448	99.815395
3	transe	238500	238448	99.815395
4	abstract	105964	105916	44.336910
5	abstract_first_sent	105964	105916	44.336910

Concatenated Embeddings¶

In [9]:

import json
embedDictMaster = {}
for key1 in ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']:
    embedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json'))
    

In [13]:

def determineEmbeddingLengths(embedDictMaster):
    for key in embedDictMaster.keys():
        embed_size = len(next(iter(embedDictMaster[key].values())))
        print(key,": ",embed_size)
determineEmbeddingLengths(embedDictMaster)
        

text_7_props :  1024
text_2_props :  1024
complex :  200
transe :  200
abstract :  768
abstract_first_sent :  768

In [6]:

for key1 in embedDictMaster.keys():
    embedDictMaster[key1] = deserializeEmbeddingDict(embedDictMaster[key1])
# Fill Coverage of embedding dictionaries
for key1 in embedDictMaster.keys():
    embedDictMaster[key1] = fillCoverage(embedDictMaster[key1])

Added 11 corrections
Added 11 corrections
Added 0 corrections
Added 0 corrections
Added 4 corrections
Added 4 corrections

In [7]:

for key1 in embedDictMaster.keys():
    print(key1, len(next(iter(embedDictMaster.values()))))

text_7_props 238941
text_2_props 238941
complex 238941
transe 238941
abstract 238941
abstract_first_sent 238941

In [ ]:

Retrofitting sample¶

In [8]:

def fetchNeighbours(df):
    neighboursDict = {}
    for _, row in df.iterrows():
        if row.node1 not in neighboursDict:
            neighboursDict[row.node1] = []
        neighboursDict[row.node1].append((row.node2, row.bert2SentSim))
        
        if row.node2 not in neighboursDict:
            neighboursDict[row.node2] = []
        neighboursDict[row.node2].append((row.node1, row.bert2SentSim))
    print(max([len(neigh) for neigh in neighboursDict.values()]))
    return neighboursDict

In [9]:

def retrofit(embedDict, neighDict, weightCase, weightAssignment=False):
    newEmbedDict = {}
    for word in embedDict.keys():
        if word in neighDict:
            neighbs = neighDict[word]
            neighbs = list(filter(lambda p: p[0] in embedDict, neighbs))
            if len(neighbs) == 0:
                newEmbedDict[word] = embedDict[word]
                continue
#             assert len(neighbs) == 1
            if weightAssignment:
                sumOfSims = sum([neighb[1] for neighb in neighbs])
                sumOfEmbs = sum([embedDict[neighb[0]] * float(neighb[1]) for neighb in neighbs])
            else:
                sumOfSims = sum([1 for neighb in neighbs])
                sumOfEmbs = sum([embedDict[neighb[0]] for neighb in neighbs])
                
            if weightCase == 1:
                newEmbedDict[word] = (embedDict[word] * (len(neighbs)) + sumOfEmbs) / ((len(neighbs)) + sumOfSims)
            elif weightCase == 2:
                newEmbedDict[word] = (embedDict[word] * (len(neighbs))**2 + sumOfEmbs) / ((len(neighbs))**2 + sumOfSims)
            elif weightCase == 0.5:
                newEmbedDict[word] = (embedDict[word] * (len(neighbs))**0.5 + sumOfEmbs) / ((len(neighbs))**0.5 + sumOfSims)
            else:
                raise
        else:
            newEmbedDict[word] = embedDict[word]
    return newEmbedDict

In [11]:

from sklearn.metrics import classification_report
def labelSamples(score):
    return 'I' if score <= 1.75 else 'U' if score >= 3.5 else 'M'
LABELS = ['I','U','M']
def fetchCorrelationResults(embedDict, newEmbedDict):
    wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')
#     print(f"Length of wordsim dataset: {len(wordSim353AnnotDF_New)}")
    assert wordSim353AnnotDF_New.word1_kg_id.isna().sum() == 0
    assert wordSim353AnnotDF_New.word2_kg_id.isna().sum() == 0
    wordSim353AnnotDF_New['category'] = wordSim353AnnotDF_New.Avg.apply(labelSamples)
#     wordSim353AnnotDF_New2 = wordSim353AnnotDF_New
    wordSim353AnnotDF_New2 = wordSim353AnnotDF_New[wordSim353AnnotDF_New.apply(lambda p: p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict, axis=1)]
    wordSimMissingSet = set(wordSim353AnnotDF_New[wordSim353AnnotDF_New.word1_kg_id.apply(lambda p: p not in embedDict)].word1_kg_id.to_list() + wordSim353AnnotDF_New[wordSim353AnnotDF_New.word2_kg_id.apply(lambda p: p not in embedDict)].word2_kg_id.to_list())
    responseDict = {}
    responseDict['wordSimMissingSet'] = wordSimMissingSet
    responseDict['coveredPairs'] = len(wordSim353AnnotDF_New2)
    responseDict['totalPairs'] = len(wordSim353AnnotDF_New)
    
#     wordSimMissingSet
#     print(f"No. of pairs with some value for embeddings: {len(wordSim353AnnotDF_New2)}")
    wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(embedDict[p['word1_kg_id']]).reshape(1,-1), np.array(embedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)
    wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(newEmbedDict[p['word1_kg_id']]).reshape(1,-1), np.array(newEmbedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)
    wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textOld'] == -1, 'textOld'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textOld'] != -1]['textOld'].mean()
    wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textNew'] == -1, 'textNew'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textNew'] != -1]['textNew'].mean()
    
    # Logic 1: Scale min,max value to 1,4 strictly
#     min1, max1 = wordSim353AnnotDF_New['textOld'].min(), wordSim353AnnotDF_New['textOld'].max()
#     min2, max2 = wordSim353AnnotDF_New['textNew'].min(), wordSim353AnnotDF_New['textNew'].max()
#     wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * (p - min1) / (max1 - min1))
#     wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * (p - min2) / (max2 - min2))
    
    # Logic 2: Scale abs value to 1,4 strictly
    wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * abs(p))
    wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * abs(p))

    
#     print(f"KT Corr of old emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['Avg'])}")
#     print(f"KT Corr of new emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['Avg'])}")
#     print(f"KT Corr of old emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['H_reversed'])}")
#     print(f"KT Corr of new emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['H_reversed'])}")
    
#     print(f"Classification Accuracy of old embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textOld'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}")
#     print(f"Classification Accuracy of new embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textNew'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}")
    responseDict['KT_old_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['Avg'])
    responseDict['KT_new_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['Avg'])
    responseDict['KT_old_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['H_reversed'])
    responseDict['KT_new_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['H_reversed'])
    responseDict['old_acc'] = accuracy_score(wordSim353AnnotDF_New['textOld'].apply(labelSamples), wordSim353AnnotDF_New['category'])
    responseDict['new_acc'] = accuracy_score(wordSim353AnnotDF_New['textNew'].apply(labelSamples), wordSim353AnnotDF_New['category'])
    
    responseDict['class_rep_old'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), output_dict=True)
    responseDict['class_rep_new'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), output_dict=True)
    
    cm_old = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), labels=LABELS)
    cm_new = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), labels=LABELS)
    
    responseDict['cm_old'] = cm_old
    responseDict['cm_new'] = cm_new
    
    return responseDict

In [12]:

neighDictMaster, embedDictMaster = {}, {}

In [13]:

neighDictMaster['19k_childPar'] = fetchNeighbours(p279ChildPar)

In [14]:

embedDictMaster['complex'] = complexEmb
embedDictMaster['transe'] = transeEmb

In [17]:

for key1 in embedDictMaster.keys():
    for key2 in embedDictMaster[key1].keys():
        embedDictMaster[key1][key2] = np.array(embedDictMaster[key1][key2])

In [18]:

embList = list(embedDictMaster.keys())

In [19]:

basisList = list(neighDictMaster.keys())

In [20]:

neighDictMaster.keys()

Out[20]:

dict_keys(['19k_childPar'])

In [21]:

newEmbedDictMaster, responsesDictMaster = {}, {}

In [25]:

import numpy as np
results = []
NUM_ITERS = 10
for basis in tqdm(basisList):
    for emb in embList:
        for weightedNess in [True]:
            groupResults = []
            for weightCase in [1,2]:
                embedDict = embedDictMaster[emb]
                if weightedNess:
                    caseName = emb + '_' + basis + '_' + str(weightCase) + '_weighted'
                else:
                    caseName = emb + '_' + basis + '_' + str(weightCase) + '_unweighted'
                for iterNum in range(1,NUM_ITERS+1):
                    newEmbedDict = retrofit(embedDict, neighDictMaster[basis], weightCase, weightedNess)
        #             dists = determineDistances(embedDict, newEmbedDict)
                    responsesDict = fetchCorrelationResults(embedDict, newEmbedDict)
        #                 print(responsesDict.keys())
                    groupResults.append([emb, basis, weightCase, weightedNess, iterNum, \
                                    responsesDict['old_acc']*100, \
                                    responsesDict['new_acc']*100, \
                                    (responsesDict['new_acc'] - responsesDict['old_acc'])*100, \
                                    responsesDict['coveredPairs'], \
                                     responsesDict['class_rep_old']['I']['precision'], \
                                     responsesDict['class_rep_old']['I']['recall'], \
                                     responsesDict['class_rep_old']['I']['f1-score'], \
                                     responsesDict['class_rep_old']['U']['precision'], \
                                     responsesDict['class_rep_old']['U']['recall'], \
                                     responsesDict['class_rep_old']['U']['f1-score'], \
                                     responsesDict['class_rep_new']['I']['precision'], \
                                     responsesDict['class_rep_new']['I']['recall'], \
                                     responsesDict['class_rep_new']['I']['f1-score'], \
                                     responsesDict['class_rep_new']['U']['precision'], \
                                     responsesDict['class_rep_new']['U']['recall'], \
                                     responsesDict['class_rep_new']['U']['f1-score'], \
                                    ])
                    embedDict = newEmbedDict

                newEmbedDictMaster[caseName] = newEmbedDict
                responsesDictMaster[caseName] = responsesDict
            for gR, rank in zip(groupResults, np.argsort([-p[6] for p in groupResults])):
                results.append(gR+[rank])
        

  0%|          | 0/1 [00:00<?, ?it/s]

In [26]:

resultsDF = pd.DataFrame(results, columns=['Embedding', 'Basis', 'Weight', 'Weightedness', 'Iteration Num', 'Old Acc', 'New Acc', 'Increase', 'Pairs Covered', \
                                           'Old I Precision', 'Old I Recall', 'Old I F1-Score', \
                                           'Old U Precision', 'Old U Recall', 'Old U F1-Score', \
                                           'New I Precision', 'New I Recall', 'New I F1-Score', \
                                           'New U Precision', 'New U Recall', 'New U F1-Score', \
                                           'Rank'])

In [27]:

resultsDF.sort_values(by=['Increase'], ascending=False)

Out[27]:

	Embedding	Basis	Weight	Weightedness	Iteration Num	Old Acc	New Acc	Increase	Pairs Covered	Old I Precision	...	Old U Precision	Old U Recall	Old U F1-Score	New I Precision	New I Recall	New I F1-Score	New U Precision	New U Recall	New U F1-Score	Rank
0	complex	19k_childPar	1	True	1	60.755814	64.244186	3.488372	291	1.000000	...	0.433121	0.660194	0.523077	1.000000	0.40	0.571429	0.463415	0.553398	0.504425	2
20	transe	19k_childPar	1	True	1	62.500000	65.697674	3.197674	291	0.888889	...	0.397059	0.262136	0.315789	0.833333	0.50	0.625000	0.450000	0.174757	0.251748	0
1	complex	19k_childPar	1	True	2	64.244186	67.151163	2.906977	291	1.000000	...	0.463415	0.553398	0.504425	1.000000	0.50	0.666667	0.495050	0.485437	0.490196	4
11	complex	19k_childPar	2	True	2	61.918605	63.662791	1.744186	291	1.000000	...	0.444444	0.660194	0.531250	1.000000	0.45	0.620690	0.458904	0.650485	0.538153	13
10	complex	19k_childPar	2	True	1	60.755814	61.918605	1.162791	291	1.000000	...	0.433121	0.660194	0.523077	1.000000	0.40	0.571429	0.444444	0.660194	0.531250	12
2	complex	19k_childPar	1	True	3	67.151163	67.732558	0.581395	291	1.000000	...	0.495050	0.485437	0.490196	0.909091	0.50	0.645161	0.511905	0.417476	0.459893	1
4	complex	19k_childPar	1	True	5	67.151163	67.732558	0.581395	291	0.916667	...	0.492958	0.339806	0.402299	0.916667	0.55	0.687500	0.507937	0.310680	0.385542	5
36	transe	19k_childPar	2	True	7	62.500000	63.081395	0.581395	291	0.846154	...	0.351852	0.184466	0.242038	0.846154	0.55	0.666667	0.365385	0.184466	0.245161	6
30	transe	19k_childPar	2	True	1	62.500000	63.081395	0.581395	291	0.888889	...	0.397059	0.262136	0.315789	0.900000	0.45	0.600000	0.400000	0.252427	0.309524	17
22	transe	19k_childPar	1	True	3	64.825581	65.406977	0.581395	291	0.750000	...	0.393939	0.126214	0.191176	0.750000	0.75	0.750000	0.400000	0.116505	0.180451	3
38	transe	19k_childPar	2	True	9	63.081395	63.372093	0.290698	291	0.785714	...	0.372549	0.184466	0.246753	0.785714	0.55	0.647059	0.380000	0.184466	0.248366	8
33	transe	19k_childPar	2	True	4	62.790698	63.081395	0.290698	291	0.900000	...	0.383333	0.223301	0.282209	0.846154	0.55	0.666667	0.375000	0.203883	0.264151	14
18	complex	19k_childPar	2	True	9	63.081395	63.081395	0.000000	291	0.909091	...	0.444444	0.543689	0.489083	0.909091	0.50	0.645161	0.444444	0.543689	0.489083	19
37	transe	19k_childPar	2	True	8	63.081395	63.081395	0.000000	291	0.846154	...	0.365385	0.184466	0.245161	0.785714	0.55	0.647059	0.372549	0.184466	0.246753	7
32	transe	19k_childPar	2	True	3	62.790698	62.790698	0.000000	291	0.900000	...	0.387097	0.233010	0.290909	0.900000	0.45	0.600000	0.383333	0.223301	0.282209	12
23	transe	19k_childPar	1	True	4	65.406977	65.406977	0.000000	291	0.750000	...	0.400000	0.116505	0.180451	0.652174	0.75	0.697674	0.444444	0.116505	0.184615	1
39	transe	19k_childPar	2	True	10	63.372093	63.372093	0.000000	291	0.785714	...	0.380000	0.184466	0.248366	0.785714	0.55	0.647059	0.380000	0.184466	0.248366	9
15	complex	19k_childPar	2	True	6	63.662791	63.662791	0.000000	291	1.000000	...	0.455224	0.592233	0.514768	1.000000	0.45	0.620690	0.453846	0.572816	0.506438	17
14	complex	19k_childPar	2	True	5	63.662791	63.662791	0.000000	291	1.000000	...	0.457143	0.621359	0.526749	1.000000	0.45	0.620690	0.455224	0.592233	0.514768	16
13	complex	19k_childPar	2	True	4	63.662791	63.662791	0.000000	291	1.000000	...	0.457746	0.631068	0.530612	1.000000	0.45	0.620690	0.457143	0.621359	0.526749	15
12	complex	19k_childPar	2	True	3	63.662791	63.662791	0.000000	291	1.000000	...	0.458904	0.650485	0.538153	1.000000	0.45	0.620690	0.457746	0.631068	0.530612	14
17	complex	19k_childPar	2	True	8	63.372093	63.081395	-0.290698	291	0.909091	...	0.448819	0.553398	0.495652	0.909091	0.50	0.645161	0.444444	0.543689	0.489083	18
16	complex	19k_childPar	2	True	7	63.662791	63.372093	-0.290698	291	1.000000	...	0.453846	0.572816	0.506438	0.909091	0.50	0.645161	0.448819	0.553398	0.495652	9
35	transe	19k_childPar	2	True	6	62.790698	62.500000	-0.290698	291	0.846154	...	0.363636	0.194175	0.253165	0.846154	0.55	0.666667	0.351852	0.184466	0.242038	5
34	transe	19k_childPar	2	True	5	63.081395	62.790698	-0.290698	291	0.846154	...	0.375000	0.203883	0.264151	0.846154	0.55	0.666667	0.363636	0.194175	0.253165	15
31	transe	19k_childPar	2	True	2	63.081395	62.790698	-0.290698	291	0.900000	...	0.400000	0.252427	0.309524	0.900000	0.45	0.600000	0.387097	0.233010	0.290909	11
28	transe	19k_childPar	1	True	9	60.755814	60.465116	-0.290698	291	0.394737	...	0.368421	0.067961	0.114754	0.365854	0.75	0.491803	0.388889	0.067961	0.115702	13
19	complex	19k_childPar	2	True	10	63.081395	62.790698	-0.290698	291	0.909091	...	0.444444	0.543689	0.489083	0.900000	0.45	0.600000	0.444444	0.543689	0.489083	10
26	transe	19k_childPar	1	True	7	62.209302	61.627907	-0.581395	291	0.428571	...	0.428571	0.087379	0.145161	0.416667	0.75	0.535714	0.400000	0.077670	0.130081	4
29	transe	19k_childPar	1	True	10	60.465116	59.883721	-0.581395	291	0.365854	...	0.388889	0.067961	0.115702	0.333333	0.75	0.461538	0.411765	0.067961	0.116667	16
8	complex	19k_childPar	1	True	9	65.116279	64.534884	-0.581395	291	0.785714	...	0.442308	0.223301	0.296774	0.785714	0.55	0.647059	0.416667	0.194175	0.264901	0
3	complex	19k_childPar	1	True	4	67.732558	67.151163	-0.581395	291	0.909091	...	0.511905	0.417476	0.459893	0.916667	0.55	0.687500	0.492958	0.339806	0.402299	3
6	complex	19k_childPar	1	True	7	66.569767	65.988372	-0.581395	291	0.916667	...	0.475410	0.281553	0.353659	0.846154	0.55	0.666667	0.464286	0.252427	0.327044	7
7	complex	19k_childPar	1	True	8	65.988372	65.116279	-0.872093	291	0.846154	...	0.464286	0.252427	0.327044	0.785714	0.55	0.647059	0.442308	0.223301	0.296774	8
21	transe	19k_childPar	1	True	2	65.697674	64.825581	-0.872093	291	0.833333	...	0.450000	0.174757	0.251748	0.750000	0.60	0.666667	0.393939	0.126214	0.191176	2
27	transe	19k_childPar	1	True	8	61.627907	60.755814	-0.872093	291	0.416667	...	0.400000	0.077670	0.130081	0.394737	0.75	0.517241	0.368421	0.067961	0.114754	10
25	transe	19k_childPar	1	True	6	63.372093	62.209302	-1.162791	291	0.468750	...	0.454545	0.097087	0.160000	0.428571	0.75	0.545455	0.428571	0.087379	0.145161	19
5	complex	19k_childPar	1	True	6	67.732558	66.569767	-1.162791	291	0.916667	...	0.507937	0.310680	0.385542	0.916667	0.55	0.687500	0.475410	0.281553	0.353659	6
9	complex	19k_childPar	1	True	10	64.534884	63.081395	-1.453488	291	0.785714	...	0.416667	0.194175	0.264901	0.687500	0.55	0.611111	0.377778	0.165049	0.229730	11
24	transe	19k_childPar	1	True	5	65.406977	63.372093	-2.034884	291	0.652174	...	0.444444	0.116505	0.184615	0.468750	0.75	0.576923	0.454545	0.097087	0.160000	18

40 rows × 22 columns

In [28]:

resultsDF.to_csv('../data/retrofitting/masterRetro_Aug20_2021.csv', index=False)

In [ ]: