In this notebook, we make a collection of all the embeddings which we use to do retrofitting. These embeddings are then evaluated for their similarity based on the evaluation benchmark datasets.
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from itertools import combinations
from math import comb
from sklearn.ensemble import RandomForestClassifier
import os
import h5py
import json
import gzip
# DWD V2 files
# https://drive.google.com/drive/u/3/folders/1OIZegxxrs_Hv2ZhDsSO-zLVARCR60P01
# SITELINKS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/sitelinks.en.tsv.gz"
CLAIMS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/claims.tsv.gz"
LABELS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz"
DESCRIPTIONS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/descriptions.en.tsv.gz"
# wikidata-20210215 files
# https://drive.google.com/drive/u/3/folders/1NGtob1BFQ03sXf4yQyYvP13ly3u1Ul5u
# SITELINKS_FILE_V1 = "../source_dataset_files/wikidata-20210215/sitelinks.en.tsv.gz"
# wikidata-20201208 files
# https://drive.google.com/drive/u/3/folders/1qbbgjo7pddMdDvQzOSeSaL6lYwj_f5gi
SITELINKS_FILE_V2 = "../source_dataset_files/wikidata-20201208/sitelinks.en.tsv.gz"
# Embedding Related Files
DBPEDIA_SHORT_ABSTRACTS_TTL = "../data/evaluation/source_files/short-abstracts_lang=en.ttl"
DBPEDIA_SHORT_ABSTRACTS_CSV = "../data/evaluation/source_files/short-abstracts_lang=en.csv"
ABSTRACTS_INTERMEDIATE_FILE = "../data/embeddings/intermediate_files/abstracts.csv"
COMPLEX_EMB_SOURCE_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.complEx.graph-embeddings.txt"
TRANSE_EMB_SOURCE_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/wikidatadwd.transE.graph-embeddings.txt"
TEXT_EMB_SOURCE_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/text-embeddings-concatenated.tsv.gz"
COMPLEX_EMB_FINAL_FILE = "../data/embeddings/complex_orig_embedding_dict.json"
TRANSE_EMB_FINAL_FILE = "../data/embeddings/transe_orig_embedding_dict.json"
TEXT_EMB_FINAL_FILE = "../data/embeddings/text_7_props_orig_embedding_dict.json"
ABS_EMB_FINAL_FILE = "../data/embeddings/abstract_orig_embedding_dict.json"
ABS_FIRST_SENT_EMB_FINAL_FILE = "../data/embeddings/abstract_first_sent_orig_embedding_dict.json"
LABELS_EMB_FINAL_FILE = "../data/embeddings/labels_orig_embedding_dict.json"
LABELS_DESC_EMB_FINAL_FILE = "../data/embeddings/labels_n_desc_orig_embedding_dict.json"
# HAS Embedding Related Files
A_SOURCE_FILE = "../source_dataset_files/A_walks_analysis/a_embeddings_10x10,min_count=0.kv"
A_OP_FILE = "../data/embeddings/has_a_orig_embedding_dict.json"
H_SOURCE_FILE = "../source_dataset_files/H_walks_analysis/h_embeddings_5x8,min_count=21.kv"
H_OP_FILE = "../data/embeddings/has_h_orig_embedding_dict.json"
S_SOURCE_FILE = "../source_dataset_files/S_walks_analysis/s_embeddings_5x10,min_count=0.kv"
S_OP_FILE = "../data/embeddings/has_s_orig_embedding_dict.json"
WORDSIM_CLASS_SIM_FILE = '../data/embeddings/wordsim_class_sim.csv'
WORDSIM_JC_SIM_FILE = '../data/embeddings/wordsim_jc_sim.csv'
WORDSIM_TOP_SIM_FILE = '../data/embeddings/wordsim_top_sim.csv'
WORDSIM_OLD_CLASS_SIM_FILE = '../data/embeddings/wordsim_old_class_sim.csv'
WORDSIM_OLD_JC_SIM_FILE = '../data/embeddings/wordsim_old_jc_sim.csv'
WORDSIM_OLD_TOP_SIM_FILE = '../data/embeddings/wordsim_old_top_sim.csv'
DBPEDIA_MC_30_CLASS_SIM_FILE = '../data/embeddings/dbpedia_mc_30_class_sim.csv'
DBPEDIA_MC_30_JC_SIM_FILE = '../data/embeddings/dbpedia_mc_30_jc_sim.csv'
DBPEDIA_MC_30_TOP_SIM_FILE = '../data/embeddings/dbpedia_mc_30_top_sim.csv'
DBPEDIA_RG_65_CLASS_SIM_FILE = '../data/embeddings/dbpedia_rg_65_class_sim.csv'
DBPEDIA_RG_65_JC_SIM_FILE = '../data/embeddings/dbpedia_rg_65_jc_sim.csv'
DBPEDIA_RG_65_TOP_SIM_FILE = '../data/embeddings/dbpedia_rg_65_top_sim.csv'
P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = "../data/basis/P279_ChildPar.all-distilroberta-v1.csv"
WORDSIM_FILE = "../data/evaluation/wordsim353_with_r3.csv"
WORDSIM_OLD_FILE = "../data/evaluation/wordsim_old.csv"
DBPEDIA_MC_30_FINAL_FILE = "../data/evaluation/mc-30_DBpedia.csv"
DBPEDIA_RG_65_FINAL_FILE = "../data/evaluation/rg-65_DBpedia.csv"
def get_all_nodes():
"""
This function generates the set of all nodes needed for execution
"""
p279ChildPar = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)
wordsim_df = pd.read_csv(WORDSIM_FILE)
dbpedia_mc_30_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)
dbpedia_rg_65_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)
# wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')
# concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')
p279QnodesList = set(p279ChildPar.node1.to_list()
+ p279ChildPar.node2.to_list()
+ wordsim_df['word1_kg_id'].to_list()
+ wordsim_df['word2_kg_id'].to_list()
+ dbpedia_mc_30_df['word1_kg_id'].to_list()
+ dbpedia_mc_30_df['word2_kg_id'].to_list()
+ dbpedia_rg_65_df['word1_kg_id'].to_list()
+ dbpedia_rg_65_df['word2_kg_id'].to_list())
# + wiki_cs_df['word1_kg_id'].to_list()
# + wiki_cs_df['word2_kg_id'].to_list()
# + concept_net_df['word1_kg_id'].to_list()
# + concept_net_df['word2_kg_id'].to_list())
print(len(p279QnodesList))
return p279QnodesList
allNodes = get_all_nodes()
241698
def fillCoverage(embedDict):
wordSim353AnnotDF_New = pd.read_csv(WORDSIM_FILE)
wordSim353AnnotDF_set = set(wordSim353AnnotDF_New['word1_kg_id'].to_list() + wordSim353AnnotDF_New['word2_kg_id'].to_list())
embed_size = len(embedDict[next(iter(embedDict))])
# print(embed_size)
count = 0
for word in wordSim353AnnotDF_set:
if word not in embedDict:
embedDict[word] = np.zeros((embed_size))
count += 1
print(f"Added {count} corrections")
return embedDict
def deserializeEmbeddingDict(embedDict):
for key2 in embedDict.keys():
embedDict[key2] = np.array(embedDict[key2])
return embedDict
def serializeEmbeddingDict(embedDict):
for key2 in embedDict.keys():
embedDict[key2] = embedDict[key2].tolist() if type(embedDict[key2]) != list else embedDict[key2]
return embedDict
def get_labels(node_set):
labels_dict = {}
first_line = True
with gzip.open(LABELS_FILE, 'r') as labelsFile:
firstLine = True
for line in tqdm(labelsFile, total=41845781):
if firstLine:
firstLine = False
continue
line = line.decode('utf-8').strip().split('\t')
line[3] = line[3][1:-5]
qnode, label = line[1], line[3]
# print(qnode, label)
if qnode in node_set:
labels_dict[qnode] = label
return labels_dict
def get_labels_n_desc(node_set):
labels_dict = get_labels(node_set)
first_line = True
with gzip.open(DESCRIPTIONS_FILE, 'r') as labelsFile:
firstLine = True
for line in tqdm(labelsFile, total=34700043):
if firstLine:
firstLine = False
continue
line = line.decode('utf-8').strip().split('\t')
line[3] = line[3][1:-5]
qnode, label = line[1], line[3]
# print(qnode, label)
if qnode in node_set:
if qnode in labels_dict:
labels_dict[qnode] += ' ' + label
else:
raise "Label not present"
return labels_dict
complex_emb_dict = json.load(open(COMPLEX_EMB_FINAL_FILE))
first_line = True
complex_emb_dict = {}
with open(COMPLEX_EMB_SOURCE_FILE) as complex_file:
for line in tqdm(complex_file, total=53002671):
if first_line:
first_line = False
continue
line = line.strip().split()
if line[0] in allNodes and line[0] not in complex_emb_dict:
complex_emb_dict[line[0]] = [float(elem) for elem in line[1:]]
0it [00:00, ?it/s]
len(complex_emb_dict)
241698
json.dump(complex_emb_dict, open(COMPLEX_EMB_FINAL_FILE, 'w'))
transe_emb_dict = json.load(open(TRANSE_EMB_FINAL_FILE))
first_line = True
transe_emb_dict = {}
with open(TRANSE_EMB_SOURCE_FILE) as complex_file:
for line in tqdm(complex_file, total=53002671):
if first_line:
first_line = False
continue
line = line.strip().split()
if line[0] in allNodes and line[0] not in transe_emb_dict:
transe_emb_dict[line[0]] = [float(elem) for elem in line[1:]]
len(transe_emb_dict)
0it [00:00, ?it/s]
241698
json.dump(transe_emb_dict, open(TRANSE_EMB_FINAL_FILE, 'w'))
# p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')
# print(len(set(p279ChildPar.node1.to_list()
# + p279ChildPar.node2.to_list())))
# # Load complex, transe embedding files and entity names file
# compf = h5py.File('../data/complTrans/complEx.h5','r')
# transf = h5py.File('../data/complTrans/transE.h5','r')
# ent_names = json.load(open('../data/complTrans/entity_names_all_0.json'))
# allNodes = get_all_nodes()
# # json.dump(list(p279QnodesList), open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json', 'w'))
# complexEmb = {qnode: emb for emb, qnode in zip(compf['embeddings'], ent_names) if qnode in allNodes}
# transeEmb = {qnode: emb for emb, qnode in zip(transf['embeddings'], ent_names) if qnode in allNodes}
# print(f"Out of {len(ent_names)} embeddings, retaining {len(transeEmb)} embeddings")
# def serialize_embedding_dict(embed_dict):
# for key2 in embed_dict.keys():
# embed_dict[key2] = embed_dict[key2].tolist() if type(embed_dict[key2]) != list else embed_dict[key2]
# return embed_dict
# json.dump(serialize_embedding_dict(complexEmb),open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json','w'))
# json.dump(serialize_embedding_dict(transeEmb),open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json','w'))
# # complexEmb = json.load(open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json'))
# # transeEmb = json.load(open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json'))
238889
first_line = True
text_emb_dict = {}
with gzip.open(TEXT_EMB_SOURCE_FILE) as file:
for line in tqdm(file):
if first_line:
first_line = False
continue
line = line.decode('utf-8').strip().split('\t')
if line[1] == 'text_embedding' and line[0] in allNodes:
text_emb_dict[line[0]] = [float(elem) for elem in line[2].split(',')]
0it [00:00, ?it/s]
json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))
# text_emb_dict = json.load(open('../data/embeddings/archived/text_7_props_orig_embedding_dict.json.old'))
missing_nodes = []
for node in tqdm(allNodes):
if node not in text_emb_dict:
missing_nodes.append(node)
0%| | 0/241698 [00:00<?, ?it/s]
missing_nodes_set = set(missing_nodes)
new_file = []
with gzip.open(CLAIMS_FILE, 'r') as all_claims_file:
firstLine = True
for ogline in tqdm(all_claims_file, total=491297976):
if firstLine:
firstLine = False
continue
line = ogline.decode('utf-8').strip().split('\t')
line[3] = line[3][1:-5]
qnode, label = line[1], line[3]
# print(qnode, label)
if qnode in missing_nodes_set:
new_file.append(ogline)
0it [00:00, ?it/s]
allowed_props = set(['P31', 'P279', 'P106', 'P39', 'P1382', 'P373', 'P452'])
new_file1 = []
for line in new_file:
line1 = line.decode('utf-8').strip().split('\t')
if line1[2] in allowed_props:
new_file1.append(line.decode('utf-8'))
new_file1 = ['id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\n'] + new_file1
with open('../output/text-embeddings/missing_nodes.tsv', 'w') as f:
f.writelines(new_file1)
# --model sentence-transformers/roberta-large-nli-mean-tokens \
q1 = "~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../output/text-embeddings/missing_nodes.tsv \
--model roberta-large-nli-mean-tokens \
--property-labels-file " + LABELS_FILE + " --debug \
--isa-properties P31 P279 P106 P39 P1382 P373 P452 \
--save-embedding-sentence > ../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv"
os.system(q1 + " ")
0
text7_missingnodes = pd.read_csv("../output/text-embeddings/P279-text-embedding-7-props-missing-qnodes.tsv", sep='\t')
text7_missingnodes = text7_missingnodes[text7_missingnodes.property == 'text_embedding']
text7_missingnodes['value'] = text7_missingnodes['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])
text7EmbDict = {row['node']: row['value'] for _,row in text7_missingnodes.iterrows()}
for key in text7EmbDict.keys():
if key not in text_emb_dict:
text_emb_dict[key] = text7EmbDict[key]
json.dump(text_emb_dict, open(TEXT_EMB_FINAL_FILE, 'w'))
p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')
p279QnodesList = list(set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list()))
missingNodes = allNodes - set(p279ChildPar.node1.to_list() + p279ChildPar.node2.to_list())
len(missingNodes)
37038
# Split main file into sub-files for groups of properties for multi-processing
# bsize = len(p279QnodesList) // 250
# cnt = 1
# for i in range(0, len(p279QnodesList), bsize):
# q1 = "kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '" + '|'.join(p279QnodesList[i:i+bsize]) + ";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv -v True"
# # print(len(q1))
# cnt += 1
# # print(q1)
# os.system("screen -dm " + q1)
0%| | 0/38 [00:00<?, ?it/s]
for cnt in range(290,503):
os.system("rm ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv")
# # Split main file into sub-files for groups of properties for multi-processing
missingNodes = list(missingNodes)
bsize = 1000
cnt = 252
for i in tqdm(range(0, len(missingNodes), bsize)):
q1 = "kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '" + '|'.join(missingNodes[i:i+bsize]) + ";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv -v True"
# print(len(q1))
cnt += 1
# print(q1)
os.system("screen -dm " + q1)
0%| | 0/38 [00:00<?, ?it/s]
def checkIfFileContainsLines(file):
with open(file) as f:
for line in f:
return True
return False
def countFileLines(file):
count = 0
with open(file) as f:
for line in f:
count += 1
return count
import time
from os.path import exists
runCommCnt = 1
# 252
for cnt in tqdm(range(252,290)):
if exists("../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv") and countFileLines("../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv") == 4097:
continue
q1 = ""
# if cnt % 10 == 0:
# q1 += "sleep 20m; "
q1 = "~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv \
--model sentence-transformers/all-distilroberta-v1 \
--property-labels-file ../data/labels.en.tsv --debug \
--isa-properties P31 P279 P106 P39 P1382 P373 P452 \
--save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv"
print(cnt)
runCommCnt += 1
os.system(q1 + " &")
if runCommCnt % 15 == 0:
time.sleep(11*60)
for cnt in tqdm(range(1,290)):
if countFileLines("../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-" + str(cnt) + ".tsv") != 4097:
print(cnt)
import time
from os.path import exists
# roberta-large-nli-mean-tokens
runCommCnt = 0
for cnt in tqdm(range(252,290)):
if exists("../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-" + str(cnt) + ".tsv") and countFileLines("../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-" + str(cnt) + ".tsv") == 4097:
continue
q1 = ""
# if cnt % 10 == 0:
# q1 += "sleep 20m; "
q1 += "~/miniconda3/envs/kgtkEnv2/bin/kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-" + str(cnt) + ".tsv \
--model sentence-transformers/all-distilroberta-v1 \
--property-labels-file ../data/labels.en.tsv --debug \
--isa-properties P31 P279 \
--save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-" + str(cnt) + ".tsv"
print(cnt)
runCommCnt += 1
os.system(q1 + " &")
if runCommCnt % 15 == 0:
time.sleep(13*60)
p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))
# temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-1.tsv', sep='\t')
# temp.head()
node | property | value | |
---|---|---|---|
0 | Q99738027 | text_embedding | 0.74755263,1.6350263,-0.73952675,1.0463063,-0.... |
1 | Q99738027 | embedding_sentence | night shift, work shift during nighttime hours... |
2 | Q99228502 | text_embedding | 0.25261465,0.06285462,0.029052094,0.50796187,0... |
3 | Q99228502 | embedding_sentence | avenue, thoroughfare named \"avenue\" is thoro... |
4 | Q98970128 | text_embedding | 0.11887096,0.8598291,0.4446009,-0.5038472,-0.9... |
text2EmbArr = []
for i in tqdm(range(1, 290)):
if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv')):
continue
temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-'+str(i)+'.tsv', sep='\t')
temp = temp[temp.property == 'text_embedding']
text2EmbArr.append(temp)
text2Emb = pd.concat(text2EmbArr)
text2Emb.head()
text7EmbArr = []
for i in tqdm(range(1, 290)):
if not(checkIfFileContainsLines('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv')):
continue
temp = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-'+str(i)+'.tsv', sep='\t')
temp = temp[temp.property == 'text_embedding']
text7EmbArr.append(temp)
text7Emb = pd.concat(text7EmbArr)
text2Emb = text2Emb[text2Emb.node.apply(lambda p: p in allNodes)]
text7Emb = text7Emb[text7Emb.node.apply(lambda p: p in allNodes)]
print(f"We have 2prop text embeddings for {len(text2Emb)} nodes and 7prop for {len(text7Emb)} nodes")
We have 2prop text embeddings for 278467 nodes and 7prop for 277587 nodes
text2Emb['value'] = text2Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])
text7Emb['value'] = text7Emb['value'].apply(lambda p: [float(p1) for p1 in p.split(',')])
text2EmbDict = {row['node']: row['value'] for _,row in text2Emb.iterrows()}
text7EmbDict = {row['node']: row['value'] for _,row in text7Emb.iterrows()}
json.dump(text2EmbDict, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))
json.dump(text7EmbDict, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))
Downloaded short abstracts file from DBPedia Short Abstracts - 2020.07.01
Then, we extract the abstracts file from the bz2 file using: bzip2 -d short-abstracts_lang=en.ttl.bz2
# cnt = 0
# p1s = []
# p11s = []
# p2s = []
# lines = []
# with open(DBPEDIA_SHORT_ABSTRACTS_TTL, 'r', encoding='utf-8') as f:
# for line in tqdm(f):
# p1 = line[:line.find(" ")]
# p11 = p1[len("<http://dbpedia.org/resource/"):][:-1]
# line = line[line.find(" ")+1:]
# p2 = line[:line.find(" ")]
# line = line[line.find(" ")+1:line.rfind(" ")][1:-4]
# p1s.append(p1)
# p11s.append(p11)
# p2s.append(p2)
# lines.append(line)
# df1 = pd.DataFrame({'urlComp': p11s, 'url':p1s, 'p2': p2s, 'abstract': lines})
# df1.to_csv(DBPEDIA_SHORT_ABSTRACTS_CSV)
0it [00:00, ?it/s]
df1 = pd.read_csv(DBPEDIA_SHORT_ABSTRACTS_CSV, skiprows=1, skipfooter=1, header=None, engine='python')
df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']
df1 = df1.set_index('node1')
df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]
print(f"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values")
df1.loc[df1[df1.index.duplicated()].index]
sitelinksDF = pd.read_csv(SITELINKS_FILE_V2, sep='\t')
sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split("/")[-1] if p.split("/")[-1] != '' else p.split("/")[-2])
sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']
sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')
print(f"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)")
sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]
sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in allNodes)]
labelsDF = pd.read_csv(LABELS_FILE, sep='\t')
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in allNodes)]
labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}
descriptionsDF = pd.read_csv(DESCRIPTIONS_FILE, compression='gzip', sep='\t')
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in allNodes)]
descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}
sdf_set = set(sitelinksDF2.index.to_list())
df1 = df1[df1.index.map(lambda p: p in sdf_set)]
abstractsDF2 = sitelinksDF2.join(df1).reset_index()
print(f"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract")
abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else "")
abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else "")
from nltk.tokenize import sent_tokenize
abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)
currNodes = set(abstractsDF2.node1.tolist())
correctedRows = []
for key in tqdm(allNodes):
if key not in currNodes:
correctedRows.append([None, None, key, None, None, None, None, None, None, labelsDict[key][1:-4] if key in labelsDict else None, descDict[key][1:-4] if key in descDict else None, None])
abstractsDF3 = pd.concat([abstractsDF2, pd.DataFrame(correctedRows, columns=abstractsDF2.columns)])
DBPedia dataset has 5732949 records with unique 5732947 index values There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text) From 116088 Qnodes, there are 5134 sitelink Qnodes which do not have a short abstract i.e 110954 have a short abstract
0%| | 0/241698 [00:00<?, ?it/s]
DBPedia dataset has 5732949 records with unique 5732947 index values There are 8637003 sitelinks present in the dataset corresponding to 8637003 unique node1s (Qxxx), 8563928 unique labels (text) From 116430 Qnodes, there are 5707 sitelink Qnodes which do not have a short abstract i.e 110723 have a short abstract
DBPedia dataset has 5732949 records with unique 5732947 index values There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)
From 122585 Qnodes, there are 5661 sitelink Qnodes which do not have a short abstract i.e 116924 have a short abstract
def combineAbsLabDesc(row, parameter):
if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != "":
return row[parameter]
elif row['node1_label'] != '' and row['node1_desc'] != '' and not(pd.isna(row['node1_label'])) and not(pd.isna(row['node1_desc'])):
return row['node1_label'] + ' ' + row['node1_desc']
elif row['node1_label'] != '' and not(pd.isna(row['node1_label'])):
return row['node1_label']
else:
return None
abstractsDF3['abstract'] = abstractsDF3.apply(combineAbsLabDesc, axis=1, args=('abstract',))
abstractsDF3['abstract_firstSent'] = abstractsDF3.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))
abstractsDF3 = abstractsDF3[~abstractsDF3.abstract.isna()]
abstractsDF3 = abstractsDF3.reset_index()
abstractsDF3 = abstractsDF3.drop(columns=['level_0']).reset_index()
abstractsDF3 = abstractsDF3.drop(columns=['level_0'])
print(len(abstractsDF3))
abstractsDF3.to_csv(ABSTRACTS_INTERMEDIATE_FILE)
241698
abstractsDF2 = pd.read_csv(ABSTRACTS_INTERMEDIATE_FILE)
len(abstractsDF2)
/nas/home/kshenoy/miniconda3/envs/kgtkEnv/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3165: DtypeWarning: Columns (1,2,4,5,7,8) have mixed types.Specify dtype option on import or set low_memory=False. has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
241698
# abstractsDF2[abstractsDF2.abstract == (abstractsDF2.node1_label + ' ' + abstractsDF2.node1_desc)].to_csv('../data/Master_P279_dataset/temppppp.csv')
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd
def getSentEmbeddings(valSeries, modelName):
model = SentenceTransformer(modelName, device='cuda:2')
start = time()
encodings = model.encode(valSeries.to_list(), show_progress_bar=True, batch_size=1000)
print(time()-start,'s')
return encodings
modelName = 'sentence-transformers/all-distilroberta-v1'
absEmbSeries = getSentEmbeddings(abstractsDF2.abstract, modelName)
absFirstSentEmbSeries = getSentEmbeddings(abstractsDF2.abstract_firstSent, modelName)
Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404 SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling
Batches: 0%| | 0/242 [00:00<?, ?it/s]
316.05097579956055 s
Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404 SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling
Batches: 0%| | 0/242 [00:00<?, ?it/s]
260.16796946525574 s
absEmbDict = {node: emb.tolist() for node, emb in zip(abstractsDF2.node1.to_list(), absEmbSeries)}
absFirstSentEmbDict = {node: emb.tolist() for node, emb in zip(abstractsDF2.node1.to_list(), absFirstSentEmbSeries)}
json.dump(absEmbDict, open(ABS_EMB_FINAL_FILE, 'w'))
json.dump(absFirstSentEmbDict, open(ABS_FIRST_SENT_EMB_FINAL_FILE, 'w'))
Fetched from sita
/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/
def store_relevant_embeddings(wvec, fname):
tempEmb = {key: wvec[key] for key in wvec.index_to_key if key in allNodes}
print(f"Original Length: {len(wvec.index_to_key)}, No. of keys stored: {len(tempEmb)}")
json.dump(serializeEmbeddingDict(tempEmb),open(fname, 'w'))
from gensim.models import KeyedVectors, Word2Vec
%%time
a_key_vec = KeyedVectors.load(A_SOURCE_FILE)
store_relevant_embeddings(a_key_vec, A_OP_FILE)
h_key_vec = KeyedVectors.load(H_SOURCE_FILE)
store_relevant_embeddings(h_key_vec, H_OP_FILE)
s_key_vec = KeyedVectors.load(S_SOURCE_FILE)
store_relevant_embeddings(s_key_vec, S_OP_FILE)
Original Length: 12106870, No. of keys stored: 27876 Original Length: 19593942, No. of keys stored: 166201 Original Length: 39030788, No. of keys stored: 116993 CPU times: user 5min 54s, sys: 2min 1s, total: 7min 56s Wall time: 18min 51s
labels_dict = get_labels(allNodes)
modelName = 'sentence-transformers/all-distilroberta-v1'
embs = getSentEmbeddings(pd.Series(labels_dict.values()), modelName)
Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404 SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling
Batches: 0%| | 0/242 [00:00<?, ?it/s]
98.77918219566345 s
labels_emb_dict = {k:v.tolist() for k, v in (zip(labels_dict.keys(), embs))}
json.dump(labels_emb_dict, open(LABELS_EMB_FINAL_FILE, 'w'))
labels_desc_dict = get_labels_n_desc(allNodes)
modelName = 'sentence-transformers/all-distilroberta-v1'
embs = getSentEmbeddings(pd.Series(labels_desc_dict.values()), modelName)
0%| | 0/41845781 [00:00<?, ?it/s]
0%| | 0/34700043 [00:00<?, ?it/s]
Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404 SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling
Batches: 0%| | 0/242 [00:00<?, ?it/s]
203.93888425827026 s
labels_desc_emb_dict = {k:v.tolist() for k, v in (zip(labels_desc_dict.keys(), embs))}
json.dump(labels_desc_emb_dict, open(LABELS_DESC_EMB_FINAL_FILE, 'w'))
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys
word_sim_df = pd.read_csv(WORDSIM_FILE)
def fetchSim(row, similarity_type):
resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
try:
row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
except Exception as exc:
print(exc)
print(f"Resp not found for {row['node1']}, {row['node2']}")
row['embedding_cos_sim'] = None
row['Resp_code'] = resp
return row
word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_class_sim_df.to_csv(WORDSIM_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(WORDSIM_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(WORDSIM_TOP_SIM_FILE, index=None)
0%| | 0/334 [00:00<?, ?it/s]
0%| | 0/334 [00:00<?, ?it/s]
0%| | 0/334 [00:00<?, ?it/s]
word_sim_class_sim_df.head()
Word 1 | Word 2 | ID | H_Sim | H_Dim | F_Sim | F_Dim | N_Sim | N_Dim | D_Sim | ... | P_Dim | Avg | Stdev | H_orig | H_reversed | word1_kg_id | word2_kg_id | category | embedding_cos_sim | Resp_code | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Arafat | peace | 8 | 3 | D | 4 | NaN | 3 | U | 4 | ... | NaN | 3.6 | 0.547723 | 2.1250 | 7.8750 | Q34211 | Q454 | U | 3.982734 | <Response [200]> |
1 | Arafat | terror | 9 | 3 | D | 4 | NaN | 3 | U | 4 | ... | NaN | 3.6 | 0.547723 | 3.0625 | 6.9375 | Q34211 | Q13648784 | U | 3.969884 | <Response [200]> |
2 | FBI | fingerprint | 109 | 3 | D | 4 | NaN | 4 | NaN | 3 | ... | NaN | 3.6 | 0.547723 | 4.0625 | 5.9375 | Q8333 | Q178022 | U | 4.000000 | <Response [200]> |
3 | FBI | investigation | 110 | 3 | U | 3 | U | 3 | U | 3 | ... | u | 3.0 | 0.000000 | 5.0625 | 4.9375 | Q8333 | Q21004260 | M | 3.951077 | <Response [200]> |
4 | Harvard | Yale | 137 | 2 | S | 3 | S | 2 | S | 2 | ... | s | 2.2 | 0.447214 | 4.8750 | 5.1250 | Q13371 | Q49112 | M | 1.264601 | <Response [200]> |
5 rows × 22 columns
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys
word_sim_df = pd.read_csv(WORDSIM_OLD_FILE)
def fetchSim(row, similarity_type):
resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
try:
row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
except Exception as exc:
print(exc)
print(f"Resp not found for {row['node1']}, {row['node2']}")
row['embedding_cos_sim'] = None
row['Resp_code'] = resp
return row
word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_class_sim_df.to_csv(WORDSIM_OLD_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(WORDSIM_OLD_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(WORDSIM_OLD_TOP_SIM_FILE, index=None)
0%| | 0/349 [00:00<?, ?it/s]
0%| | 0/349 [00:00<?, ?it/s]
0%| | 0/349 [00:00<?, ?it/s]
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys
word_sim_df = pd.read_csv(DBPEDIA_MC_30_FINAL_FILE)
def fetchSim(row, similarity_type):
resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
try:
row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
except Exception as exc:
print(exc)
print(f"Resp not found for {row['node1']}, {row['node2']}")
row['embedding_cos_sim'] = None
row['Resp_code'] = resp
return row
word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_class_sim_df.to_csv(DBPEDIA_MC_30_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(DBPEDIA_MC_30_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(DBPEDIA_MC_30_TOP_SIM_FILE, index=None)
0%| | 0/16 [00:00<?, ?it/s]
0%| | 0/16 [00:00<?, ?it/s]
0%| | 0/16 [00:00<?, ?it/s]
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys
word_sim_df = pd.read_csv(DBPEDIA_RG_65_FINAL_FILE)
def fetchSim(row, similarity_type):
resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['word1_kg_id']+"&q2="+row['word2_kg_id']+"&embedding_type="+similarity_type)
try:
row['embedding_cos_sim'] = float(resp.json()['similarity']) if resp else None
except Exception as exc:
print(exc)
print(f"Resp not found for {row['node1']}, {row['node2']}")
row['embedding_cos_sim'] = None
row['Resp_code'] = resp
return row
word_sim_class_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'class') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_jc_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'jc') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_top_sim_df = pd.DataFrame(Parallel(n_jobs=5)(delayed(fetchSim)(row, 'topsim') for _, row in tqdm(word_sim_df.iterrows(), total=len(word_sim_df))))
word_sim_class_sim_df['embedding_cos_sim'] = word_sim_class_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_class_sim_df['embedding_na'] = word_sim_class_sim_df['embedding_cos_sim'].isna()
word_sim_class_sim_df['embedding_cos_sim'].fillna(word_sim_class_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_jc_sim_df['embedding_cos_sim'] = word_sim_jc_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_jc_sim_df['embedding_na'] = word_sim_jc_sim_df['embedding_cos_sim'].isna()
word_sim_jc_sim_df['embedding_cos_sim'].fillna(word_sim_jc_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_top_sim_df['embedding_cos_sim'] = word_sim_top_sim_df['embedding_cos_sim'].apply(lambda p: 4 - 3 * abs(p) if p is not None else p)
word_sim_top_sim_df['embedding_na'] = word_sim_top_sim_df['embedding_cos_sim'].isna()
word_sim_top_sim_df['embedding_cos_sim'].fillna(word_sim_top_sim_df['embedding_cos_sim'].mean(skipna=True), inplace=True)
word_sim_class_sim_df.to_csv(DBPEDIA_RG_65_CLASS_SIM_FILE, index=None)
word_sim_jc_sim_df.to_csv(DBPEDIA_RG_65_JC_SIM_FILE, index=None)
word_sim_top_sim_df.to_csv(DBPEDIA_RG_65_TOP_SIM_FILE, index=None)
0%| | 0/34 [00:00<?, ?it/s]
0%| | 0/34 [00:00<?, ?it/s]
0%| | 0/34 [00:00<?, ?it/s]
p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))
complexEmb = json.load(open('../data/Master_P279_dataset/masterComplexEmb.json'))
transeEmb = json.load(open('../data/Master_P279_dataset/masterTranseEmb.json'))
text2Emb = json.load(open('../data/Master_P279_dataset/text2Emb.json'))
text7Emb = json.load(open('../data/Master_P279_dataset/text7Emb.json'))
abstractEmb = json.load(open('../data/Master_P279_dataset/abstractEmb.json'))
abstractFirstSentEmb = json.load(open('../data/Master_P279_dataset/abstractFirstSentEmb.json'))
json.dump({key:val for key, val in complexEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/complex_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in transeEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/transe_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in text2Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_2_props_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in text7Emb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/text_7_props_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in abstractEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_orig_embedding_dict.json', 'w'))
json.dump({key:val for key, val in abstractFirstSentEmb.items() if key in p279QnodesList}, open('../data/Master_P279_dataset/embeddings/abstract_first_sent_orig_embedding_dict.json', 'w'))
def countOverlap(source, target):
cnt = 0
for key1 in source:
if key1 in target:
cnt += 1
return cnt
summArr = []
cnt = countOverlap(complexEmb, p279QnodesList)
summArr.append(['complex', cnt, cnt / len(p279QnodesList) * 100])
cnt = countOverlap(transeEmb, p279QnodesList)
summArr.append(['transe', cnt, cnt / len(p279QnodesList) * 100])
cnt = countOverlap(text2Emb, p279QnodesList)
summArr.append(['text2', cnt, cnt / len(p279QnodesList) * 100])
cnt = countOverlap(text7Emb, p279QnodesList)
summArr.append(['text7', cnt, cnt / len(p279QnodesList) * 100])
cnt = countOverlap(abstractEmb, p279QnodesList)
summArr.append(['abstract', cnt, cnt / len(p279QnodesList) * 100])
cnt = countOverlap(abstractFirstSentEmb, p279QnodesList)
summArr.append(['abstractFirstSent', cnt, cnt / len(p279QnodesList) * 100])
len(p279QnodesList)
238889
pd.DataFrame(summArr, columns=['embedding', 'count', 'Coverage Percentage'])
embedding | count | Coverage Percentage | |
---|---|---|---|
0 | complex | 238448 | 99.815395 |
1 | transe | 238448 | 99.815395 |
2 | text2 | 238889 | 100.000000 |
3 | text7 | 238889 | 100.000000 |
4 | abstract | 105828 | 44.300072 |
5 | abstractFirstSent | 105828 | 44.300072 |
masterEmbedDictMaster = {}
subsetEmbedDictMaster = {}
masterEmbedKeys = ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']
for key1 in masterEmbedKeys:
masterEmbedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict.json'))
subsetEmbedKeys = ['text_7props', 'text_2props', 'complex', 'transe', 'abstract', 'abstract_first_sent']
for key1 in subsetEmbedKeys:
subsetEmbedDictMaster[key1] = json.load(open('../data/orig_embeddings/'+key1+'_original_embeddings_dict.json'))
wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')
wordsim_pairs = {(row['word1_kg_id'], row['word2_kg_id']) for _, row in wordSim353AnnotDF_New.iterrows()}
for key1 in subsetEmbedKeys:
print(f"Pair Coverage by {key1} embeddings created for 19k retrofitting: {sum([row[0] in subsetEmbedDictMaster[key1] and row[1] in subsetEmbedDictMaster[key1] for row in wordsim_pairs])}")
Pair Coverage by text_7props embeddings created for 19k retrofitting: 325 Pair Coverage by text_2props embeddings created for 19k retrofitting: 325 Pair Coverage by complex embeddings created for 19k retrofitting: 342 Pair Coverage by transe embeddings created for 19k retrofitting: 342 Pair Coverage by abstract embeddings created for 19k retrofitting: 343 Pair Coverage by abstract_first_sent embeddings created for 19k retrofitting: 343
for key1 in masterEmbedKeys:
print(f"Pair Coverage by old {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}")
Pair Coverage by old text_7_props embeddings created for 19k retrofitting: 278 Pair Coverage by old text_2_props embeddings created for 19k retrofitting: 278 Pair Coverage by old complex embeddings created for 19k retrofitting: 278 Pair Coverage by old transe embeddings created for 19k retrofitting: 278 Pair Coverage by old abstract embeddings created for 19k retrofitting: 183 Pair Coverage by old abstract_first_sent embeddings created for 19k retrofitting: 183
wordSim353AnnotDF_New_set = set(wordSim353AnnotDF_New.word1_kg_id.to_list() + wordSim353AnnotDF_New.word2_kg_id.to_list())
from collections import defaultdict
masterEmbCorrections = defaultdict(list)
for node in wordSim353AnnotDF_New_set:
for i in range(len(masterEmbedKeys)):
if node not in masterEmbedDictMaster[masterEmbedKeys[i]] and node in wordSim353AnnotDF_New_set:
masterEmbCorrections[masterEmbedKeys[i]].append(node)
masterEmbCorrections.keys()
dict_keys(['abstract', 'abstract_first_sent', 'text_7_props', 'text_2_props', 'complex', 'transe'])
# import requests
# correctedComplexEmb = {}
# correctedTranseEmb = {}
# for wordID in masterEmbCorrections['complex']:
# try:
# resp = requests.get("http://ckg07:9200/wikidatadwd-augmented/_doc/"+wordID).json()['_source']
# correctedComplexEmb[wordID] = [float(p) for p in resp['graph_embedding_complex'].split(',')]
# correctedTranseEmb[wordID] = [float(p) for p in resp['graph_embeddings_transe'].split(',')]
# except:
# print("Failure returned for http://ckg07:9200/wikidatadwd-augmented/_doc/"+wordID)
%%time
correctedComplexEmb = {qnode: emb for emb, qnode in tqdm(zip(f['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}
correctedTranseEmb = {qnode: emb for emb, qnode in tqdm(zip(transf['embeddings'], ent_names), total=len(ent_names)) if qnode in masterEmbCorrections['complex']}
0%| | 0/42575933 [00:00<?, ?it/s]
0%| | 0/42575933 [00:00<?, ?it/s]
CPU times: user 1h 33min 17s, sys: 2min 38s, total: 1h 35min 56s Wall time: 1h 35min 28s
len(list(correctedComplexEmb.items())[0][1])
200
len(masterEmbedDictMaster['complex'][list(masterEmbedDictMaster['complex'].keys())[0]])
for node, emb in correctedComplexEmb.items():
masterEmbedDictMaster['complex'][node] = emb
for node, emb in correctedTranseEmb.items():
masterEmbedDictMaster['transe'][node] = emb
q1 = "kgtk filter -i ../data/wikidataos.for.text-embedding.tsv --word-separator '|' -p '" + '|'.join(masterEmbCorrections['text_7_props']) + ";;' -o ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv -v True"
os.system("screen -dm " + q1)
0
q1 = "kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv \
--model roberta-large-nli-mean-tokens \
--property-labels-file ../data/labels.en.tsv --debug \
--isa-properties P31 P279 P106 P39 P1382 P373 P452 \
--save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv"
# print(q1)
os.system(q1 + " &")
q1 = "kgtk text-embedding -i ../data/Master_P279_dataset/output/wikidataos.for.text-embedding-ext-corrections.tsv \
--model roberta-large-nli-mean-tokens \
--property-labels-file ../data/labels.en.tsv --debug \
--isa-properties P31 P279 \
--save-embedding-sentence > ../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv"
# print(q1)
os.system(q1 + " &")
32512
corrected7Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-7-props-part-corrections.tsv', sep='\t')
corrected2Emb = pd.read_csv('../data/Master_P279_dataset/output/P279-text-embedding-2-props-part-corrections.tsv', sep='\t')
corrected7Emb = corrected7Emb[corrected7Emb.property == 'text_embedding']
corrected7Emb['value'] = corrected7Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])
corrected2Emb = corrected2Emb[corrected2Emb.property == 'text_embedding']
corrected2Emb['value'] = corrected2Emb.value.apply(lambda p: [float(p1) for p1 in p.split(',')])
for _, row in corrected7Emb.iterrows():
masterEmbedDictMaster['text_7_props'][row['node']] = row['value']
for _, row in corrected2Emb.iterrows():
masterEmbedDictMaster['text_2_props'][row['node']] = row['value']
df1 = pd.read_csv("../data/short-abstracts_lang=en.csv", skiprows=1, skipfooter=1, header=None, engine='python')
df1.columns = ['ignore', 'node1', 'url', 'ignore2', 'abstract']
df1 = df1.set_index('node1')
df1[df1.url.apply(lambda p: 'http://dbpedia.org/resource' not in p)]
print(f"DBPedia dataset has {len(df1)} records with unique {df1.index.nunique()} index values")
sitelinksDF = pd.read_csv("../data/sitelinks.en.tsv.gz", sep='\t')
sitelinksDF['trimmedNode2'] = sitelinksDF.node2.apply(lambda p: p.split("/")[-1] if p.split("/")[-1] != '' else p.split("/")[-2])
sitelinksDF1 = sitelinksDF[sitelinksDF.label == 'wikipedia_sitelink']
sitelinksDF2 = sitelinksDF1.set_index('trimmedNode2')
print(f"There are {len(sitelinksDF2)} sitelinks present in the dataset corresponding to {sitelinksDF2.node1.nunique()} unique node1s (Qxxx), {sitelinksDF2.index.nunique()} unique labels (text)")
sitelinksDF2.loc[sitelinksDF2[sitelinksDF2.index.duplicated()].index]
masterEmbCorrections_abs_set = set(masterEmbCorrections['abstract'])
sitelinksDF2 = sitelinksDF2[sitelinksDF2.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]
DBPedia dataset has 5732949 records with unique 5732947 index values There are 8472828 sitelinks present in the dataset corresponding to 8472828 unique node1s (Qxxx), 8401883 unique labels (text)
labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\t')
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]
labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows()}
descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\t')
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in masterEmbCorrections_abs_set)]
descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows()}
sdf_set = set(sitelinksDF2.index.to_list())
df1 = df1[df1.index.map(lambda p: p in sdf_set)]
abstractsDF2 = sitelinksDF2.join(df1).reset_index()
print(f"From {len(abstractsDF2)} Qnodes, there are {abstractsDF2.ignore2.isna().sum()} sitelink Qnodes which do not have a short abstract i.e {len(abstractsDF2) - abstractsDF2.ignore2.isna().sum()} have a short abstract")
# abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]
From 58 Qnodes, there are 16 sitelink Qnodes which do not have a short abstract i.e 42 have a short abstract
abstractsDF2['node1_label'] = abstractsDF2.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else "")
abstractsDF2['node1_desc'] = abstractsDF2.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else "")
def combineAbsLabDesc(row, parameter):
if not(pd.isna(row[parameter])) and row[parameter] != 'nan' and row[parameter] != "":
return row[parameter]
elif row['node1_label'] == "" and row['node1_desc'] == "":
return None
else:
return row['node1_label'] + ' ' + row['node1_desc']
from nltk.tokenize import sent_tokenize
abstractsDF2['abstract_firstSent'] = abstractsDF2.abstract.apply(lambda p: sent_tokenize(str(p))[0] if p else None)
abstractsDF2 = abstractsDF2.reset_index()
abstractsDF2['abstract'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract',))
abstractsDF2['abstract_firstSent'] = abstractsDF2.apply(combineAbsLabDesc, axis=1, args=('abstract_firstSent',))
len(abstractsDF2)
58
abstractsDF2 = abstractsDF2[~abstractsDF2.abstract.isna()]
len(abstractsDF2)
58
abstractsDF2 = abstractsDF2.drop(columns=['index']).reset_index()
abstractsDF2.head()
level_0 | index | trimmedNode2 | id | node1 | label | node2 | ignore | url | ignore2 | abstract | node1_label | node1_desc | abstract_firstSent | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | Luxuries | Q10953913-wikipedia_sitelink-538fe3-0 | Q10953913 | wikipedia_sitelink | http://en.wikipedia.org/wiki/Luxuries | NaN | NaN | NaN | luxuryBehavior, expenses or equipment that far... | luxury | Behavior, expenses or equipment that far excee... | nan |
1 | 1 | 1 | Potato | Q10998-wikipedia_sitelink-56b85c-0 | Q10998 | wikipedia_sitelink | http://en.wikipedia.org/wiki/Potato | 10709.0 | <http://dbpedia.org/resource/Potato> | <http://www.w3.org/2000/01/rdf-schema#comment> | The potato is a root vegetable native to the A... | potato | species of plant | The potato is a root vegetable native to the A... |
2 | 2 | 2 | Mars | Q111-wikipedia_sitelink-9ff296-0 | Q111 | wikipedia_sitelink | http://en.wikipedia.org/wiki/Mars | 1803088.0 | <http://dbpedia.org/resource/Mars> | <http://www.w3.org/2000/01/rdf-schema#comment> | Mars is the fourth planet from the Sun and the... | Mars | fourth planet from the Sun | Mars is the fourth planet from the Sun and the... |
3 | 3 | 3 | Dawn | Q11326182-wikipedia_sitelink-ae2918-0 | Q11326182 | wikipedia_sitelink | http://en.wikipedia.org/wiki/Dawn | 97544.0 | <http://dbpedia.org/resource/Dawn> | <http://www.w3.org/2000/01/rdf-schema#comment> | Dawn is the time that marks the beginning of t... | dawn | time that marks the beginning of the twilight ... | Dawn is the time that marks the beginning of t... |
4 | 4 | 4 | Change_(philosophy) | Q1150070-wikipedia_sitelink-81cf5f-0 | Q1150070 | wikipedia_sitelink | http://en.wikipedia.org/wiki/Change_(philosophy) | NaN | NaN | NaN | changeprocess, event or action that deviates f... | change | process, event or action that deviates from th... | nan |
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd
def getSentEmbeddings(valSeries, modelName):
model = SentenceTransformer(modelName)
start = time()
encodings = model.encode(valSeries.to_list())
print(time()-start,'s')
return encodings
def getIndSentEmbeddings(sent, modelName):
model = SentenceTransformer(modelName)
start = time()
encodings = model.encode([sent])
print(time()-start,'s')
return encodings
abstractsDF2['abs_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract, 'bert-base-nli-mean-tokens')))
abstractsDF2['abs_firstSent_emb'] = pd.Series(list(getSentEmbeddings(abstractsDF2.abstract_firstSent, 'bert-base-nli-mean-tokens')))
0.6419482231140137 s 0.5260367393493652 s
for _, row in abstractsDF2.iterrows():
masterEmbedDictMaster['abstract'][row['node1']] = row['abs_emb']
masterEmbedDictMaster['abstract_first_sent'][row['node1']] = row['abs_firstSent_emb']
for node in masterEmbCorrections_abs_set:
if node not in masterEmbedDictMaster['abstract']:
if node in labelsDict and node in descDict:
masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]
elif node in labelsDict:
masterEmbedDictMaster['abstract'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]
0.37706875801086426 s 0.3001420497894287 s 0.370746374130249 s 0.6896324157714844 s 0.33779358863830566 s 0.3965473175048828 s 0.3200962543487549 s 0.3489806652069092 s 0.3413431644439697 s 0.32114505767822266 s 0.3811838626861572 s 0.34630370140075684 s 0.37790727615356445 s 0.26860570907592773 s 0.3601953983306885 s 0.3713240623474121 s 0.34137582778930664 s 0.33736181259155273 s 0.37023448944091797 s 0.31382036209106445 s 0.35136938095092773 s 0.37309718132019043 s 0.33543896675109863 s 0.38199710845947266 s 0.3740067481994629 s 0.3278031349182129 s 0.32283997535705566 s 0.34000563621520996 s 0.31502628326416016 s 0.34996771812438965 s 0.3871273994445801 s 0.3487060070037842 s 0.35172486305236816 s 0.3280646800994873 s 0.3519773483276367 s 0.3354451656341553 s 0.3633551597595215 s 0.3226644992828369 s 0.33882975578308105 s 0.36072254180908203 s 0.3833494186401367 s 0.2929043769836426 s 0.32875680923461914 s 0.36334872245788574 s 0.34148168563842773 s 0.3569769859313965 s 0.37468576431274414 s 0.399524450302124 s 0.3516504764556885 s 0.333402156829834 s 0.3851203918457031 s 0.34867238998413086 s 0.3607771396636963 s 0.38669753074645996 s 0.33347272872924805 s 0.36278390884399414 s 0.3602781295776367 s 0.3322322368621826 s 0.36807823181152344 s 0.3407411575317383 s 0.3837134838104248 s 0.38958096504211426 s 0.3332521915435791 s 0.3331124782562256 s 0.35001134872436523 s 0.32433485984802246 s 0.36315059661865234 s 0.34323906898498535 s 0.3112339973449707 s 0.30588483810424805 s 0.30704236030578613 s 0.31201720237731934 s
for node in masterEmbCorrections_abs_set:
if node not in masterEmbedDictMaster['abstract_first_sent']:
if node in labelsDict and node in descDict:
masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node] + ' ' + descDict[node], 'bert-base-nli-mean-tokens')[0]
elif node in labelsDict:
masterEmbedDictMaster['abstract_first_sent'][node] = getIndSentEmbeddings(labelsDict[node], 'bert-base-nli-mean-tokens')[0]
0.32213783264160156 s 0.357776403427124 s 0.37949395179748535 s 0.35210466384887695 s 0.28103041648864746 s 0.3626406192779541 s 0.35109710693359375 s 0.34203338623046875 s 0.32386112213134766 s 0.3354361057281494 s 0.3063056468963623 s 0.3441202640533447 s 0.32869935035705566 s 0.42442989349365234 s 0.37239527702331543 s 0.38650059700012207 s 0.3191685676574707 s 0.3609733581542969 s 0.3115823268890381 s 0.36015963554382324 s 0.3338603973388672 s 0.3487727642059326 s 0.3250617980957031 s 0.35145044326782227 s 0.33944034576416016 s 0.31502413749694824 s 0.3611795902252197 s 0.35285043716430664 s 0.3575010299682617 s 0.304781436920166 s 0.4003562927246094 s 0.3315858840942383 s 0.36008763313293457 s 0.36187100410461426 s 0.32981252670288086 s 0.3378865718841553 s 0.31662964820861816 s 0.32143092155456543 s 0.3152732849121094 s 0.38222813606262207 s 0.3846759796142578 s 0.33153700828552246 s 0.37013936042785645 s 0.33272790908813477 s 0.29526567459106445 s 0.3218040466308594 s 0.3795340061187744 s 0.3576061725616455 s 0.35764193534851074 s 0.36867713928222656 s 0.3807237148284912 s 0.33266758918762207 s 0.33878159523010254 s 0.38289546966552734 s 0.38695788383483887 s 0.33074188232421875 s 0.32749414443969727 s 0.33860039710998535 s 0.36585235595703125 s 0.33011841773986816 s 0.3293156623840332 s 0.3491702079772949 s 0.3720529079437256 s 0.3078622817993164 s 0.3844125270843506 s 0.32468104362487793 s 0.3186354637145996 s 0.3438723087310791 s 0.36643028259277344 s 0.34279680252075195 s 0.3625810146331787 s 0.35865354537963867 s 0.3503103256225586 s 0.37160682678222656 s 0.3268110752105713 s 0.2564544677734375 s 0.37343525886535645 s 0.33298277854919434 s
for key1 in masterEmbedKeys:
print(f"Pair Coverage by new {key1} embeddings created for 19k retrofitting: {sum([row[0] in masterEmbedDictMaster[key1] and row[1] in masterEmbedDictMaster[key1] for row in wordsim_pairs])}")
Pair Coverage by new text_7_props embeddings created for 19k retrofitting: 325 Pair Coverage by new text_2_props embeddings created for 19k retrofitting: 325 Pair Coverage by new complex embeddings created for 19k retrofitting: 343 Pair Coverage by new transe embeddings created for 19k retrofitting: 343 Pair Coverage by new abstract embeddings created for 19k retrofitting: 339 Pair Coverage by new abstract_first_sent embeddings created for 19k retrofitting: 339
for key1 in masterEmbedDictMaster.keys():
for key2 in masterEmbedDictMaster[key1].keys():
if type(masterEmbedDictMaster[key1][key2]) != list:
masterEmbedDictMaster[key1][key2] = masterEmbedDictMaster[key1][key2].tolist()
for key1 in ['complex', 'transe']:
json.dump(masterEmbedDictMaster[key1], open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json', 'w'))
def countOverlap(source, target):
cnt = 0
for key1 in source:
if key1 in target:
cnt += 1
return cnt
p279QnodesList = set(json.load(open('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_nodes.json')))
summArr = []
for key1 in masterEmbedDictMaster:
cnt = countOverlap(masterEmbedDictMaster[key1], p279QnodesList)
summArr.append([key1, len(masterEmbedDictMaster[key1]), cnt, cnt / len(p279QnodesList) * 100])
pd.DataFrame(summArr, columns=['embedding', 'total count', 'overlap count', 'Coverage Percentage'])
embedding | total count | overlap count | Coverage Percentage | |
---|---|---|---|---|
0 | text_7_props | 238930 | 238889 | 100.000000 |
1 | text_2_props | 238930 | 238889 | 100.000000 |
2 | complex | 238500 | 238448 | 99.815395 |
3 | transe | 238500 | 238448 | 99.815395 |
4 | abstract | 105964 | 105916 | 44.336910 |
5 | abstract_first_sent | 105964 | 105916 | 44.336910 |
import json
embedDictMaster = {}
for key1 in ['text_7_props', 'text_2_props', 'complex', 'transe', 'abstract', 'abstract_first_sent']:
embedDictMaster[key1] = json.load(open('../data/Master_P279_dataset/embeddings/'+key1+'_orig_embedding_dict_updated.json'))
def determineEmbeddingLengths(embedDictMaster):
for key in embedDictMaster.keys():
embed_size = len(next(iter(embedDictMaster[key].values())))
print(key,": ",embed_size)
determineEmbeddingLengths(embedDictMaster)
text_7_props : 1024 text_2_props : 1024 complex : 200 transe : 200 abstract : 768 abstract_first_sent : 768
for key1 in embedDictMaster.keys():
embedDictMaster[key1] = deserializeEmbeddingDict(embedDictMaster[key1])
# Fill Coverage of embedding dictionaries
for key1 in embedDictMaster.keys():
embedDictMaster[key1] = fillCoverage(embedDictMaster[key1])
Added 11 corrections Added 11 corrections Added 0 corrections Added 0 corrections Added 4 corrections Added 4 corrections
for key1 in embedDictMaster.keys():
print(key1, len(next(iter(embedDictMaster.values()))))
text_7_props 238941 text_2_props 238941 complex 238941 transe 238941 abstract 238941 abstract_first_sent 238941
def fetchNeighbours(df):
neighboursDict = {}
for _, row in df.iterrows():
if row.node1 not in neighboursDict:
neighboursDict[row.node1] = []
neighboursDict[row.node1].append((row.node2, row.bert2SentSim))
if row.node2 not in neighboursDict:
neighboursDict[row.node2] = []
neighboursDict[row.node2].append((row.node1, row.bert2SentSim))
print(max([len(neigh) for neigh in neighboursDict.values()]))
return neighboursDict
def retrofit(embedDict, neighDict, weightCase, weightAssignment=False):
newEmbedDict = {}
for word in embedDict.keys():
if word in neighDict:
neighbs = neighDict[word]
neighbs = list(filter(lambda p: p[0] in embedDict, neighbs))
if len(neighbs) == 0:
newEmbedDict[word] = embedDict[word]
continue
# assert len(neighbs) == 1
if weightAssignment:
sumOfSims = sum([neighb[1] for neighb in neighbs])
sumOfEmbs = sum([embedDict[neighb[0]] * float(neighb[1]) for neighb in neighbs])
else:
sumOfSims = sum([1 for neighb in neighbs])
sumOfEmbs = sum([embedDict[neighb[0]] for neighb in neighbs])
if weightCase == 1:
newEmbedDict[word] = (embedDict[word] * (len(neighbs)) + sumOfEmbs) / ((len(neighbs)) + sumOfSims)
elif weightCase == 2:
newEmbedDict[word] = (embedDict[word] * (len(neighbs))**2 + sumOfEmbs) / ((len(neighbs))**2 + sumOfSims)
elif weightCase == 0.5:
newEmbedDict[word] = (embedDict[word] * (len(neighbs))**0.5 + sumOfEmbs) / ((len(neighbs))**0.5 + sumOfSims)
else:
raise
else:
newEmbedDict[word] = embedDict[word]
return newEmbedDict
from sklearn.metrics import classification_report
def labelSamples(score):
return 'I' if score <= 1.75 else 'U' if score >= 3.5 else 'M'
LABELS = ['I','U','M']
def fetchCorrelationResults(embedDict, newEmbedDict):
wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')
# print(f"Length of wordsim dataset: {len(wordSim353AnnotDF_New)}")
assert wordSim353AnnotDF_New.word1_kg_id.isna().sum() == 0
assert wordSim353AnnotDF_New.word2_kg_id.isna().sum() == 0
wordSim353AnnotDF_New['category'] = wordSim353AnnotDF_New.Avg.apply(labelSamples)
# wordSim353AnnotDF_New2 = wordSim353AnnotDF_New
wordSim353AnnotDF_New2 = wordSim353AnnotDF_New[wordSim353AnnotDF_New.apply(lambda p: p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict, axis=1)]
wordSimMissingSet = set(wordSim353AnnotDF_New[wordSim353AnnotDF_New.word1_kg_id.apply(lambda p: p not in embedDict)].word1_kg_id.to_list() + wordSim353AnnotDF_New[wordSim353AnnotDF_New.word2_kg_id.apply(lambda p: p not in embedDict)].word2_kg_id.to_list())
responseDict = {}
responseDict['wordSimMissingSet'] = wordSimMissingSet
responseDict['coveredPairs'] = len(wordSim353AnnotDF_New2)
responseDict['totalPairs'] = len(wordSim353AnnotDF_New)
# wordSimMissingSet
# print(f"No. of pairs with some value for embeddings: {len(wordSim353AnnotDF_New2)}")
wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(embedDict[p['word1_kg_id']]).reshape(1,-1), np.array(embedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)
wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.apply(lambda p: cosine_similarity(np.array(newEmbedDict[p['word1_kg_id']]).reshape(1,-1), np.array(newEmbedDict[p['word2_kg_id']]).reshape(1,-1))[0][0] if p['word1_kg_id'] in embedDict and p['word2_kg_id'] in embedDict else -1, axis=1)
wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textOld'] == -1, 'textOld'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textOld'] != -1]['textOld'].mean()
wordSim353AnnotDF_New.loc[wordSim353AnnotDF_New['textNew'] == -1, 'textNew'] = wordSim353AnnotDF_New[wordSim353AnnotDF_New['textNew'] != -1]['textNew'].mean()
# Logic 1: Scale min,max value to 1,4 strictly
# min1, max1 = wordSim353AnnotDF_New['textOld'].min(), wordSim353AnnotDF_New['textOld'].max()
# min2, max2 = wordSim353AnnotDF_New['textNew'].min(), wordSim353AnnotDF_New['textNew'].max()
# wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * (p - min1) / (max1 - min1))
# wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * (p - min2) / (max2 - min2))
# Logic 2: Scale abs value to 1,4 strictly
wordSim353AnnotDF_New['textOld'] = wordSim353AnnotDF_New.textOld.apply(lambda p: 4 - 3 * abs(p))
wordSim353AnnotDF_New['textNew'] = wordSim353AnnotDF_New.textNew.apply(lambda p: 4 - 3 * abs(p))
# print(f"KT Corr of old emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['Avg'])}")
# print(f"KT Corr of new emb with Annotated Avg: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['Avg'])}")
# print(f"KT Corr of old emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textOld'], wordSim353AnnotDF_New2['H_reversed'])}")
# print(f"KT Corr of new emb with Human Avg Reversed: {stats.kendalltau(wordSim353AnnotDF_New2['textNew'], wordSim353AnnotDF_New2['H_reversed'])}")
# print(f"Classification Accuracy of old embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textOld'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}")
# print(f"Classification Accuracy of new embeddings categories vs annotated averages categories: {accuracy_score(wordSim353AnnotDF_New2['textNew'].apply(labelSamples), wordSim353AnnotDF_New2['category'])}")
responseDict['KT_old_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['Avg'])
responseDict['KT_new_vs_Avg'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['Avg'])
responseDict['KT_old_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textOld'], wordSim353AnnotDF_New['H_reversed'])
responseDict['KT_new_vs_Human'] = stats.kendalltau(wordSim353AnnotDF_New['textNew'], wordSim353AnnotDF_New['H_reversed'])
responseDict['old_acc'] = accuracy_score(wordSim353AnnotDF_New['textOld'].apply(labelSamples), wordSim353AnnotDF_New['category'])
responseDict['new_acc'] = accuracy_score(wordSim353AnnotDF_New['textNew'].apply(labelSamples), wordSim353AnnotDF_New['category'])
responseDict['class_rep_old'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), output_dict=True)
responseDict['class_rep_new'] = classification_report(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), output_dict=True)
cm_old = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textOld'].apply(labelSamples), labels=LABELS)
cm_new = confusion_matrix(wordSim353AnnotDF_New['category'], wordSim353AnnotDF_New['textNew'].apply(labelSamples), labels=LABELS)
responseDict['cm_old'] = cm_old
responseDict['cm_new'] = cm_new
return responseDict
neighDictMaster, embedDictMaster = {}, {}
neighDictMaster['19k_childPar'] = fetchNeighbours(p279ChildPar)
39218
embedDictMaster['complex'] = complexEmb
embedDictMaster['transe'] = transeEmb
for key1 in embedDictMaster.keys():
for key2 in embedDictMaster[key1].keys():
embedDictMaster[key1][key2] = np.array(embedDictMaster[key1][key2])
embList = list(embedDictMaster.keys())
basisList = list(neighDictMaster.keys())
neighDictMaster.keys()
dict_keys(['19k_childPar'])
newEmbedDictMaster, responsesDictMaster = {}, {}
import numpy as np
results = []
NUM_ITERS = 10
for basis in tqdm(basisList):
for emb in embList:
for weightedNess in [True]:
groupResults = []
for weightCase in [1,2]:
embedDict = embedDictMaster[emb]
if weightedNess:
caseName = emb + '_' + basis + '_' + str(weightCase) + '_weighted'
else:
caseName = emb + '_' + basis + '_' + str(weightCase) + '_unweighted'
for iterNum in range(1,NUM_ITERS+1):
newEmbedDict = retrofit(embedDict, neighDictMaster[basis], weightCase, weightedNess)
# dists = determineDistances(embedDict, newEmbedDict)
responsesDict = fetchCorrelationResults(embedDict, newEmbedDict)
# print(responsesDict.keys())
groupResults.append([emb, basis, weightCase, weightedNess, iterNum, \
responsesDict['old_acc']*100, \
responsesDict['new_acc']*100, \
(responsesDict['new_acc'] - responsesDict['old_acc'])*100, \
responsesDict['coveredPairs'], \
responsesDict['class_rep_old']['I']['precision'], \
responsesDict['class_rep_old']['I']['recall'], \
responsesDict['class_rep_old']['I']['f1-score'], \
responsesDict['class_rep_old']['U']['precision'], \
responsesDict['class_rep_old']['U']['recall'], \
responsesDict['class_rep_old']['U']['f1-score'], \
responsesDict['class_rep_new']['I']['precision'], \
responsesDict['class_rep_new']['I']['recall'], \
responsesDict['class_rep_new']['I']['f1-score'], \
responsesDict['class_rep_new']['U']['precision'], \
responsesDict['class_rep_new']['U']['recall'], \
responsesDict['class_rep_new']['U']['f1-score'], \
])
embedDict = newEmbedDict
newEmbedDictMaster[caseName] = newEmbedDict
responsesDictMaster[caseName] = responsesDict
for gR, rank in zip(groupResults, np.argsort([-p[6] for p in groupResults])):
results.append(gR+[rank])
0%| | 0/1 [00:00<?, ?it/s]
resultsDF = pd.DataFrame(results, columns=['Embedding', 'Basis', 'Weight', 'Weightedness', 'Iteration Num', 'Old Acc', 'New Acc', 'Increase', 'Pairs Covered', \
'Old I Precision', 'Old I Recall', 'Old I F1-Score', \
'Old U Precision', 'Old U Recall', 'Old U F1-Score', \
'New I Precision', 'New I Recall', 'New I F1-Score', \
'New U Precision', 'New U Recall', 'New U F1-Score', \
'Rank'])
resultsDF.sort_values(by=['Increase'], ascending=False)
Embedding | Basis | Weight | Weightedness | Iteration Num | Old Acc | New Acc | Increase | Pairs Covered | Old I Precision | ... | Old U Precision | Old U Recall | Old U F1-Score | New I Precision | New I Recall | New I F1-Score | New U Precision | New U Recall | New U F1-Score | Rank | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | complex | 19k_childPar | 1 | True | 1 | 60.755814 | 64.244186 | 3.488372 | 291 | 1.000000 | ... | 0.433121 | 0.660194 | 0.523077 | 1.000000 | 0.40 | 0.571429 | 0.463415 | 0.553398 | 0.504425 | 2 |
20 | transe | 19k_childPar | 1 | True | 1 | 62.500000 | 65.697674 | 3.197674 | 291 | 0.888889 | ... | 0.397059 | 0.262136 | 0.315789 | 0.833333 | 0.50 | 0.625000 | 0.450000 | 0.174757 | 0.251748 | 0 |
1 | complex | 19k_childPar | 1 | True | 2 | 64.244186 | 67.151163 | 2.906977 | 291 | 1.000000 | ... | 0.463415 | 0.553398 | 0.504425 | 1.000000 | 0.50 | 0.666667 | 0.495050 | 0.485437 | 0.490196 | 4 |
11 | complex | 19k_childPar | 2 | True | 2 | 61.918605 | 63.662791 | 1.744186 | 291 | 1.000000 | ... | 0.444444 | 0.660194 | 0.531250 | 1.000000 | 0.45 | 0.620690 | 0.458904 | 0.650485 | 0.538153 | 13 |
10 | complex | 19k_childPar | 2 | True | 1 | 60.755814 | 61.918605 | 1.162791 | 291 | 1.000000 | ... | 0.433121 | 0.660194 | 0.523077 | 1.000000 | 0.40 | 0.571429 | 0.444444 | 0.660194 | 0.531250 | 12 |
2 | complex | 19k_childPar | 1 | True | 3 | 67.151163 | 67.732558 | 0.581395 | 291 | 1.000000 | ... | 0.495050 | 0.485437 | 0.490196 | 0.909091 | 0.50 | 0.645161 | 0.511905 | 0.417476 | 0.459893 | 1 |
4 | complex | 19k_childPar | 1 | True | 5 | 67.151163 | 67.732558 | 0.581395 | 291 | 0.916667 | ... | 0.492958 | 0.339806 | 0.402299 | 0.916667 | 0.55 | 0.687500 | 0.507937 | 0.310680 | 0.385542 | 5 |
36 | transe | 19k_childPar | 2 | True | 7 | 62.500000 | 63.081395 | 0.581395 | 291 | 0.846154 | ... | 0.351852 | 0.184466 | 0.242038 | 0.846154 | 0.55 | 0.666667 | 0.365385 | 0.184466 | 0.245161 | 6 |
30 | transe | 19k_childPar | 2 | True | 1 | 62.500000 | 63.081395 | 0.581395 | 291 | 0.888889 | ... | 0.397059 | 0.262136 | 0.315789 | 0.900000 | 0.45 | 0.600000 | 0.400000 | 0.252427 | 0.309524 | 17 |
22 | transe | 19k_childPar | 1 | True | 3 | 64.825581 | 65.406977 | 0.581395 | 291 | 0.750000 | ... | 0.393939 | 0.126214 | 0.191176 | 0.750000 | 0.75 | 0.750000 | 0.400000 | 0.116505 | 0.180451 | 3 |
38 | transe | 19k_childPar | 2 | True | 9 | 63.081395 | 63.372093 | 0.290698 | 291 | 0.785714 | ... | 0.372549 | 0.184466 | 0.246753 | 0.785714 | 0.55 | 0.647059 | 0.380000 | 0.184466 | 0.248366 | 8 |
33 | transe | 19k_childPar | 2 | True | 4 | 62.790698 | 63.081395 | 0.290698 | 291 | 0.900000 | ... | 0.383333 | 0.223301 | 0.282209 | 0.846154 | 0.55 | 0.666667 | 0.375000 | 0.203883 | 0.264151 | 14 |
18 | complex | 19k_childPar | 2 | True | 9 | 63.081395 | 63.081395 | 0.000000 | 291 | 0.909091 | ... | 0.444444 | 0.543689 | 0.489083 | 0.909091 | 0.50 | 0.645161 | 0.444444 | 0.543689 | 0.489083 | 19 |
37 | transe | 19k_childPar | 2 | True | 8 | 63.081395 | 63.081395 | 0.000000 | 291 | 0.846154 | ... | 0.365385 | 0.184466 | 0.245161 | 0.785714 | 0.55 | 0.647059 | 0.372549 | 0.184466 | 0.246753 | 7 |
32 | transe | 19k_childPar | 2 | True | 3 | 62.790698 | 62.790698 | 0.000000 | 291 | 0.900000 | ... | 0.387097 | 0.233010 | 0.290909 | 0.900000 | 0.45 | 0.600000 | 0.383333 | 0.223301 | 0.282209 | 12 |
23 | transe | 19k_childPar | 1 | True | 4 | 65.406977 | 65.406977 | 0.000000 | 291 | 0.750000 | ... | 0.400000 | 0.116505 | 0.180451 | 0.652174 | 0.75 | 0.697674 | 0.444444 | 0.116505 | 0.184615 | 1 |
39 | transe | 19k_childPar | 2 | True | 10 | 63.372093 | 63.372093 | 0.000000 | 291 | 0.785714 | ... | 0.380000 | 0.184466 | 0.248366 | 0.785714 | 0.55 | 0.647059 | 0.380000 | 0.184466 | 0.248366 | 9 |
15 | complex | 19k_childPar | 2 | True | 6 | 63.662791 | 63.662791 | 0.000000 | 291 | 1.000000 | ... | 0.455224 | 0.592233 | 0.514768 | 1.000000 | 0.45 | 0.620690 | 0.453846 | 0.572816 | 0.506438 | 17 |
14 | complex | 19k_childPar | 2 | True | 5 | 63.662791 | 63.662791 | 0.000000 | 291 | 1.000000 | ... | 0.457143 | 0.621359 | 0.526749 | 1.000000 | 0.45 | 0.620690 | 0.455224 | 0.592233 | 0.514768 | 16 |
13 | complex | 19k_childPar | 2 | True | 4 | 63.662791 | 63.662791 | 0.000000 | 291 | 1.000000 | ... | 0.457746 | 0.631068 | 0.530612 | 1.000000 | 0.45 | 0.620690 | 0.457143 | 0.621359 | 0.526749 | 15 |
12 | complex | 19k_childPar | 2 | True | 3 | 63.662791 | 63.662791 | 0.000000 | 291 | 1.000000 | ... | 0.458904 | 0.650485 | 0.538153 | 1.000000 | 0.45 | 0.620690 | 0.457746 | 0.631068 | 0.530612 | 14 |
17 | complex | 19k_childPar | 2 | True | 8 | 63.372093 | 63.081395 | -0.290698 | 291 | 0.909091 | ... | 0.448819 | 0.553398 | 0.495652 | 0.909091 | 0.50 | 0.645161 | 0.444444 | 0.543689 | 0.489083 | 18 |
16 | complex | 19k_childPar | 2 | True | 7 | 63.662791 | 63.372093 | -0.290698 | 291 | 1.000000 | ... | 0.453846 | 0.572816 | 0.506438 | 0.909091 | 0.50 | 0.645161 | 0.448819 | 0.553398 | 0.495652 | 9 |
35 | transe | 19k_childPar | 2 | True | 6 | 62.790698 | 62.500000 | -0.290698 | 291 | 0.846154 | ... | 0.363636 | 0.194175 | 0.253165 | 0.846154 | 0.55 | 0.666667 | 0.351852 | 0.184466 | 0.242038 | 5 |
34 | transe | 19k_childPar | 2 | True | 5 | 63.081395 | 62.790698 | -0.290698 | 291 | 0.846154 | ... | 0.375000 | 0.203883 | 0.264151 | 0.846154 | 0.55 | 0.666667 | 0.363636 | 0.194175 | 0.253165 | 15 |
31 | transe | 19k_childPar | 2 | True | 2 | 63.081395 | 62.790698 | -0.290698 | 291 | 0.900000 | ... | 0.400000 | 0.252427 | 0.309524 | 0.900000 | 0.45 | 0.600000 | 0.387097 | 0.233010 | 0.290909 | 11 |
28 | transe | 19k_childPar | 1 | True | 9 | 60.755814 | 60.465116 | -0.290698 | 291 | 0.394737 | ... | 0.368421 | 0.067961 | 0.114754 | 0.365854 | 0.75 | 0.491803 | 0.388889 | 0.067961 | 0.115702 | 13 |
19 | complex | 19k_childPar | 2 | True | 10 | 63.081395 | 62.790698 | -0.290698 | 291 | 0.909091 | ... | 0.444444 | 0.543689 | 0.489083 | 0.900000 | 0.45 | 0.600000 | 0.444444 | 0.543689 | 0.489083 | 10 |
26 | transe | 19k_childPar | 1 | True | 7 | 62.209302 | 61.627907 | -0.581395 | 291 | 0.428571 | ... | 0.428571 | 0.087379 | 0.145161 | 0.416667 | 0.75 | 0.535714 | 0.400000 | 0.077670 | 0.130081 | 4 |
29 | transe | 19k_childPar | 1 | True | 10 | 60.465116 | 59.883721 | -0.581395 | 291 | 0.365854 | ... | 0.388889 | 0.067961 | 0.115702 | 0.333333 | 0.75 | 0.461538 | 0.411765 | 0.067961 | 0.116667 | 16 |
8 | complex | 19k_childPar | 1 | True | 9 | 65.116279 | 64.534884 | -0.581395 | 291 | 0.785714 | ... | 0.442308 | 0.223301 | 0.296774 | 0.785714 | 0.55 | 0.647059 | 0.416667 | 0.194175 | 0.264901 | 0 |
3 | complex | 19k_childPar | 1 | True | 4 | 67.732558 | 67.151163 | -0.581395 | 291 | 0.909091 | ... | 0.511905 | 0.417476 | 0.459893 | 0.916667 | 0.55 | 0.687500 | 0.492958 | 0.339806 | 0.402299 | 3 |
6 | complex | 19k_childPar | 1 | True | 7 | 66.569767 | 65.988372 | -0.581395 | 291 | 0.916667 | ... | 0.475410 | 0.281553 | 0.353659 | 0.846154 | 0.55 | 0.666667 | 0.464286 | 0.252427 | 0.327044 | 7 |
7 | complex | 19k_childPar | 1 | True | 8 | 65.988372 | 65.116279 | -0.872093 | 291 | 0.846154 | ... | 0.464286 | 0.252427 | 0.327044 | 0.785714 | 0.55 | 0.647059 | 0.442308 | 0.223301 | 0.296774 | 8 |
21 | transe | 19k_childPar | 1 | True | 2 | 65.697674 | 64.825581 | -0.872093 | 291 | 0.833333 | ... | 0.450000 | 0.174757 | 0.251748 | 0.750000 | 0.60 | 0.666667 | 0.393939 | 0.126214 | 0.191176 | 2 |
27 | transe | 19k_childPar | 1 | True | 8 | 61.627907 | 60.755814 | -0.872093 | 291 | 0.416667 | ... | 0.400000 | 0.077670 | 0.130081 | 0.394737 | 0.75 | 0.517241 | 0.368421 | 0.067961 | 0.114754 | 10 |
25 | transe | 19k_childPar | 1 | True | 6 | 63.372093 | 62.209302 | -1.162791 | 291 | 0.468750 | ... | 0.454545 | 0.097087 | 0.160000 | 0.428571 | 0.75 | 0.545455 | 0.428571 | 0.087379 | 0.145161 | 19 |
5 | complex | 19k_childPar | 1 | True | 6 | 67.732558 | 66.569767 | -1.162791 | 291 | 0.916667 | ... | 0.507937 | 0.310680 | 0.385542 | 0.916667 | 0.55 | 0.687500 | 0.475410 | 0.281553 | 0.353659 | 6 |
9 | complex | 19k_childPar | 1 | True | 10 | 64.534884 | 63.081395 | -1.453488 | 291 | 0.785714 | ... | 0.416667 | 0.194175 | 0.264901 | 0.687500 | 0.55 | 0.611111 | 0.377778 | 0.165049 | 0.229730 | 11 |
24 | transe | 19k_childPar | 1 | True | 5 | 65.406977 | 63.372093 | -2.034884 | 291 | 0.652174 | ... | 0.444444 | 0.116505 | 0.184615 | 0.468750 | 0.75 | 0.576923 | 0.454545 | 0.097087 | 0.160000 | 18 |
40 rows × 22 columns
resultsDF.to_csv('../data/retrofitting/masterRetro_Aug20_2021.csv', index=False)