This note book will create the SQLite DB Cache and the required indices for KGTK Browser.
The required input parameters are:
The files metadata.pagerank.undirected.tsv.gz
and derived.isastar.tsv.gz
are created by running this notebook
output_path
where the required files and cache will be createdlocalhost
9200
wikidata-dwd-kgtk-search-01
yes|no
, default yes
yes|no
, defaut no
yes|no
, default yes
Cache file location: <output_path>/<project_name>/temp.<project_name>/wikidata.sqlite3.db
import os
import pandas as pd
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
import kgtk.kypher.api as kapi
from kgtk.functions import kgtk, kypher
input_path = "/data/amandeep/wikidata-20211027-dwd-v3"
output_path = "/data/amandeep/wikidata-20211027-dwd-v3"
project_name = "kgtk-browser-files"
files = 'label,pagerank_undirected,alias,description,claims,datatypes,qualifiers'
es_host = "http://localhost"
es_port = 9200
es_index = "wikidata-dwd-kgtk-search-01"
create_db = 'yes'
create_es = 'no'
create_class_viz = 'yes'
files = files.split(',')
create_sqlite_cache = create_db.lower() == 'yes'
create_es_index = create_es.lower() == 'yes'
create_class_viz_tables = create_class_viz.lower() == 'yes'
additional_files = {
'classvizedge': 'class-visualization.edge.tsv.gz',
'classviznode': 'class-visualization.node.tsv.gz'
}
ck = ConfigureKGTK(files)
if create_class_viz_tables:
ck.configure_kgtk(input_graph_path=input_path,
output_path=output_path,
project_name=project_name,
additional_files=additional_files)
else:
ck.configure_kgtk(input_graph_path=input_path,
output_path=output_path,
project_name=project_name)
User home: /nas/home/amandeep Current dir: /data/amandeep/github/kgtk-browser KGTK dir: /data/amandeep/github Use-cases dir: /data/amandeep/github/use-cases
ck.print_env_variables()
USE_CASES_DIR: /data/amandeep/github/use-cases KGTK_OPTION_DEBUG: false kypher: kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files/wikidata.sqlite3.db TEMP: /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files GRAPH: /data/amandeep/wikidata-20211027-dwd-v3 kgtk: kgtk EXAMPLES_DIR: /data/amandeep/github/examples STORE: /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files/wikidata.sqlite3.db KGTK_GRAPH_CACHE: /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files/wikidata.sqlite3.db OUT: /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files KGTK_LABEL_FILE: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz label: /data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz pagerank_undirected: /data/amandeep/wikidata-20211027-dwd-v3/metadata.pagerank.undirected.tsv.gz alias: /data/amandeep/wikidata-20211027-dwd-v3/aliases.en.tsv.gz description: /data/amandeep/wikidata-20211027-dwd-v3/descriptions.en.tsv.gz claims: /data/amandeep/wikidata-20211027-dwd-v3/claims.tsv.gz datatypes: /data/amandeep/wikidata-20211027-dwd-v3/metadata.property.datatypes.tsv.gz qualifiers: /data/amandeep/wikidata-20211027-dwd-v3/qualifiers.tsv.gz classvizedge: /data/amandeep/wikidata-20211027-dwd-v3/class-visualization.edge.tsv.gz classviznode: /data/amandeep/wikidata-20211027-dwd-v3/class-visualization.node.tsv.gz
if create_sqlite_cache:
ck.load_files_into_cache()
kgtk query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files/wikidata.sqlite3.db -i "/data/amandeep/wikidata-20211027-dwd-v3/labels.en.tsv.gz" --as label -i "/data/amandeep/wikidata-20211027-dwd-v3/metadata.pagerank.undirected.tsv.gz" --as pagerank_undirected -i "/data/amandeep/wikidata-20211027-dwd-v3/aliases.en.tsv.gz" --as alias -i "/data/amandeep/wikidata-20211027-dwd-v3/descriptions.en.tsv.gz" --as description -i "/data/amandeep/wikidata-20211027-dwd-v3/claims.tsv.gz" --as claims -i "/data/amandeep/wikidata-20211027-dwd-v3/metadata.property.datatypes.tsv.gz" --as datatypes -i "/data/amandeep/wikidata-20211027-dwd-v3/qualifiers.tsv.gz" --as qualifiers -i "/data/amandeep/wikidata-20211027-dwd-v3/class-visualization.edge.tsv.gz" --as classvizedge -i "/data/amandeep/wikidata-20211027-dwd-v3/class-visualization.node.tsv.gz" --as classvizenode --limit 3 id node1 label node2 lang rank node2;wikidatatype P10-label-en P10 label 'video'@en en P1000-label-en P1000 label 'record held'@en en P10000-label-en P10000 label 'Research Vocabularies Australia ID'@en en
_kapi2 = kapi.KypherApi(graphcache=os.environ['STORE'], loglevel=1, index='auto',
maxresults=100, maxcache=0)
label
, undirected_pagerank
and description
¶if create_sqlite_cache:
!kgtk query --gc $STORE \
-i label pagerank_undirected description\
--match 'label: (qnode)-[l]->(y), pagerank: (qnode)-[:Pundirected_pagerank]->(pr)' \
--opt 'description: (qnode)-[:description]->(d)' \
--return 'qnode as node1, l.label as label, y as node2, upper(y) as `node2;upper`, pr as `node1;pagerank`, ifnull(d, "") as `node1;description`' \
--order-by 'qnode' \
-o $OUT/label_pagerank_undirected_description.tsv.gz
if create_sqlite_cache:
!kgtk query --gc $STORE -i $OUT/label_pagerank_undirected_description.tsv.gz --as l_d_pgr_ud --limit 10
node1 label node2 node2;upper node1;pagerank node1;description P10 label 'video'@en 'VIDEO'@EN 1.9177407615866818e-08 'relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)'@en P1000 label 'record held'@en 'RECORD HELD'@EN 1.4103331270065626e-08 'notable record achieved by a person or entity, include qualifiers for dates held'@en P10000 label 'Research Vocabularies Australia ID'@en 'RESEARCH VOCABULARIES AUSTRALIA ID'@EN 2.586054821150431e-08 'Identifier of a vocabulary in Research Vocabularies Australia'@en P10001 label 'Austrian Football Association player ID'@en 'AUSTRIAN FOOTBALL ASSOCIATION PLAYER ID'@EN 3.152198892346769e-08 'identifier for an association football player on the Austrian Football Association website'@en P10002 label 'Dewan Negara ID'@en 'DEWAN NEGARA ID'@EN 2.3820406824266033e-08 'numerical identifier for members or former members of the Dewan Negara of Malaysia on the official website'@en P10003 label 'Arachne.org.au ID'@en 'ARACHNE.ORG.AU ID'@EN 2.355944886245409e-08 'numerical identifier for an arachnid taxon in the Arachne.org.au database'@en P10004 label 'Dewan Rakyat ID'@en 'DEWAN RAKYAT ID'@EN 2.3399165925456584e-08 'numerical identifier for members or former members of the Dewan Rakyat of Malaysia on the official website'@en P10005 label 'Remontees-mecaniques.net ID'@en 'REMONTEES-MECANIQUES.NET ID'@EN 2.1853431037247363e-08 'numerical identifier for a cableway on the French website Remontees-mecaniques.net'@en P10006 label 'AllSides ID'@en 'ALLSIDES ID'@EN 1.8609454548988348e-08 'alphabetical identifier of a publication, organization, or person at AllSides'@en P10007 label 'Birdata ID'@en 'BIRDATA ID'@EN 2.1809764677402115e-08 'identifier for an bird species in the Birdata database'@en
%%time
if create_sqlite_cache:
!kgtk --debug query -i l_d_pgr_ud --idx node1 "node2;upper" label text:node2//name=ldpgridx --gc $STORE --limit 5
[2022-01-13 16:57:25 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_10 AS graph_10_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 16:57:25 sqlstore]: CREATE INDEX "graph_10_node1_idx" ON "graph_10" ("node1") [2022-01-13 16:57:54 sqlstore]: ANALYZE "graph_10_node1_idx" [2022-01-13 16:57:58 sqlstore]: CREATE INDEX "graph_10_node2;upper_idx" ON "graph_10" ("node2;upper") [2022-01-13 16:59:15 sqlstore]: ANALYZE "graph_10_node2;upper_idx" [2022-01-13 16:59:21 sqlstore]: CREATE INDEX "graph_10_label_idx" ON "graph_10" ("label") [2022-01-13 16:59:45 sqlstore]: ANALYZE "graph_10_label_idx" [2022-01-13 16:59:49 sqlstore]: CREATE VIRTUAL TABLE "graph_10_txtidx_ldpgridx" USING FTS5 ("node2", tokenize="trigram", content="graph_10") [2022-01-13 16:59:49 sqlstore]: INSERT INTO "graph_10_txtidx_ldpgridx" ("node2") SELECT "node2" FROM graph_10 node1 label node2 node2;upper node1;pagerank node1;description P10 label 'video'@en 'VIDEO'@EN 1.9177407615866818e-08 'relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)'@en P1000 label 'record held'@en 'RECORD HELD'@EN 1.4103331270065626e-08 'notable record achieved by a person or entity, include qualifiers for dates held'@en P10000 label 'Research Vocabularies Australia ID'@en 'RESEARCH VOCABULARIES AUSTRALIA ID'@EN 2.586054821150431e-08 'Identifier of a vocabulary in Research Vocabularies Australia'@en P10001 label 'Austrian Football Association player ID'@en 'AUSTRIAN FOOTBALL ASSOCIATION PLAYER ID'@EN 3.152198892346769e-08 'identifier for an association football player on the Austrian Football Association website'@en P10002 label 'Dewan Negara ID'@en 'DEWAN NEGARA ID'@EN 2.3820406824266033e-08 'numerical identifier for members or former members of the Dewan Negara of Malaysia on the official website'@en CPU times: user 12.3 s, sys: 2.48 s, total: 14.8 s Wall time: 11min 51s
%%time
if create_sqlite_cache:
!kgtk --debug query -i label --idx label --gc $STORE --limit 5
[2022-01-13 17:09:18 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_1 AS graph_1_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 17:09:18 sqlstore]: CREATE INDEX "graph_1_label_idx" ON "graph_1" ("label") [2022-01-13 17:09:41 sqlstore]: ANALYZE "graph_1_label_idx" id node1 label node2 lang rank node2;wikidatatype P10-label-en P10 label 'video'@en en P1000-label-en P1000 label 'record held'@en en P10000-label-en P10000 label 'Research Vocabularies Australia ID'@en en P10001-label-en P10001 label 'Austrian Football Association player ID'@en en P10002-label-en P10002 label 'Dewan Negara ID'@en en CPU times: user 665 ms, sys: 138 ms, total: 803 ms Wall time: 37 s
%%time
if create_sqlite_cache:
!kgtk --debug query -i alias --idx label --gc $STORE --limit 5
[2022-01-13 17:09:51 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_3 AS graph_3_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 17:09:51 sqlstore]: CREATE INDEX "graph_3_label_idx" ON "graph_3" ("label") [2022-01-13 17:10:00 sqlstore]: ANALYZE "graph_3_label_idx" id node1 label node2 lang rank node2;wikidatatype P10-alias-en-282226-0 P10 alias 'gif'@en en P10-alias-en-2f86d8-0 P10 alias 'animation'@en en P10-alias-en-c1427e-0 P10 alias 'media'@en en P10-alias-en-c61ab1-0 P10 alias 'trailer (Commons)'@en en P10000-alias-en-0df7f5-0 P10000 alias 'Australian Research Vocabularies ID'@en en CPU times: user 294 ms, sys: 62.9 ms, total: 357 ms Wall time: 16.3 s
%%time
if create_sqlite_cache:
!kgtk --debug query -i description --idx id --gc $STORE --limit 5
[2022-01-13 17:10:07 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_4 AS graph_4_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 17:10:07 sqlstore]: CREATE INDEX "graph_4_id_idx" ON "graph_4" ("id") [2022-01-13 17:10:41 sqlstore]: ANALYZE "graph_4_id_idx" id node1 label node2 lang rank node2;wikidatatype P10-description-en P10 description 'relevant video. For images, use the property P18. For film trailers, qualify with \"object has role\" (P3831)=\"trailer\" (Q622550)'@en en P1000-description-en P1000 description 'notable record achieved by a person or entity, include qualifiers for dates held'@en en P10000-description-en P10000 description 'Identifier of a vocabulary in Research Vocabularies Australia'@en en P10001-description-en P10001 description 'identifier for an association football player on the Austrian Football Association website'@en en P10002-description-en P10002 description 'numerical identifier for members or former members of the Dewan Negara of Malaysia on the official website'@en en CPU times: user 746 ms, sys: 196 ms, total: 942 ms Wall time: 46.4 s
%%time
if create_sqlite_cache:
!kgtk --debug query -i claims --idx label node1 node2 id --gc $STORE --limit 5
[2022-01-13 17:10:55 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_5 AS graph_5_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 17:10:55 sqlstore]: CREATE INDEX "graph_5_label_idx" ON "graph_5" ("label") [2022-01-13 17:25:12 sqlstore]: ANALYZE "graph_5_label_idx" [2022-01-13 17:25:48 sqlstore]: CREATE INDEX "graph_5_node1_idx" ON "graph_5" ("node1") [2022-01-13 17:31:45 sqlstore]: ANALYZE "graph_5_node1_idx" [2022-01-13 17:32:25 sqlstore]: CREATE INDEX "graph_5_node2_idx" ON "graph_5" ("node2") [2022-01-13 17:46:35 sqlstore]: ANALYZE "graph_5_node2_idx" [2022-01-13 17:47:20 sqlstore]: CREATE INDEX "graph_5_id_idx" ON "graph_5" ("id") [2022-01-13 17:56:45 sqlstore]: ANALYZE "graph_5_id_idx" id node1 label node2 lang rank node2;wikidatatype P10-P1628-32b85d-7927ece6-0 P10 P1628 "http://www.w3.org/2006/vcard/ns#Video" normal url P10-P1628-acf60d-b8950832-0 P10 P1628 "https://schema.org/video" normal url P10-P1629-Q34508-bcc39400-0 P10 P1629 Q34508 normal wikibase-item P10-P1630-53947a-fbe9093e-0 P10 P1630 "https://commons.wikimedia.org/wiki/File:$1" normal string P10-P1659-P1651-c4068028-0 P10 P1659 P1651 normal wikibase-property CPU times: user 50.2 s, sys: 9.39 s, total: 59.6 s Wall time: 46min 58s
%%time
if create_sqlite_cache:
!kgtk --debug query -i datatypes --idx label node1 --gc $STORE --limit 5
[2022-01-13 17:58:10 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_6 AS graph_6_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 17:58:10 sqlstore]: CREATE INDEX "graph_6_label_idx" ON "graph_6" ("label") [2022-01-13 17:58:10 sqlstore]: ANALYZE "graph_6_label_idx" [2022-01-13 17:58:10 sqlstore]: CREATE INDEX "graph_6_node1_idx" ON "graph_6" ("node1") [2022-01-13 17:58:10 sqlstore]: ANALYZE "graph_6_node1_idx" id node1 label node2 lang rank node2;wikidatatype P10-datatype P10 datatype commonsMedia P1000-datatype P1000 datatype wikibase-item P10000-datatype P10000 datatype external-id P10001-datatype P10001 datatype external-id P10002-datatype P10002 datatype external-id CPU times: user 474 ms, sys: 86.5 ms, total: 561 ms Wall time: 24.3 s
%%time
if create_sqlite_cache:
!kgtk --debug query -i qualifiers --idx node2 node1 label --gc $STORE --limit 5
[2022-01-13 17:58:16 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_7 AS graph_7_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 17:58:16 sqlstore]: CREATE INDEX "graph_7_node2_idx" ON "graph_7" ("node2") [2022-01-13 18:03:02 sqlstore]: ANALYZE "graph_7_node2_idx" [2022-01-13 18:03:13 sqlstore]: CREATE INDEX "graph_7_node1_idx" ON "graph_7" ("node1") [2022-01-13 18:05:21 sqlstore]: ANALYZE "graph_7_node1_idx" [2022-01-13 18:05:36 sqlstore]: CREATE INDEX "graph_7_label_idx" ON "graph_7" ("label") [2022-01-13 18:07:25 sqlstore]: ANALYZE "graph_7_label_idx" id node1 label node2 lang rank node2;wikidatatype P10-P1855-Q15075950-7eff6d65-0-P10-54b214-0 P10-P1855-Q15075950-7eff6d65-0 P10 "Smoorverliefd 12 september.webm" commonsMedia P10-P1855-Q15075950-7eff6d65-0-P3831-Q622550-0 P10-P1855-Q15075950-7eff6d65-0 P3831 Q622550 wikibase-item P10-P1855-Q4504-a69d2c73-0-P10-bef003-0 P10-P1855-Q4504-a69d2c73-0 P10 "Komodo dragons video.ogv" commonsMedia P10-P1855-Q69063653-c8cdb04c-0-P10-6fb08f-0 P10-P1855-Q69063653-c8cdb04c-0 P10 "Couch Commander.webm" commonsMedia P10-P1855-Q7378-555592a4-0-P10-8a982d-0 P10-P1855-Q7378-555592a4-0 P10 "Elephants Dream (2006).webm" commonsMedia CPU times: user 10.2 s, sys: 1.85 s, total: 12.1 s Wall time: 9min 24s
%%time
if create_sqlite_cache and create_class_viz_tables:
!kgtk --debug query -i classvizedge --idx label --gc $STORE --limit 5
[2022-01-13 18:07:44 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_8 AS graph_8_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 18:07:44 sqlstore]: CREATE INDEX "graph_8_label_idx" ON "graph_8" ("label") [2022-01-13 18:09:31 sqlstore]: ANALYZE "graph_8_label_idx" node1 label node2 graph edge_type Q10267817 P279 Q18553442 Q1225194 subclass Q107715 P279 Q309314 Q246672 subclass Q107715 P279 Q309314 Q937228 subclass Q107715 P279 Q309314 Q7184903 subclass Q107715 P279 Q309314 Q35120 subclass CPU times: user 2.18 s, sys: 463 ms, total: 2.64 s Wall time: 2min 5s
%%time
if create_sqlite_cache and create_class_viz_tables:
!kgtk --debug query -i classvizedge --idx index:graph --gc $STORE --limit 5
%%time
if create_sqlite_cache and create_class_viz_tables:
!kgtk --debug query -i classviznode --idx label --gc $STORE --limit 5
%%time
if create_sqlite_cache and create_class_viz_tables:
!kgtk --debug query -i classviznode --idx index:graph --gc $STORE --limit 5
!kgtk query --gc $STORE --show-cache
Graph Cache: DB file: /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files/wikidata.sqlite3.db size: 193.02 GB free: 0 Bytes modified: 2022-01-13 18:27:42 KGTK File Information: alias: size: 156.76 MB modified: 2021-11-20 07:45:05 graph: graph_3 claims: size: 10.17 GB modified: 2021-11-20 07:29:41 graph: graph_5 classvizedge: size: 416.38 MB modified: 2021-12-31 08:30:22 graph: graph_8 classvizenode: size: 383.09 MB modified: 2021-12-31 09:03:51 graph: graph_9 classviznode: size: 383.09 MB modified: 2021-12-31 09:03:51 graph: graph_11 datatypes: size: 48.81 KB modified: 2021-11-20 03:43:23 graph: graph_6 description: size: 371.43 MB modified: 2021-11-20 09:00:26 graph: graph_4 l_d_pgr_ud: size: 1.27 GB modified: 2022-01-13 16:54:45 graph: graph_10 label: size: 614.97 MB modified: 2021-11-20 09:27:57 graph: graph_1 pagerank_undirected: size: 740.37 MB modified: 2021-11-20 14:56:23 graph: graph_2 qualifiers: size: 2.30 GB modified: 2021-11-20 07:38:37 graph: graph_7 Graph Table Information: graph_1: size: 4.62 GB created: 2022-01-13 15:49:06 header: ['id', 'node1', 'label', 'node2', 'lang', 'rank', 'node2;wikidatatype'] graph_10: size: 12.36 GB created: 2022-01-13 16:56:45 header: ['node1', 'label', 'node2', 'node2;upper', 'node1;pagerank', 'node1;description'] graph_11: size: 17.50 GB created: 2022-01-13 18:25:50 header: ['node1', 'graph', 'instance_count', 'node_type', 'label', 'tooltip'] graph_2: size: 7.82 GB created: 2022-01-13 15:51:06 header: ['node1', 'label', 'node2', 'id'] graph_3: size: 896.11 MB created: 2022-01-13 15:51:27 header: ['id', 'node1', 'label', 'node2', 'lang', 'rank', 'node2;wikidatatype'] graph_4: size: 6.29 GB created: 2022-01-13 15:52:55 header: ['id', 'node1', 'label', 'node2', 'lang', 'rank', 'node2;wikidatatype'] graph_5: size: 95.24 GB created: 2022-01-13 16:13:20 header: ['id', 'node1', 'label', 'node2', 'lang', 'rank', 'node2;wikidatatype'] graph_6: size: 744.00 KB created: 2022-01-13 16:13:21 header: ['id', 'node1', 'label', 'node2', 'lang', 'rank', 'node2;wikidatatype'] graph_7: size: 25.23 GB created: 2022-01-13 16:19:19 header: ['id', 'node1', 'label', 'node2', 'lang', 'rank', 'node2;wikidatatype'] graph_8: size: 8.52 GB created: 2022-01-13 16:22:30 header: ['node1', 'label', 'node2', 'graph', 'edge_type'] graph_9: size: 14.55 GB created: 2022-01-13 16:27:25 header: ['node1', 'graph', 'instance_count', 'node_type', 'label', 'tooltip']
textmatch
search¶def text_search_labels(search_text, limit=20):
text_search_labels_query = _kapi2.get_query(
doc="Doc string here",
name=f"text_search_labels_{search_text}",
inputs='l_d_pgr_ud',
match='l_d_pgr_ud: (qnode)-[l:label]->(y)',
where=f'textmatch(y, "{search_text}")',
ret='distinct qnode as node1, y as label, 10*matchscore(y) as score, cast(l.`node1;pagerank`, float) as prank, l.`node1;description` as description',
order='score*prank',
limit=limit
)
results = list([list(x) for x in text_search_labels_query.execute()])
df = pd.DataFrame(results, columns=['node1', 'label', 'score', 'pagerank', 'description'])
print(len(df))
return df
text_search_labels('arn sch')
[2022-01-13 18:29:32 query]: SQL Translation: --------------------------------------------- SELECT DISTINCT graph_10_c1."node1" "_aLias.node1", graph_10_c1."node2" "_aLias.label", (? * BM25(txtidx_1.graph_10_txtidx_ldpgridx)) "_aLias.score", CAST(graph_10_c1."node1;pagerank" AS float) "_aLias.prank", graph_10_c1."node1;description" "_aLias.description" FROM graph_10 AS graph_10_c1, graph_10_txtidx_ldpgridx AS txtidx_1 WHERE graph_10_c1."label" = ? AND txtidx_1."node2" MATCH ? and txtidx_1.rowid = graph_10_c1.rowid ORDER BY ("_aLias.score" * "_aLias.prank") ASC LIMIT ? PARAS: [10, 'label', 'arn sch', 20] ---------------------------------------------
20
node1 | label | score | pagerank | description | |
---|---|---|---|---|---|
0 | Q154770 | 'Arnold Schoenberg'@en | -107.405619 | 2.983258e-07 | 'Austrian-American composer (1874-1951)'@en |
1 | Q2685 | 'Arnold Schwarzenegger'@en | -100.426118 | 2.794892e-07 | 'Austrian-American actor, businessman, bodybui... |
2 | Q668095 | 'Scharnstein'@en | -119.905570 | 1.907455e-07 | 'municipality in Austria'@en |
3 | Q506494 | 'Schäftlarn'@en | -122.277359 | 1.862084e-07 | 'municipality of Germany'@en |
4 | Q723261 | 'Scharn'@en | -132.783438 | 1.588464e-07 | 'neighborhood in Maastricht, Netherlands'@en |
5 | Q664826 | 'Scharnitz'@en | -124.744872 | 1.239395e-07 | 'municipality in Austria'@en |
6 | Q522320 | 'Arnschwang'@en | -122.277359 | 1.133833e-07 | 'municipality of Germany'@en |
7 | Q5659790 | 'Harnischia'@en | -122.277359 | 1.014908e-07 | 'genus of insects'@en |
8 | Q2667816 | 'Scharnegoutum'@en | -115.427713 | 1.051748e-07 | 'human settlement in the Netherlands'@en |
9 | Q77325 | 'Arno Schmidt'@en | -117.624040 | 9.288887e-08 | 'German poet'@en |
10 | Q509737 | 'Scharnhorst Order'@en | -107.405619 | 8.702578e-08 | 'award'@en |
11 | Q678395 | 'Scharndorf'@en | -122.277359 | 7.542544e-08 | 'municipality in Austria'@en |
12 | Q18931623 | 'Adelbert van Scharnlaan'@en | -97.265823 | 8.584699e-08 | 'street in Maastricht, the Netherlands'@en |
13 | Q27672057 | 'Arno Schley'@en | -119.905570 | 6.687831e-08 | 'badminton player'@en |
14 | Q4861418 | 'Barnard Castle School'@en | -100.426118 | 7.964473e-08 | 'school in County Durham, UK'@en |
15 | Q632660 | 'Scharnebeck'@en | -119.905570 | 6.320210e-08 | 'municipality of Germany'@en |
16 | Q19508777 | 'Scharnerweg'@en | -119.905570 | 5.883063e-08 | 'street in Maastricht, the Netherlands'@en |
17 | Q432443 | 'Jan Arnoldus Schouten'@en | -100.426118 | 6.910608e-08 | 'Dutch politician (1883-1971)'@en |
18 | Q27530806 | 'Arnd Rösch'@en | -122.277359 | 5.620266e-08 | 'mathematician, chess player'@en |
19 | Q78106 | 'Arnold Schönhage'@en | -109.304758 | 5.492092e-08 | 'German mathematician'@en |
def exact_search_items(search_text, limit=20):
search_text = search_text.upper()
text_search_labels_query = _kapi2.get_query(
doc="""
Create the Kypher query used by 'BrowserBackend.get_node_labels()'
for case_independent searches.
Given parameters 'NODE' and 'LANG' retrieve labels for 'NODE' in
the specified language (using 'any' for 'LANG' retrieves all labels).
Return distinct 'node1', 'node_label' pairs as the result (we include
'NODE' as an output to make it easier to union result frames).
""",
name=f'exact_search_items{search_text}',
inputs='l_d_pgr_ud',
match='l_d_pgr_ud: (n)-[r:label]->(l)',
where=f'n="{search_text}"',
ret='distinct n as node1, l as node_label, r.`node1;description` as description',
)
results = list([list(x) for x in text_search_labels_query.execute()])
df = pd.DataFrame(results, columns=['node1', 'label', 'description'])
print(len(df))
return df
%%time
exact_search_items('q30')
[2022-01-13 18:30:09 query]: SQL Translation: --------------------------------------------- SELECT DISTINCT graph_10_c1."node1" "_aLias.node1", graph_10_c1."node2" "_aLias.node_label", graph_10_c1."node1;description" "_aLias.description" FROM graph_10 AS graph_10_c1 WHERE graph_10_c1."label" = ? AND (graph_10_c1."node1" = ?) LIMIT ? PARAS: ['label', 'Q30', 100] ---------------------------------------------
1 CPU times: user 34 ms, sys: 1.83 ms, total: 35.8 ms Wall time: 59.7 ms
node1 | label | description | |
---|---|---|---|
0 | Q30 | 'United States of America'@en | 'sovereign state in North America'@en |
%%time
exact_search_items('Q140')
[2022-01-13 18:30:10 query]: SQL Translation: --------------------------------------------- SELECT DISTINCT graph_10_c1."node1" "_aLias.node1", graph_10_c1."node2" "_aLias.node_label", graph_10_c1."node1;description" "_aLias.description" FROM graph_10 AS graph_10_c1 WHERE graph_10_c1."label" = ? AND (graph_10_c1."node1" = ?) LIMIT ? PARAS: ['label', 'Q140', 100] ---------------------------------------------
1 CPU times: user 33.3 ms, sys: 5.52 ms, total: 38.8 ms Wall time: 63.1 ms
node1 | label | description | |
---|---|---|---|
0 | Q140 | 'lion'@en | 'species of big cat'@en |
def exact_search_labels(search_text, limit=20):
search_text = f"'{search_text.upper()}'@EN"
text_search_labels_query = _kapi2.get_query(
doc="""
Exact Match case insensitive query
""",
name=f'exact_search_labels{search_text}',
inputs='l_d_pgr_ud',
match=f'l_d_pgr_ud: (n)-[r:label]->(l)',
where=f'r.`node2;upper`="{search_text}"',
ret='distinct n as node1, l as node_label, cast("-1.0", float) as score, cast(r.`node1;pagerank`, float) as prank, r.`node1;description` as description',
order='score*prank',
limit=limit
)
results = list([list(x) for x in text_search_labels_query.execute()])
df = pd.DataFrame(results, columns=['node1', 'label', 'score', 'prank', 'description'])
print(len(df))
return df
exact_search_labels('canada')
[2022-01-13 18:30:13 query]: SQL Translation: --------------------------------------------- SELECT DISTINCT graph_10_c1."node1" "_aLias.node1", graph_10_c1."node2" "_aLias.node_label", CAST(? AS float) "_aLias.score", CAST(graph_10_c1."node1;pagerank" AS float) "_aLias.prank", graph_10_c1."node1;description" "_aLias.description" FROM graph_10 AS graph_10_c1 WHERE graph_10_c1."label" = ? AND (graph_10_c1."node2;upper" = ?) ORDER BY ("_aLias.score" * "_aLias.prank") ASC LIMIT ? PARAS: ['-1.0', 'label', "'CANADA'@EN", 20] ---------------------------------------------
20
node1 | label | score | prank | description | |
---|---|---|---|---|---|
0 | Q16 | 'Canada'@en | -1.0 | 1.052877e-03 | 'sovereign state in North America'@en |
1 | Q2569593 | 'Canada'@en | -1.0 | 8.219015e-08 | 'French colony in New France'@en |
2 | Q13265725 | 'Canada'@en | -1.0 | 2.816562e-08 | 'family name'@en |
3 | Q106501608 | 'Canada'@en | -1.0 | 1.475482e-08 | 'Vanity Fair caricature of The Rt Hon Sir Wilf... |
4 | Q13265795 | 'Canada'@en | -1.0 | 1.402956e-08 | '2012 novel by Richard Ford'@en |
5 | Q2608363 | 'Canada'@en | -1.0 | 1.286213e-08 | 'village in Appelscha, in the Netherlands'@en |
6 | Q18612153 | 'Canada'@en | -1.0 | 1.278788e-08 | '180th strip of the webcomic xkcd'@en |
7 | Q257304 | 'Canada'@en | -1.0 | 1.266340e-08 | 'Wikimedia disambiguation page'@en |
8 | Q103921530 | 'Canada'@en | -1.0 | 1.258331e-08 | 'Shipwreck off the Scottish Coast, imported fr... |
9 | Q14624136 | 'Canada'@en | -1.0 | 1.196157e-08 | 'moth genus of Pteromalidae'@en |
10 | Q5029265 | 'Canada'@en | -1.0 | 1.163423e-08 | 'unincorporated community in Kansas, United St... |
11 | Q99292858 | 'Canada'@en | -1.0 | 1.138884e-08 | 'the country of Canada as depicted in Star Tre... |
12 | Q107159786 | 'Canada'@en | -1.0 | 9.762842e-09 | 'article in Problems of Empire'@en |
13 | Q1031917 | 'Canada'@en | -1.0 | 9.468616e-09 | 'unincorporated community in Pike County, Kent... |
14 | Q18982533 | 'Canada'@en | -1.0 | 9.246475e-09 | 'street in Elsloo, the Netherlands'@en |
15 | Q5029289 | 'Canada'@en | -1.0 | 9.143933e-09 | 'music group from Ann Arbor, Michigan, U.S.'@en |
16 | Q93287969 | 'Canada'@en | -1.0 | 8.596045e-09 | 'hamlet in Hampshire, UK'@en |
17 | Q5029279 | 'Canada'@en | -1.0 | 8.290497e-09 | 'full rigged ship built in 1891'@en |
18 | Q28457929 | 'Canada'@en | -1.0 | 7.668673e-09 | '1786 ship'@en |
19 | Q27722117 | 'Canada'@en | -1.0 | 7.489188e-09 | 'scientific journal'@en |
textlike
search¶def text_like_search_labels(search_text, limit=20):
search_label = f"%{'%'.join(search_text.split(' '))}%"
print(search_text)
text_search_labels_query = _kapi2.get_query(
doc="Doc string here",
name=f"text_like_search_labels_{search_text}",
inputs='l_d_pgr_ud',
match='l_d_pgr_ud: (qnode)-[l:label]->(y)',
where=f'textlike(y, "{search_label}")',
ret='distinct qnode as node1, y as label, matchscore(y) as score, cast(l.`node1;pagerank`, float) as prank, l.`node1;description` as description',
order='score*prank',
limit=limit
)
results = list([list(x) for x in text_search_labels_query.execute()])
df = pd.DataFrame(results, columns=['node1', 'label', 'score', 'pagerank', 'description'])
print(len(df))
return df
text_like_search_labels("fifa group b")
[2022-01-13 18:30:18 query]: SQL Translation: --------------------------------------------- SELECT DISTINCT graph_10_c1."node1" "_aLias.node1", graph_10_c1."node2" "_aLias.label", BM25(txtidx_1.graph_10_txtidx_ldpgridx) "_aLias.score", CAST(graph_10_c1."node1;pagerank" AS float) "_aLias.prank", graph_10_c1."node1;description" "_aLias.description" FROM graph_10 AS graph_10_c1, graph_10_txtidx_ldpgridx AS txtidx_1 WHERE graph_10_c1."label" = ? AND txtidx_1."node2" LIKE ? and txtidx_1.rowid = graph_10_c1.rowid ORDER BY ("_aLias.score" * "_aLias.prank") ASC LIMIT ? PARAS: ['label', '%fifa%group%b%', 20] ---------------------------------------------
fifa group b 20
node1 | label | score | pagerank | description | |
---|---|---|---|---|---|
0 | Q31189406 | '2018 FIFA World Cup Group B'@en | -14.908334 | 3.970111e-08 | |
1 | Q10260332 | '2014 FIFA World Cup Group B'@en | -14.908334 | 2.652110e-08 | |
2 | Q17115977 | 'Category:2014 FIFA World Cup group table temp... | -11.121773 | 2.265241e-08 | 'Wikimedia category'@en |
3 | Q18608402 | 'Category:2015 FIFA Women\'s World Cup group t... | -10.116338 | 1.991870e-08 | 'Wikimedia category'@en |
4 | Q20730977 | '2018 FIFA World Cup qualification – UEFA Grou... | -11.372956 | 1.526389e-08 | |
5 | Q187411 | '2010 FIFA World Cup Group B'@en | -14.908334 | 1.102419e-08 | 'football tournament'@en |
6 | Q601980 | '2014 FIFA World Cup qualification – UEFA Grou... | -11.372956 | 1.365065e-08 | |
7 | Q39134591 | 'Template:2018 FIFA World Cup Group B table'@en | -12.199530 | 1.267818e-08 | 'Wikimedia template'@en |
8 | Q39134633 | 'Template:2018 FIFA World Cup Group C table'@en | -12.199530 | 1.266194e-08 | 'Wikimedia template'@en |
9 | Q39395683 | 'Template:2018 FIFA World Cup Group D table'@en | -12.199530 | 1.264030e-08 | 'Wikimedia template'@en |
10 | Q39395642 | 'Template:2018 FIFA World Cup Group E table'@en | -12.199530 | 1.263154e-08 | 'Wikimedia template'@en |
11 | Q39134549 | 'Template:2018 FIFA World Cup Group A table'@en | -12.199530 | 1.262999e-08 | 'Wikimedia template'@en |
12 | Q39395551 | 'Template:2018 FIFA World Cup Group G table'@en | -12.199530 | 1.262856e-08 | 'Wikimedia template'@en |
13 | Q39395510 | 'Template:2018 FIFA World Cup Group H table'@en | -12.199530 | 1.262313e-08 | 'Wikimedia template'@en |
14 | Q39395600 | 'Template:2018 FIFA World Cup Group F table'@en | -12.199530 | 1.261995e-08 | 'Wikimedia template'@en |
15 | Q86680144 | '2022 FIFA World Cup qualification – UEFA Grou... | -11.372956 | 1.225984e-08 | |
16 | Q12415572 | 'Template:2006 FIFA World Cup Group B'@en | -13.155670 | 1.017848e-08 | 'Wikimedia template'@en |
17 | Q15838739 | 'Template:1994 FIFA World Cup Group B'@en | -13.155670 | 1.017848e-08 | 'Wikimedia template'@en |
18 | Q17348977 | 'Template:2002 FIFA World Cup Group B'@en | -13.155670 | 1.017848e-08 | 'Wikimedia template'@en |
19 | Q25735803 | 'Template:1998 FIFA World Cup Group B'@en | -13.155670 | 1.017848e-08 | 'Wikimedia template'@en |
if create_es_index:
kgtk(""" cat
-i "$GRAPH/labels.en.tsv.gz"
-i "$GRAPH/aliases.en.tsv.gz"
-i "$GRAPH/descriptions.en.tsv.gz"
-i "$GRAPH/claims.tsv.gz"
-i "$GRAPH/qualifiers.tsv.gz"
-i "$GRAPH/metadata.pagerank.undirected.tsv.gz"
-i "$GRAPH/derived.isastar.tsv.gz"
-o "$OUT"/kgtk.search.unsorted.tsv.gz""")
if create_es_index:
kgtk(f"""sort --column node1
-X "--parallel 8 --buffer-size 60% --temporary-directory {os.environ['TEMP']}"
-o "$OUT/kgtk.search.sorted.tsv.gz" """)
kgtk build-kgtk-search-input
command to create the json lines file¶if create_es_index:
kgtk("""build-kgtk-search-input --input-file "$OUT/kgtk.search.sorted.tsv.gz"
--output-file "$OUT"/kgtk.search.sorted.jl
--label-properties label
--alias-properties alias
--extra-alias-properties P1448,P1705,P1477,P1810,P742,P1449
--description-properties description
--pagerank-properties Pundirected_pagerank
--mapping-file "$OUT"/wikidata_es_kgtk_search_mapping.json
--property-datatype-file "$GRAPH"/metadata.property.datatypes.tsv.gz""")
if create_es_index:
os.environ['ESURL'] = f"{es_host}:{es_port}"
os.environ['ESINDEX'] = es_index
if create_es_index:
!curl -H "Content-Type: application/json" -XPUT $ESURL/$ESINDEX -d @"$OUT/wikidata_es_kgtk_search_mapping.json"
The following command requires table-linker to be installed in the current virtual environment.
if create_es_index:
!tl load-elasticsearch-index --es-url $ESURL --es-index $ESINDEX --es-version 7 --kgtk-jl-path "$OUT"/kgtk.search.sorted.jl
!kgtk --debug query --graph-cache /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db \
-i "/Volumes/saggu-ssd/class-viz/class-visualization.edge.tsv.gz" --as classvizedge \
-i "/Volumes/saggu-ssd/class-viz/class-visualization.node.tsv.gz" --as classviznode --limit 3
[2022-01-11 15:46:03 sqlstore]: IMPORT graph directly into table graph_9 from /Volumes/saggu-ssd/class-viz/class-visualization.edge.tsv.gz ... [2022-01-11 16:05:10 sqlstore]: IMPORT graph directly into table graph_10 from /Volumes/saggu-ssd/class-viz/class-visualization.node.tsv.gz ... [2022-01-11 17:49:19 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_9 AS graph_9_c1 LIMIT ? PARAS: [3] --------------------------------------------- node1 label node2 graph edge_type Q10267817 P279 Q18553442 Q1225194 subclass Q107715 P279 Q309314 Q246672 subclass Q107715 P279 Q309314 Q937228 subclass
!kgtk --debug query --graph-cache /data/amandeep/wikidata-20211027-dwd-v3/kgtk-browser-files/temp.kgtk-browser-files/wikidata.sqlite3.db \
-i "/data/amandeep/wikidata-20211027-dwd-v3/class-visualization.node.tsv.gz" --as classviznode --limit 3
[2022-01-13 18:20:57 sqlstore]: IMPORT graph directly into table graph_11 from /data/amandeep/wikidata-20211027-dwd-v3/class-visualization.node.tsv.gz ... [2022-01-13 18:25:51 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_11 AS graph_11_c1 LIMIT ? PARAS: [3] --------------------------------------------- node1 graph instance_count node_type label tooltip Q10267817 Q1225194 1 many_subclasses 'autosomal recessive disease'@en autosomal recessive disease (Q10267817)<BR/>instance count: 1<BR/>node type: many_subclasses Q107715 Q246672 93 many_subclasses 'physical quantity'@en physical quantity (Q107715)<BR/>instance count: 93<BR/>node type: many_subclasses Q107715 Q937228 93 many_subclasses 'physical quantity'@en physical quantity (Q107715)<BR/>instance count: 93<BR/>node type: many_subclasses
!kgtk --debug query -i classvizedge --idx label --gc $STORE --limit 5
!kgtk --debug query -i classviznode --idx label --gc $STORE --limit 5
[2022-01-13 18:26:08 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_11 AS graph_11_c1 LIMIT ? PARAS: [5] --------------------------------------------- [2022-01-13 18:26:08 sqlstore]: CREATE INDEX "graph_11_label_idx" ON "graph_11" ("label") [2022-01-13 18:27:34 sqlstore]: ANALYZE "graph_11_label_idx" node1 graph instance_count node_type label tooltip Q10267817 Q1225194 1 many_subclasses 'autosomal recessive disease'@en autosomal recessive disease (Q10267817)<BR/>instance count: 1<BR/>node type: many_subclasses Q107715 Q246672 93 many_subclasses 'physical quantity'@en physical quantity (Q107715)<BR/>instance count: 93<BR/>node type: many_subclasses Q107715 Q937228 93 many_subclasses 'physical quantity'@en physical quantity (Q107715)<BR/>instance count: 93<BR/>node type: many_subclasses Q107715 Q7184903 93 many_subclasses 'physical quantity'@en physical quantity (Q107715)<BR/>instance count: 93<BR/>node type: many_subclasses Q107715 Q35120 93 many_subclasses 'physical quantity'@en physical quantity (Q107715)<BR/>instance count: 93<BR/>node type: many_subclasses
!kgtk query --gc $STORE --show-cache
Graph Cache: DB file: /Volumes/saggu-ssd/wikidata-dwd-v2/kgtk-search-6/temp.kgtk-search-6/wikidata.sqlite3.db size: 173.57 GB free: 0 Bytes modified: 2022-01-11 17:49:19 KGTK File Information: alias: size: 130.57 MB modified: 2021-11-17 14:57:10 graph: graph_3 claims: size: 9.59 GB modified: 2021-11-19 12:20:27 graph: graph_5 classvizedge: size: 416.38 MB modified: 2022-01-11 10:18:54 graph: graph_9 classviznode: size: 383.09 MB modified: 2022-01-11 10:18:59 graph: graph_10 datatypes: size: 45.00 KB modified: 2020-12-11 10:51:06 graph: graph_6 description: size: 341.84 MB modified: 2021-11-19 12:11:44 graph: graph_4 l_d_pgr_ud: size: 1.18 GB modified: 2021-12-02 20:48:25 graph: graph_8 label: size: 590.02 MB modified: 2021-11-17 14:57:46 graph: graph_1 pagerank_undirected: size: 1.53 GB modified: 2021-11-17 14:58:47 graph: graph_2 qualifiers: size: 2.09 GB modified: 2021-11-19 12:14:21 graph: graph_7 Graph Table Information: graph_1: size: 4.15 GB created: 2021-12-02 18:36:14 header: ['id', 'node1', 'label', 'node2'] graph_10: size: 14.55 GB created: 2022-01-11 17:49:19 header: ['node1', 'graph', 'instance_count', 'node_type', 'label', 'tooltip'] graph_2: size: 19.31 GB created: 2021-12-02 18:53:02 header: ['node1', 'label', 'node2', 'id'] graph_3: size: 711.41 MB created: 2021-12-02 18:53:47 header: ['id', 'node1', 'label', 'node2'] graph_4: size: 5.27 GB created: 2021-12-02 18:57:40 header: ['id', 'node1', 'label', 'node2'] graph_5: size: 87.81 GB created: 2021-12-02 19:52:02 header: ['id', 'node1', 'label', 'node2', 'rank', 'node2;wikidatatype'] graph_6: size: 632.00 KB created: 2021-12-02 19:52:02 header: ['id', 'node1', 'label', 'node2'] graph_7: size: 23.57 GB created: 2021-12-02 20:10:50 header: ['id', 'node1', 'label', 'node2', 'node2;wikidatatype'] graph_8: size: 11.50 GB created: 2021-12-02 20:55:01 header: ['node1', 'label', 'node2', 'node2;upper', 'node1;pagerank', 'node1;description'] graph_9: size: 6.71 GB created: 2022-01-11 16:05:10 header: ['node1', 'label', 'node2', 'graph', 'edge_type']
roots = ['Q1420']
for root in roots:
kgtk(f"""
query -i classvizedge
--match '(class)-[{{label: property, graph: "{root}", edge_type: edge_type}}]->(superclass)'
-o $TEMP/{root}.graph.low.tsv
""")
for root in roots:
kgtk(f"""--debug
query -i classviznode
--match '(class)-[{{graph: "{root}", instance_count: instance_count, label: label}}]->()'
-o $TEMP/{root}.node.graph.low.tsv
""")
[2022-01-13 18:31:29 query]: SQL Translation: --------------------------------------------- SELECT * FROM graph_11 AS graph_11_c1 WHERE graph_11_c1."graph" = ? AND graph_11_c1."instance_count" = graph_11_c1."instance_count" AND graph_11_c1."label" = graph_11_c1."label" PARAS: ['Q1420'] ---------------------------------------------
roots = ['Q1420']
for root in roots:
kgtk(f"""
query -i classvizedge
--match '(class)-[{{label: property, graph: "{root}", edge_type: edge_type}}]->(superclass)'
-o $TEMP/{root}.graph.low.tsv
""")