import os
import sys
sys.path.insert(0,'..')
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
# Parameters
kgtk_path = "/Users/filipilievski/mcs/kgtk"
tutorial_deployment_path = "/Users/filipilievski/mcs/kgtk-tutorial-files/datasets"
project_deployment_path = tutorial_deployment_path + "/arnold-network-analysis"
# Folder on local machine where to create the output and temporary folders
input_path = "/Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense"
output_path = "/Users/filipilievski/mcs/kgtk-projects"
project_name = "building-commonsense-knowledge-graph"
files = [
"conceptnet",
"vg_graphs",
"vg_synsets",
"atomic",
"mapping_lex",
"mapping_cnfn"
]
ck = ConfigureKGTK(files, kgtk_path=kgtk_path)
ck.configure_kgtk(input_graph_path=input_path,
output_path=output_path,
project_name=project_name)
User home: /Users/filipilievski Current dir: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph KGTK dir: /Users/filipilievski/mcs/kgtk Use-cases dir: /Users/filipilievski/mcs/kgtk/use-cases
os.environ["conceptnet"]="%s/conceptnet-assertions-5.7.0.csv" % os.environ["GRAPH"]
os.environ["vg_graphs"]="%s/visualgenome/scene_graphs.json" % os.environ["GRAPH"]
os.environ["vg_synsets"]="%s/visualgenome/attribute_synsets.json" % os.environ["GRAPH"]
os.environ["atomic"]="%s/v4_atomic_all_agg.csv" % os.environ["GRAPH"]
os.environ["mapping_lex"]="%s/mappings/lexical_mappings.tsv" % os.environ["GRAPH"]
os.environ["mapping_cnfn"]="%s/mappings/mapping_fn_cn.tsv" % os.environ["GRAPH"]
os.environ['kgtk_path'] = kgtk_path
os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']
os.environ['KGTK_OPTION_DEBUG'] = "false"
ck.print_env_variables()
USE_CASES_DIR: /Users/filipilievski/mcs/kgtk/use-cases GRAPH: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense TEMP: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph kgtk: kgtk EXAMPLES_DIR: /Users/filipilievski/mcs/kgtk/examples OUT: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph kypher: kgtk query --graph-cache /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph/wikidata.sqlite3.db STORE: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph/wikidata.sqlite3.db conceptnet: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/conceptnet-assertions-5.7.0.csv vg_graphs: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/visualgenome/scene_graphs.json vg_synsets: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/visualgenome/attribute_synsets.json atomic: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/v4_atomic_all_agg.csv mapping_lex: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/mappings/lexical_mappings.tsv mapping_cnfn: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/mappings/mapping_fn_cn.tsv
Load all my files into the kypher cache so that all graph aliases are defined
%%time
#ck.load_files_into_cache()
CPU times: user 3 µs, sys: 0 ns, total: 3 µs Wall time: 6.2 µs
%cd {os.environ['OUT']}
/Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph
We will first import the individual resources in KGTK format:
%%time
# Import ConceptNet
!kgtk import_conceptnet --english_only -i $conceptnet -o $TEMP/kgtk_conceptnet.tsv
CPU times: user 1.92 s, sys: 572 ms, total: 2.49 s Wall time: 3min 33s
%%time
# Import FrameNet
!kgtk import-framenet -o $TEMP/kgtk_framenet.tsv
[nltk_data] Downloading package framenet_v17 to [nltk_data] /Users/filipilievski/nltk_data... [nltk_data] Package framenet_v17 is already up-to-date! CPU times: user 290 ms, sys: 94.8 ms, total: 384 ms Wall time: 27.1 s
%%time
# Import Visual Genome
!kgtk import-visualgenome -i $vg_graphs --attr-synsets $vg_synsets \
-o $TEMP/kgtk_visualgenome.tsv
CPU times: user 442 ms, sys: 141 ms, total: 583 ms Wall time: 40.7 s
%%time
# Import ATOMIC
!kgtk import_atomic -i $atomic -o $TEMP/kgtk_atomic.tsv
CPU times: user 149 ms, sys: 54.7 ms, total: 204 ms Wall time: 14.1 s
We will first concatenate the sources to create cskg_base.tsv
:
%%time
!kgtk cat -i $TEMP/kgtk_atomic.tsv $TEMP/kgtk_conceptnet.tsv $TEMP/kgtk_framenet.tsv $TEMP/kgtk_visualgenome.tsv \
/ sort -c 'node1,relation,node2' \
/ add_id --id-style node1-label-node2-num \
/ reorder_columns --columns id ... -o $TEMP/cskg_base.tsv
CPU times: user 896 ms, sys: 280 ms, total: 1.18 s Wall time: 1min 18s
Let's see what we get from simple concatenation:
%%time
kgtk("""
cat -i $TEMP/cskg_base.tsv
""")
/Users/filipilievski/opt/anaconda3/envs/scenegen/lib/python3.8/site-packages/IPython/core/magic.py:187: DtypeWarning: Columns (7,9) have mixed types.Specify dtype option on import or set low_memory=False. call = lambda f, *a, **k: f(*a, **k)
CPU times: user 1min 12s, sys: 21.9 s, total: 1min 34s Wall time: 1min 25s
id | node1 | relation | node2 | node1;label | node2;label | relation;label | relation;dimension | source | sentence | |
---|---|---|---|---|---|---|---|---|---|---|
0 | /c/en/0-/r/DefinedAs-/c/en/empty_set-0000 | /c/en/0 | /r/DefinedAs | /c/en/empty_set | 0 | empty set | defined as | NaN | CN | [[0]] is the [[empty set]]. |
1 | /c/en/0-/r/DefinedAs-/c/en/first_limit_ordinal... | /c/en/0 | /r/DefinedAs | /c/en/first_limit_ordinal | 0 | first limit ordinal | defined as | NaN | CN | [[0]] is the [[first limit ordinal]]. |
2 | /c/en/0-/r/DefinedAs-/c/en/number_zero-0000 | /c/en/0 | /r/DefinedAs | /c/en/number_zero | 0 | number zero | defined as | NaN | CN | [[0]] is the [[number zero]]. |
3 | /c/en/0-/r/HasContext-/c/en/internet_slang-0000 | /c/en/0 | /r/HasContext | /c/en/internet_slang | 0 | internet slang | has context | NaN | CN | NaN |
4 | /c/en/0-/r/HasProperty-/c/en/pronounced_zero-0000 | /c/en/0 | /r/HasProperty | /c/en/pronounced_zero | 0 | pronounced zero | has property | NaN | CN | [[\0\"]] is [[pronounced zero]]" |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
6773218 | wn:zucchini.n.01-mw:MayHaveProperty-wn:steamed... | wn:zucchini.n.01 | mw:MayHaveProperty | wn:steamed.s.01 | zucchini | steamed | may have property | NaN | VG | NaN |
6773219 | wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... | wn:zucchini.n.01 | mw:MayHaveProperty | wn:yellow.s.01 | zucchini squash | yellow | may have property | NaN | VG | NaN |
6773220 | wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... | wn:zucchini.n.01 | mw:MayHaveProperty | wn:yellow.s.01 | zucchini | yellow | may have property | NaN | VG | NaN |
6773221 | wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... | wn:zucchini.n.01 | mw:MayHaveProperty | wn:yellow.s.01 | zucchini | yellow | may have property | NaN | VG | NaN |
6773222 | wn:zwieback.n.01-/r/LocatedNear-wn:elephant.n.... | wn:zwieback.n.01 | /r/LocatedNear | wn:elephant.n.01 | rusk | elephant | of | NaN | VG | NaN |
6773223 rows × 10 columns
As a second attempt, we add mappings between nodes and we use them to deduplicate
%%time
## Concatenate mappings
!kgtk cat -i $TEMP/kgtk_atomic.tsv $TEMP/kgtk_conceptnet.tsv $TEMP/kgtk_framenet.tsv $TEMP/kgtk_visualgenome.tsv \
$mapping_lex $mapping_cnfn \
/ sort -c 'node1,relation,node2' \
/ compact --columns node1 relation node2 -o $TEMP/kgtk_compact_quoted.tsv
CPU times: user 1.57 s, sys: 506 ms, total: 2.08 s Wall time: 2min 26s
!head $TEMP/kgtk_compact_quoted.tsv
node1 relation node2 node1;label node2;label relation;label relation;dimension source sentence id /c/en/0 /r/DefinedAs /c/en/empty_set "0" "empty set" "defined as" "CN" "[[0]] is the [[empty set]]." /c/en/0 /r/DefinedAs /c/en/first_limit_ordinal "0" "first limit ordinal" "defined as" "CN" "[[0]] is the [[first limit ordinal]]." /c/en/0 /r/DefinedAs /c/en/number_zero "0" "number zero" "defined as" "CN" "[[0]] is the [[number zero]]." /c/en/0 /r/HasContext /c/en/internet_slang "0" "internet slang" "has context" "CN" /c/en/0 /r/HasProperty /c/en/pronounced_zero "0" "pronounced zero" "has property" "CN" "[[\"0\"]] is [[pronounced zero]]" /c/en/0 /r/IsA /c/en/set_containing_one_element "0" "set containing one element" "is a" "CN" "[[{0}]] is a type of [[set containing one element]]." /c/en/0 /r/RelatedTo /c/en/1 "0" "1" "related to" "CN" /c/en/0 /r/RelatedTo /c/en/2 "0" "2" "related to" "CN" /c/en/0.22_inch_calibre /r/IsA /c/en/5.6_millimetres "0.22 inch calibre" "5.6 millimetres" "is a" "CN" "[[0.22 inch calibre]] is [[5.6 millimetres]]"
%%time
# Deduplicate
!kgtk connected_components -i $TEMP/kgtk_compact_quoted.tsv \
--properties mw:SameAs --cluster-name-method lowest \
/ lift --columns-to-lift node1 node2 --lift-suffix= \
--input-file $TEMP/kgtk_compact_quoted.tsv --label-file - \
--label-select-value connected_component \
/ filter --invert -p ';mw:SameAs;' \
/ compact --columns node1 relation node2 --presorted False \
/ add_id --id-style node1-label-node2-num / \
reorder_columns --columns id ... -o $OUT/cskg.tsv.gz
CPU times: user 1.82 s, sys: 590 ms, total: 2.41 s Wall time: 2min 30s
We first compute statistics of the graph:
%%time
!kgtk graph_statistics -i $OUT/cskg.tsv.gz \
--degrees --hits --pagerank --statistics-only \
--log $TEMP/summary.txt -o $TEMP/statistics.tsv
objc[24619]: Class GNotificationCenterDelegate is implemented in both /Users/filipilievski/opt/anaconda3/envs/scenegen/lib/libgio-2.0.0.dylib (0x12bdd09b0) and /usr/local/opt/glib/lib/libgio-2.0.0.dylib (0x14f2f22f8). One of the two will be used. Which one is undefined. CPU times: user 1.41 s, sys: 478 ms, total: 1.89 s Wall time: 2min 1s
Let's see the summarized information about the graph nodes, relations, and centrality metrics.
!cat $TEMP/summary.txt
graph loaded! It has 2102795 nodes and 4481705 edges ###Top relations: /r/RelatedTo 1703951 /r/FormOf 378859 /r/DerivedFrom 325374 /r/HasContext 232935 /r/IsA 231424 /r/Synonym 222155 /r/LocatedNear 157204 at:xAttr 133281 at:xWant 129171 at:xEffect 100307 ###Degrees: in degree stats: mean=2.131309, std=0.020009, max=1 out degree stats: mean=2.131309, std=0.007436, max=1 total degree stats: mean=4.262617, std=0.022866, max=1 ###PageRank Max pageranks 14095 /c/en/entity/n/wn 0.004315 13917 /c/en/abstraction/n/wn 0.003083 319950 /c/en/physical_entity/n/wn 0.001992 210 /c/en/organic_compound 0.001859 126760 /c/en/whole/n/wn 0.001584 ###HITS HITS hubs 2091640 wn:white.a.01 0.134129 2091604 wn:black.a.01 0.126603 2091632 wn:red.s.01 0.119914 2091546 wn:small.a.01 0.114502 2091608 wn:brown.s.01 0.114064 HITS auth 2091583 wn:man.n.01 0.114008 2091600 wn:woman.n.01 0.098821 2091726 wn:person.n.01 0.098466 2091532 wn:sign.n.02 0.095097 2091854 wn:tree.n.01 0.086341
!head $TEMP/statistics.tsv
node1 label node2 id /c/en/0 vertex_in_degree 21 /c/en/0-vertex_in_degree-0 /c/en/0 vertex_out_degree 8 /c/en/0-vertex_out_degree-1 /c/en/0 vertex_pagerank 2.1814036156011614e-06 /c/en/0-vertex_pagerank-2 /c/en/0 vertex_hubs 7.957265660837697e-14 /c/en/0-vertex_hubs-3 /c/en/0 vertex_auth 3.18954781666107e-15 /c/en/0-vertex_auth-4 /c/en/empty_set vertex_in_degree 21 /c/en/empty_set-vertex_in_degree-5 /c/en/empty_set vertex_out_degree 5 /c/en/empty_set-vertex_out_degree-6 /c/en/empty_set vertex_pagerank 1.743039400606911e-06 /c/en/empty_set-vertex_pagerank-7 /c/en/empty_set vertex_hubs 3.6071192796944516e-15 /c/en/empty_set-vertex_hubs-8
Let's find paths between some node pairs:
%%bash
cat <<EOF >$TEMP/path-query.tsv
node1 node2 label
/c/en/politician /c/en/lie path
/c/en/politician /c/en/actor path
/c/en/politician /c/en/film path
EOF
%%time
kgtk("""
paths -i $OUT/cskg.tsv.gz
--verbose False
--max_hops 2
--statistics-only True
--path-file $TEMP/path-query.tsv
-o $TEMP/path-results.tsv
""")
objc[24659]: Class GNotificationCenterDelegate is implemented in both /Users/filipilievski/opt/anaconda3/envs/scenegen/lib/libgio-2.0.0.dylib (0x126b339b0) and /usr/local/opt/glib/lib/libgio-2.0.0.dylib (0x14a0792f8). One of the two will be used. Which one is undefined. CPU times: user 19.7 ms, sys: 31.4 ms, total: 51 ms Wall time: 51 s
!head $TEMP/path-results.tsv
node1 label node2 id p0 0 /c/en/politician-/r/Antonym-/c/en/honest-0000 p0-0-0 p0 1 /c/en/honest-/r/Antonym-/c/en/lie-0000 p0-1-1 p1 0 /c/en/politician-/r/Antonym-/c/en/honest-0000 p1-0-2 p1 1 /c/en/honest-/r/DistinctFrom-/c/en/lie-0000 p1-1-3 p2 0 /c/en/politician-/r/CapableOf-/c/en/lie-0000 p2-0-4 p3 0 /c/en/politician-/r/IsA-/c/en/human-0000 p3-0-5 p3 1 /c/en/human-/r/AtLocation-/c/en/lie-0000 p3-1-6 p4 0 /c/en/politician-/r/RelatedTo-/c/en/liar-0000 p4-0-7 p4 1 /c/en/liar-/r/EtymologicallyRelatedTo-/c/en/lie-0000 p4-1-8
The next command uses a query to retrieve the node1, label, node2 for each of the edges in the paths. We show the results through add-labels to help us interpret the results.
kgtk("""
query -i $OUT/cskg.tsv.gz -i $TEMP/path-results.tsv
--match '
path: (path)-[segment]->(edge),
cskg: (n1)-[edge {relation: property}]->(n2)'
--return 'n1.label as node1, property.label as label, n2.label as node2, path as path, segment as segment'
--order-by 'path, segment'
""")
node1 | label | node2 | path | segment | |
---|---|---|---|---|---|
0 | politician | antonym | honest | p0 | p0-0-0 |
1 | honest | antonym | lie | p0 | p0-1-1 |
2 | politician | antonym | honest | p1 | p1-0-2 |
3 | honest | distinct from | lie | p1 | p1-1-3 |
4 | politician | capable of | lie | p2 | p2-0-4 |
5 | politician | is a | human | p3 | p3-0-5 |
6 | human | at location | lie | p3 | p3-1-6 |
7 | politician | related to | liar | p4 | p4-0-7 |
8 | liar | etymologically related to | lie | p4 | p4-1-8 |
9 | politician | related to | lying | p5 | p5-0-9 |
10 | lying | etymologically related to | lie | p5 | p5-1-10 |