from kgforge.core import KnowledgeGraphForge
A configuration file is needed in order to create a KnowledgeGraphForge session. A configuration can be generated using the notebook 00-Initialization.ipynb.
Note: DemoStore doesn't implement file operations yet. Use the BluBrainNexus store instead when creating a config file.
forge = KnowledgeGraphForge("../../configurations/forge.yml")
from kgforge.core import Resource
from kgforge.specializations.resources import Dataset
import pandas as pd
! ls -p ../../data | egrep -v /$
associations.tsv my_data.xwz my_data_derived.txt persons-with-id.csv persons.csv tfidfvectorizer_model_schemaorg_linking
persons = Dataset(forge, name="Interesting Persons")
persons.add_files("../../data/persons.csv")
forge.register(persons)
<action> _register_one <succeeded> True
forge.as_json(persons)
{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/980a7cd9-36ef-4fc9-95b8-cbf622b49fd8', 'type': 'Dataset', 'hasPart': {'distribution': {'type': 'DataDownload', 'atLocation': {'type': 'Location', 'store': {'id': 'https://bluebrain.github.io/nexus/vocabulary/diskStorageDefault', 'type': 'DiskStorage', '_rev': 1}}, 'contentSize': {'unitCode': 'bytes', 'value': 52}, 'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/2737f7f0-950a-471d-ae60-80b79c7451bd', 'digest': {'algorithm': 'SHA-256', 'value': '1dacd765946963fda4949753659089c5f532714b418d30788bedddfec47a389f'}, 'encodingFormat': 'text/csv', 'name': 'persons.csv'}}, 'name': 'Interesting Persons'}
associations = Dataset(forge, name="Associations data")
associations.add_files("../../data/associations.tsv")
associations.add_derivation(persons)
forge.register(associations)
<action> _register_one <succeeded> True
forge.as_json(associations)
{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/80bd2bcb-b84f-4418-9d2e-42712a59fbfb', 'type': 'Dataset', 'derivation': {'type': 'Derivation', 'entity': {'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/980a7cd9-36ef-4fc9-95b8-cbf622b49fd8?rev=1', 'type': 'Dataset', 'name': 'Interesting Persons'}}, 'hasPart': {'distribution': {'type': 'DataDownload', 'atLocation': {'type': 'Location', 'store': {'id': 'https://bluebrain.github.io/nexus/vocabulary/diskStorageDefault', 'type': 'DiskStorage', '_rev': 1}}, 'contentSize': {'unitCode': 'bytes', 'value': 477}, 'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/d6b03d5b-4007-48d7-9432-5bf302c62999', 'digest': {'algorithm': 'SHA-256', 'value': '789aa07948683fe036ac29811814a826b703b562f7d168eb70dee1fabde26859'}, 'encodingFormat': 'text/tab-separated-values', 'name': 'associations.tsv'}}, 'name': 'Associations data'}
# By default the files are downloaded in the current path (path="."). The urls or the files to download can be collected from a different json path (by setting a value for "follow") and
# the files downloaded to a different path (by setting a value for "path")
# The argument overwrite: bool can be provided to decide whether to overwrite (True) existing files with the same name or
# to create new ones (False) with their names suffixed with a timestamp.
# A cross_bucket argument can be provided to download data from the configured bucket (cross_bucket=False - the default value)
# or from a bucket different than the configured one (cross_bucket=True). The configured store should support crossing buckets for this to work.
associations.download(source="parts")
# A specific path can be provided.
associations.download(path="./downloaded/", source="parts")
# A specific content type can be downloded.
associations.download(path="./downloaded/", source="parts", content_type="text/tab-separated-values")
! ls -l ./downloaded
total 8 -rw-r--r-- 1 mfsy staff 477 Apr 12 17:13 associations.tsv
# ! rm -R ./downloaded/
persons = Dataset(forge, name="Interesting Persons")
persons.add_distribution("../../data/associations.tsv")
forge.register(persons)
<action> _register_one <succeeded> True
forge.as_json(persons)
{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/3579d0f7-dbf4-40be-90e5-cd641704dfb3', 'type': 'Dataset', 'distribution': {'type': 'DataDownload', 'atLocation': {'type': 'Location', 'store': {'id': 'https://bluebrain.github.io/nexus/vocabulary/diskStorageDefault', 'type': 'DiskStorage', '_rev': 1}}, 'contentSize': {'unitCode': 'bytes', 'value': 477}, 'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/dbb6814c-ef9c-4320-b8a0-bf8190dd510a', 'digest': {'algorithm': 'SHA-256', 'value': '789aa07948683fe036ac29811814a826b703b562f7d168eb70dee1fabde26859'}, 'encodingFormat': 'text/tab-separated-values', 'name': 'associations.tsv'}, 'name': 'Interesting Persons'}
# When files are added as distributions, they can be directly downloaded without specifying which json path to use to collect the downlodable urls. In addition, content type and path arguments
# can still be provided
persons.download()
distribution_1 = forge.attach("../../data/associations.tsv")
distribution_2 = forge.attach("../../data/persons.csv")
jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)
john = Resource(type="Person", name="John Smith", distribution=distribution_2)
persons = [jane, john]
forge.register(persons)
<count> 2 <action> _register_many <succeeded> True
dataset = Dataset(forge, name="Interesting people")
dataset.add_parts(persons)
forge.register(dataset)
<action> _register_one <succeeded> True
forge.as_json(dataset)
{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/46d34055-c662-4a7d-90f4-2c866f89cf57', 'type': 'Dataset', 'hasPart': [{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/0a2041a9-12f7-49aa-b302-48bb09450832?rev=1', 'type': 'Person', 'distribution': {'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/901d4b2e-2b67-4504-aca7-3ab93966dbad'}, 'name': 'Jane Doe'}, {'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/87652108-38ca-4fe2-8e41-9ba5ef22f32b?rev=1', 'type': 'Person', 'distribution': {'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/86eb143e-89cb-462d-af0e-48afb7172f2d'}, 'name': 'John Smith'}], 'name': 'Interesting people'}
dataset.download(path="./downloaded/", source="parts")
! ls -l ./downloaded
total 32 -rw-r--r-- 1 mfsy staff 477 Apr 12 17:14 associations.tsv -rw-r--r-- 1 mfsy staff 477 Apr 12 17:14 associations.tsv.20220412171438 -rw-r--r-- 1 mfsy staff 52 Apr 12 17:14 persons.csv -rw-r--r-- 1 mfsy staff 52 Apr 12 17:14 persons.csv.20220412171438
# ! rm -R ./downloaded/
dataset = Dataset.from_resource(forge, [jane, john], store_metadata=True)
print(*dataset, sep="\n")
{ id: https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/0a2041a9-12f7-49aa-b302-48bb09450832 type: Person distribution: { type: DataDownload atLocation: { type: Location store: { id: https://bluebrain.github.io/nexus/vocabulary/diskStorageDefault type: DiskStorage _rev: 1 } } contentSize: { unitCode: bytes value: 477 } contentUrl: https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/901d4b2e-2b67-4504-aca7-3ab93966dbad digest: { algorithm: SHA-256 value: 789aa07948683fe036ac29811814a826b703b562f7d168eb70dee1fabde26859 } encodingFormat: text/tab-separated-values name: associations.tsv } name: Jane Doe } { id: https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/87652108-38ca-4fe2-8e41-9ba5ef22f32b type: Person distribution: { type: DataDownload atLocation: { type: Location store: { id: https://bluebrain.github.io/nexus/vocabulary/diskStorageDefault type: DiskStorage _rev: 1 } } contentSize: { unitCode: bytes value: 52 } contentUrl: https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/86eb143e-89cb-462d-af0e-48afb7172f2d digest: { algorithm: SHA-256 value: 1dacd765946963fda4949753659089c5f532714b418d30788bedddfec47a389f } encodingFormat: text/csv name: persons.csv } name: John Smith }
See notebook 07 DataFrame IO.ipynb
for details on conversions of instances of Resource from a Pandas DataFrame.
dataframe = pd.read_csv("../../data/persons.csv")
dataframe
type | name | |
---|---|---|
0 | Person | Marie Curie |
1 | Person | Albert Einstein |
persons = forge.from_dataframe(dataframe)
forge.register(persons)
<count> 2 <action> _register_many <succeeded> True
dataset = Dataset(forge, name="Interesting people")
dataset.add_parts(persons)
forge.register(dataset)
<action> _register_one <succeeded> True
forge.as_json(dataset)
{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/5e1118bc-70b6-4b1d-b8ba-060a6f684230', 'type': 'Dataset', 'hasPart': [{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/58635f85-c6cc-4a7f-bf16-1558a5713080?rev=1', 'type': 'Person', 'name': 'Marie Curie'}, {'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/a9c2740b-2ef9-4557-841c-6ab425d35906?rev=1', 'type': 'Person', 'name': 'Albert Einstein'}], 'name': 'Interesting people'}
dataframe = pd.read_csv("../../data/associations.tsv", sep="\t")
dataframe
id | name | type | agent__type | agent__name | agent__gender__id | agent__gender__type | agent__gender__label | distribution | |
---|---|---|---|---|---|---|---|---|---|
0 | (missing) | Curie Association | Association | Person | Marie Curie | http://purl.obolibrary.org/obo/PATO_0000383 | LabeledOntologyEntity | female | ../../data/scientists-database/marie_curie.txt |
1 | (missing) | Einstein Association | Association | Person | Albert Einstein | http://purl.obolibrary.org/obo/PATO_0000384 | LabeledOntologyEntity | male | ../../data/scientists-database/albert_einstein... |
dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))
associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")
print(*associations, sep="\n")
{ type: Association agent: { type: Person gender: { id: http://purl.obolibrary.org/obo/PATO_0000383 type: LabeledOntologyEntity label: female } name: Marie Curie } distribution: LazyAction(operation=Store.upload, args=['../../data/scientists-database/marie_curie.txt', None]) name: Curie Association } { type: Association agent: { type: Person gender: { id: http://purl.obolibrary.org/obo/PATO_0000384 type: LabeledOntologyEntity label: male } name: Albert Einstein } distribution: LazyAction(operation=Store.upload, args=['../../data/scientists-database/albert_einstein.txt', None]) name: Einstein Association }
forge.register(associations)
<count> 2 <action> _register_many <succeeded> True
dataset = Dataset(forge, name="Interesting associations")
dataset.add_parts(associations)
forge.register(dataset)
<action> _register_one <succeeded> True
forge.as_json(dataset)
{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/ae57773c-9e71-4ec0-85b9-6e5f52a04349', 'type': 'Dataset', 'hasPart': [{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/8fc91342-42f7-4946-8a5d-a21be3448684?rev=1', 'type': 'Association', 'distribution': {'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/5e4e7cb5-707a-4d4a-9682-36e5b993fe40'}, 'name': 'Curie Association'}, {'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/d78d842a-89f9-4273-bc70-9666c1f72781?rev=1', 'type': 'Association', 'distribution': {'contentUrl': 'https://bbp.epfl.ch/nexus/v1/files/dke/kgforge/1dc1c1a2-f13c-47d6-9533-92c97bf5a5d6'}, 'name': 'Einstein Association'}], 'name': 'Interesting associations'}