A Dataset is a specialization of a Resource
that aims to register (upload) files with its metadata.
Note: commented lines are not implemented on the Demo Store
from kgforge.core import KnowledgeGraphForge
forge = KnowledgeGraphForge("../../configurations/demo-forge.yml")
from kgforge.core import Resource
from kgforge.specializations.resources import Dataset
import pandas as pd
! ls -p ../../data | egrep -v /$
associations.tsv persons.csv
jane = Resource(type="Person", name="Jane Doe")
persons = Dataset(forge, name="Interesting Persons")
persons.add_files("../../data/persons.csv")
persons.add_contribution(jane)
# forge.register(persons)
print(persons)
{ type: Dataset contribution: { type: Contribution agent: { id: { type: Person name: Jane Doe } type: Agent } } hasPart: LazyAction(operation=Store.upload, args=['../../data/persons.csv']) name: Interesting Persons }
associations = Dataset(forge, name="Associations data")
associations.add_files("../../data/associations.tsv")
# associations.add_derivation(persons)
associations.add_contribution(jane)
# forge.register(associations)
print(associations)
{ type: Dataset contribution: { type: Contribution agent: { id: { type: Person name: Jane Doe } type: Agent } } hasPart: LazyAction(operation=Store.upload, args=['../../data/associations.tsv']) name: Associations data }
# associations.download("files", "./downloaded/")
# ! ls ./downloaded
# ! rm -R ./downloaded
distribution_1 = forge.attach("../../data/associations.tsv")
distribution_2 = forge.attach("../../data/persons.csv")
jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)
john = Resource(type="Person", name="John Smith", distribution=distribution_2)
persons = [jane, john]
# forge.register(persons)
dataset = Dataset(forge, name="Interesting people")
# dataset.add_parts(persons)
# print(dataset)
# forge.register(dataset)
# dataset.download("parts", "./downloaded/")
# ! ls ./downloaded
See notebook DataFrame IO.ipynb
for details on conversions of instances of Resource from a Pandas DataFrame.
dataframe = pd.read_csv("../../data/persons.csv")
dataframe
type | name | distribution | |
---|---|---|---|
0 | Person | Marie Curie | ../../data/scientists-database/marie_curie.txt |
1 | Person | Albert Einstein | ../../data/scientists-database/albert_einstein... |
persons = forge.from_dataframe(dataframe)
forge.register(persons)
<count> 2 <action> _register_one <succeeded> True
dataset = Dataset(forge, name="Interesting people")
dataset.add_parts(persons)
print(dataset)
{ type: Dataset hasPart: [ { id: edbc5599-cd7c-4825-b97e-97da45bc0b8c_version=1 type: Person distribution: ../../data/scientists-database/marie_curie.txt name: Marie Curie } { id: 6571c60b-e0a2-4d80-846e-0f116a44921a_version=1 type: Person distribution: ../../data/scientists-database/albert_einstein.txt name: Albert Einstein } ] name: Interesting people }
forge.register(dataset)
<action> _register_one <succeeded> True
dataframe = pd.read_csv("../../data/associations.tsv", sep="\t")
dataframe
id | name | type | agent__type | agent__name | agent__gender__id | agent__gender__type | agent__gender__label | distribution | |
---|---|---|---|---|---|---|---|---|---|
0 | https://kg.example.ch/associations/123 | Curie Association | Association | Person | Marie Curie | http://purl.obolibrary.org/obo/PATO_0000383 | LabeledOntologyEntity | female | ../../data/scientists-database/marie_curie.txt |
1 | (missing) | Einstein Association | Association | Person | Albert Einstein | http://purl.obolibrary.org/obo/PATO_0000384 | LabeledOntologyEntity | male | ../../data/scientists-database/albert_einstein... |
dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))
associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")
# forge.register(associations)
# dataset = Dataset(forge, name="Interesting associations")
# dataset.add_parts(associations)
# print(dataset)
# forge.register(dataset)