Datasets¶

A Dataset is a specialization of a Resource that aims to register (upload) files with its metadata.

Note: commented lines are not implemented on the Demo Store

In [1]:

from kgforge.core import KnowledgeGraphForge

In [2]:

forge = KnowledgeGraphForge("../../configurations/demo-forge.yml")

Imports¶

In [3]:

from kgforge.core import Resource

In [4]:

from kgforge.specializations.resources import Dataset

In [5]:

import pandas as pd

Creation with files¶

In [6]:

! ls -p ../../data | egrep -v /$

associations.tsv
persons.csv

In [7]:

jane = Resource(type="Person", name="Jane Doe")

In [8]:

persons = Dataset(forge, name="Interesting Persons")

In [9]:

persons.add_files("../../data/persons.csv")

In [10]:

persons.add_contribution(jane)

In [11]:

# forge.register(persons)

In [12]:

print(persons)

{
    type: Dataset
    contribution:
    {
        type: Contribution
        agent:
        {
            id:
            {
                type: Person
                name: Jane Doe
            }
            type: Agent
        }
    }
    hasPart: LazyAction(operation=Store.upload, args=['../../data/persons.csv'])
    name: Interesting Persons
}

In [13]:

associations = Dataset(forge, name="Associations data")

In [14]:

associations.add_files("../../data/associations.tsv")

In [15]:

# associations.add_derivation(persons)

In [16]:

associations.add_contribution(jane)

In [17]:

# forge.register(associations)

In [18]:

print(associations)

{
    type: Dataset
    contribution:
    {
        type: Contribution
        agent:
        {
            id:
            {
                type: Person
                name: Jane Doe
            }
            type: Agent
        }
    }
    hasPart: LazyAction(operation=Store.upload, args=['../../data/associations.tsv'])
    name: Associations data
}

In [19]:

# associations.download("files", "./downloaded/")

In [20]:

# ! ls ./downloaded

In [21]:

# ! rm -R ./downloaded

Creation with resources¶

In [22]:

distribution_1 = forge.attach("../../data/associations.tsv")

In [23]:

distribution_2 = forge.attach("../../data/persons.csv")

In [24]:

jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)

In [25]:

john = Resource(type="Person", name="John Smith", distribution=distribution_2)

In [26]:

persons = [jane, john]

In [27]:

# forge.register(persons)

In [28]:

dataset = Dataset(forge, name="Interesting people")

In [29]:

# dataset.add_parts(persons)

In [30]:

# print(dataset)

In [31]:

# forge.register(dataset)

In [32]:

# dataset.download("parts", "./downloaded/")

In [33]:

# ! ls ./downloaded

Creation from a dataframe¶

See notebook DataFrame IO.ipynb for details on conversions of instances of Resource from a Pandas DataFrame.

basics¶

In [34]:

dataframe = pd.read_csv("../../data/persons.csv")

In [35]:

dataframe

Out[35]:

	type	name	distribution
0	Person	Marie Curie	../../data/scientists-database/marie_curie.txt
1	Person	Albert Einstein	../../data/scientists-database/albert_einstein...

In [36]:

persons = forge.from_dataframe(dataframe)

In [37]:

forge.register(persons)

<count> 2
<action> _register_one
<succeeded> True

In [38]:

dataset = Dataset(forge, name="Interesting people")

In [39]:

dataset.add_parts(persons)

In [40]:

print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: edbc5599-cd7c-4825-b97e-97da45bc0b8c_version=1
            type: Person
            distribution: ../../data/scientists-database/marie_curie.txt
            name: Marie Curie
        }
        {
            id: 6571c60b-e0a2-4d80-846e-0f116a44921a_version=1
            type: Person
            distribution: ../../data/scientists-database/albert_einstein.txt
            name: Albert Einstein
        }
    ]
    name: Interesting people
}

In [41]:

forge.register(dataset)

<action> _register_one
<succeeded> True

advanced¶

In [42]:

dataframe = pd.read_csv("../../data/associations.tsv", sep="\t")

In [43]:

dataframe

Out[43]:

	id	name	type	agent__type	agent__name	agent__gender__id	agent__gender__type	agent__gender__label	distribution
0	https://kg.example.ch/associations/123	Curie Association	Association	Person	Marie Curie	http://purl.obolibrary.org/obo/PATO_0000383	LabeledOntologyEntity	female	../../data/scientists-database/marie_curie.txt
1	(missing)	Einstein Association	Association	Person	Albert Einstein	http://purl.obolibrary.org/obo/PATO_0000384	LabeledOntologyEntity	male	../../data/scientists-database/albert_einstein...

In [44]:

dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))

In [45]:

associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")

In [46]:

# forge.register(associations)

In [47]:

# dataset = Dataset(forge, name="Interesting associations")

In [48]:

# dataset.add_parts(associations)

In [49]:

# print(dataset)

In [50]:

# forge.register(dataset)

In [ ]: