#!/usr/bin/env python # coding: utf-8 # # Datasets # # A [Dataset](https://nexus-forge.readthedocs.io/en/latest/interaction.html#dataset) is a specialization of a [Resource](https://nexus-forge.readthedocs.io/en/latest/interaction.html#resource) that provides users with operations to handle files, record their provenance and describe them with metadata. # In[1]: from kgforge.core import KnowledgeGraphForge # A configuration file is needed in order to create a KnowledgeGraphForge session. A configuration can be generated using the notebook [00-Initialization.ipynb](00%20-%20Initialization.ipynb). # # Note: DemoStore doesn't implement file operations yet. Use the BluBrainNexus store instead when creating a config file. # In[58]: forge = KnowledgeGraphForge("../../configurations/forge.yml") # ## Imports # In[3]: from kgforge.core import Resource # In[4]: from kgforge.specializations.resources import Dataset # In[5]: import pandas as pd # ## Creation with files added as parts # In[6]: get_ipython().system(' ls -p ../../data | egrep -v /$') # In[7]: persons = Dataset(forge, name="Interesting Persons") # In[8]: persons.add_files("../../data/persons.csv") # In[9]: forge.register(persons) # In[10]: forge.as_json(persons) # In[11]: associations = Dataset(forge, name="Associations data") # In[12]: associations.add_files("../../data/associations.tsv") # In[13]: associations.add_derivation(persons) # In[14]: forge.register(associations) # In[15]: forge.as_json(associations) # In[ ]: # By default the files are downloaded in the current path (path="."). The urls or the files to download can be collected from a different json path (by setting a value for "follow") and # the files downloaded to a different path (by setting a value for "path") # The argument overwrite: bool can be provided to decide whether to overwrite (True) existing files with the same name or # to create new ones (False) with their names suffixed with a timestamp. # A cross_bucket argument can be provided to download data from the configured bucket (cross_bucket=False - the default value) # or from a bucket different than the configured one (cross_bucket=True). The configured store should support crossing buckets for this to work. associations.download(source="parts") # In[19]: # A specific path can be provided. associations.download(path="./downloaded/", source="parts") # In[ ]: # A specific content type can be downloded. associations.download(path="./downloaded/", source="parts", content_type="text/tab-separated-values") # In[20]: get_ipython().system(' ls -l ./downloaded') # In[18]: # ! rm -R ./downloaded/ # ## Creation with files added as distribution # In[59]: persons = Dataset(forge, name="Interesting Persons") # In[60]: persons.add_distribution("../../data/associations.tsv") # In[61]: forge.register(persons) # In[62]: forge.as_json(persons) # In[ ]: # When files are added as distributions, they can be directly downloaded without specifying which json path to use to collect the downlodable urls. In addition, content type and path arguments # can still be provided persons.download() # ## Creation with resources added as parts # In[21]: distribution_1 = forge.attach("../../data/associations.tsv") # In[22]: distribution_2 = forge.attach("../../data/persons.csv") # In[23]: jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1) # In[24]: john = Resource(type="Person", name="John Smith", distribution=distribution_2) # In[25]: persons = [jane, john] # In[26]: forge.register(persons) # In[27]: dataset = Dataset(forge, name="Interesting people") # In[28]: dataset.add_parts(persons) # In[29]: forge.register(dataset) # In[30]: forge.as_json(dataset) # In[34]: dataset.download(path="./downloaded/", source="parts") # In[35]: get_ipython().system(' ls -l ./downloaded') # In[31]: # ! rm -R ./downloaded/ # ### Creation from resources converted as Dataset objects # In[63]: dataset = Dataset.from_resource(forge, [jane, john], store_metadata=True) print(*dataset, sep="\n") # ## Creation from a dataframe # See notebook `07 DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame. # ### basics # In[37]: dataframe = pd.read_csv("../../data/persons.csv") # In[38]: dataframe # In[39]: persons = forge.from_dataframe(dataframe) # In[40]: forge.register(persons) # In[41]: dataset = Dataset(forge, name="Interesting people") # In[42]: dataset.add_parts(persons) # In[43]: forge.register(dataset) # In[44]: forge.as_json(dataset) # ### advanced # In[45]: dataframe = pd.read_csv("../../data/associations.tsv", sep="\t") # In[46]: dataframe # In[47]: dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x)) # In[48]: associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__") # In[49]: print(*associations, sep="\n") # In[50]: forge.register(associations) # In[51]: dataset = Dataset(forge, name="Interesting associations") # In[52]: dataset.add_parts(associations) # In[53]: forge.register(dataset) # In[54]: forge.as_json(dataset)