#!/usr/bin/env python
# coding: utf-8

# # Datasets
# 
# A [Dataset](https://nexus-forge.readthedocs.io/en/latest/interaction.html#dataset) is a specialization of a [Resource](https://nexus-forge.readthedocs.io/en/latest/interaction.html#resource) that provides users with operations to handle files, record their provenance and describe them with metadata.

# In[1]:


from kgforge.core import KnowledgeGraphForge


# A configuration file is needed in order to create a KnowledgeGraphForge session. A configuration can be generated using the notebook [00-Initialization.ipynb](00%20-%20Initialization.ipynb).
# 
# Note: DemoStore doesn't implement file operations yet. Use the BluBrainNexus store instead when creating a config file.

# In[58]:


forge = KnowledgeGraphForge("../../configurations/forge.yml")


# ## Imports

# In[3]:


from kgforge.core import Resource


# In[4]:


from kgforge.specializations.resources import Dataset


# In[5]:


import pandas as pd


# ## Creation with files added as parts

# In[6]:


get_ipython().system(' ls -p ../../data | egrep -v /$')


# In[7]:


persons = Dataset(forge, name="Interesting Persons")


# In[8]:


persons.add_files("../../data/persons.csv")


# In[9]:


forge.register(persons)


# In[10]:


forge.as_json(persons)


# In[11]:


associations = Dataset(forge, name="Associations data")


# In[12]:


associations.add_files("../../data/associations.tsv")


# In[13]:


associations.add_derivation(persons)


# In[14]:


forge.register(associations)


# In[15]:


forge.as_json(associations)


# In[ ]:


# By default the files are downloaded in the current path (path="."). The urls or the files to download can be collected from a different json path (by setting a value for "follow") and 
# the files downloaded to a different path (by setting a value for "path")
# The argument overwrite: bool can be provided to decide whether to overwrite (True) existing files with the same name or
# to create new ones (False) with their names suffixed with a timestamp.
# A cross_bucket argument can be provided to download data from the configured bucket (cross_bucket=False - the default value) 
# or from a bucket different than the configured one (cross_bucket=True). The configured store should support crossing buckets for this to work.
associations.download(source="parts")


# In[19]:


# A specific path can be provided.
associations.download(path="./downloaded/", source="parts")


# In[ ]:


# A specific content type can be downloded.
associations.download(path="./downloaded/", source="parts", content_type="text/tab-separated-values")


# In[20]:


get_ipython().system(' ls -l ./downloaded')


# In[18]:


# ! rm -R ./downloaded/


# ## Creation with files added as distribution

# In[59]:


persons = Dataset(forge, name="Interesting Persons")


# In[60]:


persons.add_distribution("../../data/associations.tsv")


# In[61]:


forge.register(persons)


# In[62]:


forge.as_json(persons)


# In[ ]:


# When files are added as distributions, they can be directly downloaded without specifying which json path to use to collect the downlodable urls. In addition, content type and path arguments
# can still be provided
persons.download()


# ## Creation with resources added as parts

# In[21]:


distribution_1 = forge.attach("../../data/associations.tsv")


# In[22]:


distribution_2 = forge.attach("../../data/persons.csv")


# In[23]:


jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)


# In[24]:


john = Resource(type="Person", name="John Smith", distribution=distribution_2)


# In[25]:


persons = [jane, john]


# In[26]:


forge.register(persons)


# In[27]:


dataset = Dataset(forge, name="Interesting people")


# In[28]:


dataset.add_parts(persons)


# In[29]:


forge.register(dataset)


# In[30]:


forge.as_json(dataset)


# In[34]:


dataset.download(path="./downloaded/", source="parts")


# In[35]:


get_ipython().system(' ls -l ./downloaded')


# In[31]:


# ! rm -R ./downloaded/


# ### Creation from resources converted as Dataset objects

# In[63]:


dataset = Dataset.from_resource(forge, [jane, john], store_metadata=True)
print(*dataset, sep="\n")


# ## Creation from a dataframe

# See notebook `07 DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame.

# ### basics

# In[37]:


dataframe = pd.read_csv("../../data/persons.csv")


# In[38]:


dataframe


# In[39]:


persons = forge.from_dataframe(dataframe)


# In[40]:


forge.register(persons)


# In[41]:


dataset = Dataset(forge, name="Interesting people")


# In[42]:


dataset.add_parts(persons)


# In[43]:


forge.register(dataset)


# In[44]:


forge.as_json(dataset)


# ### advanced

# In[45]:


dataframe = pd.read_csv("../../data/associations.tsv", sep="\t")


# In[46]:


dataframe


# In[47]:


dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))


# In[48]:


associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")


# In[49]:


print(*associations, sep="\n")


# In[50]:


forge.register(associations)


# In[51]:


dataset = Dataset(forge, name="Interesting associations")


# In[52]:


dataset.add_parts(associations)


# In[53]:


forge.register(dataset)


# In[54]:


forge.as_json(dataset)