#!/usr/bin/env python # coding: utf-8 # # Register Datasets within the Blue Brain Knowledge Graph # # This notebook presents a step by step approach for registering datasets (any resource with files attached) with eventually metadata and provenance in a configured project. # ## Nexus Forge installation # Installation instruction can be found [here](https://nexus-forge.readthedocs.io/en/latest/#installation). # ## Get a token # The [Nexus production deployment](https://bbp.epfl.ch/nexus/web/) can be used to login and get a token. # # - Step 1: From the opened web page, click on the login button on the right corner and follow the instructions. # # ![login-ui](https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/login-ui.png) # # - Step 2: At the end you’ll see a token button on the right corner. Click on it to copy the token. # # ![copy-token](https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/copy-token.png) # # In[ ]: import getpass TOKEN = getpass.getpass() # ## Set the Nexus deployment to work with # In[11]: nexus_staging_endpoint = "https://staging.nise.bbp.epfl.ch/nexus/v1" # use staging to try and test. nexus_endpoint = nexus_staging_endpoint # ## Set the Nexus project to work with # ### The project already exist ? # In production the existing BBP projects can be found [here](https://bbp.epfl.ch/nexus/web/admin/bbp) and the list of available studios can be found [here](https://bbp.epfl.ch/nexus/web/studios) # In[13]: ORG = "bbp" PROJECT = "MyProject" # ## Create a KnowledgeGraphForge session # In[14]: from kgforge.core import KnowledgeGraphForge from kgforge.core import Resource from kgforge.specializations.resources import Dataset import pandas as pd # A KnowledgeGraphForge session is a python object that exposes all necessary functions to register with metadata, search and download datasets. A configuration file is needed in order to create a KnowledgeGraphForge session but a ready to use configuration file is made available [here](https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml). # In[ ]: forge = KnowledgeGraphForge("https://raw.githubusercontent.com/BlueBrain/nexus-forge/master/examples/notebooks/use-cases/prod-forge-nexus.yml", endpoint=nexus_endpoint, bucket=f"{ORG}/{PROJECT}", token= TOKEN ) # You are all set up ! # ## Create resources with some metadata # # A resource is anything that can be identified and that can have metadata associated. The following cell creates two resources of type Person and Agent and with each a name as metadata. # # ### Using forge Resource object # Any 'property=value' can be given here as metadata. # In[23]: jane = Resource(type="Person", name="Jane Doe", givenName="Jane", familyName="Doe") john = Resource(type=["Person","Agent"], name="John Smith", givenName="John", familyName="Smith") persons = [jane, john] # In[25]: forge.register(persons) # In[26]: # A resource can be retrieved by its id result = forge.retrieve(id= john.id) # In[27]: # Note that the Blue Brain Nexus has automatically generated ids (id property) for the resources. The generated ids are unique in the selected project. forge.as_json(result) # In[28]: # Add store_metadata=True to see extra metadata added by Blue Brain Nexus (e.g. _rev, _createdBy, _updatedAt, _deprecated, ...) forge.as_json(result, store_metadata=True) # ### Using pandas dataframe # In[35]: # Each person has a file attached. We'll use forge.attach to be able to register the files in the knowledge graph scientists_df = pd.read_csv("../../data/persons-with-id.csv") scientists_df # In[36]: # Resources can be created from a pandas dataframe scientists = forge.from_dataframe(scientists_df) # In[37]: forge.as_json(scientists[0]) # In[38]: # Note that registering an existing resource in a given project will throw a 'RegistrationError: resource already exists' error. # forge.retrieve(id=...) can be used to fetch the registered resource as shown in next cell. forge.register(scientists) # In[39]: scientists = [] Marie_Curie = forge.retrieve(id= "https://www.wikidata.org/wiki/Q7186") # Let retrieve Marie Curie resource Albert_Einstein = forge.retrieve(id= "https://www.wikidata.org/wiki/Q937") # Let retrieve Albert Einstein resource scientists.append(Marie_Curie) scientists.append(Albert_Einstein) # In[40]: # Note that the Blue Brain Nexus Store has kept the provided id forge.as_json(Marie_Curie) # See the notebook [DataFrame IO.ipynb](https://github.com/BlueBrain/nexus-forge/blob/master/examples/notebooks/getting-started/07%20-%20DataFrame%20IO.ipynb) for more details on converting Pandas DataFrame to forge Resources and the other way around. # Even though any type can be provided for a Resource, there are a set of available types that can be obtained programmatically by using the following command or by looking at the [schemas doc](https://bbp-nexus.epfl.ch/datamodels/entities-az.html) (Note these schemas may change in the future). # In[ ]: forge.types() # ## Create a dataset from files # # This use case is about registering files with metadata in the knowledge graph. A specific type of Resource, called Dataset will be used. Since Dataset is also a Resource, then everything that applies to Resource also applies to Dataset. # In[42]: # Let list the files that will be used and let capture the start time for this part import time startedAtTime = time.strftime("%Y%m%d%H%M%S") get_ipython().system(' ls -p ../../data | egrep -v /$') # Any 'property=value' can be given here as metadata. We recommend to use properties from the [Dataset schema](https://bbp-nexus.epfl.ch/datamodels/class-schemadataset.html). # A Dataset is a Resource with a [distribution](https://schema.org/distribution) property to account for where the data (files) are stored and where they can be accessed. # In[43]: # The file content type can be provided by setting the content_type. my_data_distribution = forge.attach("../../data/my_data.xwz") my_dataset = Dataset(forge, type=["Entity","Dataset", "MyOtherType"],name="Interesting Dataset", distribution=my_data_distribution) # ### Register # In[44]: forge.register(my_dataset) # In[45]: # Visualise the metadata. Note the distribution property with file related metadata automatically added (contentSize, digest, encodingFormat, ...) forge.as_json(my_dataset) # ### Retrieve by id # In[46]: result = forge.retrieve(id= my_dataset.id) # In[47]: forge.as_json(result) # ### Search by metadata # See the notebook [BBP KG Search and Download.ipynb](https://github.com/BlueBrain/nexus-forge/blob/master/examples/notebooks/use-cases/BBP%20KG%20Search%20and%20Download.ipynb) for more search details and options. # In[48]: filters = {"type":"Dataset", "name":"Interesting Dataset"} results = forge.search(filters, limit=3) print(f"{len(results)} results found") # In[49]: forge.as_json(results[0]) # In[50]: # A list of resources can be transformed in pandas dataframe forge.as_dataframe(results) # ### Download # In[54]: # The argument overwrite: bool can be provided to decide whether to overwrite (True) existing files with the same name or # to create new ones (False) with their names suffixed with a timestamp my_dataset.download(path="./downloaded/", source="distributions") # In[55]: get_ipython().system(' ls -l ./downloaded') # In[53]: #! rm -R ./downloaded/ # ### Get storage path # In case the dataset files are stored in an external storage (e.g. GPFS), it is possible to get their location # In[56]: forge.as_json(my_dataset.distribution.atLocation) # In[ ]: # This will break when in staging as no gpfs storage is used my_dataset.distribution.atLocation.location # ### Add provenance information to the dataset # # Provenance are specific metadata accounting for (among other things) data lineage (derivation), who contributed to the generation of the dataset (contribution), how the dataset was generated (generation), the subject of the dataset if any (subject). # #### Add derivation (from which datasets a given dataset derived from) # Let consider the file ../../data/my_data_derived.txt to derive from ../../data/my_data.xwz # In[58]: # The file content type can be provided by setting the content_type. my_derived_data_distribution = forge.attach("../../data/my_data_derived.txt", content_type="application/txt") my_derived_dataset = Dataset(forge, name="Derived Dataset from my_dataset", distribution=my_derived_data_distribution) # In[59]: forge.register(my_derived_dataset) # In[60]: result = forge.retrieve(id=my_derived_dataset.id) # In[61]: # Note the added distribution property forge.as_json(result) # In[62]: # my_derived_dataset derived from my_dataset my_derived_dataset.add_derivation(my_dataset) # In[63]: # Since the my_derived_dataset is already registered, it can be updated to store its derivation information. # If no change occurs (i.e there is nothing to update), then forge.update(...) will throw a "UpdatingError: resource should not be synchronized" error. forge.update(my_derived_dataset) # In[64]: # Note the increased _rev number because of the update forge.as_json(my_derived_dataset, store_metadata=True) # #### Add contribution (which Person, Organization or Software contributed to the generation of the data) # Adding contributors to the dataset. The contributors are john, jane and the persons stored in the ../../data/persons.csv file. All persons from the file will be resources in the knowledge graph to be able to reference them as contributors. # In[65]: # An id can also be provided to add_contribution(). By default, ids are versioned when referenced to avoid being impacted by further changes and keep the state at which they were when referenced. for contributor in scientists: my_derived_dataset.add_contribution(contributor) my_derived_dataset.add_contribution(john.id, versioned=False) my_derived_dataset.add_contribution(jane) # In[66]: forge.update(my_derived_dataset) # In[67]: result = forge.retrieve(id= my_derived_dataset.id) # In[68]: forge.as_json(result) # In[69]: # By adding store_metadata=True, the revision number of a resource can be introspected forge.as_json(result, store_metadata=True) # #### Add generation (which activity lead to the generation of the my_derived_dataset) # An activity used some entity to generate new ones aand can potentially follow a Protocol. It has a start and end time and is associated with some agents (Person, Organization and/or SoftwareAgent) # In[70]: #Was a protocol followed ? protocol = Resource(type="Protocol", name="Protocol used to generate the dataset", description="Description of the protocol") activity = Resource(type=["Activity", "MyCustomActivity"], description= "Activity", used=Resource(id=my_dataset.id,type = my_dataset.type), # the value here can be an array of any dataset or entity (e.g. config files) that was used to generate my_derived_dataset hadProtocol=protocol, startedAtTime=startedAtTime, endedAtTime=time.strftime("%Y%m%d%H%M%S"), wasAssociatedWith= Resource(id = jane.id,type = jane.type) # the value here can be an array of any agents ) # In[71]: forge.register(activity) # In[72]: forge.as_json(activity) # In[73]: my_derived_dataset.add_generation(activity) # In[74]: forge.update(my_derived_dataset) # In[75]: forge.as_json(my_derived_dataset) # #### Add Subject # # The subject on wich the study was performed can be added if any. The [subject schema](https://bbp-nexus.epfl.ch/datamodels/class-subject.html) can be used for more informatation. # In[77]: # Note that Resource can be used as value of a property my_derived_dataset.subject = Resource(type=["Subject","Entity"], name="P14-12 Rattus norvegicus Wistar Han", species= Resource(id="http://purl.obolibrary.org/obo/NCBITaxon_10116", label="Rattus norvegicus"), strain = Resource(id="http://purl.obolibrary.org/obo/RS_0001833", label="Wistar Han"), age = Resource(period="Post-natal", value=14, unitCode="days"), sex = Resource(id="http://purl.obolibrary.org/obo/PATO_0000384", label="male") ) # In[78]: forge.update(my_derived_dataset) # In[79]: forge.as_json(my_derived_dataset) # #### Add license # In[80]: my_derived_dataset.license = Resource (id="https://creativecommons.org/licenses/by/4.0", label="CC BY 4.0", description="You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.") # this is just an example # In[81]: forge.update(my_derived_dataset) # In[82]: forge.as_json(my_derived_dataset) # ### Tag the dataset # Tagging a dataset is equivalent to git tag. It allows to version a dataset. # In[83]: forge.tag(my_derived_dataset, value="releaseV112") # In[84]: my_derived_dataset.description="Derived Dataset description" # In[85]: forge.update(my_derived_dataset) # In[86]: forge.as_json(my_derived_dataset) # In[87]: # version argument can be specified to retroeive the dataset at a given tag. result = forge.retrieve(id=my_derived_dataset.id, version="releaseV112") # In[89]: result != my_derived_dataset # In[88]: # Note that description is not retrieved as it was added after the tag forge.as_json(result)