(fcb-notebook-1)=
{panels}
:container: container-lg pb-3<br>
:column: col-lg-3 col-md-4 col-sm-6 col-xs-12 p-1<br>
:card: rounded<br>
<i class='fa fa-qrcode fa-2x' style='color:#7e0038;'></i><br>
^^^
<h4><b>Recipe metadata</b></h4>
identifier: <a href="http://w3id.org/faircookbook/FCB038">FCB038</a>
version: <a href="">v1.0</a>
---
<i class='fa fa-fire fa-2x' style='color:#7e0038;'></i>
^^^
<h4><b>Difficulty level</b></h4>
<i class='fa fa-fire fa-lg' style='color:#7e0038;'></i>
<i class='fa fa-fire fa-lg' style='color:#7e0038;'></i>
<i class='fa fa-fire fa-lg' style='color:#7e0038;'></i>
<i class='fa fa-fire fa-lg' style='color:#7e0038;'></i>
<i class='fa fa-fire fa-lg' style='color:lightgrey'></i>
---
<i class='fas fa-clock fa-2x' style='color:#7e0038;'></i>
^^^
<h4><b>Reading Time</b></h4>
<i class='fa fa-clock fa-lg' style='color:#7e0038;'></i> 30 minutes
<h4><b>Recipe Type</b></h4>
<i class='fa fa-globe fa-lg' style='color:#7e0038;'></i> Hands-on
<h4><b>Executable Code</b></h4>
<i class='fa fa-play-circle fa-lg' style='color:#7e0038;'></i> Yes
---
<i class='fa fa-users fa-2x' style='color:#7e0038;'></i>
^^^
<h4><b>Intended Audience</b></h4>
<!-- <p> <i class='fa fa-user-md fa-lg' style='color:#7e0038;'></i> Principal Investigator </p> -->
<p> <i class='fa fa-database fa-lg' style='color:#7e0038;'></i> Data Manager </p>
<p> <i class='fa fa-wrench fa-lg' style='color:#7e0038;'></i> Data Scientist </p>
Background:
Experimental results such as metabolite profiling data published in [1,2] can be straightfowardly reported using OKFN Data Packages. Such components can be easily parsed as data frames and exploiting for data visualization purpose using libraries implementing graphical grammar concepts. Here, we show how to use a set of python libraries to create a tabular data package from an Excel file, annotate it with ontologies (CHEBI, PO, NCBITax) and validate the results against the JSON definition of the data table. A few line of codes allow structure information around key study design descriptors: the independent variables and their levels have been clearly and unambiguously declared in the Tabular Data Package itself.
import os
import libchebipy
import re
import pandas as pd
from datapackage import Package
from goodtables import validate
We now simply read in the Excel file corresponding to the Nature Genetics Supplementary Table from the Zenodo archive
#df = pd.read_excel('Supplementary Data 3.xlsx', sheet_name='Feuil1')
df = pd.read_excel('https://zenodo.org/api/files/91a610cb-8f1f-4ec5-9818-767a75a7a820/Supplementary%20Data%203.xlsx', sheet_name='Feuil1')
df.head(25)
header_treatment = df.take([13], axis=0)
data_full = df.take([3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], axis=1)
# We now trim by removing the first 15 rows which contain no information
data_slice = data_full.take([16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76], axis=0)
data_slice.rename(columns={"Unnamed: 3": "chemical_name",
"Unnamed: 4": "sample_mean_1",
"Unnamed: 5": "sem_1",
"Unnamed: 6": "sample_mean_2",
"Unnamed: 7": "sem_2",
"Unnamed: 8": "sample_mean_3",
"Unnamed: 9": "sem_3",
"Unnamed: 10": "sample_mean_4",
"Unnamed: 11": "sem_4",
"Unnamed: 12": "sample_mean_5",
"Unnamed: 13": "sem_5",
"Unnamed: 14": "sample_mean_6",
"Unnamed: 15": "sem_6",
"Unnamed: 16": "sample_mean_7",
"Unnamed: 17": "sem_7",
"Unnamed: 18": "sample_mean_8",
"Unnamed: 19": "sem_8"}, inplace=True)
data_slice.insert(loc=1, column='inchi', value='')
data_slice.insert(loc=2, column='chebi_identifier', value='')
data_slice = data_slice.reset_index(drop=True)
Note: in this call, we retrieve only values for which an exact match on the chemical name is found in Chebi libchebi API does not allow easy searching on synonyms, thus we are failing to retrieve all relevant information. This is merely to showcase how to use libchebi.
for i in range(0, 60):
hit = libchebipy.search(data_slice.loc[i, 'chemical_name'], True)
if len(hit) > 0:
print("HIT: ", data_slice.loc[i, 'chemical_name'], ":", hit[0].get_inchi(), "|", hit[0].get_id())
data_slice.loc[i, 'inchi'] = hit[0].get_inchi()
data_slice.loc[i, 'chebi_identifier'] = hit[0].get_id()
else:
print("Nothing found: ", data_slice.loc[i, 'chemical_name'])
data_slice.loc[i, 'inchi'] = ''
data_slice.loc[i, 'chebi_identifier'] = ''
Step1: obtain all the different 'dimensions' measured for a given condition (i.e. repeating fields with an increment suffix).
feature_models = [col for col in data_slice.columns if re.match("(sample_mean|sem)_[0-9]", col) is not None]
features = list(set([re.sub("_[0-9]", "", feature_model) for feature_model in feature_models]))
long_df = pd.wide_to_long(data_slice, i=['chemical_name'], j='treatment', stubnames=features, sep="_")
long_df.to_csv("long.txt", sep='\t', encoding='utf-8')
long_df_from_file = pd.read_csv("long.txt", sep="\t")
long_df_from_file.head()
try:
os.remove("long.txt")
except IOError as e:
print(e)
long_df_from_file.insert(loc=3, column='unit', value='')
long_df_from_file['var1_levels'] = long_df_from_file['treatment']
long_df_from_file['var1_uri'] = long_df_from_file['treatment']
long_df_from_file['var2_levels'] = long_df_from_file['treatment']
long_df_from_file['var2_uri'] = long_df_from_file['treatment']
# adding a new field for 'sample size' and setting the value to n=3
long_df_from_file['sample_size'] = 3
long_df_from_file.loc[long_df_from_file['treatment'] == 1, 'treatment'] = 'R. chinensis \'Old Blush\' sepals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 1, 'var1_levels'] = 'R. chinensis \'Old Blush\''
long_df_from_file.loc[long_df_from_file['var1_uri'] == 1, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_74649'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 1, 'var2_levels'] = 'sepals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 1, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009031'
long_df_from_file.loc[long_df_from_file['treatment'] == 2, 'treatment'] = 'R. chinensis \'Old Blush\' stamens'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 2, 'var1_levels'] = 'R. chinensis \'Old Blush\''
long_df_from_file.loc[long_df_from_file['var1_uri'] == 2, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_74649'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 2, 'var2_levels'] = 'stamens'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 2, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009029'
long_df_from_file.loc[long_df_from_file['treatment'] == 3, 'treatment'] = 'R. chinensis \'Old Blush\' petals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 3, 'var1_levels'] = 'R. chinensis \'Old Blush\''
long_df_from_file.loc[long_df_from_file['var1_uri'] == 3, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_74649'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 3, 'var2_levels'] = 'petals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 3, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009032'
long_df_from_file.loc[long_df_from_file['treatment'] == 4, 'treatment'] = 'R. gigantea petals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 4, 'var1_levels'] = 'R. gigantea'
long_df_from_file.loc[long_df_from_file['var1_uri'] == 4, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_74650'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 4, 'var2_levels'] = 'petals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 4, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009032'
long_df_from_file.loc[long_df_from_file['treatment'] == 5, 'treatment'] = 'R. Damascena petals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 5, 'var1_levels'] = 'R. Damascena'
long_df_from_file.loc[long_df_from_file['var1_uri'] == 5, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_3765'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 5, 'var2_levels'] = 'petals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 5, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009032'
long_df_from_file.loc[long_df_from_file['treatment'] == 6, 'treatment'] = 'R. Gallica petals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 6, 'var1_levels'] = 'R. Gallica'
long_df_from_file.loc[long_df_from_file['var1_uri'] == 6, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_74632'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 6, 'var2_levels'] = 'petals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 6, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009032'
long_df_from_file.loc[long_df_from_file['treatment'] == 7, 'treatment'] = 'R. moschata petals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 7, 'var1_levels'] = 'R. moschata'
long_df_from_file.loc[long_df_from_file['var1_uri'] == 7, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_74646'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 7, 'var2_levels'] = 'petals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 7, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009032'
long_df_from_file.loc[long_df_from_file['treatment'] == 8, 'treatment'] = 'R. wichurana petals'
long_df_from_file.loc[long_df_from_file['var1_levels'] == 8, 'var1_levels'] = 'R. wichurana'
long_df_from_file.loc[long_df_from_file['var1_uri'] == 8, 'var1_uri'] = 'http://purl.obolibrary.org/obo/NCBITaxon_2094184'
long_df_from_file.loc[long_df_from_file['var2_levels'] == 8, 'var2_levels'] = 'petals'
long_df_from_file.loc[long_df_from_file['var2_uri'] == 8, 'var2_uri'] = 'http://purl.obolibrary.org/obo/PO_0009032'
long_df_from_file['sample_mean'] = long_df_from_file['sample_mean'].fillna("0")
long_df_from_file['sem'] = long_df_from_file['sample_mean'].fillna("0")
long_df_from_file = long_df_from_file[['chemical_name', 'inchi', 'chebi_identifier', 'var1_levels', 'var1_uri',
'var2_levels', 'var2_uri', 'treatment', 'sample_size', 'sample_mean',
'unit', 'sem']]
long_df_from_file.head()
try:
HOME=os.getcwd()
# print("checking current directory #1: ",HOME)
if not os.path.exists(os.path.join(HOME,'../data/processed/denovo')):
# print("checking current directory #2: ", os.getcwd())
os.makedirs(os.path.join(HOME,'../data/processed/denovo'))
os.chdir(os.path.join(HOME,'../data/processed/denovo'))
long_df_from_file.to_csv("rose-aroma-naturegenetics2018-treatment-group-mean-sem-report-table-example.csv",
quoting=1,
doublequote=True, sep=',',
encoding='utf-8', index=False)
else:
os.chdir(os.path.join(HOME,'../data/processed/denovo'))
except IOError as e:
print(e)
os.chdir('./../../../')
LOCAL = os.getcwd()
print("moving to directory: ", os.getcwd())
package_definition = os.path.join(LOCAL,'./rose-metabo-JSON-DP-validated/rose-aroma-naturegenetics2018-treatment-group-mean-sem-report-datapackage.json')
file_to_test = os.path.join(LOCAL,'../data/processed/denovo/rose-aroma-naturegenetics2018-treatment-group-mean-sem-report-table-example.csv')
print ("JSON data package definition:", package_definition)
print("csv file to evaluate:", file_to_test)
try:
pack = Package(package_definition)
pack.valid
pack.errors
for e in pack.errors:
print(e)
report = validate(file_to_test)
if report['valid']== True:
print("Success! \n")
print("\'"+file_to_test + "\'"+ " is a valid Frictionless Tabular Data Package\n" + "It complies with the 'rose-aroma-naturegenetics2018-treatment-group-mean-sem-report-datapackage.json' definition\n")
else:
print("hmmm, something went wrong. Please, see the validation report for tracing the fault")
except IOError as e:
print(e)
Bibliographic References