The aim of this notebook is to show to create an ISA document for depositing Stable Isotope Resolved Metabolomics Study metadata using the ISA API.
This notebook highlights key steps of the deposition, including:
Stable Isotope Resolved Metabolomics Studies are a type of studies using MS and NMR acquisition techniques to decypher biochemical reactions using tracer molecule
, i.e. molecules for which certain positions carry an isotope (e.g. 13C, 15N). Specific data acquisition and data processing techniques are required and dedicated software is used to make sense of the data. Software such as IsoSolve
[1], Ramid
[2](for primary processing of 13C mass isotopomer data obtained with GCMS) or midcor
[3] (for natural abundance correction processes on13C mass isotopomers spectra), may be used to accomplish those tasks. The output of such tools are tables which may comply with a new specifications devised to better support the reporting of SIRM study results.
import os
from hashlib import md5, sha1, sha256, blake2b
from isatools.model import (
Comment,
Investigation,
Study,
StudyFactor,
FactorValue,
OntologyAnnotation,
Characteristic,
OntologySource,
Material,
Sample,
Source,
Protocol,
ProtocolParameter,
ParameterValue,
Process,
Publication,
Person,
Assay,
DataFile,
plink
)
HASH_FUNCTIONS = {
"md5": md5,
"sha1": sha1,
"sha256": sha256,
"blake2": blake2b,
}
def compute_hash(file_path, file, hash_func):
"""a subfunction generating the hash using hashlib functions
:param file_path:
:param file:
:param hash_func:
:return:
"""
with open(os.path.join(file_path, file), "rb") as f:
for byte_block in iter(lambda: f.read(4096), b""):
hash_func.update(byte_block)
return hash_func.hexdigest()
def update_checksum(file_path, isa_file_object: DataFile, checksum_type):
""" a helper function to compute file checksum given a file path, an isa data file name and a type of algorithm
:param file_path:
:param isa_file_object:
:param checksum_type: enum
:return: isa_file_object:
:raises ValueError: when the checksum is invalid
"""
if checksum_type in HASH_FUNCTIONS.keys():
hash_type = HASH_FUNCTIONS[checksum_type]()
file_checksum = compute_hash(file_path, isa_file_object.filename, hash_type)
isa_file_object.comments.append(Comment(name="checksum type", value=checksum_type))
else:
raise ValueError("Invalid checksum type")
isa_file_object.comments.append(Comment(name="checksum", value=file_checksum))
return isa_file_object
def create_directories() -> None:
""" Creates all the directories required by the notebook """
here_path: str = os.getcwd()
bh2023_output_path: str = os.path.join(here_path, "output", "ISA-BH2023-ALL")
directories: dict[str, list[str]] = {
'TAB': ['BH23-ISATAB_FROM_TAB'],
'JSON': ['BH23-ISATAB', 'BH23-ISATAB_FROM_JSON'],
'DERIVED_FILES': [],
'RAW_FILES': []
}
for directory, subdirectories in directories.items():
directory_path: str = os.path.join(bh2023_output_path, directory)
if not os.path.exists(directory_path):
os.makedirs(directory_path)
for subdirectory in subdirectories:
sub_directory_path: str = os.path.join(directory_path, subdirectory)
if not os.path.exists(sub_directory_path):
os.makedirs(sub_directory_path)
create_directories()
investigation = Investigation()
chebi = OntologySource(
name="CHEBI",
description="Chemical Entity of Biological Interest",
version="1.0",
file="https://www.example.org/CHEBI"
)
efo = OntologySource(name="EFO", description="Experimental Factor Ontology")
msio = OntologySource(name="MSIO", description="Metabolomics Standards Initiative Ontology")
obi = OntologySource(name='OBI', description="Ontology for Biomedical Investigations")
pato = OntologySource(name='PATO', description="Phenotype and Trait Ontology")
uo = OntologySource(name="UO", description="Unit Ontology")
ncbitaxon = OntologySource(name="NCIBTaxon", description="NCBI Taxonomy")
ncbitaxon.comments.append(Comment(name="onto-test", value="onto-value"))
investigation.ontology_source_references = [chebi, efo, obi, pato, ncbitaxon, msio, uo]
mag_field_unit = OntologyAnnotation(term="Tesla", term_source=uo, term_accession="https://purl.org/")
mass_unit = OntologyAnnotation(term="mg", term_source=uo, term_accession="https://purl.org/")
study = Study(filename="s_BH2023-study.txt")
study.identifier= "BH2023"
study.title = "[U-13C6]-D-glucose labeling experiment in MCF7 cancer cell line"
study.description = "Probing cancer pathways of MCF7 cell line using 13C stable isotope resolved metabolomics study using isotopologue distribution analysis with mass spectrometry and isotopomer analysis by 1D 1H NMR."
study.submission_date = "2021-08-15"
study.public_release_date = "2021-08-15"
# These EMBL-EBI Metabolights (MTBLS) related ISA Comments fields may be used for deposition to EMBL-EBI
SRA_comments = [
{"name": "EMBL Broker Name", "value": "OXFORD"},
{"name": "EMBL Center Name", "value": "OXFORD"},
{"name": "EMBL Center Project Name", "value": "OXFORD"},
{"name": "EMBL Lab Name", "value": "Oxford e-Research Centre"},
{"name": "EMBL Submission Action", "value": "ADD"}
]
Funders_comments = [
{"name": "Study Funding Agency", "value": ""},
{"name": "Study Grant Number", "value": ""}
]
for cmt in SRA_comments:
sra_comment = Comment(name=cmt["name"], value=cmt["value"])
study.comments.append(sra_comment)
for cmt in Funders_comments:
funder_cmt = Comment(name=cmt["name"], value=cmt["value"])
study.comments.append(funder_cmt)
# Adding a Study Design descriptor to the ISA Study object
intervention_design = OntologyAnnotation(term_source=obi)
intervention_design.term = "intervention design"
intervention_design.term_accession = "http://purl.obolibrary.org/obo/OBI_0000115"
study_design = OntologyAnnotation(term_source=msio)
study_design.term = "stable isotope resolved metabolomics study"
study_design.term_accession = "http://purl.obolibrary.org/obo/MSIO_0000096"
study.design_descriptors.append(intervention_design)
study.design_descriptors.append(study_design)
# Declaring the Study Factors
agent_ft_annot = OntologyAnnotation(term="chemical substance",
term_accession="http://purl.obolibrary.org/obo/CHEBI_59999",
term_source=chebi)
intensity_ft_annot = OntologyAnnotation(term="dose",
term_accession="http://www.ebi.ac.uk/efo/EFO_0000428",
term_source=efo)
duration_ft_annot = OntologyAnnotation(term="time",
term_accession="http://purl.obolibrary.org/obo/PATO_0000165",
term_source=pato)
study.factors = [
StudyFactor(name="compound",factor_type=agent_ft_annot),
StudyFactor(name="dose",factor_type=intensity_ft_annot),
StudyFactor(name="duration",factor_type=duration_ft_annot)
]
# Associating the levels to each of the Study Factor.
agent_fvalue_annot = OntologyAnnotation(term="dioxygen", term_source=obi, term_accession="https://purl.org/")
intensity_fvalue_annot1 = OntologyAnnotation(term="high", term_source=obi, term_accession="https://purl.org/")
intensity_fvalue_annot2 =OntologyAnnotation(term="normal", term_source=obi, term_accession="https://purl.org/")
duration_fvalue_annot =OntologyAnnotation(term="hour", term_source=obi, term_accession="https://purl.org/")
fv1 = FactorValue(factor_name=study.factors[0], value=agent_fvalue_annot)
fv2 = FactorValue(factor_name=study.factors[1], value=intensity_fvalue_annot1)
fv3 = FactorValue(factor_name=study.factors[1], value=intensity_fvalue_annot2)
fv4 = FactorValue(factor_name=study.factors[2], value=duration_fvalue_annot)
status_annot_value = OntologyAnnotation(term="indexed in PubMed", term_source=obi, term_accession="https://purl.org/")
study.publications = [
Publication(doi="10.1371/journal.pone.0000000",pubmed_id="36007233",
title="Decyphering new cancer pathways with stable isotope resolved metabolomics in MCF7 cell lines",
status=status_annot_value,
author_list="Min,W. and Everest H"),
]
study.contacts = [
Person(first_name="Weng", last_name="Min", affiliation="Beijing Institute of Metabolism", email="weng.min@bim.edu.cn",
address="Prospect Street, Beijing, People's Republic of China",
comments=[Comment(name="Study Person REF", value="")],
roles=[OntologyAnnotation(term="principal investigator role"),
OntologyAnnotation(term="SRA Inform On Status"),
OntologyAnnotation(term="SRA Inform On Error")]
),
Person(first_name="Hillary", last_name="Everest", affiliation="Centre for Cell Metabolism",
address="CCM, Edinborough, United Kingdom",
comments=[Comment(name="Study Person REF", value="")],
roles=[OntologyAnnotation(term="principal investigator role")]
)
]
study.protocols = [
#Protocol #0
Protocol(name="cell culture and isotopic labeling",
description="SOP for growing MCF7 cells and incubating them with the tracer molecule",
protocol_type=OntologyAnnotation(term="sample collection"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="tracer molecule"))
]
),
#Protocol #1
Protocol(
name="intracellular metabolite extraction",
description="SOP for extracting metabolites from harvested cells",
protocol_type=OntologyAnnotation(term="extraction")
),
#Protocol #2
Protocol(
name="extracellular metabolite extraction",
description="SOP for extracting metabolites from cell culture supernatant",
protocol_type=OntologyAnnotation(term="extraction")
),
#Protocol #3
Protocol(
name="liquid chromatography mass spectrometry",
description="SOP for LC-MS data acquisition",
protocol_type=OntologyAnnotation(term="mass spectrometry"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="chromatography column")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="mass spectrometry instrument")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="mass analyzer"))
]
),
#Protocol #4
Protocol(
name="1D 13C NMR spectroscopy for isotopomer analysis",
description="SOP for 1D 13C NMR data acquisition for isotopomer analysis",
protocol_type=OntologyAnnotation(term="NMR spectroscopy"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="magnetic field strength")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="nmr tube")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="pulse sequence"))
]
),
#Protocol #5
Protocol(
name="1D 13C NMR spectroscopy for metabolite profiling",
description="SOP for 1D 13C NMR data acquisition for metabolite profiling",
protocol_type=OntologyAnnotation(term="NMR spectroscopy"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="magnetic field strength")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="nmr tube")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="pulse sequence"))
]
),
#Protocol #6
Protocol(
name="MS metabolite identification",
description="SOP for MS signal processing and metabolite and isotopologue identification",
protocol_type=OntologyAnnotation(term="metabolite identification"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="ms software"))
]
),
#Protocol #7
Protocol(
name="NMR metabolite identification",
description="SOP for NMR signal processing and metabolite and isotopomer identification",
uri="https://doi.org/10.1021/acs.analchem.1c01064",
protocol_type=OntologyAnnotation(term="data transformation"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="nmr software"))
]
),
#Protocol #8
Protocol(
name="mRNA extraction",
description="procedure for isolating messenger RNA for transcriptomics analysis",
uri="",
protocol_type=OntologyAnnotation(term="material separation")
),
#Protocol #9
Protocol(
name="gDNA extraction",
description="procedure for isolating genomic DNA for copy number variation analysis",
uri="",
protocol_type=OntologyAnnotation(term="material separation")
),
#Protocol #10
Protocol(
name="gDNA library preparation",
description="procedure for isolating genoic DNA for copy number variation analysis",
uri="",
protocol_type=OntologyAnnotation(term="library construction"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="library strategy")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="library selection")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="library source")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="library orientation"))
]
),
#Protocol #11
Protocol(
name="mRNA library preparation",
description="procedure for isolating genoic DNA for gene expression analysis",
uri="",
protocol_type=OntologyAnnotation(term="library construction"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="library strategy")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="library selection")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="library source")),
ProtocolParameter(parameter_name=OntologyAnnotation(term="library orientation"))
]
),
#Protocol #12
Protocol(
name="nucleic acid sequencing",
description="SOP for nucleic acid sequencing",
uri="",
protocol_type=OntologyAnnotation(term="nucleic acid sequencing"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="sequencing instrument"))
]
),
#Protocol #13
Protocol(
name="transcription analysis",
description="SOP for transcriptomics analysis",
uri="",
protocol_type=OntologyAnnotation(term="data transformation"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="sequence analysis software"))
]
),
#Protocol #14
Protocol(
name="CNV analysis",
description="SOP for CNV ",
uri="",
protocol_type=OntologyAnnotation(term="data transformation"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="variant calling software"))
]
)
]
In this fictional study, we assume the following underlying experimental setup:
# Creating the ISA Source Materials
study.sources = [Source(name="culture-1"), Source(name="culture-2")]
src_characteristic_biosamplexref = Characteristic(category=OntologyAnnotation(term="namespace:biosample:src"),
value=OntologyAnnotation(term="SRC:" ,
term_source=obi, term_accession="https://purl.org/"))
characteristic_organism = Characteristic(category=OntologyAnnotation(term="Organism"),
value=OntologyAnnotation(term="Homo sapiens",
term_source=ncbitaxon,
term_accession="http://purl.obolibrary.org/obo/NCBITaxon_9606"))
characteristic_cell = Characteristic(category=OntologyAnnotation(term="cell line"),
value=OntologyAnnotation(term="MCF-7", term_source=obi, term_accession="https://purl.org/"))
study.characteristic_categories.append(src_characteristic_biosamplexref.category)
study.characteristic_categories.append(characteristic_organism.category)
study.characteristic_categories.append(characteristic_cell.category)
for i in range(len(study.sources)):
study.sources[i].characteristics.append(src_characteristic_biosamplexref)
study.sources[i].characteristics.append(characteristic_organism)
study.sources[i].characteristics.append(characteristic_cell)
# Note how the treatment groups are defined as sets of factor values attached to the ISA.Sample object
treatment_1 = [fv1,fv2,fv4]
treatment_2 = [fv1,fv3,fv4]
# Ensuring the Tracer Molecule(s) used for the SIRM study is properly reported
tracer_mol_C = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="tracer molecule",
term_source="",
term_accession="")),
value=OntologyAnnotation(term="80% [1-13C1]-D-glucose + 20% [U-13C6]-D-glucose",
term_source=chebi,
term_accession="https://purl.org/chebi_1212"))
tracers = [tracer_mol_C]
# the number of samples collected from each culture condition
replicates = 4
# Now creating a Process showing a `Protocol Application` using Source as input and producing Sample as output.
for k in range(replicates):
smp_characteristics_biosamplexref = Characteristic(category=OntologyAnnotation(term="namespace:biosample:smp"),
value=OntologyAnnotation(term=("SAME:" + str(k)), term_source=obi, term_accession="https://purl.org/"))
study.characteristic_categories.append(smp_characteristics_biosamplexref.category)
study.samples.append(Sample(name=(study.sources[0].name + "-sample-" + str(k)),
characteristics=[smp_characteristics_biosamplexref],
factor_values=treatment_1))
study.samples.append(Sample(name=(study.sources[1].name + "-sample-" + str(k)),
characteristics=[smp_characteristics_biosamplexref],
factor_values=treatment_2))
sample_collection_mbx = Process(name="sample-collection-process-mbx",
executes_protocol=study.protocols[0], # a sample collection
inputs=[study.sources[0]],
outputs=[study.samples[0],study.samples[2],study.samples[4],study.samples[6]],
parameter_values= [tracer_mol_C])
sample_collection_gtx = Process(name="sample-collection-process-gtx",
executes_protocol=study.protocols[0], # a sample collection
inputs=[study.sources[1]],
outputs=[study.samples[1],study.samples[3],study.samples[5],study.samples[7]],
parameter_values= [tracer_mol_C])
study.process_sequence.append(sample_collection_mbx)
study.process_sequence.append(sample_collection_gtx)
study.units = []
# Now appending the ISA Study object to the ISA Investigation object
investigation.studies = [study]
#Starting by declaring the 2 types of assays used in BII-S-3 as coded with ISAcreator tool
# assay = Assay(filename="a_"+ study.identifier + "-isotopologue-ms-assay.txt")
# assay.measurement_type = OntologyAnnotation(term="isotopologue distribution analysis",term_accession="http://purl.obolibrary.org/obo/msio.owl#mass_isotopologue_distribution_analysis", term_source=msio)
# assay.technology_type = OntologyAnnotation(term="mass spectrometry", term_accession="http://purl.obolibrary.org/obo/CHMO_0000470", term_source=msio)
# assay.comments.append(Comment(name="target repository", value="metabolights"))
# assay_nmr_topo = Assay(filename="a_"+ study.identifier + "-isotopomer-nmr-assay.txt")
# assay_nmr_topo.measurement_type = OntologyAnnotation(term="isotopomer analysis",term_accession="http://purl.obolibrary.org/obo/msio.owl#isotopomer_analysis", term_source=msio)
# assay_nmr_topo.technology_type = OntologyAnnotation(term="NMR spectroscopy",term_accession="http://purl.obolibrary.org/obo/CHMO_0000591", term_source=msio)
# assay_nmr_topo.comments.append(Comment(name="target repository", value="metabolights"))
#
assay_nmr_metpro = Assay(filename="a_"+ study.identifier + "-metabolite-profiling-nmr-assay.txt")
assay_nmr_metpro.measurement_type = OntologyAnnotation(term="metabolite profiling",term_accession="http://purl.obolibrary.org/obo/MSIO_0000101", term_source=msio)
assay_nmr_metpro.technology_type = OntologyAnnotation(term="NMR spectroscopy",term_accession="http://purl.obolibrary.org/obo/CHMO_0000591", term_source=msio)
assay_nmr_metpro.comments.append(Comment(name="target repository", value="metabolights"))
assay_cnv_seq = Assay(filename="a_"+ study.identifier + "-cnv_seq-assay.txt")
assay_cnv_seq.measurement_type = OntologyAnnotation(term="copy number variation profiling",term_accession="https://purl.org", term_source=msio)
assay_cnv_seq.technology_type = OntologyAnnotation(term="nucleotide sequencing",term_accession="https://purl.org", term_source=msio)
assay_cnv_seq.comments.append(Comment(name="target repository", value="ega"))
assay_rna_seq = Assay(filename="a_"+ study.identifier + "-rna-seq-assay.txt")
assay_rna_seq.measurement_type = OntologyAnnotation(term="transcription profiling", term_accession="https://purl.org", term_source=msio)
assay_rna_seq.technology_type = OntologyAnnotation(term="nucleotide sequencing", term_accession="https://purl.org", term_source=msio)
assay_rna_seq.comments.append(Comment(name="target repository", value="arrayexpress"))
Warning
technology type
OntologyAnnotation.term is left emptymain_path = "./output/ISA-BH2023-ALL/"
data_path = "./output/"
NOTE
make sure to used ISA API plink function
to connects the protocols in a chain.
nmr_sw = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="nmr software")),
value=OntologyAnnotation(term="Batman", term_source=obi, term_accession="https://purl.org/"))
nmr_derivedDF = DataFile(filename="metpro-analysis.txt", label="Derived Spectral Data File")
f=open(os.path.join(main_path, "DERIVED_FILES/","metpro-analysis.txt"),"w+")
f.write("metpro-analysis.txt")
f.close
nmr_da_process = Process(
name = "NMR-metpro-DT-ident",
executes_protocol=study.protocols[7],
parameter_values=[nmr_sw],
outputs=[nmr_derivedDF]
)
assay_nmr_metpro.data_files.append(nmr_derivedDF)
for i, sample in enumerate(study.samples):
# extraction process takes as input a sample, and produces an extract material as output
material_nmr_metpro = Material(name="extract-nmr-metpro-{}".format(i),
type_="Extract Name")
extraction_process_nmr_metpro = Process(
name="extract-process-{}".format(i),
executes_protocol=study.protocols[1],
inputs=[sample],
outputs=[material_nmr_metpro]
)
# create a nmr acquisition process that executes the nmr protocol
magnet = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="magnetic field strength")),
value=6.5,
unit=mag_field_unit)
tube = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="nmr tube")),
value=OntologyAnnotation(term="Brucker 14 mm Oscar", term_source=obi, term_accession="https://purl.org/"))
pulse_a = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="pulse sequence")),
value=OntologyAnnotation(term="CPMG", term_source=obi, term_accession="https://purl.org/"))
# pulses=[pulse_a]
# for j in range(len(pulses)):
metpro_process = Process(executes_protocol=study.protocols[5],parameter_values=[magnet,tube,pulse_a])
metpro_process.name = "assay-name-nmr-metpro-"+ pulse_a.value.term +"-{}".format(i+1)
metpro_process.inputs.append(extraction_process_nmr_metpro.outputs[0])
# a Data acquisition process usually has an output data file
datafile_nmr_metpro = DataFile(filename="nmr-data-metpro-"+pulse_a.value.term +"-{}.nmrml".format(i+1), label="Free Induction Decay Data File")
f=open(os.path.join(main_path,"RAW_FILES/","nmr-data-metpro-"+ pulse_a.value.term +"-{}.nmrml".format(i+1)),"w+")
f.write("nmr-data-metpro-"+ pulse_a.value.term +"-{}.nmrml".format(i+1))
f.close
metpro_process.outputs.append(datafile_nmr_metpro)
nmr_da_process.inputs.append(datafile_nmr_metpro)
# Ensure Processes are linked forward and backward. plink(from_process, to_process) is a function to set
# these links for you. It is found in the isatools.model package
assay_nmr_metpro.samples.append(sample)
assay_nmr_metpro.other_material.append(material_nmr_metpro)
assay_nmr_metpro.data_files.append(datafile_nmr_metpro)
assay_nmr_metpro.process_sequence.append(extraction_process_nmr_metpro)
assay_nmr_metpro.process_sequence.append(metpro_process)
assay_nmr_metpro.process_sequence.append(nmr_da_process)
# plink(sample_collection_mbx, extraction_process_nmr_metpro)
# plink(extraction_process_nmr_metpro, metpro_process)
# # plink(metpro_process, nmr_da_process)
# make sure the extract, data file, and the processes are attached to the assay
assay_nmr_metpro.units.append(mag_field_unit)
#TODO: this is static: take it out of the for loop
char_ext_rna_seq = Characteristic(category=OntologyAnnotation(term="Stuff Type"),
value=OntologyAnnotation(term="mRNA", term_source=obi, term_accession="https://purl.org/"))
#TODO: this is static: take it out of the for loop
rna_strat = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library strategy")),
value=OntologyAnnotation(term="RNA-SEQ", term_source=obi, term_accession="https://purl.org/"))
rna_sel = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library selection")),
value=OntologyAnnotation(term="OTHER", term_source=obi, term_accession="https://purl.org/"))
rna_src = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library source")),
value=OntologyAnnotation(term="TRANSCRIPTOMICS", term_source=obi, term_accession="https://purl.org/"))
rna_ori = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library orientation")),
value=OntologyAnnotation(term="SINGLE", term_source=obi, term_accession="https://purl.org/"))
rna_label = Characteristic(category=OntologyAnnotation(term="Label"), value=OntologyAnnotation(term="AAAAAAAAAA", term_source=obi, term_accession="https://purl.org/"))
seq_instrument = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="sequencing instrument")),
value=OntologyAnnotation(term="Illumina MiSeq", term_source=obi, term_accession="https://purl.org/"))
rna_sw = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="sequence analysis software")),
value=OntologyAnnotation(term="DESeq2", term_source=obi, term_accession="https://purl.org/"))
for i, sample in enumerate(study.samples):
# extraction process takes as input a sample, and produces an extract material as output
material_rna_seq = Material(name="extract-rna-seq-{}".format(i))
material_rna_seq.type = "Extract Name"
material_rna_seq.characteristics.append(char_ext_rna_seq)
# print(char_ext_rna_seq.to_dict())
# create an extraction process that executes the extraction protocol
extraction_process_rna_seq = Process(
name="extract-process-rna-seq-{}".format(i),
executes_protocol=study.protocols[8],
inputs=[sample],
outputs=[material_rna_seq]
)
# create a library contruction process that executes the gDNA library construction protocol
rna_library = Material(name="rna-library-name-{}".format(i))
rna_library.type = "Labeled Extract Name"
rna_library.characteristics.append(rna_label)
rna_lib_process = Process(
name = "rna-library-name-{}".format(i),
executes_protocol=study.protocols[11],
parameter_values=[rna_strat,rna_sel, rna_src, rna_ori],
inputs=[extraction_process_rna_seq.outputs[0]],
outputs=[rna_library]
)
# rna seq acquisition process usually has an output fastq data file
rna_datafile = DataFile(filename="rna-seq-data-{}.fastq".format(i), label="Raw Data File")
f=open(os.path.join(main_path, "rna-seq-data-{}.fastq".format(i)),"w+")
f.write("rna-seq-data-{}.fastq".format(i))
# f.close
updated_rna_datafile = update_checksum(main_path, rna_datafile, "md5")
rna_data_comment = Comment(name="export",value="yes")
# rna_data_comment1 = Comment(name="checksum", value=md5)
# rna_data_comment2 = Comment(name="checksum type", value="MD5")
updated_rna_datafile.comments.append(rna_data_comment)
# rna_datafile.comments.append(rna_data_comment1)
# rna_datafile.comments.append(rna_data_comment2)
#
# create a sequencing process that executes the sequencing protoco
rna_seq_process = Process(
name = "assay-name-rna-seq-{}".format(i),
executes_protocol=study.protocols[12],
parameter_values=[seq_instrument],
inputs=[rna_lib_process.outputs[0]],
outputs=[updated_rna_datafile]
)
# Ensure Processes are linked forward and backward. plink(from_process, to_process) is a function to set
# these links for you. It is found in the isatools.model package
assay_rna_seq.samples.append(sample)
assay_rna_seq.other_material.append(material_rna_seq)
assay_rna_seq.other_material.append(rna_library)
assay_rna_seq.data_files.append(updated_rna_datafile)
rnaseq_drvdf = DataFile(filename="rna-seq-DEA.txt", label="Derived Data File")
dvf=open(os.path.join(main_path,"rna-seq-DEA.txt"),"w+")
dvf.write("rna-seq-DEA.txt")
dvf.close
rna_drvdata_comment = Comment(name="export", value="yes")
updated_rnaseq_drvdf = update_checksum(main_path, rnaseq_drvdf, "md5")
updated_rnaseq_drvdf.comments.append(rna_drvdata_comment)
rna_da_process = Process(
name = "RNASEQ-DT",
executes_protocol=study.protocols[13],
parameter_values=[rna_sw],
inputs=[rna_datafile],
outputs=[updated_rnaseq_drvdf]
)
assay_rna_seq.process_sequence.append(extraction_process_rna_seq)
assay_rna_seq.process_sequence.append(rna_lib_process)
assay_rna_seq.process_sequence.append(rna_seq_process)
assay_rna_seq.process_sequence.append(rna_da_process)
# plink(sample_collection_gtx, extraction_process_rna_seq)
# plink(extraction_process_rna_seq, rna_lib_process)
# plink(rna_lib_process, rna_seq_process)
# plink(rna_seq_process, rna_da_process)
assay_rna_seq.characteristic_categories.append(char_ext_rna_seq.category)
char_ext_cnv_seq = Characteristic(category=OntologyAnnotation(term="Stuff Type", term_source="", term_accession=""),
value=OntologyAnnotation(term="gDNA", term_source=obi, term_accession="https://purl.org/OBOfoundry/obi:123414"))
# create a library contruction process that executes the gDNA library construction protocol
cnv_strat = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library strategy")),
value=OntologyAnnotation(term="WGS", term_source=obi, term_accession="https://purl.org/"))
cnv_sel = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library selection")),
value=OntologyAnnotation(term="OTHER", term_source=obi, term_accession="https://purl.org/"))
cnv_src = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library source")),
value=OntologyAnnotation(term="GENOMICS", term_source=obi, term_accession="https://purl.org/"))
cnv_ori = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="library orientation")),
value=OntologyAnnotation(term="SINGLE", term_source=obi, term_accession="https://purl.org/"))
cnv_label = Characteristic(category=OntologyAnnotation(term="Label") , value=OntologyAnnotation(term="Not Applicable", term_source=obi, term_accession="https://purl.org/"))
seq_instrument = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="sequencing instrument")),
value=OntologyAnnotation(term="AB SOLiD 5500xl", term_source=obi, term_accession="https://purl.org/"))
cnv_sw = ParameterValue(category=ProtocolParameter(parameter_name=OntologyAnnotation(term="variant calling software")),
value=OntologyAnnotation(term="VCF caller", term_source=obi, term_accession="https://purl.org/"))
for i, sample in enumerate(study.samples):
# extraction process takes as input a sample, and produces an extract material as output
material_cnv_seq = Material(name="extract-cnv-seq-{}".format(i))
material_cnv_seq.type = "Extract Name"
material_cnv_seq.characteristics.append(char_ext_cnv_seq)
print(material_cnv_seq.characteristics)
# create an extraction process that executes the extraction protocol
extraction_process_cnv_seq = Process(
name="extract-process-cnv-seq-{}".format(i),
executes_protocol=study.protocols[9],
inputs=[sample],
outputs=[material_cnv_seq]
)
cnv_library = Material(name="cnv-library-name-{}".format(i))
cnv_library.type = "Labeled Extract Name"
cnv_library.characteristics.append(cnv_label)
cnv_lib_process = Process(
name = "cnv-library-name-{}".format(i),
executes_protocol=study.protocols[10],
parameter_values=[cnv_strat,cnv_sel, cnv_src, cnv_ori],
inputs=[extraction_process_cnv_seq.outputs[0]],
outputs=[cnv_library]
)
# cnv seq acquisition process usually has an output fastq data file
cnv_datafile = DataFile(filename="cnv-seq-data-{}.fastq".format(i), label="Raw Data File")
f=open(os.path.join(main_path,"cnv-seq-data-{}.fastq".format(i)), "w+")
cnv_data_comment = Comment(name="export", value="yes")
updated_cnv_datafile = update_checksum(main_path, cnv_datafile, "md5")
updated_cnv_datafile.comments.append(cnv_data_comment)
# create a sequencing process that executes the sequencing protocol
cnv_seq_process = Process(
name = "assay-name-cnv-seq-{}".format(i),
executes_protocol=study.protocols[12],
parameter_values=[seq_instrument],
inputs=[cnv_lib_process.outputs[0]],
outputs=[updated_cnv_datafile]
)
# Ensure Processes are linked forward and backward. plink(from_process, to_process) is a function to set
# these links for you. It is found in the isatools.model package
assay_cnv_seq.samples.append(sample)
assay_cnv_seq.other_material.append(material_cnv_seq)
assay_cnv_seq.other_material.append(cnv_library)
assay_cnv_seq.data_files.append(updated_cnv_datafile)
cnvseq_drvdf = DataFile(filename="cnv-seq-derived-data.vcf", label="Derived Data File")
dvf=open(os.path.join(main_path,"cnv-seq-derived-data.vcf"),"w+")
dvf.write("cnv-seq-derived-datav.vcf")
# dvf.close
cnvseq_drvdf = DataFile(filename="cnv-seq-data-{}.vcf".format(i), label="Derived Data File")
dvf=open(os.path.join(main_path,"cnv-seq-data-{}.vcf".format(i)),"w+")
dvf.write("cnv-seq-data-{}.vcf".format(i))
dvf.close
cnv_drvdata_comment = Comment(name="export",value="yes")
updated_cnvseq_drvdf = update_checksum(main_path, cnvseq_drvdf, "md5")
updated_cnvseq_drvdf.comments.append(cnv_drvdata_comment)
cnv_da_process = Process(
name = "VCF-DT",
executes_protocol=study.protocols[14],
parameter_values=[cnv_sw],
inputs=[cnv_datafile],
outputs=[cnvseq_drvdf]
)
assay_cnv_seq.process_sequence.append(extraction_process_cnv_seq)
assay_cnv_seq.process_sequence.append(cnv_lib_process)
assay_cnv_seq.process_sequence.append(cnv_seq_process)
assay_cnv_seq.process_sequence.append(cnv_da_process)
# plink(sample_collection_gtx, extraction_process_cnv_seq)
# plink(extraction_process_cnv_seq, cnv_lib_process)
# plink(cnv_lib_process, cnv_seq_process)
# plink(cnv_seq_process, cnv_da_process)
#
assay_cnv_seq.characteristic_categories.append(char_ext_cnv_seq.category)
# print(assay_cnv_seq.other_material[0].characteristics[0].value.term)
# study.assays.append(assay)
#study.assays.append(assay_nmr_topo)
study.assays.append(assay_nmr_metpro)
study.assays.append(assay_rna_seq)
study.assays.append(assay_cnv_seq)
#Protocol #*
workflow_ref =Protocol(
name="13C SIRM MS and NMR integrative analysis",
description="a workflow for integrating data from NMR and MS acquisition into a consolidated result",
uri="https://doi.org/10.1021/acs.analchem.1c01064",
protocol_type=OntologyAnnotation(term="data transformation"),
parameters=[
ProtocolParameter(parameter_name=OntologyAnnotation(term="software"))
])
study.protocols.append(workflow_ref)
print(investigation.ontology_source_references[4].comments[0])
print(study.assays[0].comments[0])
dump
function¶from isatools.isatab import dump
# note the use of the flag for explicit serialization on factor values on assay tables
dump(investigation, os.path.join(main_path,'TAB'), write_factor_values_in_assay_table=True)
from isatools.isatab import load
with open(os.path.join(main_path,"TAB", "i_investigation.txt")) as isa_sirm_test:
roundtrip = load(isa_sirm_test)
# note the use of the flag for explicit serialization on factor values on assay tables
dump(roundtrip, os.path.join(main_path,'TAB/BH23-ISATAB_FROM_TAB'), write_factor_values_in_assay_table=False)
from isatools.isajson.dump import ISAJSONEncoder
from json import dumps, loads
inv_j = dumps(investigation, cls=ISAJSONEncoder)
print(main_path)
with open(os.path.join(main_path, 'isa-bh2023-all.json'), 'w') as out_fp:
out_fp.write(inv_j)
validate
function¶from isatools import isatab
my_json_report_isa_flux = isatab.validate(open(os.path.join(main_path,"TAB","i_investigation.txt")))
my_json_report_isa_flux["errors"]
NOTE: The error report indicates the need to add new configurations files matching the assay definitions.
from isatools.isatab import load
with open(os.path.join(main_path,"TAB", "i_investigation.txt")) as isa_sirm_test:
roundtrip = load(isa_sirm_test)
from isatools.convert import isatab2json
import json
isa_json = isatab2json.convert(os.path.join(main_path, "TAB"), validate_first=True, use_new_parser=True)
print(isa_json["studies"][0]["assays"][0]["technologyType"]["annotationValue"])
print(isa_json["studies"][0]["assays"][0]["processSequence"][10]["name"])
print([process['name'] for process in isa_json["studies"][0]["assays"][0]["processSequence"]])
output_path = os.path.join(main_path, 'JSON', 'isa-bh2023-t2j.json')
with open(output_path, 'w') as out_fp:
json.dump(isa_json, out_fp)
with open(output_path) as out_fp:
new_investigation_dict = json.loads(out_fp.read())
new_investigation = Investigation()
new_investigation.from_dict(new_investigation_dict)
print(new_investigation.studies[0].assays[0].process_sequence[0])
from isatools.convert import json2isatab
with open(os.path.join(main_path,'JSON','isa-bh2023-t2j.json')) as in_fp:
out_path = os.path.join(main_path,'JSON', 'BH23-ISATAB_FROM_JSON')
print(out_path)
json2isatab.convert(in_fp, out_path)
from isatools.convert import json2isatab
with open(os.path.join(main_path, 'isa-v2.json')) as in_fp:
out_path = os.path.join(main_path,'JSON', 'BH23-ISATAB')
json2isatab.convert(in_fp, out_path)