#!/usr/bin/env python
# coding: utf-8

# # Read related files
# 
# Non-standardized metadata files
# 
# 

# In[445]:


import os
input_dir = "./isa_container/input_files"
os.listdir(path = input_dir)


# ## Sample identifiers from ACTION

# In[446]:


IDs_file_path = "./isa_container/input_files/IDs/ACTIONdemonstrator_XOmics_IDs_fake.csv"
import pandas as pd
IDs_df = pd.read_csv(IDs_file_path).iloc[:10,]
print("Data frame dimensions\n", IDs_df.shape)
print("Column names\n", IDs_df.columns)
print("Missing value counts\n", IDs_df.isna().sum())
IDs_df.head


# # Create ISA object

# In[447]:


# print current directory
import numpy as np
import os
os.getcwd()


# ## Investigation
# 
# Create a new ISA object

# In[450]:


# create new investigation with single study
from isatools.model import *
investigation = Investigation()


# In[452]:


# define title
investigation.title = "X-omics data analysis, integration and stewardship demonstrator dataset: NTR ACTION omics data"
investigation.title


# In[401]:


investigation.description = "Predict childhood aggression with multi-omics data and demonstrate the FAIRification process and data analysis of a multi-omics project"
investigation.identifier = "tbd"


# ## Study

# In[402]:


# add one study to investigation
investigation.studies.append(Study())
investigation.studies


# According to MetaboLights help site, the title should ideally be the same as for a corresponding manuscript.

# In[403]:


investigation.studies[0].title = "X-omics data analysis, integration and stewardship demonstrator dataset: NTR ACTION omics data"
investigation.studies[0].identifier = "tbd" # TODO: add identifier; update title
investigation.studies[0].filename = "s_study.txt"


# In[404]:


investigation.studies[0]


# ### Ontologies
# 
# Ontologies can be searched e.g. at http://www.ontobee.org/ or https://www.ebi.ac.uk/ols/index.
# 
# FAIR genomes lookups: https://github.com/fairgenomes/fairgenomes-semantic-model/tree/main/lookups

# In[405]:


# ontologies
ontologies = {
    "afo": OntologySource(
        name = "AFO",
        description = "Allotrope Merged Ontology Suite"),
    "chebi": OntologySource(
        name = "CHEBI",
        description = "Chemical Entities of Biological Interest"),
    "chmo": OntologySource(
        name = "CHMO", 
        description = "Chemical Methods Ontology"),
    "edam": OntologySource(
        name = "EDAM", 
        description = "Bioinformatics operations, data types, formats, identifiers and topics"),
    "efo": OntologySource(
        name = "EFO", 
        description = "Experimental Factor Ontology"),
    "ero": OntologySource(
        name = "eagle-i resource ontology",
        description = "An ontology of research resources such as instruments, protocols, reagents, animal models and biospecimens"),
    "maxo": OntologySource(
        name = "MAXO", 
        description = "Medical Action Ontology"),
    "msio": OntologySource(
        name = "MSIO",
        description = "Metabolite Standards Initiative Ontology"),
    "ncbitaxon": OntologySource(
        name = "NCBITAXON", 
        description = "NCBI organismal classification"),
    "ncit": OntologySource(
        name = "NCIT", 
        description = "NCI Thesaurus OBO Edition"),
    "obi": OntologySource(
        name = "OBI", 
        description = "Ontology for Biomedical Investigations"),
    "pato": OntologySource(
        name = "PATO", 
        description = "PATO - the Phenotype And Trait Ontology"),
    "uberon": OntologySource(
        name = "UBERON", 
        description = "Uber-anatomy ontology")
}
# add ontologies to investigation
for o in ontologies.values():
    investigation.ontology_source_references.append(o)


# ### Protocols

# Note that the protocol used in the process to derive `sample` from `source` MUST be of type 'sample collection' (see https://isa-specs.readthedocs.io/en/latest/isatab.html#study-table-file). 
# 
# - ISA model source: https://github.com/ISA-tools/isa-api/blob/master/isatools/model.py

# In[406]:


protocol_params = {
    "anatomical entity": ProtocolParameter(
            parameter_name = OntologyAnnotation(
                term = "anatomical entity",
                term_source = ontologies["uberon"],
                    term_accession = "http://purl.obolibrary.org/obo/UBERON_0001062"))
}
# define sample collection protocol
sample_collection_protocol = Protocol(
    name = "sample collection", 
    # see github.com/ISA-tools/isa-specs/blob/master/source/isatab.rst 
    # -> MUST be of type 'sample collection'
    protocol_type = OntologyAnnotation(term = "sample collection"),
    parameters = [protocol_params["anatomical entity"]])
investigation.studies[0].protocols.append(sample_collection_protocol)


# ### Metabolomics
# 
# See also https://ebi.ac.uk/metabolights/guides/Protocol/Protocol for protocols required for submission in MetaboLights, i.e. Sample collection, Extraction, Chromatography, Mass spectrometry, Data transformation, and Metabolite identification.

# ### Study factors

# In[407]:


# gender
studyfactor_gender = StudyFactor(
    name = "genotypic sex", 
    factor_type = OntologyAnnotation( # Ontology source reference
        term = "genotypic sex", # also used in FAIR genomes
        term_source = ontologies["pato"], 
        term_accession = "http://purl.obolibrary.org/obo/PATO_0020000"))
# female
factorvalue_female = FactorValue(
    factor_name = studyfactor_gender, 
    value = OntologyAnnotation( # str or OntologyAnnotation
        term = "XX Genotype", # also used in FAIR genomes
        term_source = ontologies["ncit"], 
        term_accession = "http://purl.obolibrary.org/obo/NCIT_C45976"))
# male
factorvalue_male = FactorValue(
    factor_name = studyfactor_gender, 
    value = OntologyAnnotation( # str or OntologyAnnotation
        term = "XY Genotype", # also used in FAIR genomes
        term_source = ontologies["ncit"], 
        term_accession = "http://purl.obolibrary.org/obo/NCIT_C45977"))


# In[408]:


# agressive behaviour assessment - T-scores
studyfactor_aggression = StudyFactor(
    name = "aggression score", 
    factor_type = OntologyAnnotation( 
        term = "childhood aggressive behaviour measurement", 
        term_source = ontologies["efo"], 
        term_accession = "http://www.ebi.ac.uk/efo/EFO_0007663"),
    comments = [
        Comment(name = "T-score reference",
                value = "Age- and sex-specific Aggressive Behaviour T-score as described in Hagenbeek et al. https://doi.org/10.3389/fpyst.2020.00165")])
# T-score value
class FactorValueAggressionScore(FactorValue):
    def __init__(self, 
                 factor_name = studyfactor_aggression, 
                 value = None,
                 unit = OntologyAnnotation( 
                     term = "T-score", 
                     term_source = ontologies["ncit"], 
                     term_accession = "http://purl.obolibrary.org/obo/NCIT_C120401"),
                 comments = None):
        super().__init__(factor_name = factor_name, value = value, unit = unit, comments = comments)


# ## Assays

# ### Genotyping

# In[409]:


assay_genotype = Assay(filename = "a_assay_genotype.txt",
    measurement_type = OntologyAnnotation(term = "", term_source = "", term_accession = ""),
    technology_type = OntologyAnnotation(term = "nucleotide sequencing", term_source ="", term_accession = ""),
    technology_platform = OntologyAnnotation(term = "", term_source = "", term_accession = ""))
# TODO: What sequencing plaform has been used?
    

# In[410]:


# define extraction and measurement protocols
dna_extraction_protocol = Protocol(
    name = "DNA extraction",
    protocol_type = OntologyAnnotation(
        term = "DNA extraction",
        term_source = ontologies["obi"], 
        term_accession = "http://purl.obolibrary.org/obo/OBI_0000257")) 
# TODO: check type; compare to FAIR genomes
investigation.studies[0].protocols.append(dna_extraction_protocol)


# In[411]:


genotype_profiling_protocol = Protocol(
    name = "genotype profiling",
    protocol_type = OntologyAnnotation(
        term = "genotyping",
        term_source = ontologies["efo"],
    term_accession = "http://www.ebi.ac.uk/efo/EFO_0000750")
)
investigation.studies[0].protocols.append(genotype_profiling_protocol)
# TODO: check type; compare to FAIR genomes


# ### DNA methylation

# In[412]:


assay_methylation = Assay(
    filename = "a_assay_methylation.txt", 
    measurement_type = OntologyAnnotation(
        term = "Methylation Beta Value",
        term_source = ontologies["ncit"],
        term_accession = "http://purl.obolibrary.org/obo/NCIT_C164051"),
    technology_type = OntologyAnnotation(
        term = "DNA methylation profiling by array assay",
        term_source = ontologies["obi"], 
        term_accession = "http://purl.obolibrary.org/obo/OBI_0001332"),
    technology_platform = OntologyAnnotation(
        term = "Illumina Infinium MethylationEPIC BeadChip",
        term_source = ontologies["obi"], 
        term_accession = "http://purl.obolibrary.org/obo/OBI_0002131")
    )


# In[413]:


# define extraction and measurement protocols
dna_extraction_protocol = Protocol(
    name = "DNA extraction",
    protocol_type = OntologyAnnotation(
        term = "DNA extraction",
        term_source = ontologies["obi"], 
        term_accession = "http://purl.obolibrary.org/obo/OBI_0000257")) 
# TODO: check type; compare to FAIR genomes
investigation.studies[0].protocols.append(dna_extraction_protocol)


# In[414]:


methylation_profiling_protocol = Protocol(
    name = "methylation profiling",
    protocol_type = OntologyAnnotation(
        term = "methylation profiling",
        term_source = ontologies["efo"], 
        term_accession = "http://www.ebi.ac.uk/efo/EFO_0000751"),
#    components = [OntologyAnnotation(
#        term = "Illumina Infinium MethylationEPIC BeadChip",
#        term_source = obi, 
#        term_accession = "http://purl.obolibrary.org/obo/OBI_0002131")]
    ) 
investigation.studies[0].protocols.append(methylation_profiling_protocol)


# In[415]:


methylation_data_processing_protocol = Protocol(
   name = "methylation data processing protocol",
   protocol_type = OntologyAnnotation(
       term = "Protocol",
       term_source = ontologies["edam"], 
       term_accession = "http://edamontology.org/data_2531"),
   description = "Sinke, Lucy, van Iterson, Maarten, Cats, Davy, Slieker, Roderick, & Heijmans, Bas. (2019, July 11). DNAmArray: Streamlined workflow for the quality control, normalization, and analysis of Illumina methylation array data (Version 2.1). Zenodo. http://doi.org/10.5281/zenodo.3355292",
   uri = "http://doi.org/10.5281/zenodo.3355292")
investigation.studies[0].protocols.append(methylation_data_processing_protocol)


# ### Metabolomics

# In[416]:


assay_metabolomics_amines = Assay(
    filename = "a_assay_metabolomics_amines.txt",

    
    measurement_type = OntologyAnnotation(
        term = "targeted metabolite profiling",
        term_source = ontologies["msio"], 
        term_accession = "http://purl.obolibrary.org/obo/MSIO_0000100"),
    
    technology_type = OntologyAnnotation(
        term = "liquid chromatography-mass spectrometry",
        term_source = ontologies["chmo"],
        term_accession = "http://purl.obolibrary.org/obo/CHMO_0000524"))
    
#     sample_type = OntologyAnnotation(
#         term = "urine specimen",
#         term_source = ['obi'],
#         term_accession = "http:/purl.obolibrary.org/obo/OBI_0000651")
    

#     
#     technology_platform = OntologyAnnotation(
#         term = "",
#         term_source = ontologies[""], 
#         term_accession = "")
    # TODO: What exact platform/instrument has been used?


# In[417]:


assay_metabolomics_OA = Assay(
    filename = "a_assay_metabolomics_OA.txt", 
    
    measurement_type = OntologyAnnotation(
        term = "targeted metabolite profiling",
        term_source = ontologies["msio"], 
        term_accession = "http://purl.obolibrary.org/obo/MSIO_0000100"),
    
    technology_type = OntologyAnnotation(
        term = "gas chromatography-mass spectrometry",
        term_source = ontologies["chmo"]))
        
#     sample_type = OntologyAnnotation(
#         term = "urine specimen",
#         term_source = ['obi'],
#         term_accession = "http:/purl.obolibrary.org/obo/OBI_0000651")
    
    
#     technology_platform = OntologyAnnotation(
#         term = "",
#         term_source = ontologies[""], 
#         term_accession = "")
        # TODO: What exact platform/instrument has been used?


# In[418]:


assay_metabolomics_steroids = Assay(
    filename = "a_assay_metabolomics_steroids.txt", 
    
    measurement_type = OntologyAnnotation(
        term = "targeted metabolite profiling",
        term_source = ontologies["msio"], 
        term_accession = "http://purl.obolibrary.org/obo/MSIO_0000100"),
    
    technology_type = OntologyAnnotation(
        term = "high-performance liquid chromatography-mass spectrometry",
        term_source = ontologies["chmo"],
        term_accession = "http://purl.obolibrary.org/obo/CHMO_0000796"))
    
#     sample_type = OntologyAnnotation(
#         term = "urine specimen",
#         term_source = ['obi'],
#         term_accession = "http:/purl.obolibrary.org/obo/OBI_0000651")

#     technology_platform = OntologyAnnotation(
#         term = "",
#         term_source = ontologies[""], 
#         term_accession = "")
# TODO: What exact platform/instrument has been used?


# In[419]:


urine_sampling_protocol = Protocol(
    name = "urine sampling",
    protocol_type = OntologyAnnotation(
        term = 'urine speciment collection',
#         term_source = [''],
        term_accession = 'http://snomed.info/id/57617002')
)
# TODO: is this useful a ontology?


investigation.studies[0].protocols.append(urine_sampling_protocol)


# In[420]:


extraction_metabolomics = Protocol(
    name = "Extraction",
    protocol_type = OntologyAnnotation(
        term = 'Extraction',
        term_source = ontologies["ncit"],
        term_accession = 'http://purl.obolibrary.org/obo/NCIT_C61575'),
    parameters = [
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Post Extraction")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Derivatization"))
    ])


# In[421]:


chromatography = Protocol(
    name = "Chromatography",
    protocol_type = OntologyAnnotation(
        term = 'Chromatography',
        term_source = ontologies["ncit"],
        term_accession = 'http://purl.obolibrary.org/obo/NCIT_C16431'),
    parameters = [
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Chromatography Instrument")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Column model")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Column type"))
    ])


# In[422]:


labelling_metabolites = Protocol(
    name = "Labelling metabolites",
    protocol_type = OntologyAnnotation(
        term = 'Labelling',
        term_source = ontologies["chmo"],
        term_accession = 'http://purl.obolibrary.org/obo/CHMO_0001675')
    )


# In[423]:


mass_spectrometry = Protocol(
    name = "Mass spectrometry",
    protocol_type = OntologyAnnotation(
        term = 'Mass spectrometry',
        term_source = ontologies["ncit"],
        term_accession = 'http://purl.obolibrary.org/obo/NCIT_C17156'),
    parameters = [
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Scan polarity")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Scan m/z range")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Instrument")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Ion source")),
        ProtocolParameter(parameter_name=OntologyAnnotation(term = "Mass analyzer"))
    ])


# In[424]:


data_transformation = Protocol(
    name = "Data transformation",
    protocol_type = OntologyAnnotation(
        term = 'Data Transformation',
        term_source = ontologies["ncit"],
        term_accession = 'http://purl.obolibrary.org/obo/NCIT_C43582')
    )


# In[425]:


metabolite_identification = Protocol(
    name = "Metabolite identification",
    protocol_type = OntologyAnnotation(
        term = 'peak identification',
        term_source = ontologies["afo"],
        term_accession = 'http://purl.allotrope.erg/ontologies/process#AFP_0003618')
    )

# TODO: Is this the correct ontlogy (source)?


# ## ACTION samples
# 
# Add samples to study and link to previously defined protocols and assays.
# 
# For MetaboLights, sample information should include unique sample name, organism, organism part, sample type (control, QC, experimental sample), other descriptors as factors (age, gender).

# In[426]:


for idx, row in IDs_df.iterrows(): 
    # add subjects (sources) 
    # TODO: issue - source should represent a source material such as urine, 
    #   and sample a respective extract or similar
    # check if source was already added already (rows can contain duplicate entries)
    #is_new_source = True
    source_name = row["XOmicsPhenoID"]
    source = next((src for src in investigation.studies[0].sources 
                   if src.name == source_name), None)
    if not source:
        #is_new_source = False
        # create new source for subject
        source = Source(name = row["XOmicsPhenoID"])
        # Characteristics - Organism - should be included for Metabolights
        # here, organism is defined per source, i.e. individual
        source.characteristics.append(
            Characteristic(
                category = OntologyAnnotation(
                    term = "organism"), 
                # TODO: add term source and accession for category? would such information be lost in ISA-Tab?
                value = OntologyAnnotation(
                    term = "Homo sapiens",
                    term_source = ontologies["ncbitaxon"],
                    term_accession = "http://purl.bioontology.org/ontology/NCBITAXON/9606")))
        # TODO: check if family ID should/should not be added / is required for analysis
        source.characteristics.append(
            Characteristic(category = "family ID",
                           value = row["XOmicsFamID"]))
        source.characteristics.append(
            Characteristic(
                category = OntologyAnnotation( 
                    term = "childhood aggressive behaviour measurement", 
                    term_source = ontologies["efo"], 
                    term_accession = "http://www.ebi.ac.uk/efo/EFO_0007663"),
                value = "", #tscore,
                unit = OntologyAnnotation( 
                    term = "T-score", 
                    term_source = ontologies["ncit"], 
                    term_accession = "http://purl.obolibrary.org/obo/NCIT_C120401")))
        #source.factor_values.append(FactorValueAggressionScore(value = tscore))
        
        # add subject to study
        investigation.studies[0].sources.append(source)
        
    # add samples - sample names need to be unique
    # urine sample for metabolomics
    if not pd.isna(row["XOmicsmetaboID"]):
        # check if urine sample was already added to study
        urine_sample_name = "urine_{0}".format(source_name)
        urine_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == urine_sample_name), None)
        if not urine_sample:
            # create a new sample with unique name
            urine_sample = Sample(
                name = urine_sample_name, 
                derives_from = [source]) # the individual
            # Characteristics - Organism part - should be included for Metabolights
            # here, organism part is defined per sample
            urine_sample.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(term = "organism part"),
                    value = OntologyAnnotation(
                        term = "urine",
                        term_source = ontologies["uberon"],
                        term_accession = "http://purl.obolibrary.org/obo/UBERON_0001088")))
            # Characteristics - sample type - should be included for Metabolights 
            # i.e. control, QC, experimental sample
            urine_sample.characteristics.append(
                Characteristic(
                    category = "sample type", # TODO: could not find a term yet; sample type is not an ontological term, but required by MetaboLights
                    value = OntologyAnnotation(
                        term = "experimental sample",
                        term_source = ontologies["chmo"],
                        term_accession = "http://purl.obolibrary.org/obo/CHMO_0002746")))
            # add urine sample to study
            investigation.studies[0].samples.append(urine_sample)
            
            # check if urine sampling process exists for source
            urine_p_name = "urine_specimen_collection_process_{0}".format(source.name)
            urine_collection_process = next(
                (prcs for prcs in investigation.studies[0].process_sequence 
                 if prcs.name == urine_p_name), None)
            if not urine_collection_process:
                # define urine sampling process for this subject
                urine_collection_process = Process(
                    name = urine_p_name, 
                    executes_protocol = sample_collection_protocol,
                    parameter_values = [
                        ParameterValue(
                            category = protocol_params["anatomical entity"], #ProtocolParameter 
                            value = "urine")],
                    inputs = [source],
                    outputs = [urine_sample])
                investigation.studies[0].process_sequence.append(urine_collection_process)
            else:
                # urine sampling process already exists for the source
                # add urine sample to outputs of existing process
                urine_collection_process.outputs.append(urine_sample)
    
    # add samples
    # buccal swab sample for genotyping and DNA methylation arrays
    if not pd.isna(row["XOmicsGenoID"]) or not pd.isna(row["XOmicsMethylID"]):
        buccal_sample_name = "buccal_mucosa_{0}".format(source_name)
        
        buccal_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == buccal_sample_name), None)
        
        if not buccal_sample:
            # create sample of buccal mucosa
            buccal_sample = Sample(
                name = buccal_sample_name, 
                derives_from = [source]) # same source as urine sample
            buccal_sample.characteristics.append(
                Characteristic(
                    category = OntologyAnnotation(term = "organism part"),
                    value = ""))
            # TODO: add more characteristics - compare to urime sample
            # add sample to study
            investigation.studies[0].samples.append(buccal_sample) 
            
            # check if buccal swab sampling process exists for source
            # needs to be checked, because multiple samples can be derived from one source
            buccal_p_name = "buccal_specimen_collection_process_{0}".format(source.name)
            buccal_collection_process = next(
                (prcs for prcs in investigation.studies[0].process_sequence 
                 if prcs.name == buccal_p_name), None)
            if not buccal_collection_process:
                # define buccal sampling process for this subject
                buccal_collection_process = Process(
                    name = buccal_p_name, 
                    executes_protocol = sample_collection_protocol,
                    parameter_values = [
                        ParameterValue(
                            category = protocol_params["anatomical entity"], #ProtocolParameter 
                            value = "buccal mucosa")],
                    inputs = [source],
                    outputs = [buccal_sample]
                )
                investigation.studies[0].process_sequence.append(buccal_collection_process)
 
            else:
                # buccal sampling process already exists for the source
                # add buccal sample to outputs of existing process
                buccal_collection_process.outputs.append(buccal_sample)
                
            # NOTE: adding extraction process at study level doesn seem to work
            # causes sample to disappear from study file
            
                
        #if not pd.isna(row["XOmicsMethylID"]):
        #    dna_extraction_process = next(())
        #    assay_methylation.samples.append(buccal_sample) # first check if already added
        #    methylation_profiling_process = Process(
        #            name = "methylation_profiling_{0}".format(source.name),
        #            executes_protocol = methylation_profiling_protocol,
        #            inputs = [buccal_sample], 
        ##            outputs = [buccal_dna])
            

# Genotype assay

# In[427]:


# add samples to genotype assay
for idx, row in IDs_df.iterrows(): 
    source_name = row["XOmicsPhenoID"]
    if not pd.isna(row["XOmicsGenoID"]):
        buccal_sample_name = "buccal_mucosa_{0}".format(source_name)
        buccal_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == buccal_sample_name), None)
        genotype_sample = next(
            (smpl for smpl in assay_genotype.samples 
             if smpl.name == buccal_sample_name), None)
        if not genotype_sample:
            assay_genotype.samples.append(buccal_sample) # first check if already added
            # define DNA as material extracted from buccal mucosa sample
            # on study level, because the same DNA is used for genotyping AND DNA methylation profiling
            # TODO: check if this works; could be that extraction has to be on assay level
            # but Study object has process_sequence, i.e. this is technically possible 
            # define DNA material for this sample
            # now trying on assay level
            buccal_dna = Material(
                name = "buccal_DNA_{0}".format(row["XOmicsGenoID"]),
                type_ = "Extract Name")
            # define extraction process for buccal DNA
            dna_extraction_process = Process(
                name = "DNA_extraction_{0}".format(row["XOmicsGenoID"]),
                executes_protocol = dna_extraction_protocol,
                inputs = [buccal_sample], 
                outputs = [buccal_dna])
            

            #if not pd.isna(row["XOmicsMethylID"]):
            #    dna_extraction_process = next(())

            genotype_profiling_process = Process(
                name = "genotype_profiling_{0}".format(row["XOmicsGenoID"]),
                executes_protocol = genotype_profiling_protocol,
                inputs = [buccal_dna])
            
            plink(dna_extraction_process, genotype_profiling_process)
            assay_genotype.process_sequence.append(dna_extraction_process) 
            assay_genotype.process_sequence.append(genotype_profiling_process) 


# In[428]:


# add samples to methylation assay
for idx, row in IDs_df.iterrows(): 
    source_name = row["XOmicsPhenoID"]
    if not pd.isna(row["XOmicsMethylID"]):
        buccal_sample_name = "buccal_mucosa_{0}".format(source_name)
        buccal_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == buccal_sample_name), None)
        methylation_sample = next(
            (smpl for smpl in assay_methylation.samples 
             if smpl.name == buccal_sample_name), None)
        if not methylation_sample:
            assay_methylation.samples.append(buccal_sample) # first check if already added
            # define DNA as material extracted from buccal mucosa sample
            # on study level, because the same DNA is used for genotyping AND DNA methylation profiling
            # TODO: check if this works; could be that extraction has to be on assay level
            # but Study object has process_sequence, i.e. this is technically possible 
            # define DNA material for this sample
            # now trying on assay level
            buccal_dna = Material(
                name = "buccal_DNA_{0}".format(row["XOmicsMethylID"]),
                type_ = "Extract Name")
            # define extraction process for buccal DNA
            dna_extraction_process = Process(
                name = "DNA_extraction_{0}".format(row["XOmicsMethylID"]),
                executes_protocol = dna_extraction_protocol,
                inputs = [buccal_sample], 
                outputs = [buccal_dna])
            

            #if not pd.isna(row["XOmicsMethylID"]):
            #    dna_extraction_process = next(())

            methylation_profiling_process = Process(
                name = "methylation_profiling_{0}".format(row["XOmicsMethylID"]),
                executes_protocol = methylation_profiling_protocol,
                inputs = [buccal_dna])
            
            plink(dna_extraction_process, methylation_profiling_process)
            assay_methylation.process_sequence.append(dna_extraction_process) 
            assay_methylation.process_sequence.append(methylation_profiling_process) 


# Metabolomics assays

# In[429]:


# add samples, processes and datafiles to metabolomics Amines assay

Assay = assay_metabolomics_amines

# Define datafiles (not all may be relevant)

raw_datafile = DataFile(filename="link/to/raw/data", label="Raw Spectral Data File")

normalized_datafile = DataFile(filename="link/to/normalized_data", label="Normalization Name")

derived_spectral_data_file = DataFile(filename="link/to/spectral_file", label="Derived Spectral Data File")

Data_Transformation_Name = DataFile(filename="link/to/data_transformation_name", label="Data Transformation Name")

MAF = DataFile(filename="link/to/MAF", label="Metabolite Assignment File")
  

# Loop over samples and add process to samples
for idx, row in IDs_df.iterrows():
    source_name = row["XOmicsPhenoID"]
#     print(source_name)
    if not pd.isna(row["XOmicsmetaboID"]):    
        #         print(row['XOmicsmetaboID'])
        urine_sample_name = "urine_{0}".format(source_name)
#         print(urine_sample_name)
        urine_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == urine_sample_name), None)

        metabolomics_sample = next(
            (smpl for smpl in Assay.samples 
             if smpl.name == urine_sample_name), None)
        
        if not metabolomics_sample:
            
            
            ## Extraction
            Post_extraction = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Post Extraction")), value = OntologyAnnotation(term="1 uL borate buffer (pH 8.8) with AQC reagent"))
            Derivatization = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Derivatization")), value = "AQC")
            
            material_extract = Material(
                name = "extract_{0}".format(row["XOmicsmetaboID"]),
                type_ = "Extract Name")
            
            extraction_process = Process(
                executes_protocol=extraction_metabolomics, 
                parameter_values=[Post_extraction, Derivatization],
                inputs = [urine_sample],
                outputs = [material_extract])
            
            
            ## Labelling
            material_label = Material(
                name ="labeled_{0}".format(row["XOmicsmetaboID"]),
                type_ ="Labeled Extract Name")

            labelling_process = Process(
                executes_protocol=labelling_metabolites,
                inputs = [extraction_process.outputs[0]],
                outputs = [material_label])
            
            
            ## Chromatography
#             separated_molecules = Material(
#                 name = "separated_molecules_{0}".format(row["XOmicsmetaboID"],
#                 type_ ="Labeled Extract Name")
#             )
            
            instrument = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Chromatography Instrument")), value = "Agilent 1290 Infinity II")
            column_model = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Column model")), value = "Accq-Tag Ultra column (waters + FURHTER SPECS?)")
            column_type = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Column type")), value = "reverse phase")

            chromatography_process = Process(
                name = "chromatography_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol = chromatography,
                parameter_values = [instrument, column_model, column_type],
                inputs = [labelling_process.outputs[0]],
                outputs = []
#                 outputs = [separated_molecules]
            )
            
            
            ## Mass spectrometry
            scan_polarity = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Scan polarity")), value = "positive")
            scan_range = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Scan m/z range")), value = "5-2000?")
            instrument = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Instrument")), value = "AB SCIEX Qtrap 6500")
            ion_source = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Ion source")), value = "ESI")
            mass_analyzer = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Mass Analyzer")), value = "triple quadrupole linear ion trap")
            
            mass_spectrometry_process = Process(
                name = "mass_spectrometry_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol= mass_spectrometry,
                parameter_values = [scan_polarity, scan_range, instrument, ion_source, mass_analyzer],
#                 inputs = [separated_molecules],
                inputs = [],
                outputs = [raw_datafile]
            )
            
            
           ## Data transformation
            data_transformation_process = Process(
                name = "data_transformation_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol = data_transformation,
                inputs = [raw_datafile],
                outputs  = [normalized_datafile, derived_spectral_data_file]
            )
            
            
            ## Metabolite identification
            metabolite_identification_process = Process(
                name = "metabolite_identification_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol =  metabolite_identification,
                inputs = [normalized_datafile],
                outputs= [Data_Transformation_Name, MAF]
            )
            
            
            # Link processes
            plink(extraction_process, labelling_process)
            plink(labelling_process, chromatography_process)
            plink(chromatography_process, mass_spectrometry_process)
            plink(mass_spectrometry_process, data_transformation_process)
            plink(data_transformation_process, metabolite_identification_process)
            
            
            # Add samples, materials and data files to the amines assay
            Assay.samples.append(urine_sample)
            Assay.other_material.append(material_extract)
            Assay.other_material.append(material_label)
#             Assay.other_material.append(separated_molecules)
            Assay.data_files.append(raw_datafile)
            Assay.data_files.append(normalized_datafile)
            Assay.data_files.append(derived_spectral_data_file)
            Assay.data_files.append(Data_Transformation_Name)                                                                                                   
            Assay.data_files.append(MAF)
            
            
            ## Add processes to the amines assay
            Assay.process_sequence.append(extraction_process)
            Assay.process_sequence.append(labelling_process)
            Assay.process_sequence.append(chromatography_process)
            Assay.process_sequence.append(mass_spectrometry_process)
            Assay.process_sequence.append(data_transformation_process)
            Assay.process_sequence.append(metabolite_identification_process)


# In[430]:


# add samples, processes and datafiles to metabolomics OA assay

Assay = assay_metabolomics_OA

# Define datafiles (not all may be relevant)

raw_datafile = DataFile(filename="link/to/raw/data", label="Raw Spectral Data File")

normalized_datafile = DataFile(filename="link/to/normalized_data", label="Normalization Name")

derived_spectral_data_file = DataFile(filename="link/to/spectral_file", label="Derived Spectral Data File")

Data_Transformation_Name = DataFile(filename="link/to/data_transformation_name", label="Data Transformation Name")

MAF = DataFile(filename="link/to/MAF", label="Metabolite Assignment File")
  

# Loop over samples and add process to samples
for idx, row in IDs_df.iterrows():
    source_name = row["XOmicsPhenoID"]
#     print(source_name)
    if not pd.isna(row["XOmicsmetaboID"]):
#         print(row['XOmicsmetaboID'])
        urine_sample_name = "urine_{0}".format(source_name)
#         print(urine_sample_name)
        urine_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == urine_sample_name), None)

        metabolomics_sample = next(
            (smpl for smpl in Assay.samples 
             if smpl.name == urine_sample_name), None)
        
        if not metabolomics_sample:
            Assay.samples.append(urine_sample)
            
            
            ## Extraction
            Post_extraction = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Post Extraction")), value = "1 uL pyridine")
            Derivatization = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Derivatization")), value = "oximation followed by silylation")
            
       
            material_extract = Material(
                name = "extract_{0}".format(row["XOmicsmetaboID"]),
                type_ = "Extract Name")
                
            extraction_process = Process(
                executes_protocol=extraction_metabolomics, 
                parameter_values=[Post_extraction, Derivatization],
                inputs = [urine_sample],
                outputs = [material_extract])
            
            
            ## Labelling
            material_label = Material(
                name ="labeled_{0}".format(row["XOmicsmetaboID"]),
                type_ ="Labeled Extract Name")

            
            labelling_process = Process(
                executes_protocol=labelling_metabolites,
                inputs = [extraction_process.outputs[0]],
                outputs = [material_label])
            
            
#             ## Chromatography
#             separated_molecules = Material(
#                 name = "separated_molecules_{0}".format(row["XOmicsmetaboID"],
#                 type_ ="Labeled Extract Name")
#             )
            
            instrument = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Chromatography Instrument")), value = "Agilent Technologies 7890A")
            column_model = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Column model")), value = "HP-5MS UI (5% Phenyl Methyl Silox), 30 m x 0.25 m ID column with a film thickness of 25 um")
            column_type = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Column type")), value = "low polarity")

            
            chromatography_process = Process(
                name = "chromatography_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol = chromatography,
                parameter_values = [instrument, column_model, column_type],
                inputs = [labelling_process.outputs[0]], 
                outputs = [],
#                 outputs = [separated_molecules]
            )
            
            ## Mass spectrometry
            scan_polarity = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Scan polarity")), value = "positive")
            scan_range = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Scan m/z range")), value = "50-500")
            instrument = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Instrument")), value = "Agilent Technologies mass selective detector (MSD 5975C) and MultiPurpose Sampler (MPS, MXY016-02A, GERSTEL)")
            ion_source = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Ion source")), value = "EI (70 eV)")
            mass_analyzer = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Mass Analyzer")), value = "single-quadrupole")
            
            
            mass_spectrometry_process = Process(
                name = "mass_spectrometry_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol= mass_spectrometry,
                parameter_values = [scan_polarity, scan_range, instrument, ion_source, mass_analyzer],
                inputs = [],
#                 inputs = [separated_molecules],
                outputs = [raw_datafile]
            )
            
            
           ## Data transformation
            data_transformation_process = Process(
                name = "data_transformation_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol = data_transformation,
                inputs = [raw_datafile],
                outputs  = [normalized_datafile, derived_spectral_data_file]
            )
            
            ## Metabolite identification
            metabolite_identification_process = Process(
                name = "metabolite_identification_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol =  metabolite_identification,
                inputs = [normalized_datafile],
                outputs= [Data_Transformation_Name, MAF]
            )
            
#             ## Link processes
            plink(extraction_process, labelling_process)
            plink(labelling_process, chromatography_process)
            plink(chromatography_process, mass_spectrometry_process)
            plink(mass_spectrometry_process, data_transformation_process)
            plink(data_transformation_process, metabolite_identification_process)
            
#             ## Add samples, materials and data files to the OA assay
            Assay.other_material.append(material_extract)
            Assay.other_material.append(material_label)
#             Assay.other_material.append(separated_molecules)
            Assay.data_files.append(raw_datafile)
            Assay.data_files.append(normalized_datafile)
            Assay.data_files.append(derived_spectral_data_file)
            Assay.data_files.append(Data_Transformation_Name)                                                                                                   
            Assay.data_files.append(MAF)
            
#             ## Add processes to the OA assay
            Assay.process_sequence.append(extraction_process)
            Assay.process_sequence.append(labelling_process)
            Assay.process_sequence.append(chromatography_process)
            Assay.process_sequence.append(mass_spectrometry_process)
            Assay.process_sequence.append(data_transformation_process)
            Assay.process_sequence.append(metabolite_identification_process)


# In[432]:


# add samples, processes and datafiles to metabolomics steroids assay

Assay = assay_metabolomics_steroids

# Define datafiles (not all may be relevant)

raw_datafile = DataFile(filename="link/to/raw/data", label="Raw Spectral Data File")

normalized_datafile = DataFile(filename="link/to/normalized_data", label="Normalization Name")

derived_spectral_data_file = DataFile(filename="link/to/spectral_file", label="Derived Spectral Data File")

Data_Transformation_Name = DataFile(filename="link/to/data_transformation_name", label="Data Transformation Name")

MAF = DataFile(filename="link/to/MAF", label="Metabolite Assignment File")
  

# Loop over samples and add process to samples
for idx, row in IDs_df.iterrows():
    source_name = row["XOmicsPhenoID"]
#     print(source_name)
    if not pd.isna(row["XOmicsmetaboID"]):
#         print(row['XOmicsmetaboID'])
        urine_sample_name = "urine_{0}".format(source_name)
#         print(urine_sample_name)
        urine_sample = next(
            (smpl for smpl in investigation.studies[0].samples 
             if smpl.name == urine_sample_name), None)

        metabolomics_sample = next(
            (smpl for smpl in Assay.samples 
             if smpl.name == urine_sample_name), None)
        
        if not metabolomics_sample:
            Assay.samples.append(urine_sample)
            
            
            ## Extraction
            Post_extraction = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Post Extraction")), value = "1 uL filtered urine")
            Derivatization = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Derivatization")), value = "NA")

            
            material_extract = Material(
                name = "extract_{0}".format(row["XOmicsmetaboID"]),
                type_ = "Extract Name")
            
            extraction_process = Process(
                executes_protocol=extraction_metabolomics, 
                parameter_values=[Post_extraction, Derivatization],
                inputs = [urine_sample],
                outputs = [material_extract])
            
            
            ## Labelling
            material_label = Material(
                name ="labeled_{0}".format(row["XOmicsmetaboID"]),
                type_ ="Labeled Extract Name")

            
            labelling_process = Process(
                executes_protocol=labelling_metabolites,
                inputs = [extraction_process.outputs[0]],
                outputs = [material_label])
            
            
            ## Chromatography
#             separated_molecules = Material(
#                 name = "new_separated_molecules_{0}".format(row["XOmicsmetaboID"],
#                 type_ ="Labeled Extract Name")  
#             )
            
            instrument = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Chromatography Instrument")), value = "Agilent 1290")
            column_model = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Column model")), value = "Acquity UPLC CSH C18 column (Waters)")
            column_type = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Column type")), value = "reverse phase")

            
            chromatography_process = Process(
                name = "chromatography_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol = chromatography,
                parameter_values = [instrument, column_model, column_type],
                inputs = [labelling_process.outputs[0]], 
                outputs = []
                #outputs = [separated_molecules]
            )
            
            ## Mass spectrometry
            scan_polarity = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Scan polarity")), value = "switching positive and negative ion mode !! MAYBE SERPARATE INTO NEGATIVE AND POSITIVE ASSAY?")
            scan_range = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Scan m/z range")), value = "5-3000?")
            instrument = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Instrument")), value = "Agilent 6460")
            ion_source = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Ion source")), value = "ESI")
            mass_analyzer = ParameterValue(category = ProtocolParameter(parameter_name=OntologyAnnotation(term="Mass Analyzer")), value = "triple quadrupole")
            
            
            mass_spectrometry_process = Process(
                name = "mass_spectrometry_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol= mass_spectrometry,
                parameter_values = [scan_polarity, scan_range, instrument, ion_source, mass_analyzer],
               # inputs = [separated_molecules],
                inputs = [],
                outputs = [raw_datafile]
            )
            
            
           ## Data transformation
            data_transformation_process = Process(
                name = "data_transformation_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol = data_transformation,
                inputs = [raw_datafile],
                outputs  = [normalized_datafile, derived_spectral_data_file]
            )
            
            ## Metabolite identification
            metabolite_identification_process = Process(
                name = "metabolite_identification_{0}".format(row["XOmicsmetaboID"]),
                executes_protocol =  metabolite_identification,
                inputs = [normalized_datafile],
                outputs= [Data_Transformation_Name, MAF]
            )
            
            ## Link processes
            plink(extraction_process, labelling_process)
            plink(labelling_process, chromatography_process)
            plink(chromatography_process, mass_spectrometry_process)
            plink(mass_spectrometry_process, data_transformation_process)
            plink(data_transformation_process, metabolite_identification_process)
            
            ## Add samples, materials and data files to the steroids assay
            Assay.other_material.append(material_extract)
            Assay.other_material.append(material_label)
            # Assay.other_material.append(separated_molecules)
            Assay.data_files.append(raw_datafile)
            Assay.data_files.append(normalized_datafile)
            Assay.data_files.append(derived_spectral_data_file)
            Assay.data_files.append(Data_Transformation_Name)                                                                                                   
            Assay.data_files.append(MAF)
            
            ## Add processes to the steroids assay
            Assay.process_sequence.append(extraction_process)
            Assay.process_sequence.append(labelling_process)
            Assay.process_sequence.append(chromatography_process)
            Assay.process_sequence.append(mass_spectrometry_process)
            Assay.process_sequence.append(data_transformation_process)
            Assay.process_sequence.append(metabolite_identification_process)


# In[433]:


# add assays to study
investigation.studies[0].assays.append(assay_genotype)
investigation.studies[0].assays.append(assay_methylation)
investigation.studies[0].assays.append(assay_metabolomics_amines)
investigation.studies[0].assays.append(assay_metabolomics_OA)
investigation.studies[0].assays.append(assay_metabolomics_steroids)


# # Write ISA-Tab files

# In[434]:


# create ISA files directory 
out_dir = "isa_template"
if not os.path.isdir(out_dir):
    os.makedirs(out_dir)
# write to ISA-Tab
from isatools import isatab
isatab.dump(investigation, out_dir)
print()


# In[435]:


import json
from isatools.isajson import ISAJSONEncoder
print(json.dumps(investigation, cls=ISAJSONEncoder, sort_keys=True, indent=4, separators=(',', ': ')))


# In[ ]: