#!/usr/bin/env python
# coding: utf-8

# # ToxicoDB - A Toxicogenomics Brick
# [The toxicodb brick](https://github.com/biobricks-ai/toxicodb) provides data from [toxicodb.ca](https://www.toxicodb.ca/). In this post, we'll use BioBricks.ai to look up gene and compound interaction data. To start, install [biobricks.ai](https://docs.biobricks.ai) and then `toxicodb`:

# In[ ]:


import biobricks as bb, pyspark, subprocess, pandas as pd
import pyspark, pyspark.sql, pyspark.sql.functions as F, pyspark.sql.types as T

subprocess.run("biobricks install toxicodb", shell=True)
spark = pyspark.sql.SparkSession.builder.config("spark.driver.memory","4g").getOrCreate()
toxicodb = bb.assets('toxicodb')


# In[2]:


tbls = [{"table": table, "count": spark.read.parquet(path).count()} for table, path in toxicodb.__dict__.items()]
pd.DataFrame(tbls).sort_values("count", ascending=False)


# In[3]:


rawhmn = spark.read.parquet(toxicodb.TGGATEsHuman_TGGATEsHuman_parquet)
exphmn = rawhmn\
  .filter(rawhmn.gene_symbol != '')\
  .groupBy("compound_name","gene_symbol","dose").agg(F.mean('expression').alias('exp'))\
  .orderBy("gene_symbol","dose")
exphmn.limit(5).toPandas()


# In[4]:


window_spec = pyspark.sql.Window.partitionBy("compound_name", "gene_symbol").orderBy("dose")
udf_change = F.udf(lambda arr: arr[-1] - arr[0], T.FloatType())
hmn = exphmn\
  .withColumn("expressions", F.collect_list("exp").over(window_spec))\
  .groupBy("compound_name", "gene_symbol").agg(F.max("expressions").alias("expressions"))\
  .withColumn("expr_change", udf_change(F.col("expressions")))\
  .select("compound_name", "gene_symbol", "expr_change").toPandas()
hmn.head()


# In[5]:


import plotly.express as px
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist

# Calculate high variance genes and filter the dataframe
highvar_genes = hmn.groupby('gene_symbol')['expr_change'].var().nlargest(200)
hvdf = hmn[hmn['gene_symbol'].isin(highvar_genes.index)]

# Pivot the DataFrame
pivot_df = hvdf.pivot(index="compound_name", columns="gene_symbol", values="expr_change").fillna(0)

# Compute clusters
row_clusters = linkage(pdist(pivot_df, 'euclidean'), method='average')
col_clusters = linkage(pdist(pivot_df.T, 'euclidean'), method='average')

# Determine the order of rows and columns based on the clusters
row_order = leaves_list(row_clusters)
col_order = leaves_list(col_clusters)

# Reorder the DataFrame according to the clustering
clustered_df = pivot_df.iloc[row_order, col_order]

# Create the heatmap using Plotly Express
lbls = dict(x="", y="", color="Expression Change")
title = "TGGATES Gene Expression Change Over Dose in Human Cells"
fig = px.imshow(clustered_df, labels=lbls, x=clustered_df.columns, y=clustered_df.index, aspect="auto", title=title)

# Update the layout for transparent background and white text
fig.update_layout(
    xaxis={'side': 'bottom', 'title_standoff': 10, 'color': 'white'},
    yaxis={'title_standoff': 10, 'color': 'white'},
    title={'text': title, 'x': 0.5, 'xanchor': 'center', 'font': {'color': 'white'}},
    plot_bgcolor='rgba(0,0,0,0)',  # Transparent background
    paper_bgcolor='rgba(0,0,0,0)',  # Transparent surrounding
    margin=dict(l=20, r=20, t=50, b=20),
    font=dict(color='white', size=12)  # Set font color to white
)

# Show the figure
fig.show()