#!/usr/bin/env python # coding: utf-8 # # ToxicoDB - A Toxicogenomics Brick # [The toxicodb brick](https://github.com/biobricks-ai/toxicodb) provides data from [toxicodb.ca](https://www.toxicodb.ca/). In this post, we'll use BioBricks.ai to look up gene and compound interaction data. To start, install [biobricks.ai](https://docs.biobricks.ai) and then `toxicodb`: # In[ ]: import biobricks as bb, pyspark, subprocess, pandas as pd import pyspark, pyspark.sql, pyspark.sql.functions as F, pyspark.sql.types as T subprocess.run("biobricks install toxicodb", shell=True) spark = pyspark.sql.SparkSession.builder.config("spark.driver.memory","4g").getOrCreate() toxicodb = bb.assets('toxicodb') # In[2]: tbls = [{"table": table, "count": spark.read.parquet(path).count()} for table, path in toxicodb.__dict__.items()] pd.DataFrame(tbls).sort_values("count", ascending=False) # In[3]: rawhmn = spark.read.parquet(toxicodb.TGGATEsHuman_TGGATEsHuman_parquet) exphmn = rawhmn\ .filter(rawhmn.gene_symbol != '')\ .groupBy("compound_name","gene_symbol","dose").agg(F.mean('expression').alias('exp'))\ .orderBy("gene_symbol","dose") exphmn.limit(5).toPandas() # In[4]: window_spec = pyspark.sql.Window.partitionBy("compound_name", "gene_symbol").orderBy("dose") udf_change = F.udf(lambda arr: arr[-1] - arr[0], T.FloatType()) hmn = exphmn\ .withColumn("expressions", F.collect_list("exp").over(window_spec))\ .groupBy("compound_name", "gene_symbol").agg(F.max("expressions").alias("expressions"))\ .withColumn("expr_change", udf_change(F.col("expressions")))\ .select("compound_name", "gene_symbol", "expr_change").toPandas() hmn.head() # In[5]: import plotly.express as px from scipy.cluster.hierarchy import linkage, leaves_list from scipy.spatial.distance import pdist # Calculate high variance genes and filter the dataframe highvar_genes = hmn.groupby('gene_symbol')['expr_change'].var().nlargest(200) hvdf = hmn[hmn['gene_symbol'].isin(highvar_genes.index)] # Pivot the DataFrame pivot_df = hvdf.pivot(index="compound_name", columns="gene_symbol", values="expr_change").fillna(0) # Compute clusters row_clusters = linkage(pdist(pivot_df, 'euclidean'), method='average') col_clusters = linkage(pdist(pivot_df.T, 'euclidean'), method='average') # Determine the order of rows and columns based on the clusters row_order = leaves_list(row_clusters) col_order = leaves_list(col_clusters) # Reorder the DataFrame according to the clustering clustered_df = pivot_df.iloc[row_order, col_order] # Create the heatmap using Plotly Express lbls = dict(x="", y="", color="Expression Change") title = "TGGATES Gene Expression Change Over Dose in Human Cells" fig = px.imshow(clustered_df, labels=lbls, x=clustered_df.columns, y=clustered_df.index, aspect="auto", title=title) # Update the layout for transparent background and white text fig.update_layout( xaxis={'side': 'bottom', 'title_standoff': 10, 'color': 'white'}, yaxis={'title_standoff': 10, 'color': 'white'}, title={'text': title, 'x': 0.5, 'xanchor': 'center', 'font': {'color': 'white'}}, plot_bgcolor='rgba(0,0,0,0)', # Transparent background paper_bgcolor='rgba(0,0,0,0)', # Transparent surrounding margin=dict(l=20, r=20, t=50, b=20), font=dict(color='white', size=12) # Set font color to white ) # Show the figure fig.show()