The toxicodb brick provides data from toxicodb.ca. In this post, we'll use BioBricks.ai to look up gene and compound interaction data. To start, install biobricks.ai and then toxicodb
:
import biobricks as bb, pyspark, subprocess, pandas as pd
import pyspark, pyspark.sql, pyspark.sql.functions as F, pyspark.sql.types as T
subprocess.run("biobricks install toxicodb", shell=True)
spark = pyspark.sql.SparkSession.builder.config("spark.driver.memory","4g").getOrCreate()
toxicodb = bb.assets('toxicodb')
tbls = [{"table": table, "count": spark.read.parquet(path).count()} for table, path in toxicodb.__dict__.items()]
pd.DataFrame(tbls).sort_values("count", ascending=False)
table | count | |
---|---|---|
2 | TGGATEsHuman_TGGATEsHuman_parquet | 47494698 |
4 | TGGATEsRat_TGGATEsRat_parquet | 39813228 |
0 | DrugMatrixRat_DrugMatrixRat_parquet | 11338749 |
5 | EMEXP2458_EMEXP2458_parquet | 1395730 |
1 | genes_parquet | 32092 |
3 | compounds_parquet | 234 |
rawhmn = spark.read.parquet(toxicodb.TGGATEsHuman_TGGATEsHuman_parquet)
exphmn = rawhmn\
.filter(rawhmn.gene_symbol != '')\
.groupBy("compound_name","gene_symbol","dose").agg(F.mean('expression').alias('exp'))\
.orderBy("gene_symbol","dose")
exphmn.limit(5).toPandas()
compound_name | gene_symbol | dose | exp | |
---|---|---|---|---|
0 | Hexachlorobenzene | A1BG-AS1 | 0 | 5.070635 |
1 | Flutamide | A1BG-AS1 | 0 | 5.128496 |
2 | Isoniazid | A1BG-AS1 | 0 | 5.227362 |
3 | Tacrine | A1BG-AS1 | 0 | 5.197305 |
4 | Buthionine sulfoximine | A1BG-AS1 | 0 | 5.025494 |
window_spec = pyspark.sql.Window.partitionBy("compound_name", "gene_symbol").orderBy("dose")
udf_change = F.udf(lambda arr: arr[-1] - arr[0], T.FloatType())
hmn = exphmn\
.withColumn("expressions", F.collect_list("exp").over(window_spec))\
.groupBy("compound_name", "gene_symbol").agg(F.max("expressions").alias("expressions"))\
.withColumn("expr_change", udf_change(F.col("expressions")))\
.select("compound_name", "gene_symbol", "expr_change").toPandas()
hmn.head()
compound_name | gene_symbol | expr_change | |
---|---|---|---|
0 | 1-Naphthyl isothiocyanate | A1CF | -0.007111 |
1 | 1-Naphthyl isothiocyanate | A4GALT | 0.182894 |
2 | 1-Naphthyl isothiocyanate | A4GNT | -0.148850 |
3 | 1-Naphthyl isothiocyanate | AAR2 | 0.174779 |
4 | 1-Naphthyl isothiocyanate | ABCA11P | 0.263340 |
import plotly.express as px
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist
# Calculate high variance genes and filter the dataframe
highvar_genes = hmn.groupby('gene_symbol')['expr_change'].var().nlargest(200)
hvdf = hmn[hmn['gene_symbol'].isin(highvar_genes.index)]
# Pivot the DataFrame
pivot_df = hvdf.pivot(index="compound_name", columns="gene_symbol", values="expr_change").fillna(0)
# Compute clusters
row_clusters = linkage(pdist(pivot_df, 'euclidean'), method='average')
col_clusters = linkage(pdist(pivot_df.T, 'euclidean'), method='average')
# Determine the order of rows and columns based on the clusters
row_order = leaves_list(row_clusters)
col_order = leaves_list(col_clusters)
# Reorder the DataFrame according to the clustering
clustered_df = pivot_df.iloc[row_order, col_order]
# Create the heatmap using Plotly Express
lbls = dict(x="", y="", color="Expression Change")
title = "TGGATES Gene Expression Change Over Dose in Human Cells"
fig = px.imshow(clustered_df, labels=lbls, x=clustered_df.columns, y=clustered_df.index, aspect="auto", title=title)
# Update the layout for transparent background and white text
fig.update_layout(
xaxis={'side': 'bottom', 'title_standoff': 10, 'color': 'white'},
yaxis={'title_standoff': 10, 'color': 'white'},
title={'text': title, 'x': 0.5, 'xanchor': 'center', 'font': {'color': 'white'}},
plot_bgcolor='rgba(0,0,0,0)', # Transparent background
paper_bgcolor='rgba(0,0,0,0)', # Transparent surrounding
margin=dict(l=20, r=20, t=50, b=20),
font=dict(color='white', size=12) # Set font color to white
)
# Show the figure
fig.show()