If you know TF concentrations, binding site affinities, and cooperativities, can you predict probabilities for each configuration?
[With extensive help from Jeremy A. Owen]
#collapse
# imports
import os, sys
import string
import json
from collections import defaultdict
import itertools
from random import choice, random
import pickle
import requests
from xml.etree import ElementTree
import numpy as np
import pandas as pd
from scipy.stats import bernoulli, norm, lognorm, expon as exponential
from scipy.stats import rv_discrete, rv_continuous
from scipy.stats import hypergeom
from scipy import interpolate
import matplotlib
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.text import TextPath
from matplotlib.font_manager import FontProperties
from matplotlib.collections import PolyCollection
from matplotlib.patches import Rectangle, Polygon, PathPatch
from matplotlib.colors import colorConverter, TABLEAU_COLORS, LinearSegmentedColormap
import matplotlib.cm as cm
from mpl_toolkits.mplot3d import axes3d
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import MaxNLocator
# colorConverter.to_rgba('mediumseagreen', alpha=.5)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
plt.rcParams['figure.figsize'] = [12, 5]
plt.rcParams['figure.dpi'] = 140
plt.rcParams['agg.path.chunksize'] = 10000
plt.rcParams['animation.html'] = 'jshtml'
plt.rcParams['hatch.linewidth'] = 0.3
from IPython.display import display, HTML
from matplotlib_venn import venn2, venn3
import plotly.graph_objects as go
import seaborn as sns
# import seaborn.apionly as sns
sns.set_style("white")
import MOODS.tools
from dna_features_viewer import GraphicFeature, GraphicRecord
import pyBigWig
ln = np.log
exp = np.exp
#collapse
# distribution plotting utilities
def is_discrete(dist):
if hasattr(dist, 'dist'):
return isinstance(dist.dist, rv_discrete)
else: return isinstance(dist, rv_discrete)
def is_continuous(dist):
if hasattr(dist, 'dist'):
return isinstance(dist.dist, rv_continuous)
else: return isinstance(dist, rv_continuous)
def plot_distrib(distrib, title=None):
fig, ax = plt.subplots(1, 1)
if is_continuous(distrib):
x = np.linspace(distrib.ppf(0.001),
distrib.ppf(0.999), 1000)
ax.plot(x, distrib.pdf(x), 'k-', lw=0.4)
elif is_discrete(distrib):
x = np.arange(distrib.ppf(0.01),
distrib.ppf(0.99))
ax.plot(x, distrib.pmf(x), 'bo', ms=2, lw=0.4)
r = distrib.rvs(size=1000)
ax.hist(r, density=True, histtype='stepfilled', alpha=0.2, bins=200)
if title: ax.set_title(title)
fig_style_2(ax)
return ax
def fig_style_2(ax):
for side in ["right","top","left"]: ax.spines[side].set_visible(False)
ax.get_yaxis().set_visible(False)
#collapse
# functions for plotting DNA
base_colors = {'A': 'Lime', 'G': 'Gold', 'C': 'Blue', 'T':'Crimson'}
def print_bases(dna): return HTML(''.join([f'<span style="color:{base_colors[base]};font-size:1.5rem;font-weight:bold;font-family:monospace">{base}</span>' for base in dna]))
def print_dna(dna): return HTML(''.join([f'<span style="color:{base_colors[base]};font-size:1rem;font-family:monospace">{base}</span>' for base in dna]))
complement = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'a':'t', 't':'a', 'c':'g', 'g':'c'}
def reverse_complement(dna):
list_repr = [complement[base] for base in dna[::-1]]
if type(dna) == list: return list_repr
else: return ''.join(list_repr)
def reverse_complement_pwm(pwm, base_index='ACGT'):
return pwm[::-1, [base_index.index(complement[base]) for base in base_index]]
#collapse
# gene name namespace_mapping utility
import mygene
def namespace_mapping(names, map_from=['symbol', 'alias'], map_to='symbol', species='human'):
names = pd.Series(names)
print(f"passed {len(names)} symbols")
names_stripped = names.str.strip()
if any(names_stripped != names):
print(f"{sum(names.str.strip() != names)} names contained whitespace. Stripping...")
names_stripped_unique = names_stripped.unique()
if len(names_stripped_unique) != len(names_stripped):
print(f"{len(names_stripped) - len(names_stripped_unique)} duplicates. {len(names_stripped_unique)} uniques.")
print()
mg = mygene.MyGeneInfo()
out, dup, missing = mg.querymany(names_stripped_unique.tolist(), scopes=map_from, fields=[map_to], species=species, as_dataframe=True, returnall=True).values()
out = out.reset_index().rename(columns={'query':'input'}).sort_values(['input', '_score'], ascending=[True, False]).drop_duplicates(subset='input', keep='first')
same = out[out.input == out.symbol]
updates = out[(out.input != out.symbol) & (out.notfound.isna() if 'notfound' in out else True)].set_index('input')['symbol']
print(f"\nunchanged: {len(same)}; updates: {len(updates)}; missing: {len(missing)}")
names_updated = updates.reindex(names_stripped.values)
names_updated = names_updated.fillna(names_updated.index.to_series()).values
return updates, missing, names_updated
#collapse
# define read_shell() to read dataframe from shell output
import os, sys
import subprocess
import shlex
import io
def read_shell(command, shell=False, **kwargs):
proc = subprocess.Popen(command, shell=shell,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
if proc.returncode == 0:
with io.StringIO(stdout.decode()) as buffer:
return pd.read_csv(buffer, **kwargs)
else:
message = ("Shell command returned non-zero exit status: {0}\n\n"
"Command was:\n{1}\n\n"
"Standard error was:\n{2}")
raise IOError(message.format(proc.returncode, command, stderr.decode()))
There exist two mathematical formalisms to describe the probabilities of molecular configurations at equilibrium: thermodynamics and kinetics. In the kinetics formalism, the system transits between configurational states according to rate parameters contained in differential equations, which can describe not just the system's equilibrium but also its trajectory towards it, or the kinetics of non-equilibrium systems. In the thermodynamics/statistical mechanics formalism, we posit that a system will occupy configurations with lower energy, and use the Boltzmann distribution to estimate the proportion of time the system spends in each state, at equilibrium. The thermodynamics formalism is limited to describing equilibrium state probabilities, but it does so with fewer parameters.
We'll derive an expression for the probability of a single TFBS' occupancy with both formalisms, but proceed with the thermodynamic description for more elaborate configurations. It will become clear why that is preferable.
Most derivations of the probability of single TFBS occupancy at equilibrium employ a kinetics formalism, so we'll walk through that first, and then explore the analog in the thermodynamics description. In the kinetics description, the parameters are rates.
$$ \mathrm{TF} + \mathrm{TFBS} \underset{\koff}{\overset{\kon}{\rightleftarrows}} \mathrm{TF\colon TFBS} $$The natural rates are the rate of TF binding $\kon$ and unbinding $\koff$. Equilibrium is reached when binding and unbinding are balanced:
$$\frac{d[\mathrm{TF\colon TFBS}]}{dt} = k_{\mathrm{on}}[\mathrm{TF}][\mathrm{TFBS}] - k_{\mathrm{off}}[\mathrm{TF\colon TFBS}] = 0 \text{ at equilibrium}$$$$k_{\mathrm{on}}[\mathrm{TF}]_{\mathrm{eq}}[\mathrm{TFBS}]_{\mathrm{eq}} = k_{\mathrm{off}}[\mathrm{TF\colon TFBS}]_{\mathrm{eq}}$$$$\text{(dropping eq subscript) }[\mathrm{TF\colon TFBS}] = \frac{k_{\mathrm{on}}[\mathrm{TF}][\mathrm{TFBS}]}{k_{\mathrm{off}}} = \frac{[\mathrm{TF}][\mathrm{TFBS}]}{k_{d}}$$where $k_{d} = \frac{\koff}{\kon}$ is called the dissociation constant (or equilibrium constant). We'd like to determine the probability of finding the TFBS occupied, i.e. the fraction of time it spends in the bound state. That fraction is $\frac{[\mathrm{bound}]}{([\mathrm{unbound}] + [\mathrm{bound}])} = \frac{[\mathrm{TF\colon TFBS}]}{([\mathrm{TFBS}] + [\mathrm{TF\colon TFBS}])}$. Define the denominator as $[\mathrm{TFBS}]_{0} = [\mathrm{TFBS}] + [\mathrm{TF\colon TFBS}]$ so that $[\mathrm{TFBS}] = [\mathrm{TFBS}]_{0} - [\mathrm{TF\colon TFBS}]$ and substitute:
$$[\mathrm{TF\colon TFBS}] = \frac{[\mathrm{TF}]([\mathrm{TFBS}]_{0} - [\mathrm{TF\colon TFBS}])}{k_{d}}$$$$[\mathrm{TF\colon TFBS}](k_d + [\mathrm{TF}]) = [\mathrm{TF}][\mathrm{TFBS}]_{0}$$$$\frac{[\mathrm{TF\colon TFBS}]}{[\mathrm{TFBS}]_{0}} = \frac{[\mathrm{TF}]}{k_d + [\mathrm{TF}]}$$Note: We could also ask for this expression in terms of $[\mathrm{TF}]_0 = [\mathrm{TF}] + [\mathrm{TF\colon TFBS}]$ however, since we're considering a single TFBS, $[\mathrm{TF\colon TFBS}]$ is at most 1, and so $[\mathrm{TF}]_0 \approx [\mathrm{TF}]$. In instances of ligand-receptor binding in which that approximation cannot be made, the fraction bound is a messy quadratic. Derivation here.
In the thermodynamics description, the parameters are Gibbs free energies $\Delta G$. Let's follow the derivation from Physical Biology of the Cell (pp. 242) and consider the number microstates underlying each of the of bound and unbound macrostates, and their energies.
In order to count microstates, we imagine distributing $L$ TF molecules across a space-filling lattice with $\Omega$ sites. The energy of a TF in solution is $\varepsilon_{\mathrm{solution}}$ and the energy of a bound TF is $\varepsilon_{\mathrm{bound}}$. $\beta$ is the constant $1/k_b T$ where $k_b$ is Boltzmann's constant and $T$ is the temperature.
State | Energy | Multiplicity | Weight |
![]() |
$A \cdot A_s$ | $\frac{\Omega!}{(\Omega - A)!A!} \approx \frac{\Omega^{A}}{A!}$ | $\frac{\Omega^{A}}{A!} \cdot e^{-\beta \left[ A \cdot A_s \right]}$ |
![]() |
$(A - 1) A_s + A_b$ | $\frac{\Omega!}{(\Omega - (A - 1))!(A-1)!B!} \approx \frac{\Omega^{A-1}}{(A-1)!}$ | $\frac{\Omega^{A-1}}{(A-1)!} \cdot e^{-\beta \left[ (A - 1) A_s + A_b \right]}$ |
In our case, the number of microstates in the unbound macrostate is $\frac{\Omega !}{L!(\Omega -L)!}\approx \frac{\Omega^L}{L!}$ and they each have energy $L \cdot \varepsilon_s$. The number of microstates in the bound macrostate is $\frac{\Omega !}{(L-1)!(\Omega -(L+1))}\approx \frac{\Omega^{(L-1)}}{(L-1)!}$ and they each have energy $(L-1) \varepsilon_s + \varepsilon_b$.
The Boltzmann distribution describes the probability of a microstate as a function of its energy: $p(E_i) = e^{-\beta E_i}/Z$ where $Z$ is the "partition function" or simply $\sum_i e^{-\beta E_i}$ the sum of the weights of the microstates, which normalizes the distribution. In our case:
$$Z(L,\Omega)=\left(\colorbox{LightCyan}{$ \frac{\Omega^L}{L!} e^{-\beta L \varepsilon_s}$}\right) + \left(\colorbox{Seashell}{$\frac{\Omega^{(L-1)}}{(L-1)!} e^{-\beta [(L-1) \varepsilon_s + \varepsilon_b]}$}\right)$$With that expression in hand, we can express the probability of the bound macrostate, $p_b$:
$$p_b=\frac{ \colorbox{Seashell}{$\frac{\Omega^{(L-1)}}{(L-1)!} e^{-\beta [(L-1) \varepsilon_s + \varepsilon_b]}$}}{\colorbox{LightCyan}{$\frac{\Omega^L}{L!} e^{-\beta L \varepsilon_s}$} + \colorbox{Seashell}{$\frac{\Omega^{(L-1)}}{(L-1)!} e^{-\beta [(L-1) \varepsilon_s + \varepsilon_b]}$}} \cdot \color{DarkRed}\frac{\frac{\Omega^L}{L!}e^{\beta L \varepsilon_s}}{\frac{\Omega^L}{L!}e^{\beta L \varepsilon_s}} \color{black} = \frac{(L/\Omega)e^{- \beta \Delta \varepsilon}}{1+(L/\Omega)e^{- \beta \Delta \varepsilon}} $$Where we have defined $\Delta \varepsilon = \varepsilon_b - \varepsilon_s$. $L/\Omega$ is really just a dimensionless TF concentration, which we'll hand-wave as being equivalent to $[\mathrm{TF}]$, which leaves us with an expression suspiciously similar to the one we derived from the kinetics formalism:
$$p_b = \frac{[\mathrm{TF}]e^{-\beta \Delta \varepsilon}}{1+[\mathrm{TF}]e^{-\beta \Delta \varepsilon}} \cdot \color{DarkRed}\frac{e^{\beta \Delta \varepsilon}}{e^{\beta \Delta \varepsilon}} \color{black} = \frac{[\mathrm{TF}]}{e^{\beta \Delta \varepsilon}+[\mathrm{TF}]}$$From which we recapitulate an important correspondence between kinetics and thermodynamics at equilibrium: $ k_d = e^{\beta \Delta \varepsilon} = e^{\Delta \varepsilon / k_bT} $ more commonly written for different units as $k = e^{-\Delta G / RT}$.
The takeaway is that both the kinetics and thermodynamics formalisms produce an equivalent expression for the probabilities of each of the bound and unbound configurations, parameterized respectively by $k_d$ and $\Delta G$.
In order to compute probabilities like $p_b$, we need concrete TF concentrations $[\mathrm{TF}]$ and binding affinities (either $k_d$ or $\Delta G$). What are typical intranuclear TF concentrations and binding affinities?
A typical human cell line, K562s, have a cellular diameter of 17 microns. (BioNumbers)
def sphere_volume(d):
return 4/3*np.pi*(d/2)**3
K562_diameter_microns = 17
K562_volume_micron_cubed = sphere_volume(K562_diameter_microns)
print(f'K562 volume: {round(K562_volume_micron_cubed)} μm^3')
K562 volume: 2572 μm^3
A typical expressed TF has a per-cell copy number range from $10^3$ - $10^6$. (BioNumbers)
copy_number_range = [1e3, 1e6]
N_A = 6.02214076e23
def copy_number_and_cubic_micron_volume_to_molar_concentration(copy_number, volume=K562_volume_micron_cubed):
return (copy_number / N_A) / (volume * (1e3 / 1e18)) # 1000 Liters / m^3; 1e18 μm^3 / m^3
lower_end_molar = copy_number_and_cubic_micron_volume_to_molar_concentration(copy_number_range[0], K562_volume_micron_cubed)
upper_end_molar = copy_number_and_cubic_micron_volume_to_molar_concentration(copy_number_range[1], K562_volume_micron_cubed)
lower_end_nanomolar = lower_end_molar / 1e-9
upper_end_nanomolar = upper_end_molar / 1e-9
print('If TF copy numbers range from 1,000-1,000,000, then TF concentrations range from', str(round(lower_end_nanomolar))+'nM', 'to', str(round(upper_end_nanomolar))+'nM')
If TF copy numbers range from 1,000-1,000,000, then TF concentrations range from 1nM to 646nM
We might also like a distribution over this range. Let's posit a lognormal, where $10^3$ and $10^6$ are the 3σ from the mean, which is $10^{4.5}$. Then $\sigma = 10^{0.5}$
#collapse
# define a distribution over TF copy numbers
# Note: the lognormal is defined with base e, so we need to take some natural logs on our base 10 expression.
TF_copy_number_distrib = lognorm(scale=10**4.5, s=np.log(10**0.5))
ax = plot_distrib(TF_copy_number_distrib, title='Hypothetical expressed TF copy number distribution')
ax.set_xlim(left=0, right=5e5)
ax.set_xlabel('TF protein copy number / cell')
ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',')))
def TF_nanomolar_concentrations_sample(TFs):
return dict(zip(TFs, (copy_number_and_cubic_micron_volume_to_molar_concentration(TF_copy_number_distrib.rvs(len(TFs)))*1e9).astype(int)))
What are typical TF ΔG's of binding? How about the $\koff$ rates and half lives?
T = 310
k_b = 3.297623483e-24 * 1e-3 ## cal/K * kcal/cal
kbT = k_b*T*N_A
kbT ## in 1/Mol -- an unusual format
k_on = 1e5
def nanomolar_kd_from_kcal_ΔG(ΔG): return exp(-ΔG/kbT) * 1e9
def kcal_ΔG_from_nanomolar_kd(K_d): return -kbT*ln(K_d*1e-9)
def k_off_from_nanomolar_kd(k_d): return (k_d*1e-9) * k_on
def half_life_from_kd(k_d): return ln(2) / ((k_d*1e-9) * k_on)
#collapse
# compute statistics from kds
nanomolar_kds = pd.Series([1, 10, 100, 1000])
affinity_grid = pd.DataFrame()
affinity_grid['$k_d$'] = nanomolar_kds
affinity_grid['$\Delta G$'] = nanomolar_kds.apply(kcal_ΔG_from_nanomolar_kd)
affinity_grid['$\kon$'] = '1e5 / (M * s)'
affinity_grid['$\koff$'] = nanomolar_kds.apply(k_off_from_nanomolar_kd)
affinity_grid['$t_{1/2}$'] = pd.to_timedelta(nanomolar_kds.apply(half_life_from_kd), unit='s')
affinity_grid = affinity_grid.set_index('$k_d$')
affinity_grid
$\Delta G$ | $\kon$ | $\koff$ | $t_{1/2}$ | |
---|---|---|---|---|
$k_d$ | ||||
1 | 12.757685 | 1e5 / (M * s) | 0.0001 | 0 days 01:55:31.471805599 |
10 | 11.340164 | 1e5 / (M * s) | 0.0010 | 0 days 00:11:33.147180560 |
100 | 9.922644 | 1e5 / (M * s) | 0.0100 | 0 days 00:01:09.314718056 |
1000 | 8.505123 | 1e5 / (M * s) | 0.1000 | 0 days 00:00:06.931471806 |
We learn that an order of magnitude residence time difference results from just 1.4 extra kcal/Mol, and that TF half lives range from about 5s to about 2h. Let's once again posit a distribution of affinities to sample from (defined on $k_d$):
#collapse
# define a distribution over TF Kd's / ΔG's
TF_affinity_min = 6 ## define exponential distribution in log10 space
TF_affinity_spread = 0.5
TF_affinity_distrib = exponential(loc=TF_affinity_min, scale=TF_affinity_spread)
ax = plot_distrib(TF_affinity_distrib, title="Hypothetical TF $K_d$ distribution")
ax.set_xlim(left=5.9)
ax.set_xlabel('$k_d$')
plt.xticks([6,7,8,9], ['1000μm', '100nm', '10nm', '1nm'])
def TF_Kd_sample(n=1): return 10**(-TF_affinity_distrib.rvs(n))
def TF_ΔG_sample(n=1): return kcal_ΔG_from_nanomolar_kd(10**(-TF_affinity_distrib.rvs(n)+9))
With those concrete TF concentrations and dissociation constants, we can finally plot our function $p_b = \frac{[\mathrm{TF}]}{e^{\beta \Delta \varepsilon}+[\mathrm{TF}]}$.
@np.vectorize
def fraction_bound(TF, ΔG):
'''TF in nanomolar'''
return TF / (TF + nanomolar_kd_from_kcal_ΔG(ΔG))
#collapse
# plot fraction bound as a function of concentration and binding energy
TF_concentration_array = np.logspace(1, 5)
ΔG_array = np.logspace(*np.log10([8, 13]))
TF_concs_matrix, ΔG_matrix = np.meshgrid(TF_concentration_array, ΔG_array)
z_data = pd.DataFrame(fraction_bound(TF_concs_matrix, ΔG_matrix), index=ΔG_array, columns=TF_concentration_array).rename_axis('ΔG').T.rename_axis('[TF]')
fig = go.Figure(data=[go.Surface(x=TF_concentration_array.astype(int).astype(str), y=ΔG_array.round(1).astype(str), z=z_data.values)])
fig.update_layout(
title='',
autosize=False,
width=700,
margin=dict(r=20, l=10, b=10, t=10),
scene = dict(
xaxis_title='[TF]',
yaxis_title='ΔG',
zaxis_title='Pb'),
scene_camera = dict(eye=dict(x=-1, y=-1.8, z=1.25)))
fig.update_traces(showscale=False)
# config = dict({'scrollZoom': False})
# fig.show(config = config)
# display(fig)
HTML(fig.to_html(include_plotlyjs='cdn', include_mathjax=False))
(Note that both $[\mathrm{TF}]$ and $k_d$ are plotted in log space, but $p_b$ is linear.)
Suppose now that two TFs bind adjacent segments of DNA in such a way that the binding of either facilitates (or hinders) the binding of the other. We call this direct cooperativity (and competition). We'll focus first on cooperativity.
As before, the statistical mechanics formalism entails enumerating the configurations, their multiplicities, and their energies. We'll call the TFs A and B. We'll denote their counts as $A$ and $B$. The energy of a TF in solution will once more be $A_s$ and bound to its cognate TFBS $A_b$. The energy of cooperativity will be $C_{AB}$.
State | Energy | Multiplicity | Weight |
![]() |
$A \cdot A_s + B \cdot B_s$ | $\frac{\Omega!}{(\Omega - A - B)!A!B!} \approx \frac{\Omega^{A+B}}{A!B!}$ | $\frac{\Omega^{A+B}}{A!B!} \cdot e^{-\beta \left[ A \cdot A_s + B \cdot B_s \right]}$ |
![]() |
$(A - 1) A_s + A_b + B \cdot B_s$ | $\frac{\Omega!}{(\Omega - (A - 1) - B)!(A-1)!B!} \approx \frac{\Omega^{A+B-1}}{(A-1)!B!}$ | $\frac{\Omega^{A+B-1}}{(A-1)!B!} \cdot e^{-\beta \left[ (A - 1) A_s + A_b + B \cdot B_s \right]}$ |
![]() |
$A \cdot A_s + (B - 1) B_s + B_b$ | $\frac{\Omega!}{(\Omega - A - (B - 1))!A!(B-1)!} \approx \frac{\Omega^{A+B-1}}{A!(B-1)!}$ | $\frac{\Omega^{A+B-1}}{A!(B-1)!} \cdot e^{-\beta \left[ A \cdot A_s + (B - 1) B_s + B_b \right]}$ |
![]() |
$(A - 1) A_s + A_b + (B - 1) B_s + B_b + C_{AB}$ | $\frac{\Omega!}{(\Omega - (A - 1) - (B-1))!(A-1)!(B-1)!} \approx \frac{\Omega^{A+B-2}}{(A-1)!(B-1)!}$ | $\frac{\Omega^{A+B-2}}{(A-1)!(B-1)!} \cdot e^{-\beta \left[ (A - 1) A_s + A_b + (B - 1) B_s + B_b + C_{AB} \right]}$ |
The partition function is the sum of the weights:
$$ Z = \frac{\Omega^{A+B}}{A!B!} \cdot e^{-\beta \left[ A \cdot A_s + B \cdot B_s \right]} + \frac{\Omega^{A+B-1}}{(A-1)!B!} \cdot e^{-\beta \left[ (A - 1) A_s + A_b + B \cdot B_s \right]} + \frac{\Omega^{A+B-1}}{A!(B-1)!} \cdot e^{-\beta \left[ A \cdot A_s + (B - 1) B_s + B_b \right]} + \frac{\Omega^{A+B-2}}{(A-1)!(B-1)!} \cdot e^{-\beta \left[ (A - 1) A_s + A_b + (B - 1) B_s + B_b + C_{AB} \right]}$$Which we can greatly simplify by multiplying the entire expression by the reciprocal of the "base state" weight, $\color{DarkRed}\frac{A!B!}{\Omega^{A+B}} \cdot e^{\beta \left[ A \cdot A_s + B \cdot B_s \right]}$, normalizing that weight to 1:
$$ Z = 1 + \frac{A}{\Omega} \cdot e^{-\beta \left[ A_b-A_s \right]} + \frac{B}{\Omega} \cdot e^{-\beta \left[ B_b-B_s \right]} + \frac{A \cdot B}{\Omega^2} \cdot e^{-\beta \left[ A_b-A_s+B_b-B_s+C_{AB} \right]}$$Taking the definition $[A] = A/\Omega$ and $\Delta G_A = A_b-A_s$ produces:
$$ Z = 1 + [A] e^{-\beta \left[ \Delta G_A \right]} + [B] e^{-\beta \left[ \Delta G_B \right]} + [A][B] e^{-\beta \left[ \Delta G_A+\Delta G_B+C_{AB} \right]}$$Then the probability of any state is just the weight of that state (scaled by the weight of the base state) divided by the partition function expression $Z$.
From the above, we notice the form of the expression for the weight of a configuration of N TFBSs:
$$ p_{\mathrm{config}} = \prod_{i \in \, \mathrm{bound \,TBFS}} \left( [\mathrm{TF}_{\mathrm{cognate}(i)}] \cdot e^{-\beta \left[ \Delta G_i + \sum_j c_{ij} \right]} \right) / Z$$For numerical stability, we take the log of the unnormalized probability (that is, the weight) of configurations:
$$ \log(\tilde{p}_{\mathrm{config}}) = \sum_{i \in \, \mathrm{bound \,TBFS}} \left( \log([\mathrm{TF}_{\mathrm{cognate}(i)}]) - \beta \left[ \Delta G_i + \sum_j c_{ij} \right] \right) $$#collapse
# define log_P_config()
β = 1/kbT ## kbT was in in 1/Mol
def log_P_config(config, TF_conc, TFBSs, cooperativities):
logP = 0
for i, tfbs in TFBSs[config.astype(bool)].iterrows():
cooperativity_sum = 0
if i in cooperativities:
cooperativity_sum = sum([C_AB for tfbs_j, C_AB in cooperativities[i].items() if config[tfbs_j] == 1])
logP = sum([np.log(TF_conc[tfbs.TF]*1e-9) + β*(tfbs.dG + cooperativity_sum)]) ## the sign is flipped here, because our ΔG's of binding are positive above.
return logP
Incorporating competition into our thermodynamic model is slightly more subtle than cooperativity, because we can imagine two types of competition
In the former case, the expression for $p_\mathrm{config}$ we had written before suffices, with values of $C_{AB}$ having both signs to represent cooperativity and competition. Nominally, the latter case also fits this formalism if we allow $C_{AB}$ to reach $-\infty$, but that would cause us headaches in the implementation. Instead, the weights of all those configurations which are not physical attainable, due to "strict" competitive binding between TFs vying for overlapping binding sites, are merely omitted from the denominator $Z$.
In order to compute concrete probabilities of configurations, accounting for cooperativity and competition, we will need concrete energies. We'll take $C_{AB}$ to be distributed exponentially with a mean at 2.2kcal/Mol. (Forsén & Linse).
#collapse
# define a distribution of cooperativities
cooperativity_mean_ΔG = 2.2
cooperativity_distrib = exponential(scale=cooperativity_mean_ΔG)
ax = plot_distrib(cooperativity_distrib, title="Hypothetical $C_{AB}$ distribution")
ax.set_xlim(left=-0.5,right=15)
ax.set_xlabel('$C_{AB}$ (kcal/mol)')
def C_AB_sample(n=1): return cooperativity_distrib.rvs(n)
#collapse
# sample that distribution for cooperativities between binding sites
def sample_cooperativities(TFBSs):
cooperativities = defaultdict(lambda: dict())
for i, tfbs_i in TFBSs.iterrows():
for j, tfbs_j in TFBSs.iterrows():
if i < j:
if 7 <= abs(tfbs_i.start - tfbs_j.start) <= 10:
cooperativities[i][j] = cooperativities[j][i] = C_AB_sample()[0]
elif abs(tfbs_i.start - tfbs_j.start) < 7:
cooperativities[i][j] = cooperativities[j][i] = -C_AB_sample()[0]
return dict(cooperativities)
Let's check our derivation (and our implementation) of $p_\mathrm{config}$ by comparing it to our direct computation of $p_b$ from §1. Single TF.
#collapse
# define create_environment()
len_TFBS=10
def create_environment(len_DNA=1000, num_TFs=10, num_TFBS=20, len_TFBS=10):
# TFBSs is a dataframe with columns 'TF_name', 'start', 'dG'
TFs = list(string.ascii_uppercase[:num_TFs]) ## TF names are just letters from the alphabet
TF_conc = TF_nanomolar_concentrations_sample(TFs)
TFBSs = pd.DataFrame([{'TF': choice(TFs), 'start': int(random()*(len_DNA-len_TFBS)), 'dG': TF_ΔG_sample()[0]} for _ in range(num_TFBS)]).sort_values(by='start').reset_index(drop=True)
cooperativities = sample_cooperativities(TFBSs)
return TFs, TF_conc, TFBSs, cooperativities
#collapse
# define draw_config() for plotting
def draw_config(TFBSs, TF_conc, cooperativities, config=None, len_DNA=1000):
if config is None: config = [0]*len(TFBSs)
TF_colors = dict(zip(list(TF_conc.keys()), list(TABLEAU_COLORS.values())))
plt.rcParams['figure.figsize'] = [12, 0.5+np.sqrt(len(TFs))]
fig, axs = plt.subplots(ncols=2, sharey=True, gridspec_kw={'width_ratios': [4, 1]})
genome_track_ax = draw_genome_track(axs[0], config, TFBSs, cooperativities, TF_colors, len_DNA=len_DNA)
conc_plot_ax = draw_concentration_plot(axs[1],