#!/usr/bin/env python # coding: utf-8 # # Survival Analysis # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from lifelines.plotting import qq_plot, plot_interval_censored_lifetimes import lifelines as lfl # ## SEER dataset # > https://seer.cancer.gov/data-software/documentation/seerstat/nov2016/TextData.FileDescription.pdf # # > https://arxiv.org/abs/2204.07276 # # - `id`: parient's ID # - `tte`: time to event # - `label`: cancer or not # - `AGE_DX`: patient's age # - `EOD10_SZ`: size of the tumor # - `MALIGCOUNT`: number of In Situ/malignant tumors for patient # - `BENBORDCOUNT`: number of benign/borderline tumors # ## Data preparation # In[2]: df = pd.read_csv('SEER_sample_imputed.csv') # keeping just a few columns df = df[['id', 'tte', 'label', 'AGE_DX', 'EOD10_SZ', 'MALIGCOUNT', 'BENBORDCOUNT']] df = df.dropna() # there are some 'id's duplicated df.drop_duplicates('id', inplace=True) df.set_index('id', inplace=True) df = df.astype({'tte': int, 'label': int}) # In[3]: df.head() # In[4]: df.info() # In[5]: df.groupby(['label']).count() # ### Censorhip # # Happens when the **time to event** is partially known. # # - **Not censored**: the event occured and the time is known. # - **Right-censored**: the survival duration is > the the observed. # - **Left-censored**: the survival duration is < then the observed duration. # - **Interval-censored**: the survival duration is within the range but not exactly known. # # > The data is **right-censored** because there are over 6,000 observations where the event didn't happen. It the cabe confirmed in the plot below. # In[6]: plt.figure(figsize=(12,4)) plt.subplot(1,2,1) plt.title('Counting of Time to Event (tte)') df['tte'].hist() plt.subplot(1,2,2) plt.title('Counting based on LABEL') df.groupby(['label']).count()['tte'].plot(kind='pie', label=''); # ## Survival Function # # - Shows the percentage of the population that **did not** experienced the event until that time $t$. # - Shows the **survival probability** (probability of the event of interest happens) after time $t$: # # $$\Large S(t) = Pr(T > t)$$ # # - $T$: when the event occurs. # - $t$: any point in time during an observation. # # ## Hazard Function # # - Shows the probability of that event happens at some time give the survival up to that time. # # $$\Large h(t) = -\frac{d}{dt} \log(S(t))$$ # # - **Hazard rate**: instantaneous rate of the event occuring # ## Survival Function/Curve Estimation # # $$\Large S(t) = \prod_{i:t_i \leq t} \left(1 - \frac{d_i}{n_i} \right)$$ # # - $t_i$: duration time # - $d_i$: number of events that happened at time $t_i$ # - $n_i$: number of individuals known to have survived up to time $t_i$ # ### Non-parametric # # - Does not assume underlying distribution. # - Describes the data better. # - It is not smooth/differentiable # #### Kaplan-Meier Model # most used # In[7]: kmf = lfl.KaplanMeierFitter() kmf.fit(durations=df['tte'], event_observed=df['label']) # In[8]: kmf.plot_survival_function(at_risk_counts=True) plt.ylabel('Survival Probability'); # In[9]: # this measure might be wrong because more than 50% of the data is censored print('Median survival time:', kmf.median_survival_time_) # ### Parametric # # - Usually gives more information because there is an underlying distribution # - Is necessary to assess the goodness-of-fit to avoid biases and misinterpretations # # # #### Weibull Model # # $$\Large f(x; \lambda,k) = \frac{k}{\lambda} \left( \frac{x}{\lambda} \right)^{k-1} e^{-(x/\lambda)^k} \qquad \longrightarrow \quad S(t) = e^{-(t/\lambda)^\rho}$$ # # $$x \geq 0, k > 0, \lambda > 0$$ # # - $k$ or $\rho$: shape of the distribution # - $k < 1$: event rate decreases over time # - $k = 1$: event rate is constant # - $k > 1$: event rate increases over time # # - $\lambda$: scale of the distribution (indicates when 63.2% of the population has experienced the event) # In[10]: wb = lfl.WeibullFitter() # Weibull needs non-zero tte df['tte'] = df['tte'] + 1e-10 wb.fit(durations=df['tte'], event_observed=df['label']) # In[11]: print('Lambda:', wb.lambda_) print('Rho:', wb.rho_) ax = wb.survival_function_.plot() ax.set_title("Survival function of Weibull model / estimated parameters") # $\rho$ is smaller than 1: the event rate (cancer incidence) decreases over time. # ### Comparing Several Models # In[12]: wb = lfl.WeibullFitter().fit(durations=df['tte'], event_observed=df['label']) exp = lfl.ExponentialFitter().fit(durations=df['tte'], event_observed=df['label']) log = lfl.LogNormalFitter().fit(durations=df['tte'], event_observed=df['label']) # #### Akaike Information Criterion (AIC) # # - Estimates the prediction error and relative quality of statistical models # - The **less information the model loses, the higher its quality**. From `lifelines` documentation: *The model with the lowest AIC is desirable, since it’s a trade off between maximizing the log-likelihood with as few parameters as possible.* # In[13]: print('Weilbull model:', wb.AIC_) print('Exponential model:', exp.AIC_) print('LogNormal model:', log.AIC_) # In[14]: models = {'Weibull':['r', wb], 'Exponential':['g', exp], 'LogNormal':['b', log]} fig, axes = plt.subplots(2, 3, figsize=(18, 8)) i = 0 for model_name,model_lst in models.items(): m_color = model_lst[0] model = model_lst[1] model.plot_survival_function(ax=axes[0,i], c=m_color) ax = qq_plot(model, ax=axes[1,i], scatter_color=m_color) ax.title.set_text(model_name) i += 1 # #### Finding the best model based on AIC # In[15]: best_model, best_aic_ = lfl.utils.find_best_parametric_model(event_times=df['tte'], event_observed=df['label'], scoring_method="AIC") print('Best model:', best_model) print('Best model AIC:', best_aic_) # In[16]: fig, axes = plt.subplots(1, 2, figsize=(18, 4)) best_model.plot_survival_function(ax=axes[0]) # line below raises the error: NotImplementedError: Distribution not implemented in SciPy #qq_plot(best_model, ax=axes[1]) # ### Weibul Model and Covariates # In[17]: aft = lfl.WeibullAFTFitter() aft.fit(df=df, duration_col='tte', event_col='label') # In[18]: aft.print_summary() # - $exp(coef)$ shows how much the average survival changes with one unit change in the covariate # - $p$ shows the statistic significance. $p < 0.005$ is significant # # #### How the covariates affect the survival function? # In[19]: plt.figure(figsize=(12,6)) aft.plot() # #### Evaluating the `MALIGCOUNT` covariate # In[20]: plt.figure(figsize=(16,6)) values = [[x,y] for x in np.arange(0, .51, 0.2) for y in np.arange(0, .51, 0.2)] aft.plot_partial_effects_on_outcome(covariates=['MALIGCOUNT', 'BENBORDCOUNT'] , cmap='coolwarm', values=values) # In[21]: _, axes = plt.subplots(2,1, figsize=(16,8)) aft.plot_partial_effects_on_outcome(covariates='MALIGCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[0]) aft.plot_partial_effects_on_outcome(covariates='BENBORDCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[1]) # The `baseline` curve is the same one as estimated. The increase of `MALIGCOUNT` (counting of malignant tumors) affects the survival function in such a way that people has cancer sooner. # ### Cox Proportional Hazards Model (Cox-PH) # # #### Proportional hazards assumption # # All the individuals' hazards are proportional: # # $$\Large h_A(t) = c \cdot h_B(t) $$ # # - There is a **baseline hazard** function where other hazards are **scaled** # - The **relative survival impact** of a varaible does not change with time (time-invariant) # - **baseline hazard** the risk for individuals at the baseline levels of covariates, i.e., the averages of covariates # # #### Cox-PH model: Regresses covariates on time-to-event/duration. # # $$\Large h(t|x) = b_0 \cdot exp\left( \sum^{n}_{i=1} b_i(x_i - \bar{x_i}) \right) $$ # # - $b_0(t)$: population-level **baseline hazard** function # - $exp(...)$: linear relationship between covariates and the log of hazard. Note: Doesn't depend on time, that's why there is no $t$ in the equation # # # In[22]: coxph = lfl.CoxPHFitter() coxph.fit(df=df, duration_col='tte', event_col='label') # In[23]: coxph.print_summary() # $exp(coef)$ is the **hazard ratio**: how the hazard changes with one unit variation of the covariate. # # Example: If `MALIGCOUNT` increases by 1, the hazard function increases by $2.04$, which is 204%!!! # In[24]: _, axes = plt.subplots(1, 2, figsize=(16,4)) coxph.baseline_hazard_.plot(ax=axes[0]) coxph.baseline_survival_.plot(ax=axes[1]) # In[25]: _, axes = plt.subplots(2,1, figsize=(16,8)) coxph.plot_partial_effects_on_outcome(covariates='MALIGCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[0]) coxph.plot_partial_effects_on_outcome(covariates='BENBORDCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[1]) # #### Checking the Proportional Hazards Assumption # In[26]: coxph.check_assumptions(df, p_value_threshold=0.05)