#!/usr/bin/env python
# coding: utf-8

# # Survival Analysis

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lifelines.plotting import qq_plot, plot_interval_censored_lifetimes
import lifelines as lfl


# ## SEER dataset
# > https://seer.cancer.gov/data-software/documentation/seerstat/nov2016/TextData.FileDescription.pdf
# 
# > https://arxiv.org/abs/2204.07276
# 
# - `id`: parient's ID
# - `tte`: time to event
# - `label`: cancer or not
# - `AGE_DX`: patient's age
# - `EOD10_SZ`: size of the tumor
# - `MALIGCOUNT`: number of In Situ/malignant tumors for patient
# - `BENBORDCOUNT`: number of benign/borderline tumors

# ## Data preparation

# In[2]:


df = pd.read_csv('SEER_sample_imputed.csv')

# keeping just a few columns
df = df[['id', 'tte', 'label', 'AGE_DX', 'EOD10_SZ', 'MALIGCOUNT', 'BENBORDCOUNT']]

df = df.dropna()

# there are some 'id's duplicated
df.drop_duplicates('id', inplace=True)
df.set_index('id', inplace=True)

df = df.astype({'tte': int, 'label': int})


# In[3]:


df.head()


# In[4]:


df.info()


# In[5]:


df.groupby(['label']).count()


# ### Censorhip
# 
# Happens when the **time to event** is partially known.
# 
# - **Not censored**: the event occured and the time is known.
# - **Right-censored**: the survival duration is > the the observed. 
# - **Left-censored**: the survival duration is < then the observed duration.
# - **Interval-censored**: the survival duration is within the range but not exactly known.
# 
# > The data is **right-censored** because there are over 6,000 observations where the event didn't happen. It the cabe confirmed in the plot below.

# In[6]:


plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.title('Counting of Time to Event (tte)')
df['tte'].hist()

plt.subplot(1,2,2)
plt.title('Counting based on LABEL')
df.groupby(['label']).count()['tte'].plot(kind='pie', label='');


# ## Survival Function
# 
# - Shows the percentage of the population that **did not** experienced the event until that time $t$.
# - Shows the **survival probability** (probability of the event of interest happens) after time $t$: 
# 
# $$\Large S(t) = Pr(T > t)$$
# 
# - $T$: when the event occurs.
# - $t$: any point in time during an observation.
# 

# ## Hazard Function
# 
# - Shows the probability of that event happens at some time give the survival up to that time.
# 
# $$\Large h(t) = -\frac{d}{dt} \log(S(t))$$
# 
# - **Hazard rate**: instantaneous rate of the event occuring

# ## Survival Function/Curve Estimation
# 
# $$\Large S(t) = \prod_{i:t_i \leq t} \left(1 - \frac{d_i}{n_i} \right)$$
# 
# - $t_i$: duration time
# - $d_i$: number of events that happened at time $t_i$
# - $n_i$: number of individuals known to have survived up to time $t_i$

# ### Non-parametric
# 
# - Does not assume underlying distribution.
# - Describes the data better.
# - It is not smooth/differentiable

# #### Kaplan-Meier Model
# most used

# In[7]:


kmf = lfl.KaplanMeierFitter()
kmf.fit(durations=df['tte'], event_observed=df['label'])


# In[8]:


kmf.plot_survival_function(at_risk_counts=True)
plt.ylabel('Survival Probability');


# In[9]:


# this measure might be wrong because more than 50% of the data is censored
print('Median survival time:', kmf.median_survival_time_)


# ### Parametric 
# 
# - Usually gives more information because there is an underlying distribution
# - Is necessary to assess the goodness-of-fit to avoid biases and misinterpretations
# 
# 
# #### Weibull Model
# 
# $$\Large f(x; \lambda,k) = \frac{k}{\lambda} \left( \frac{x}{\lambda} \right)^{k-1} e^{-(x/\lambda)^k} \qquad \longrightarrow \quad S(t) = e^{-(t/\lambda)^\rho}$$
# 
# $$x \geq 0, k > 0, \lambda > 0$$
# 
# - $k$ or $\rho$: shape of the distribution
#     - $k < 1$: event rate decreases over time
#     - $k = 1$: event rate is constant
#     - $k > 1$: event rate increases over time
#     
# - $\lambda$: scale of the distribution (indicates when 63.2% of the population has experienced the event)

# In[10]:


wb = lfl.WeibullFitter()

# Weibull needs non-zero tte
df['tte'] = df['tte'] + 1e-10

wb.fit(durations=df['tte'], event_observed=df['label'])


# In[11]:


print('Lambda:', wb.lambda_)
print('Rho:', wb.rho_)

ax = wb.survival_function_.plot()
ax.set_title("Survival function of Weibull model / estimated parameters")


# $\rho$ is smaller than 1: the event rate (cancer incidence) decreases over time.

# ### Comparing Several Models

# In[12]:


wb = lfl.WeibullFitter().fit(durations=df['tte'], event_observed=df['label'])
exp = lfl.ExponentialFitter().fit(durations=df['tte'], event_observed=df['label'])
log = lfl.LogNormalFitter().fit(durations=df['tte'], event_observed=df['label'])


# #### Akaike Information Criterion (AIC)
# 
# - Estimates the prediction error and relative quality of statistical models
# - The **less information the model loses, the higher its quality**. From `lifelines` documentation: *The model with the lowest AIC is desirable, since it’s a trade off between maximizing the log-likelihood with as few parameters as possible.*

# In[13]:


print('Weilbull model:', wb.AIC_)
print('Exponential model:', exp.AIC_)
print('LogNormal model:', log.AIC_)


# In[14]:


models = {'Weibull':['r', wb], 
          'Exponential':['g', exp], 
          'LogNormal':['b', log]}

fig, axes = plt.subplots(2, 3, figsize=(18, 8))

i = 0
for model_name,model_lst in models.items():
    m_color = model_lst[0]
    model = model_lst[1]
    
    model.plot_survival_function(ax=axes[0,i], c=m_color)
    ax = qq_plot(model, ax=axes[1,i], scatter_color=m_color)
    ax.title.set_text(model_name)
    i += 1


# #### Finding the best model based on AIC

# In[15]:


best_model, best_aic_ = lfl.utils.find_best_parametric_model(event_times=df['tte'], 
                                                       event_observed=df['label'],
                                                       scoring_method="AIC")
print('Best model:', best_model)
print('Best model AIC:', best_aic_)


# In[16]:


fig, axes = plt.subplots(1, 2, figsize=(18, 4))

best_model.plot_survival_function(ax=axes[0])

# line below raises the error: NotImplementedError: Distribution not implemented in SciPy
#qq_plot(best_model, ax=axes[1])


# ### Weibul Model and Covariates

# In[17]:


aft = lfl.WeibullAFTFitter()
aft.fit(df=df, duration_col='tte', event_col='label')


# In[18]:


aft.print_summary()


# - $exp(coef)$ shows how much the average survival changes with one unit change in the covariate
# - $p$ shows the statistic significance. $p < 0.005$ is significant
# 

# #### How the covariates affect the survival function?

# In[19]:


plt.figure(figsize=(12,6))
aft.plot()


# #### Evaluating the `MALIGCOUNT` covariate

# In[20]:


plt.figure(figsize=(16,6))

values = [[x,y] for x in np.arange(0, .51, 0.2) for y in np.arange(0, .51, 0.2)]
aft.plot_partial_effects_on_outcome(covariates=['MALIGCOUNT', 'BENBORDCOUNT'] , cmap='coolwarm', values=values)


# In[21]:


_, axes = plt.subplots(2,1, figsize=(16,8))
aft.plot_partial_effects_on_outcome(covariates='MALIGCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[0])
aft.plot_partial_effects_on_outcome(covariates='BENBORDCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[1])


# The `baseline` curve is the same one as estimated. The increase of `MALIGCOUNT` (counting of malignant tumors) affects the survival function in such a way that people has cancer sooner.

# ### Cox Proportional Hazards Model (Cox-PH)
# 
# #### Proportional hazards assumption
# 
# All the individuals' hazards are proportional:
# 
# $$\Large h_A(t) = c \cdot h_B(t) $$
# 
# - There is a **baseline hazard** function where other hazards are **scaled**
# - The **relative survival impact** of a varaible does not change with time (time-invariant)
# - **baseline hazard** the risk for individuals at the baseline levels of covariates, i.e., the averages of covariates
# 
# #### Cox-PH model: Regresses covariates on time-to-event/duration.
# 
# $$\Large h(t|x) = b_0 \cdot exp\left( \sum^{n}_{i=1} b_i(x_i - \bar{x_i}) \right) $$
# 
# - $b_0(t)$: population-level **baseline hazard** function
# - $exp(...)$: linear relationship between covariates and the log of hazard. Note: Doesn't depend on time, that's why there is no $t$ in the equation
# 
# 

# In[22]:


coxph = lfl.CoxPHFitter()
coxph.fit(df=df, duration_col='tte', event_col='label')


# In[23]:


coxph.print_summary()


# $exp(coef)$ is the **hazard ratio**: how the hazard changes with one unit variation of the covariate.
# 
# Example: If `MALIGCOUNT` increases by 1, the hazard function increases by $2.04$, which is 204%!!!

# In[24]:


_, axes = plt.subplots(1, 2, figsize=(16,4))

coxph.baseline_hazard_.plot(ax=axes[0])
coxph.baseline_survival_.plot(ax=axes[1])


# In[25]:


_, axes = plt.subplots(2,1, figsize=(16,8))
coxph.plot_partial_effects_on_outcome(covariates='MALIGCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[0])
coxph.plot_partial_effects_on_outcome(covariates='BENBORDCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[1])


# #### Checking the Proportional Hazards Assumption

# In[26]:


coxph.check_assumptions(df, p_value_threshold=0.05)