import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lifelines.plotting import qq_plot, plot_interval_censored_lifetimes
import lifelines as lfl
https://seer.cancer.gov/data-software/documentation/seerstat/nov2016/TextData.FileDescription.pdf
id
: parient's IDtte
: time to eventlabel
: cancer or notAGE_DX
: patient's ageEOD10_SZ
: size of the tumorMALIGCOUNT
: number of In Situ/malignant tumors for patientBENBORDCOUNT
: number of benign/borderline tumorsdf = pd.read_csv('SEER_sample_imputed.csv')
# keeping just a few columns
df = df[['id', 'tte', 'label', 'AGE_DX', 'EOD10_SZ', 'MALIGCOUNT', 'BENBORDCOUNT']]
df = df.dropna()
# there are some 'id's duplicated
df.drop_duplicates('id', inplace=True)
df.set_index('id', inplace=True)
df = df.astype({'tte': int, 'label': int})
df.head()
tte | label | AGE_DX | EOD10_SZ | MALIGCOUNT | BENBORDCOUNT | |
---|---|---|---|---|---|---|
id | ||||||
40003996 | 40 | 0 | 0.336449 | 0.04536 | 0.090909 | 0.0 |
38375019 | 2 | 1 | 0.598131 | 0.03006 | 0.000000 | 0.0 |
36142366 | 119 | 1 | 0.710280 | 0.04008 | 0.000000 | 0.0 |
35021855 | 229 | 1 | 0.560748 | 0.02004 | 0.000000 | 0.0 |
38377976 | 122 | 0 | 0.448598 | 0.06012 | 0.000000 | 0.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 9966 entries, 40003996 to 38818610 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 tte 9966 non-null int32 1 label 9966 non-null int32 2 AGE_DX 9966 non-null float64 3 EOD10_SZ 9966 non-null float64 4 MALIGCOUNT 9966 non-null float64 5 BENBORDCOUNT 9966 non-null float64 dtypes: float64(4), int32(2) memory usage: 467.2 KB
df.groupby(['label']).count()
tte | AGE_DX | EOD10_SZ | MALIGCOUNT | BENBORDCOUNT | |
---|---|---|---|---|---|
label | |||||
0 | 6633 | 6633 | 6633 | 6633 | 6633 |
1 | 3333 | 3333 | 3333 | 3333 | 3333 |
Happens when the time to event is partially known.
The data is right-censored because there are over 6,000 observations where the event didn't happen. It the cabe confirmed in the plot below.
plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.title('Counting of Time to Event (tte)')
df['tte'].hist()
plt.subplot(1,2,2)
plt.title('Counting based on LABEL')
df.groupby(['label']).count()['tte'].plot(kind='pie', label='');
most used
kmf = lfl.KaplanMeierFitter()
kmf.fit(durations=df['tte'], event_observed=df['label'])
<lifelines.KaplanMeierFitter:"KM_estimate", fitted with 9966 total observations, 6633 right-censored observations>
kmf.plot_survival_function(at_risk_counts=True)
plt.ylabel('Survival Probability');
# this measure might be wrong because more than 50% of the data is censored
print('Median survival time:', kmf.median_survival_time_)
Median survival time: 208.0
$k$ or $\rho$: shape of the distribution
$\lambda$: scale of the distribution (indicates when 63.2% of the population has experienced the event)
wb = lfl.WeibullFitter()
# Weibull needs non-zero tte
df['tte'] = df['tte'] + 1e-10
wb.fit(durations=df['tte'], event_observed=df['label'])
<lifelines.WeibullFitter:"Weibull_estimate", fitted with 9966 total observations, 6633 right-censored observations>
print('Lambda:', wb.lambda_)
print('Rho:', wb.rho_)
ax = wb.survival_function_.plot()
ax.set_title("Survival function of Weibull model / estimated parameters")
Lambda: 358.7938759636887 Rho: 0.7952285765619703
Text(0.5, 1.0, 'Survival function of Weibull model / estimated parameters')
$\rho$ is smaller than 1: the event rate (cancer incidence) decreases over time.
wb = lfl.WeibullFitter().fit(durations=df['tte'], event_observed=df['label'])
exp = lfl.ExponentialFitter().fit(durations=df['tte'], event_observed=df['label'])
log = lfl.LogNormalFitter().fit(durations=df['tte'], event_observed=df['label'])
lifelines
documentation: The model with the lowest AIC is desirable, since it’s a trade off between maximizing the log-likelihood with as few parameters as possible.print('Weilbull model:', wb.AIC_)
print('Exponential model:', exp.AIC_)
print('LogNormal model:', log.AIC_)
Weilbull model: 44222.28063808024 Exponential model: 44474.95418984919 LogNormal model: 47666.55189603807
models = {'Weibull':['r', wb],
'Exponential':['g', exp],
'LogNormal':['b', log]}
fig, axes = plt.subplots(2, 3, figsize=(18, 8))
i = 0
for model_name,model_lst in models.items():
m_color = model_lst[0]
model = model_lst[1]
model.plot_survival_function(ax=axes[0,i], c=m_color)
ax = qq_plot(model, ax=axes[1,i], scatter_color=m_color)
ax.title.set_text(model_name)
i += 1
best_model, best_aic_ = lfl.utils.find_best_parametric_model(event_times=df['tte'],
event_observed=df['label'],
scoring_method="AIC")
print('Best model:', best_model)
print('Best model AIC:', best_aic_)
Best model: <lifelines.SplineFitter:"SplineFitter: 1 internal knot", fitted with 9966 total observations, 6633 right-censored observations> Best model AIC: 43659.17156592098
fig, axes = plt.subplots(1, 2, figsize=(18, 4))
best_model.plot_survival_function(ax=axes[0])
# line below raises the error: NotImplementedError: Distribution not implemented in SciPy
#qq_plot(best_model, ax=axes[1])
<AxesSubplot:>
aft = lfl.WeibullAFTFitter()
aft.fit(df=df, duration_col='tte', event_col='label')
<lifelines.WeibullAFTFitter: fitted with 9966 total observations, 6633 right-censored observations>
aft.print_summary()
C:\Users\helde\miniconda3\envs\py39\lib\site-packages\lifelines\utils\printer.py:62: FutureWarning: In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality. return summary_df[columns].to_latex(float_format="%." + str(self.decimals) + "f")
model | lifelines.WeibullAFTFitter |
---|---|
duration col | 'tte' |
event col | 'label' |
number of observations | 9966 |
number of events observed | 3333 |
log-likelihood | -21462.69 |
time fit was run | 2022-06-10 20:10:14 UTC |
coef | exp(coef) | se(coef) | coef lower 95% | coef upper 95% | exp(coef) lower 95% | exp(coef) upper 95% | cmp to | z | p | -log2(p) | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
lambda_ | AGE_DX | -5.74 | 0.00 | 0.18 | -6.10 | -5.39 | 0.00 | 0.00 | 0.00 | -31.93 | <0.005 | 740.77 |
BENBORDCOUNT | 1.43 | 4.18 | 1.14 | -0.81 | 3.67 | 0.45 | 39.21 | 0.00 | 1.25 | 0.21 | 2.25 | |
EOD10_SZ | -1.20 | 0.30 | 0.11 | -1.42 | -0.99 | 0.24 | 0.37 | 0.00 | -11.04 | <0.005 | 91.66 | |
MALIGCOUNT | -0.96 | 0.38 | 0.30 | -1.54 | -0.37 | 0.21 | 0.69 | 0.00 | -3.22 | <0.005 | 9.60 | |
Intercept | 8.75 | 6320.25 | 0.10 | 8.55 | 8.95 | 5157.63 | 7744.94 | 0.00 | 84.38 | <0.005 | inf | |
rho_ | Intercept | -0.16 | 0.85 | 0.01 | -0.19 | -0.13 | 0.83 | 0.88 | 0.00 | -10.83 | <0.005 | 88.44 |
Concordance | 0.66 |
---|---|
AIC | 42937.37 |
log-likelihood ratio test | 1292.91 on 4 df |
-log2(p) of ll-ratio test | 923.30 |
plt.figure(figsize=(12,6))
aft.plot()
<AxesSubplot:xlabel='log(accelerated failure rate) (95% CI)'>
MALIGCOUNT
covariate¶plt.figure(figsize=(16,6))
values = [[x,y] for x in np.arange(0, .51, 0.2) for y in np.arange(0, .51, 0.2)]
aft.plot_partial_effects_on_outcome(covariates=['MALIGCOUNT', 'BENBORDCOUNT'] , cmap='coolwarm', values=values)
<AxesSubplot:>
_, axes = plt.subplots(2,1, figsize=(16,8))
aft.plot_partial_effects_on_outcome(covariates='MALIGCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[0])
aft.plot_partial_effects_on_outcome(covariates='BENBORDCOUNT', values=np.arange(0, 1.01, 0.2), ax=axes[1])
<AxesSubplot:>