!git clone https://github.com/laura-health/cbms2020/
!pip install catboost
!pip install lightgbm
!pip install missingpy
Cloning into 'cbms2020'... remote: Enumerating objects: 9, done. remote: Counting objects: 100% (9/9), done. remote: Compressing objects: 100% (8/8), done. remote: Total 9 (delta 2), reused 4 (delta 0), pack-reused 0 Unpacking objects: 100% (9/9), done. Collecting catboost Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB) |████████████████████████████████| 64.4MB 41kB/s Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (from catboost) (4.4.1) Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from catboost) (3.1.3) Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (0.25.3) Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.17.5) Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from catboost) (1.12.0) Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from catboost) (0.10.1) Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from catboost) (1.4.1) Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly->catboost) (1.3.3) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (1.1.0) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.4.6) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (0.10.0) Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.6.1) Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2018.9) Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->catboost) (45.1.0) Installing collected packages: catboost Successfully installed catboost-0.22 Requirement already satisfied: lightgbm in /usr/local/lib/python3.6/dist-packages (2.2.3) Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.17.5) Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.4.1) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from lightgbm) (0.22.1) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->lightgbm) (0.14.1) Collecting missingpy Downloading https://files.pythonhosted.org/packages/b5/be/998d04d27054b58f0974b5f09f8457778a0a72d4355e0b7ae877b6cfb850/missingpy-0.2.0-py3-none-any.whl (49kB) |████████████████████████████████| 51kB 4.0MB/s Installing collected packages: missingpy Successfully installed missingpy-0.2.0
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from missingpy import MissForest
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
dataset = pd.read_csv("cbms2020/heg_sample_data.csv") #normalized dataset
dataset.drop('Unnamed: 0', axis=1, inplace=True)
dataset.shape, dataset.columns
((13652, 72), Index(['days_from_entrance', 'age', 'document.sexo', 'UTI', 'delta_collect_timestamp_t-t1', 'delta_collect_timestamp_t1-t2', 'delta_collect_timestamp_t2-t3', 'delta_collect_timestamp_t3-t4', 'document.freq_cardiaca(t)', 'document.freq_cardiaca(t-1)', 'document.freq_cardiaca(t-2)', 'document.freq_cardiaca(t-3)', 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t)', 'document.freq_respiratoria(t-1)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-3)', 'document.freq_respiratoria(t-4)', 'document.glicemia_capilar(t)', 'document.glicemia_capilar(t-1)', 'document.glicemia_capilar(t-2)', 'document.glicemia_capilar(t-3)', 'document.glicemia_capilar(t-4)', 'document.pa_diastolica(t)', 'document.pa_diastolica(t-1)', 'document.pa_diastolica(t-2)', 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)', 'document.pa_sistolica(t)', 'document.pa_sistolica(t-1)', 'document.pa_sistolica(t-2)', 'document.pa_sistolica(t-3)', 'document.pa_sistolica(t-4)', 'document.sat_o2(t)', 'document.sat_o2(t-1)', 'document.sat_o2(t-2)', 'document.sat_o2(t-3)', 'document.sat_o2(t-4)', 'document.temperatura(t)', 'document.temperatura(t-1)', 'document.temperatura(t-2)', 'document.temperatura(t-3)', 'document.temperatura(t-4)', 'delta_document.freq_cardiaca_t-t1', 'delta_document.freq_cardiaca_t1-t2', 'delta_document.freq_cardiaca_t2-t3', 'delta_document.freq_cardiaca_t3-t4', 'delta_document.freq_respiratoria_t-t1', 'delta_document.freq_respiratoria_t1-t2', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.freq_respiratoria_t3-t4', 'delta_document.glicemia_capilar_t-t1', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_diastolica_t-t1', 'delta_document.pa_diastolica_t1-t2', 'delta_document.pa_diastolica_t2-t3', 'delta_document.pa_diastolica_t3-t4', 'delta_document.pa_sistolica_t-t1', 'delta_document.pa_sistolica_t1-t2', 'delta_document.pa_sistolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'delta_document.sat_o2_t-t1', 'delta_document.sat_o2_t1-t2', 'delta_document.sat_o2_t2-t3', 'delta_document.sat_o2_t3-t4', 'delta_document.temperatura_t-t1', 'delta_document.temperatura_t1-t2', 'delta_document.temperatura_t2-t3', 'delta_document.temperatura_t3-t4', 'outcome'], dtype='object'))
X = dataset.drop(["outcome"], axis = 1)
Y = dataset["outcome"]
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
classifiers = {
'XGBoost' : XGBClassifier(learning_rate=0.1, n_estimators=100,random_state=7, tree_method='gpu_hist'),
'LogReg': LogisticRegression(solver='liblinear', multi_class='ovr'),
'D.Tree': DecisionTreeClassifier(),
'RForest': RandomForestClassifier(n_estimators = 50),
'CatBoos': CatBoostClassifier(learning_rate=0.1,n_estimators=100,random_state=7,task_type='GPU',verbose = False),
'Naive': GaussianNB(),
'Light': lgb.LGBMClassifier()
}
for c in classifiers:
start = time.time()
model = classifiers[c]
scores = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
scores_f1 = cross_val_score(model, X, Y, cv=kfold, scoring='f1')
print (c + '\t', round(scores.mean(),4), '(' + str(round(scores_f1.mean(),4)) + ')', round(time.time() - start,2), 's')
XGBoost 0.9076 (0.563) 6.44 s LogReg 0.8783 (0.5129) 11.1 s D.Tree 0.6973 (0.4705) 16.1 s RForest 0.8853 (0.5185) 53.33 s CatBoos 0.9058 (0.567) 84.57 s Naive 0.7854 (0.456) 0.58 s Light 0.9054 (0.5877) 27.89 s
cols = ['age', 'document.sexo', 'UTI', 'days_from_entrance']
t_cols = [c for c in dataset.columns if '4)' in c and (not 'time' in c)]
for i in [4,3,2,1,0]:
if i == 4: cols.extend(t_cols)
if i == 0:
tN_cols = [c for c in dataset.columns if ('t)' in c or '_t-' in c) and (not 'time' in c)]
cols.extend(tN_cols)
else:
tN_cols = [c for c in dataset.columns if ('t-'+str(i) in c or '_t'+str(i) in c) and (not 'time' in c)]
cols.extend(tN_cols)
cols = list(set(cols))
print('Number of Columns:', len(cols), 'Exam(s):', 5-i)
print(cols)
X_W = dataset[cols]
Y_W = dataset["outcome"]
for c in classifiers:
start = time.time()
model = classifiers[c]
scores = cross_val_score(model, X_W, Y_W, cv=kfold, scoring='roc_auc')
print ('\t' + c + '\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')
Number of Columns: 11 Exam(s): 1 ['document.temperatura(t-4)', 'document.glicemia_capilar(t-4)', 'days_from_entrance', 'UTI', 'document.pa_diastolica(t-4)', 'document.sat_o2(t-4)', 'document.freq_respiratoria(t-4)', 'document.sexo', 'document.freq_cardiaca(t-4)', 'document.pa_sistolica(t-4)', 'age'] XGBoost 0.846 (+-0.0073) 2.25 s LogReg 0.8045 (+-0.015) 0.54 s D.Tree 0.6356 (+-0.0154) 0.76 s RForest 0.8079 (+-0.0139) 7.39 s CatBoos 0.8446 (+-0.01) 9.36 s Naive 0.7696 (+-0.0138) 0.12 s Light 0.8408 (+-0.0085) 2.75 s Number of Columns: 25 Exam(s): 2 ['document.temperatura(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'UTI', 'document.freq_respiratoria(t-3)', 'delta_document.temperatura_t3-t4', 'age', 'document.sat_o2(t-4)', 'delta_document.freq_respiratoria_t3-t4', 'document.temperatura(t-3)', 'document.glicemia_capilar(t-3)', 'document.freq_cardiaca(t-4)', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.glicemia_capilar(t-4)', 'delta_document.pa_sistolica_t3-t4', 'document.freq_respiratoria(t-4)', 'document.pa_sistolica(t-3)', 'document.freq_cardiaca(t-3)', 'days_from_entrance', 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)', 'document.sat_o2(t-3)', 'document.sexo', 'delta_document.freq_cardiaca_t3-t4', 'document.pa_sistolica(t-4)'] XGBoost 0.8587 (+-0.007) 2.91 s LogReg 0.8227 (+-0.0149) 1.23 s D.Tree 0.6531 (+-0.0135) 1.79 s RForest 0.824 (+-0.0125) 12.59 s CatBoos 0.8575 (+-0.0079) 12.96 s Naive 0.7643 (+-0.0199) 0.18 s Light 0.8533 (+-0.0092) 5.05 s Number of Columns: 39 Exam(s): 3 ['document.temperatura(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_diastolica_t2-t3', 'UTI', 'document.freq_respiratoria(t-3)', 'delta_document.temperatura_t3-t4', 'document.sat_o2(t-4)', 'age', 'delta_document.freq_respiratoria_t3-t4', 'document.temperatura(t-3)', 'document.glicemia_capilar(t-3)', 'delta_document.pa_sistolica_t2-t3', 'document.glicemia_capilar(t-2)', 'document.freq_cardiaca(t-4)', 'delta_document.sat_o2_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_diastolica_t3-t4', 'document.pa_sistolica(t-2)', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-2)', 'delta_document.pa_sistolica_t3-t4', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3', 'document.sat_o2(t-2)', 'delta_document.sat_o2_t2-t3', 'document.freq_respiratoria(t-4)', 'document.pa_sistolica(t-3)', 'document.freq_cardiaca(t-3)', 'document.temperatura(t-2)', 'document.pa_diastolica(t-2)', 'days_from_entrance', 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)', 'document.sat_o2(t-3)', 'delta_document.freq_cardiaca_t2-t3', 'document.sexo', 'delta_document.freq_cardiaca_t3-t4', 'document.pa_sistolica(t-4)', 'delta_document.temperatura_t2-t3'] XGBoost 0.8751 (+-0.0077) 2.64 s LogReg 0.8376 (+-0.0125) 2.3 s D.Tree 0.6621 (+-0.0167) 3.08 s RForest 0.8424 (+-0.0149) 16.19 s CatBoos 0.8758 (+-0.0062) 13.13 s Naive 0.7555 (+-0.0192) 0.17 s Light 0.8763 (+-0.0076) 7.45 s Number of Columns: 53 Exam(s): 4 ['document.temperatura(t-4)', 'delta_document.pa_diastolica_t2-t3', 'delta_document.glicemia_capilar_t3-t4', 'UTI', 'delta_document.temperatura_t1-t2', 'document.freq_respiratoria(t-3)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.temperatura_t3-t4', 'document.freq_respiratoria(t-1)', 'age', 'document.sat_o2(t-4)', 'delta_document.freq_respiratoria_t3-t4', 'document.temperatura(t-3)', 'delta_document.pa_sistolica_t1-t2', 'document.glicemia_capilar(t-3)', 'delta_document.pa_sistolica_t2-t3', 'delta_document.freq_respiratoria_t1-t2', 'document.glicemia_capilar(t-2)', 'document.pa_sistolica(t-1)', 'document.freq_cardiaca(t-4)', 'delta_document.sat_o2_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-1)', 'document.temperatura(t-1)', 'document.pa_sistolica(t-2)', 'document.glicemia_capilar(t-1)', 'delta_document.sat_o2_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-2)', 'delta_document.pa_sistolica_t3-t4', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3', 'document.sat_o2(t-2)', 'delta_document.sat_o2_t2-t3', 'delta_document.freq_cardiaca_t1-t2', 'document.freq_respiratoria(t-4)', 'document.pa_sistolica(t-3)', 'document.freq_cardiaca(t-3)', 'document.temperatura(t-2)', 'delta_document.pa_diastolica_t1-t2', 'document.pa_diastolica(t-2)', 'days_from_entrance', 'document.sat_o2(t-1)', 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)', 'document.sat_o2(t-3)', 'delta_document.freq_cardiaca_t2-t3', 'document.pa_diastolica(t-1)', 'document.sexo', 'delta_document.freq_cardiaca_t3-t4', 'document.pa_sistolica(t-4)', 'delta_document.temperatura_t2-t3'] XGBoost 0.887 (+-0.007) 3.26 s LogReg 0.8543 (+-0.0111) 3.49 s D.Tree 0.6798 (+-0.0143) 4.68 s RForest 0.8619 (+-0.0151) 20.08 s CatBoos 0.8886 (+-0.0067) 12.87 s Naive 0.7596 (+-0.0197) 0.21 s Light 0.8851 (+-0.0084) 10.27 s Number of Columns: 67 Exam(s): 5 ['document.temperatura(t-4)', 'document.pa_diastolica(t)', 'delta_document.pa_diastolica_t2-t3', 'delta_document.glicemia_capilar_t3-t4', 'document.sat_o2(t)', 'UTI', 'delta_document.temperatura_t1-t2', 'document.freq_cardiaca(t)', 'document.freq_respiratoria(t-3)', 'delta_document.glicemia_capilar_t1-t2', 'document.temperatura(t)', 'delta_document.temperatura_t3-t4', 'document.freq_respiratoria(t-1)', 'document.sat_o2(t-4)', 'age', 'delta_document.freq_respiratoria_t3-t4', 'delta_document.glicemia_capilar_t-t1', 'delta_document.freq_respiratoria_t-t1', 'delta_document.temperatura_t2-t3', 'delta_document.pa_sistolica_t-t1', 'document.temperatura(t-3)', 'delta_document.pa_sistolica_t1-t2', 'document.glicemia_capilar(t-3)', 'delta_document.pa_sistolica_t2-t3', 'delta_document.freq_respiratoria_t1-t2', 'document.glicemia_capilar(t-2)', 'document.pa_sistolica(t-1)', 'document.freq_cardiaca(t-4)', 'delta_document.sat_o2_t3-t4', 'document.freq_cardiaca(t-2)', 'document.pa_sistolica(t)', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-1)', 'delta_document.freq_cardiaca_t-t1', 'document.temperatura(t-1)', 'document.glicemia_capilar(t)', 'document.freq_respiratoria(t)', 'document.pa_sistolica(t-2)', 'document.glicemia_capilar(t-1)', 'delta_document.sat_o2_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-2)', 'delta_document.pa_sistolica_t3-t4', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3', 'document.sat_o2(t-2)', 'delta_document.sat_o2_t2-t3', 'delta_document.freq_cardiaca_t1-t2', 'document.freq_respiratoria(t-4)', 'document.pa_sistolica(t-3)', 'delta_document.pa_diastolica_t-t1', 'document.freq_cardiaca(t-3)', 'document.temperatura(t-2)', 'delta_document.pa_diastolica_t1-t2', 'document.pa_diastolica(t-2)', 'days_from_entrance', 'document.sat_o2(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.sat_o2_t-t1', 'document.pa_diastolica(t-4)', 'document.sat_o2(t-3)', 'delta_document.freq_cardiaca_t2-t3', 'document.sexo', 'delta_document.temperatura_t-t1', 'delta_document.freq_cardiaca_t3-t4', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)'] XGBoost 0.9072 (+-0.0067) 2.57 s LogReg 0.8749 (+-0.0086) 4.42 s D.Tree 0.6957 (+-0.0206) 6.53 s RForest 0.882 (+-0.0111) 24.15 s CatBoos 0.9071 (+-0.0083) 12.77 s Naive 0.7735 (+-0.018) 0.29 s Light 0.9057 (+-0.0066) 12.6 s