!pip install catboost
!pip install lightgbm
!pip install missingpy
Collecting catboost Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB) |████████████████████████████████| 64.4MB 47kB/s Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from catboost) (1.12.0) Requirement already satisfied: pandas>=0.24.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (0.25.3) Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from catboost) (1.4.1) Requirement already satisfied: plotly in /usr/local/lib/python3.6/dist-packages (from catboost) (4.4.1) Requirement already satisfied: numpy>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from catboost) (1.17.5) Requirement already satisfied: graphviz in /usr/local/lib/python3.6/dist-packages (from catboost) (0.10.1) Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (from catboost) (3.1.3) Requirement already satisfied: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2.6.1) Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.24.0->catboost) (2018.9) Requirement already satisfied: retrying>=1.3.3 in /usr/local/lib/python3.6/dist-packages (from plotly->catboost) (1.3.3) Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (2.4.6) Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (1.1.0) Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib->catboost) (0.10.0) Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from kiwisolver>=1.0.1->matplotlib->catboost) (45.1.0) Installing collected packages: catboost Successfully installed catboost-0.22 Requirement already satisfied: lightgbm in /usr/local/lib/python3.6/dist-packages (2.2.3) Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.4.1) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from lightgbm) (0.22.1) Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from lightgbm) (1.17.5) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->lightgbm) (0.14.1) Collecting missingpy Downloading https://files.pythonhosted.org/packages/b5/be/998d04d27054b58f0974b5f09f8457778a0a72d4355e0b7ae877b6cfb850/missingpy-0.2.0-py3-none-any.whl (49kB) |████████████████████████████████| 51kB 1.7MB/s Installing collected packages: missingpy Successfully installed missingpy-0.2.0
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from catboost import CatBoostClassifier
from missingpy import MissForest
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report, f1_score
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')
/usr/local/lib/python3.6/dist-packages/sklearn/utils/deprecation.py:144: FutureWarning: The sklearn.neighbors.base module is deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API. warnings.warn(message, FutureWarning)
!wget -c https://unkown/dataset_treinamento_modelo_generico_filled.csv.gz
dataset = pd.read_csv("dataset_treinamento_modelo_generico_filled.csv.gz")
dataset.drop('Unnamed: 0', axis=1, inplace=True)
dataset.shape, dataset.columns
((121522, 80), Index(['entity', 'atendimento_id', 'days_from_entrance', 'age', 'document.sexo', 'UTI', 'absolute_timestamp', 'collect_timestamp(t)', 'collect_timestamp(t-1)', 'collect_timestamp(t-2)', 'collect_timestamp(t-3)', 'collect_timestamp(t-4)', 'delta_collect_timestamp_t-t1', 'delta_collect_timestamp_t1-t2', 'delta_collect_timestamp_t2-t3', 'delta_collect_timestamp_t3-t4', 'document.freq_cardiaca(t)', 'document.freq_cardiaca(t-1)', 'document.freq_cardiaca(t-2)', 'document.freq_cardiaca(t-3)', 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t)', 'document.freq_respiratoria(t-1)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-3)', 'document.freq_respiratoria(t-4)', 'document.glicemia_capilar(t)', 'document.glicemia_capilar(t-1)', 'document.glicemia_capilar(t-2)', 'document.glicemia_capilar(t-3)', 'document.glicemia_capilar(t-4)', 'document.pa_diastolica(t)', 'document.pa_diastolica(t-1)', 'document.pa_diastolica(t-2)', 'document.pa_diastolica(t-3)', 'document.pa_diastolica(t-4)', 'document.pa_sistolica(t)', 'document.pa_sistolica(t-1)', 'document.pa_sistolica(t-2)', 'document.pa_sistolica(t-3)', 'document.pa_sistolica(t-4)', 'document.sat_o2(t)', 'document.sat_o2(t-1)', 'document.sat_o2(t-2)', 'document.sat_o2(t-3)', 'document.sat_o2(t-4)', 'document.temperatura(t)', 'document.temperatura(t-1)', 'document.temperatura(t-2)', 'document.temperatura(t-3)', 'document.temperatura(t-4)', 'delta_document.freq_cardiaca_t-t1', 'delta_document.freq_cardiaca_t1-t2', 'delta_document.freq_cardiaca_t2-t3', 'delta_document.freq_cardiaca_t3-t4', 'delta_document.freq_respiratoria_t-t1', 'delta_document.freq_respiratoria_t1-t2', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.freq_respiratoria_t3-t4', 'delta_document.glicemia_capilar_t-t1', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_diastolica_t-t1', 'delta_document.pa_diastolica_t1-t2', 'delta_document.pa_diastolica_t2-t3', 'delta_document.pa_diastolica_t3-t4', 'delta_document.pa_sistolica_t-t1', 'delta_document.pa_sistolica_t1-t2', 'delta_document.pa_sistolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'delta_document.sat_o2_t-t1', 'delta_document.sat_o2_t1-t2', 'delta_document.sat_o2_t2-t3', 'delta_document.sat_o2_t3-t4', 'delta_document.temperatura_t-t1', 'delta_document.temperatura_t1-t2', 'delta_document.temperatura_t2-t3', 'delta_document.temperatura_t3-t4', 'document.alta.motivo'], dtype='object'))
X = dataset.drop(["document.alta.motivo"], axis = 1)
Y = dataset["document.alta.motivo"]
lightgbm_tunned = lgb.LGBMClassifier(boosting_type='gbdt', class_weight=None,
colsample_bytree=0.9341899590668798, gamma=0.06731944764385,
importance_type='split', learning_rate=0.11067874018709263,
max_depth=14, min_child_samples=20,
min_child_weight=14.46086218129473, min_split_gain=0.0,
n_estimators=293, n_jobs=-1, num_leaves=31, objective=None,
random_state=None, reg_alpha=11.767083608890678, reg_lambda=0.0,
silent=True, subsample=0.8930069556095456,
subsample_for_bin=200000, subsample_freq=0)
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
classifiers = {
'XGBoost' : XGBClassifier(learning_rate=0.1, n_estimators=100,random_state=7, tree_method='gpu_hist'),
'LogReg': LogisticRegression(solver='liblinear', multi_class='ovr'),
'D.Tree': DecisionTreeClassifier(),
'RForest': RandomForestClassifier(n_estimators = 50),
'CatBoos': CatBoostClassifier(learning_rate=0.1,n_estimators=100,random_state=7,task_type='GPU',verbose = False),
'Naive': GaussianNB(),
'Light': lightgbm_tunned
}
print('Entity ID: all, Count', len(dataset))
cols = ['delta_document.pa_diastolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.glicemia_capilar(t-1)', 'document.freq_respiratoria(t-1)', 'delta_document.pa_sistolica_t-t1', 'document.sat_o2(t-4)', 'document.pa_sistolica(t)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'delta_document.pa_diastolica_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'document.freq_cardiaca(t)', 'document.pa_sistolica(t-1)', 'document.pa_diastolica(t)', 'UTI', 'age', 'document.sexo', 'document.freq_cardiaca(t-1)', 'delta_document.temperatura_t-t1', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.glicemia_capilar_t-t1', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'document.freq_respiratoria(t)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'delta_document.freq_cardiaca_t-t1', 'document.temperatura(t-4)', 'document.temperatura(t)', 'delta_document.freq_cardiaca_t2-t3', 'document.temperatura(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.freq_cardiaca_t1-t2', 'document.glicemia_capilar(t-3)', 'delta_document.pa_diastolica_t-t1', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)', 'document.glicemia_capilar(t)', 'delta_document.sat_o2_t1-t2', 'document.pa_sistolica(t-2)', 'delta_document.freq_respiratoria_t-t1', 'delta_document.sat_o2_t-t1', 'document.pa_diastolica(t-4)', 'delta_document.freq_respiratoria_t2-t3', 'document.sat_o2(t)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_sistolica_t1-t2', 'delta_document.temperatura_t1-t2', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'document.sat_o2(t-1)', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.freq_respiratoria_t1-t2']
X = dataset[cols]
Y = dataset["document.alta.motivo"]
for c in classifiers:
start = time.time()
model = classifiers[c]
scores = cross_val_score(model, X, Y, cv=kfold, scoring='roc_auc')
scores_f1 = cross_val_score(model, X, Y, cv=kfold, scoring='f1')
print (c + '\t', round(scores.mean(),4), '(' + str(round(scores_f1.mean(),4)) + ')', round(time.time() - start,2), 's')
Entity ID: all, Count 121522 XGBoost 0.9559 (0.632) 10.02 s LogReg 0.9326 (0.5565) 967.61 s D.Tree 0.7482 (0.4994) 156.02 s RForest 0.9406 (0.609) 546.14 s CatBoos 0.9555 (0.6426) 31.79 s Naive 0.8418 (0.3793) 3.45 s Light 0.9611 (0.6715) 312.0 s
for id in dataset['entity'].unique():
dataset_entity = dataset[dataset['entity']==id]
print('Entity ID:', id, ', Count:',len(dataset_entity))
X_ID = dataset_entity.drop(["document.alta.motivo"], axis = 1)
Y_ID = dataset_entity["document.alta.motivo"]
for c in classifiers:
start = time.time()
model = classifiers[c]
scores = cross_val_score(model, X_ID, Y_ID, cv=kfold, scoring='roc_auc')
print ('\t' + c + '\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')
Entity ID: 1.0 , Count: 16674 XGBoost 0.9776 (+-0.0107) 21.4 s LogReg 0.7975 (+-0.077) 5.53 s D.Tree 0.804 (+-0.0326) 10.39 s RForest 0.9532 (+-0.0172) 33.14 s CatBoos 0.9764 (+-0.013) 19.49 s Naive 0.8898 (+-0.0281) 0.25 s Light 0.9772 (+-0.0103) 12.59 s Entity ID: 3.0 , Count: 28279 XGBoost 0.9587 (+-0.0068) 30.42 s LogReg 0.7345 (+-0.0986) 12.36 s D.Tree 0.7195 (+-0.0123) 20.82 s RForest 0.9346 (+-0.011) 59.67 s CatBoos 0.9548 (+-0.0061) 20.61 s Naive 0.8356 (+-0.0271) 0.43 s Light 0.9616 (+-0.0055) 27.89 s Entity ID: 5.0 , Count: 17091 XGBoost 0.9156 (+-0.0061) 20.89 s LogReg 0.5858 (+-0.0222) 3.42 s D.Tree 0.7038 (+-0.0185) 12.17 s RForest 0.8955 (+-0.0074) 35.9 s CatBoos 0.9153 (+-0.0068) 21.0 s Naive 0.8151 (+-0.0097) 0.27 s Light 0.9179 (+-0.0078) 20.22 s Entity ID: 8.0 , Count: 19811 XGBoost 0.9598 (+-0.0158) 21.0 s LogReg 0.717 (+-0.0359) 4.78 s D.Tree 0.8087 (+-0.0238) 14.32 s RForest 0.9491 (+-0.0156) 43.85 s CatBoos 0.9567 (+-0.016) 20.64 s Naive 0.8503 (+-0.0231) 0.31 s Light 0.9613 (+-0.014) 15.71 s Entity ID: 10.0 , Count: 6524 XGBoost 0.9779 (+-0.0135) 10.7 s LogReg 0.7244 (+-0.0769) 2.81 s D.Tree 0.8429 (+-0.0395) 3.08 s RForest 0.9641 (+-0.0204) 10.12 s CatBoos 0.9794 (+-0.0132) 18.46 s Naive 0.8133 (+-0.0352) 0.13 s Light 0.9817 (+-0.0091) 5.07 s Entity ID: 38.0 , Count: 33143 XGBoost 0.9643 (+-0.0063) 30.77 s LogReg 0.5936 (+-0.0282) 4.75 s D.Tree 0.7528 (+-0.0139) 30.18 s RForest 0.9483 (+-0.0084) 83.26 s CatBoos 0.9642 (+-0.0053) 20.45 s Naive 0.7082 (+-0.0269) 0.5 s Light 0.9668 (+-0.0064) 32.9 s
cols = ['age', 'document.sexo', 'UTI', 'days_from_entrance']
t_cols = [c for c in dataset.columns if '4)' in c and (not 'time' in c)]
for i in [4,3,2,1,0]:
if i == 4: cols.extend(t_cols)
if i == 0:
tN_cols = [c for c in dataset.columns if ('t)' in c or '_t-' in c) and (not 'time' in c)]
cols.extend(tN_cols)
else:
tN_cols = [c for c in dataset.columns if ('t-'+str(i) in c or '_t'+str(i) in c) and (not 'time' in c)]
cols.extend(tN_cols)
cols = list(set(cols))
print('Número de Colunas:', len(cols), 'Exame(s):', 5-i)
print(cols)
X_N = dataset[cols]
#X_N = imputer.fit_transform(X_N)
Y_N = dataset["document.alta.motivo"]
for c in classifiers:
start = time.time()
model = classifiers[c]
scores = cross_val_score(model, X_N, Y_N, cv=kfold, scoring='roc_auc')
print ('\t' + c + '\t', round(scores.mean(),4), '(+-' + str(round(scores.std(),4)) + ')', round(time.time() - start,2), 's')
Número de Colunas: 11 Exame(s): 1 ['document.glicemia_capilar(t-4)', 'document.pa_diastolica(t-4)', 'UTI', 'age', 'document.sexo', 'document.temperatura(t-4)', 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t-4)', 'document.pa_sistolica(t-4)', 'days_from_entrance', 'document.sat_o2(t-4)'] XGBoost 0.9285 (+-0.0047) 2.96 s LogReg 0.9047 (+-0.0054) 11.25 s D.Tree 0.7107 (+-0.0074) 7.48 s RForest 0.9064 (+-0.0046) 72.15 s CatBoos 0.9301 (+-0.0045) 9.83 s Naive 0.8578 (+-0.006) 0.44 s Light 0.9354 (+-0.0044) 44.18 s Número de Colunas: 25 Exame(s): 2 ['delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.sat_o2(t-4)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'UTI', 'age', 'document.sexo', 'days_from_entrance', 'document.freq_cardiaca(t-3)', 'delta_document.pa_diastolica_t3-t4', 'delta_document.sat_o2_t3-t4', 'delta_document.freq_respiratoria_t3-t4', 'document.temperatura(t-4)', 'document.pa_diastolica(t-3)', 'document.glicemia_capilar(t-3)', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4'] XGBoost 0.9374 (+-0.005) 4.18 s LogReg 0.9138 (+-0.0064) 43.98 s D.Tree 0.7192 (+-0.0116) 20.04 s RForest 0.9141 (+-0.0085) 130.24 s CatBoos 0.9379 (+-0.0049) 13.25 s Naive 0.8337 (+-0.0067) 0.67 s Light 0.943 (+-0.0042) 71.26 s Número de Colunas: 39 Exame(s): 3 ['delta_document.pa_sistolica_t3-t4', 'delta_document.pa_diastolica_t2-t3', 'document.freq_cardiaca(t-4)', 'document.sat_o2(t-4)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'UTI', 'age', 'document.sexo', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'document.temperatura(t-4)', 'delta_document.freq_cardiaca_t2-t3', 'document.pa_diastolica(t-3)', 'document.glicemia_capilar(t-3)', 'document.pa_sistolica(t-4)', 'document.pa_sistolica(t-2)', 'document.pa_diastolica(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3'] XGBoost 0.944 (+-0.0045) 4.35 s LogReg 0.9201 (+-0.0058) 122.47 s D.Tree 0.7272 (+-0.0084) 35.78 s RForest 0.9237 (+-0.0074) 174.16 s CatBoos 0.9445 (+-0.0046) 14.22 s Naive 0.8316 (+-0.0074) 1.08 s Light 0.9495 (+-0.0041) 100.16 s Número de Colunas: 53 Exame(s): 4 ['delta_document.pa_diastolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.freq_respiratoria(t-1)', 'document.glicemia_capilar(t-1)', 'document.sat_o2(t-4)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'delta_document.pa_diastolica_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'document.pa_sistolica(t-1)', 'UTI', 'age', 'document.sexo', 'document.freq_cardiaca(t-1)', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'document.sat_o2(t-1)', 'document.temperatura(t-4)', 'delta_document.freq_cardiaca_t2-t3', 'document.temperatura(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.freq_cardiaca_t1-t2', 'document.glicemia_capilar(t-3)', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)', 'delta_document.sat_o2_t1-t2', 'document.pa_sistolica(t-2)', 'document.pa_diastolica(t-4)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_sistolica_t1-t2', 'delta_document.temperatura_t1-t2', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'delta_document.freq_respiratoria_t2-t3', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.freq_respiratoria_t1-t2'] XGBoost 0.9506 (+-0.0043) 5.3 s LogReg 0.9286 (+-0.005) 236.43 s D.Tree 0.7362 (+-0.0049) 58.34 s RForest 0.9327 (+-0.0055) 221.28 s CatBoos 0.9502 (+-0.0041) 15.24 s Naive 0.8366 (+-0.007) 1.51 s Light 0.9559 (+-0.0038) 128.46 s Número de Colunas: 67 Exame(s): 5 ['delta_document.pa_diastolica_t2-t3', 'delta_document.pa_sistolica_t3-t4', 'document.freq_cardiaca(t-4)', 'document.glicemia_capilar(t-1)', 'document.freq_respiratoria(t-1)', 'delta_document.pa_sistolica_t-t1', 'document.sat_o2(t-4)', 'document.pa_sistolica(t)', 'document.pa_sistolica(t-3)', 'document.sat_o2(t-3)', 'delta_document.temperatura_t3-t4', 'delta_document.pa_diastolica_t1-t2', 'document.glicemia_capilar(t-4)', 'document.freq_respiratoria(t-3)', 'delta_document.sat_o2_t2-t3', 'document.freq_cardiaca(t)', 'document.pa_sistolica(t-1)', 'document.pa_diastolica(t)', 'UTI', 'age', 'document.sexo', 'document.freq_cardiaca(t-1)', 'delta_document.temperatura_t-t1', 'document.pa_diastolica(t-2)', 'delta_document.temperatura_t2-t3', 'document.sat_o2(t-2)', 'days_from_entrance', 'delta_document.glicemia_capilar_t-t1', 'delta_document.sat_o2_t3-t4', 'delta_document.pa_diastolica_t3-t4', 'document.freq_cardiaca(t-3)', 'document.freq_respiratoria(t)', 'delta_document.glicemia_capilar_t1-t2', 'delta_document.freq_respiratoria_t3-t4', 'document.freq_cardiaca(t-2)', 'delta_document.pa_sistolica_t2-t3', 'delta_document.freq_cardiaca_t-t1', 'document.temperatura(t-4)', 'document.temperatura(t)', 'delta_document.freq_cardiaca_t2-t3', 'document.temperatura(t-1)', 'document.pa_diastolica(t-3)', 'delta_document.freq_cardiaca_t1-t2', 'document.glicemia_capilar(t-3)', 'delta_document.pa_diastolica_t-t1', 'document.pa_sistolica(t-4)', 'document.pa_diastolica(t-1)', 'document.glicemia_capilar(t)', 'delta_document.sat_o2_t1-t2', 'document.pa_sistolica(t-2)', 'delta_document.freq_respiratoria_t-t1', 'delta_document.sat_o2_t-t1', 'document.pa_diastolica(t-4)', 'delta_document.freq_respiratoria_t2-t3', 'document.sat_o2(t)', 'delta_document.glicemia_capilar_t3-t4', 'delta_document.pa_sistolica_t1-t2', 'delta_document.temperatura_t1-t2', 'document.glicemia_capilar(t-2)', 'document.freq_respiratoria(t-2)', 'document.freq_respiratoria(t-4)', 'document.temperatura(t-3)', 'delta_document.freq_cardiaca_t3-t4', 'document.temperatura(t-2)', 'document.sat_o2(t-1)', 'delta_document.glicemia_capilar_t2-t3', 'delta_document.freq_respiratoria_t1-t2'] XGBoost 0.9559 (+-0.0039) 5.1 s LogReg 0.9326 (+-0.0055) 493.28 s D.Tree 0.7465 (+-0.0076) 78.28 s RForest 0.9402 (+-0.006) 271.72 s CatBoos 0.9555 (+-0.0041) 15.67 s Naive 0.8418 (+-0.0068) 1.76 s Light 0.9611 (+-0.0039) 156.87 s
## based on: https://www.mdcalc.com/modified-early-warning-score-mews-clinical-deterioration
## Revisar com um Clínico
def mews_score(x, t):
systolic_bp = x['document.pa_sistolica(t'+t+')']
heart_rate = x['document.freq_cardiaca(t'+t+')']
respiratory_rate = x['document.freq_respiratoria(t'+t+')']
temperature = x['document.temperatura(t'+t+')']
avpu_score = 0 # not informed
mews = 0
if systolic_bp <= 70: mews += 3
elif systolic_bp <= 80: mews += 2
elif systolic_bp <= 100: mews += 1
elif systolic_bp < 200: mews += 0
else: mews += 2
if heart_rate <= 40: mews += 2
elif heart_rate <= 50: mews += 1
elif heart_rate <= 100: mews += 0
elif heart_rate <= 110: mews += 1
elif heart_rate < 130: mews += 2
else: mews += 3
if respiratory_rate < 9: mews += 2
elif respiratory_rate <= 14: mews += 0
elif respiratory_rate <= 20: mews += 1
elif respiratory_rate < 30: mews += 2
else: mews += 3
if temperature < 35: mews += 2
elif temperature <= 38.4: mews += 0
else: mews += 2
return mews
columns = dataset.columns.drop(["document.alta.motivo"])
pd_mews = pd.DataFrame()
pd_mews['t4'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-4',))
pd_mews['t3'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-3',))
pd_mews['t2'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-2',))
pd_mews['t1'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('-1',))
pd_mews['t'] = pd.DataFrame(X, columns=columns).apply(mews_score, axis=1, args=('',))
for t in ['t4','t3','t2','t1','t']:
print('----',t,'-----')
for i in range(4):
mews_binary = (pd_mews[t] > i) * 1
print(i, round(roc_auc_score(Y, mews_binary),4), round(f1_score(Y, mews_binary),4))
---- t4 ----- 0 0.5042 0.0965 1 0.6584 0.1634 2 0.626 0.1814 3 0.581 0.1659 ---- t3 ----- 0 0.5045 0.0966 1 0.6738 0.1705 2 0.6405 0.1991 3 0.5904 0.1799 ---- t2 ----- 0 0.5029 0.0963 1 0.6768 0.17 2 0.6513 0.2053 3 0.5958 0.182 ---- t1 ----- 0 0.5024 0.0962 1 0.6891 0.1742 2 0.6718 0.2258 3 0.6104 0.2025 ---- t ----- 0 0.5017 0.0961 1 0.6976 0.1755 2 0.6829 0.2321 3 0.6201 0.2075
confusion_matrix(Y, mews_binary)
## based on: https://www.rcplondon.ac.uk/projects/outputs/national-early-warning-score-news-2
## Revisar com um Clínico
def news2_score(x, t):
systolic_bp = x['document.pa_sistolica(t'+t+')']
heart_rate = x['document.freq_cardiaca(t'+t+')']
respiratory_rate = x['document.freq_respiratoria(t'+t+')']
temperature = x['document.temperatura(t'+t+')']
avpu_score = 0 # not informed
news2 = 0
if systolic_bp <= 90: news2 += 3
elif systolic_bp <= 100: news2 += 2
elif systolic_bp <= 110: news2 += 1
elif systolic_bp < 220: news2 += 0
else: news2 += 3
if heart_rate <= 40: news2 += 3
elif heart_rate <= 50: news2 += 1
elif heart_rate <= 90: news2 += 0
elif heart_rate <= 110: news2 += 1
elif heart_rate < 130: news2 += 2
else: news2 += 3
if respiratory_rate <= 8: news2 += 3
elif respiratory_rate <= 11: news2 += 1
elif respiratory_rate <= 20: news2 += 0
elif respiratory_rate <= 24: news2 += 2
else: news2 += 3
if temperature <= 35: news2 += 3
elif temperature <= 36: news2 += 1
elif temperature <= 38: news2 += 0
elif temperature <= 39: news2 += 1
else: news2 += 2
return news2
columns = dataset.columns.drop(["document.alta.motivo"])
pd_news2 = pd.DataFrame()
pd_news2['t4'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-4',))
pd_news2['t3'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-3',))
pd_news2['t2'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-2',))
pd_news2['t1'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('-1',))
pd_news2['t'] = pd.DataFrame(X, columns=columns).apply(news2_score, axis=1, args=('',))
for t in ['t4','t3','t2','t1','t']:
print('----',t,'-----')
for i in range(6):
news2_binary = (pd_news2[t] > i) * 1
print(i, round(roc_auc_score(Y, news2_binary),4), round(f1_score(Y, news2_binary),4))
---- t4 ----- 0 0.5627 0.1086 1 0.6518 0.1467 2 0.6452 0.1667 3 0.6216 0.179 4 0.5836 0.1634 5 0.5506 0.1348 ---- t3 ----- 0 0.5716 0.1112 1 0.6429 0.1449 2 0.6512 0.1728 3 0.6242 0.1831 4 0.5804 0.1591 5 0.551 0.1339 ---- t2 ----- 0 0.5691 0.1106 1 0.6501 0.1475 2 0.6595 0.1752 3 0.6269 0.1823 4 0.5851 0.163 5 0.5521 0.1338 ---- t1 ----- 0 0.5785 0.1128 1 0.666 0.1523 2 0.6783 0.1853 3 0.6501 0.2049 4 0.604 0.1889 5 0.5657 0.1582 ---- t ----- 0 0.5846 0.1139 1 0.6822 0.1565 2 0.7048 0.1964 3 0.6773 0.2231 4 0.6232 0.2037 5 0.5754 0.1673
delta_t_cols = [c for c in dataset.columns if 'delta_collect' in c]
dataset[delta_t_cols].mean() / 3600
delta_collect_timestamp_t-t1 4.235348 delta_collect_timestamp_t1-t2 3.413571 delta_collect_timestamp_t2-t3 3.116624 delta_collect_timestamp_t3-t4 3.233622 dtype: float64