from sklearn.datasets import load_boston boston = load_boston() print(boston.data.shape) # dataset dimension print(boston.feature_names) # nombre feature print(boston.target) # target variable print(boston.DESCR) # data description import pandas as pd bos = pd.DataFrame(boston.data, columns = boston.feature_names) bos['Price'] = boston.target X = bos.drop("Price", 1) # feature matrix y = bos['Price'] # target feature bos.head() X y import statsmodels.api as sm def forward_selection(data, target, significance_level=0.01): initial_features = data.columns.tolist() best_features = [] while (len(initial_features)>0): remaining_features = list(set(initial_features)-set(best_features)) new_pval = pd.Series(index=remaining_features) for new_column in remaining_features: model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit() new_pval[new_column] = model.pvalues[new_column] min_p_value = new_pval.min() if(min_p_value0): features_with_constant = sm.add_constant(data[features]) p_values = sm.OLS(target, features_with_constant).fit().pvalues[1:] max_p_value = p_values.max() if(max_p_value >= significance_level): excluded_feature = p_values.idxmax() features.remove(excluded_feature) else: break return features backward_elimination(X,y) def stepwise_selection(data, target,SL_in=0.05,SL_out = 0.05): initial_features = data.columns.tolist() best_features = [] while (len(initial_features)>0): remaining_features = list(set(initial_features)-set(best_features)) new_pval = pd.Series(index=remaining_features) for new_column in remaining_features: model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit() new_pval[new_column] = model.pvalues[new_column] min_p_value = new_pval.min() if(min_p_value0): best_features_with_constant = sm.add_constant(data[best_features]) p_values = sm.OLS(target, best_features_with_constant).fit().pvalues[1:] max_p_value = p_values.max() if(max_p_value >= SL_out): excluded_feature = p_values.idxmax() best_features.remove(excluded_feature) else: break else: break return best_features stepwise_selection(X,y) from sklearn.datasets import load_breast_cancer from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn import metrics import pandas as pd import numpy as np from matplotlib import pyplot as plt import seaborn as sns sns.set_style('whitegrid') # Cargamos dataset de cancer de mama data = load_breast_cancer() # definimos matriz de diseƱo X y vector respuesta y X = pd.DataFrame(data['data'], columns=data['feature_names']) y = abs(pd.Series(data['target'])-1) # Separamos en entrenamiento/test en razon 80/20 % X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=1) # Creamos un modelo Random Forest con parametros por defect modelo = RandomForestClassifier(random_state=1) modelo.fit(X_train, y_train) # Obtenemos las predicciones del modelo con X_test preds = modelo.predict(X_test) plt.figure(figsize=(10,6)) metrics.plot_confusion_matrix(modelo, X_test, y_test, display_labels=['Negative', 'Positive']) confusion = metrics.confusion_matrix(y_test, preds) confusion.ravel() accuracy = metrics.accuracy_score(y_test, preds) accuracy # Precision se evalua para cada categoria precision_positiva = metrics.precision_score(y_test, preds, pos_label=1) precision_negativa = metrics.precision_score(y_test, preds, pos_label=0) precision_positiva, precision_negativa recall_sensibilidad = metrics.recall_score(y_test, preds, pos_label=1) recall_especificidad= metrics.recall_score(y_test, preds, pos_label=0) recall_sensibilidad, recall_especificidad f1_positivo = metrics.f1_score(y_test, preds, pos_label=1) f1_negativo = metrics.f1_score(y_test, preds, pos_label=0) f1_positivo, f1_negativo # Todas las metricas en uno print(metrics.classification_report(y_test, preds)) import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score # Carguemos un dataset de ejemplo diabetes_X, diabetes_y = datasets.load_diabetes(return_X_y=True) diabetes_X diabetes_y from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test = train_test_split(diabetes_X,diabetes_y,test_size=0.2,random_state=2) from sklearn.linear_model import LinearRegression # crear el modelo lr = LinearRegression() # Ajustar el modelo con X_train y y_train lr.fit(X_train,y_train) # PRedecir con X_test y_pred = lr.predict(X_test) from sklearn.metrics import mean_absolute_error print("MAE",mean_absolute_error(y_test,y_pred)) from sklearn.metrics import mean_squared_error print("MSE",mean_squared_error(y_test,y_pred)) print("RMSE",np.sqrt(mean_squared_error(y_test,y_pred))) print("RMSE",np.log(np.sqrt(mean_squared_error(y_test,y_pred)))) from sklearn.metrics import r2_score r2 = r2_score(y_test,y_pred) print(r2)