# librerias
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
Descripcion de datos https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.names
Enlace con datos https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv
El archivo "sonar.mines" contiene 111 patrones obtenidos al hacer rebotar señales de sonar en un cilindro de metal en varios ángulos y bajo diversas condiciones. El archivo "sonar.rocks" contiene 97 patrones obtenidos de rocas en condiciones similares. La señal del sonar transmitida es un chirrido de frecuencia modulada, aumentando en frecuencia. El conjunto de datos contiene señales obtenidas desde una variedad de ángulos de aspecto diferentes, que abarcan 90 grados para el cilindro y 180 grados para la roca.
Cada patrón es un conjunto de 60 números en el rango de 0,0 a 1,0. Cada número representa la energía dentro de una banda de frecuencia particular, integrada durante un cierto período de tiempo. La apertura de integración para frecuencias más altas ocurre más tarde, ya que estas frecuencias se transmiten más tarde durante el chirrido.
La etiqueta asociada a cada registro contiene la letra "R" si el objeto es una roca y "M" si es una mina (cilindro de metal). Los números en las etiquetas están en orden creciente de ángulo de aspecto, pero no codifican el ángulo directamente.
# datos
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
data = dataframe.values
dataframe.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0200 | 0.0371 | 0.0428 | 0.0207 | 0.0954 | 0.0986 | 0.1539 | 0.1601 | 0.3109 | 0.2111 | ... | 0.0027 | 0.0065 | 0.0159 | 0.0072 | 0.0167 | 0.0180 | 0.0084 | 0.0090 | 0.0032 | R |
1 | 0.0453 | 0.0523 | 0.0843 | 0.0689 | 0.1183 | 0.2583 | 0.2156 | 0.3481 | 0.3337 | 0.2872 | ... | 0.0084 | 0.0089 | 0.0048 | 0.0094 | 0.0191 | 0.0140 | 0.0049 | 0.0052 | 0.0044 | R |
2 | 0.0262 | 0.0582 | 0.1099 | 0.1083 | 0.0974 | 0.2280 | 0.2431 | 0.3771 | 0.5598 | 0.6194 | ... | 0.0232 | 0.0166 | 0.0095 | 0.0180 | 0.0244 | 0.0316 | 0.0164 | 0.0095 | 0.0078 | R |
3 | 0.0100 | 0.0171 | 0.0623 | 0.0205 | 0.0205 | 0.0368 | 0.1098 | 0.1276 | 0.0598 | 0.1264 | ... | 0.0121 | 0.0036 | 0.0150 | 0.0085 | 0.0073 | 0.0050 | 0.0044 | 0.0040 | 0.0117 | R |
4 | 0.0762 | 0.0666 | 0.0481 | 0.0394 | 0.0590 | 0.0649 | 0.1209 | 0.2467 | 0.3564 | 0.4459 | ... | 0.0031 | 0.0054 | 0.0105 | 0.0110 | 0.0015 | 0.0072 | 0.0048 | 0.0107 | 0.0094 | R |
5 rows × 61 columns
dataframe.shape
(208, 61)
dataframe.isnull().sum()
0 0 1 0 2 0 3 0 4 0 .. 56 0 57 0 58 0 59 0 60 0 Length: 61, dtype: int64
sum(dataframe.isna().sum())
# no hay nulos
0
# Separar en X y y
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)
(208, 60) (208,)
# crear el procedimiento loocv
cv = LeaveOneOut()
LeaveOneOut?
# crear modelo
model = RandomForestClassifier(random_state=1,n_estimators=30, criterion="gini",max_depth=4)
cross_val_score?
# evaluar el modelo
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, verbose=1)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers. [Parallel(n_jobs=1)]: Done 208 out of 208 | elapsed: 14.7s finished
# reportar el performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))
Accuracy: 0.808 (0.394)
El modelo se evalúa utilizando LOOCV y el rendimiento estimado al hacer predicciones sobre nuevos datos tiene una precisión de alrededor de ~80%.
Descripcion de datos https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.names
Enlace con datos https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv
# librerias
from pandas import read_csv
# cargar datos
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
dataframe = read_csv(url, header=None)
# shape
print(dataframe.shape)
(506, 14)
dataframe
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1 | 273.0 | 21.0 | 391.99 | 9.67 | 22.4 |
502 | 0.04527 | 0.0 | 11.93 | 0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1 | 273.0 | 21.0 | 396.90 | 9.08 | 20.6 |
503 | 0.06076 | 0.0 | 11.93 | 0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1 | 273.0 | 21.0 | 396.90 | 5.64 | 23.9 |
504 | 0.10959 | 0.0 | 11.93 | 0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1 | 273.0 | 21.0 | 393.45 | 6.48 | 22.0 |
505 | 0.04741 | 0.0 | 11.93 | 0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1 | 273.0 | 21.0 | 396.90 | 7.88 | 11.9 |
506 rows × 14 columns
dataframe.dtypes
0 float64 1 float64 2 float64 3 int64 4 float64 5 float64 6 float64 7 float64 8 int64 9 float64 10 float64 11 float64 12 float64 13 float64 dtype: object
dataframe.isnull().sum()
0 0 1 0 2 0 3 0 4 0 5 0 6 0 7 0 8 0 9 0 10 0 11 0 12 0 13 0 dtype: int64
# separar en X y y
data= dataframe.values
X, y = data[:, :-1], data[:, -1]
print(X.shape, y.shape)
(506, 13) (506,)
# crear el procediemiento LOOCV
cv = LeaveOneOut()
# crear el modelo
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42, n_estimators=10,max_depth=4)
cross_val_score?
Metodos de scoring disponibles
https://scikit-learn.org/stable/modules/model_evaluation.html
# evaluar el modelo (criterio de comparacion MAE)
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error
MAE = make_scorer(mean_absolute_error)
scores = cross_val_score(model, X, y, scoring=MAE, cv=cv,error_score='raise',verbose=1)
# convertir a postivos
scores = abs(scores)
# reportar el performance
print('MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
MAE: 2.650 (2.727)
[Parallel(n_jobs=1)]: Done 506 out of 506 | elapsed: 11.9s finished
El modelo se evalúa usando LOOCV y el desempeño del modelo al hacer predicciones sobre nuevos datos tiene un MAE de ~2.650 (miles de dólares)
# datos
from sklearn.model_selection import train_test_split
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv'
dataframe = read_csv(url, header=None)
dataframe.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0200 | 0.0371 | 0.0428 | 0.0207 | 0.0954 | 0.0986 | 0.1539 | 0.1601 | 0.3109 | 0.2111 | ... | 0.0027 | 0.0065 | 0.0159 | 0.0072 | 0.0167 | 0.0180 | 0.0084 | 0.0090 | 0.0032 | R |
1 | 0.0453 | 0.0523 | 0.0843 | 0.0689 | 0.1183 | 0.2583 | 0.2156 | 0.3481 | 0.3337 | 0.2872 | ... | 0.0084 | 0.0089 | 0.0048 | 0.0094 | 0.0191 | 0.0140 | 0.0049 | 0.0052 | 0.0044 | R |
2 | 0.0262 | 0.0582 | 0.1099 | 0.1083 | 0.0974 | 0.2280 | 0.2431 | 0.3771 | 0.5598 | 0.6194 | ... | 0.0232 | 0.0166 | 0.0095 | 0.0180 | 0.0244 | 0.0316 | 0.0164 | 0.0095 | 0.0078 | R |
3 | 0.0100 | 0.0171 | 0.0623 | 0.0205 | 0.0205 | 0.0368 | 0.1098 | 0.1276 | 0.0598 | 0.1264 | ... | 0.0121 | 0.0036 | 0.0150 | 0.0085 | 0.0073 | 0.0050 | 0.0044 | 0.0040 | 0.0117 | R |
4 | 0.0762 | 0.0666 | 0.0481 | 0.0394 | 0.0590 | 0.0649 | 0.1209 | 0.2467 | 0.3564 | 0.4459 | ... | 0.0031 | 0.0054 | 0.0105 | 0.0110 | 0.0015 | 0.0072 | 0.0048 | 0.0107 | 0.0094 | R |
5 rows × 61 columns
train_test_split?
data= dataframe.values
X,y = data[:,:-1],data[:, -1]
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2, shuffle=True)
print(X_train.shape, X_test.shape)
(166, 60) (42, 60)
# Modelo
model = RandomForestClassifier(random_state=1,n_estimators=30, criterion="gini",max_depth=4)
# Ajuste
model.fit(X_train,y_train)
RandomForestClassifier(max_depth=4, n_estimators=30, random_state=1)
predicciones= model.predict(X_test)
predicciones[0:5]
array(['M', 'M', 'M', 'M', 'M'], dtype=object)
# Validacion simple
from sklearn.metrics import classification_report
print(classification_report(y_true= y_test, y_pred= predicciones))
precision recall f1-score support M 0.74 0.92 0.82 25 R 0.82 0.53 0.64 17 accuracy 0.76 42 macro avg 0.78 0.72 0.73 42 weighted avg 0.77 0.76 0.75 42
# librerias
from pandas import read_csv
# cargar datos
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv'
dataframe = read_csv(url, header=None)
# shape
print(dataframe.shape)
(506, 14)
data= dataframe.values
X,y = data[:,:-1],data[:, -1]
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2, shuffle=True)
print(X_train.shape, X_test.shape)3
(404, 13) (102, 13)
# Modelo
model =RandomForestRegressor(random_state=42, n_estimators=10,max_depth=4)
# Ajuste
model.fit(X_train,y_train)
RandomForestRegressor(max_depth=4, n_estimators=10, random_state=42)
predicciones= model.predict(X_test)
# Validacion simple
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
print('MSE: ',mean_squared_error(y_true= y_test, y_pred= predicciones))
print('MAE: ',mean_absolute_error(y_true= y_test, y_pred= predicciones))
print('R2: ',r2_score(y_true= y_test, y_pred= predicciones))
MSE: 13.256900736696371 MAE: 2.74655558241846 R2: 0.810238703397846