# Tratamiento de datos
# ==============================================================================
import pandas as pd
import numpy as np
# Preprocesado y modelado
# ==============================================================================
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('ignore')
#Cargamos los datos!
url = 'https://raw.githubusercontent.com/JoaquinAmatRodrigo/' \
+ 'Estadistica-machine-learning-python/master/data/ESL.mixture.csv'
datos = pd.read_csv(url)
datos.head()
X1 | X2 | y | |
---|---|---|---|
0 | 2.526093 | 0.321050 | 0 |
1 | 0.366954 | 0.031462 | 0 |
2 | 0.768219 | 0.717486 | 0 |
3 | 0.693436 | 0.777194 | 0 |
4 | -0.019837 | 0.867254 | 0 |
datos.y.unique()
array([0, 1])
datos.shape
(200, 3)
datos.y.unique()
array([0, 1])
#Visualizacion!
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6,4))
ax.scatter(datos.X1, datos.X2, c=datos.y);
ax.set_title("Datos");
# División de los datos en train y test
X = datos.drop(columns = 'y') # Matriz de 2 columnas -Variable Independientes
y = datos['y'] # Vector de 200 filas y 1 columa- Variable Dependiente
X
X1 | X2 | |
---|---|---|
0 | 2.526093 | 0.321050 |
1 | 0.366954 | 0.031462 |
2 | 0.768219 | 0.717486 |
3 | 0.693436 | 0.777194 |
4 | -0.019837 | 0.867254 |
... | ... | ... |
195 | 0.256750 | 2.293605 |
196 | 1.925173 | 0.165053 |
197 | 1.301941 | 0.992200 |
198 | 0.008131 | 2.242264 |
199 | -0.196246 | 0.551404 |
200 rows × 2 columns
y
0 0 1 0 2 0 3 0 4 0 .. 195 1 196 1 197 1 198 1 199 1 Name: y, Length: 200, dtype: int64
y.values.reshape(-1,1)
array([[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [1]])
X_train, X_test, y_train, y_test = train_test_split(X,y.values.reshape(-1,1),train_size= 0.75,\
random_state = 42,shuffle=True)
gamma es un parámetro para hiperplanos no lineales. Cuanto mayor sea el valor de gamma, intentará ajustarse exactamente al conjunto de datos de entrenamiento.
C es el parámetro de penalización del término de error. Controla el equilibrio entre un límite de decisión uniforme y la clasificación correcta de los puntos de entrenamiento.
SVC?
# Creación del modelo SVM
modelo = SVC(C=0.1,kernel='rbf',random_state=42)
modelo.fit(X_train, y_train)
SVC(C=0.1, random_state=42)
#Predicciones!
#y_train_pred = modelo.predict(X_train)
y_test_pred = modelo.predict(X_test)
X_test
X1 | X2 | |
---|---|---|
95 | 0.434501 | -0.064545 |
15 | 1.279560 | -0.271033 |
30 | 2.988126 | 0.672351 |
158 | 0.508118 | 1.757232 |
128 | -1.136539 | 2.052686 |
115 | -2.017571 | 2.720377 |
69 | 2.482942 | -0.737176 |
170 | 0.571872 | 1.577942 |
174 | 1.244438 | 0.852323 |
45 | 1.749188 | -0.357916 |
66 | 2.290042 | 0.137390 |
182 | 1.518294 | 0.361734 |
165 | 3.503126 | 1.223982 |
78 | 0.087615 | -0.733347 |
186 | 1.486248 | 0.918180 |
177 | 1.287422 | 0.272585 |
56 | 0.134347 | 0.901676 |
152 | -1.960780 | 2.002274 |
82 | 2.576843 | -1.435682 |
68 | 0.516429 | 0.525152 |
124 | -1.464691 | 2.473922 |
16 | -0.313007 | 1.273747 |
148 | -2.477418 | 1.026506 |
93 | 1.419537 | -0.029746 |
65 | 0.310785 | 2.007982 |
60 | 0.325769 | 0.347996 |
84 | 2.892025 | 1.625783 |
67 | 0.659906 | -1.241637 |
125 | 1.623802 | 0.240937 |
132 | -0.071295 | 1.460543 |
9 | 1.897709 | 0.973755 |
18 | 0.319988 | 0.017202 |
55 | -0.727077 | 1.457361 |
75 | 0.951265 | -1.095890 |
150 | 1.028464 | 1.550986 |
104 | 1.076957 | -0.330735 |
135 | 0.433583 | 1.261365 |
137 | 1.432136 | 0.306475 |
164 | -0.570430 | 1.090314 |
76 | 0.425476 | 0.184899 |
79 | 2.285265 | 1.008993 |
197 | 1.301941 | 0.992200 |
38 | 1.532258 | -1.082164 |
24 | 1.467109 | 1.877591 |
122 | 1.334411 | -0.294441 |
195 | 0.256750 | 2.293605 |
29 | 2.126869 | 1.154180 |
19 | 0.437542 | 0.413648 |
143 | 1.047458 | 1.039996 |
86 | -0.074079 | 0.919702 |
X_test.shape
(50, 2)
y_test_pred
array([0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1])
y_test
array([[0], [0], [0], [1], [1], [1], [0], [1], [1], [0], [0], [1], [1], [0], [1], [1], [0], [1], [0], [0], [1], [0], [1], [0], [0], [0], [0], [0], [1], [1], [0], [0], [0], [0], [1], [1], [1], [1], [1], [0], [0], [1], [0], [0], [1], [1], [0], [0], [1], [0]])
#Calculo el accuracy en Test
from sklearn.metrics import accuracy_score
test_accuracy = accuracy_score(y_test, y_test_pred)
print('% de aciertos sobre el set de evaluación:',test_accuracy)
% de aciertos sobre el set de evaluación: 0.68