from google.colab import drive
import os
drive.mount('/content/drive')
# Establecer ruta de acceso en drive
import os
print(os.getcwd())
os.chdir("/content/drive/My Drive")
print(os.getcwd())
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True). /content/drive/My Drive /content/drive/My Drive
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Breast_cancer_data.csv')
df.head()
mean_radius | mean_texture | mean_perimeter | mean_area | mean_smoothness | diagnosis | |
---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0 |
#Info
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mean_radius 569 non-null float64 1 mean_texture 569 non-null float64 2 mean_perimeter 569 non-null float64 3 mean_area 569 non-null float64 4 mean_smoothness 569 non-null float64 5 diagnosis 569 non-null int64 dtypes: float64(5), int64(1) memory usage: 26.8 KB
#Analizamos la variable target y su frecuencia
df['diagnosis'].value_counts()
1 357 0 212 Name: diagnosis, dtype: int64
#Definimos X e y
X = df[['mean_radius','mean_texture','mean_perimeter','mean_area','mean_smoothness']]
y = df['diagnosis']
X
mean_radius | mean_texture | mean_perimeter | mean_area | mean_smoothness | |
---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 |
... | ... | ... | ... | ... | ... |
564 | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 |
565 | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 |
566 | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 |
567 | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 |
568 | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 |
569 rows × 5 columns
y
0 0 1 0 2 0 3 0 4 0 .. 564 0 565 0 566 0 567 0 568 1 Name: diagnosis, Length: 569, dtype: int64
#Separamos en Train y Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
!pip install lightgbm
Requirement already satisfied: lightgbm in /usr/local/lib/python3.7/dist-packages (2.2.3) Requirement already satisfied: scikit-learn in /usr/local/lib/python3.7/dist-packages (from lightgbm) (1.0.2) Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from lightgbm) (1.21.6) Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from lightgbm) (1.4.1) Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->lightgbm) (1.1.0) Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn->lightgbm) (3.1.0)
import lightgbm as lgb #pip install lightgbm
clf = lgb.LGBMClassifier()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
#Accuracy
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_pred, y_test)
print('LightGBM Model accuracy score: {0:0.4f}'.format(accuracy_score(y_test, y_pred)))
LightGBM Model accuracy score: 0.9591