#!/usr/bin/env python # coding: utf-8 # ## Introduction: # # In the development of a cancer diagnosis prediction model, I utilized a K-Nearest Neighbors (KNN) classifier with 3 neighbors. My primary goal was to optimize the model to accurately identify cancer cases, reducing the number of false negatives, that represent undetected cancer cases. I focused on improving recall, which measures the model's ability to correctly identify positive cases. # # In[269]: import pandas as pd import numpy as np import warnings warnings.filterwarnings("ignore") # In[270]: df = pd.read_csv(r"C:\Users\Teni\Desktop\Git-Github\Datasets\KNN\breast-cancer.csv") # ### Data Info Analysis # # In[271]: df.head() # In[272]: df.info() # In[273]: df.diagnosis.value_counts() # In[274]: print(df.diagnosis.unique()) # In[275]: print(df.dtypes) # In[276]: df.isnull().sum() # There are no null data # # ### Data Exploratory Analysis # # # In[277]: import matplotlib.pyplot as plt import seaborn as sns # In[278]: df.diagnosis.value_counts().plot(kind='bar') plt.show() # In[279]: diagnosis_mapping = {'M': 1, 'B': 0} # In[280]: df.diagnosis = df.diagnosis.map(diagnosis_mapping) # In[281]: df.head() # In[282]: df.drop(df.columns[-1], axis=1, inplace=True) df # In[283]: sns.pairplot(df, vars=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean'], hue='diagnosis') plt.show() # # ### Define X and y # # # In[284]: X = df.drop('diagnosis', axis=1) y= df['diagnosis'] # # ### Train_Test_Split # # # In[285]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) from sklearn.model_selection import train_test_split # In[286]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42) # #### Data Scaling # # - Scaling the data because the values in the data are widely apart # In[287]: from sklearn.preprocessing import StandardScaler # In[288]: scaler = StandardScaler() # In[289]: scaled_X_train = scaler.fit_transform(X_train) # In[290]: scaled_X_test = scaler.transform(X_test) # - model selction # # - will be using KNNClassification to identify the ditance/relationship of the variables in forcasting new data enties # # - also, will use GridsearchCV # ### Data Modelling # # In[291]: from sklearn.neighbors import KNeighborsClassifier # In[292]: knn = KNeighborsClassifier(n_neighbors=2) # In[293]: knn.fit(scaled_X_train, y_train) # In[294]: y_pred = knn.predict(scaled_X_test) # ### Data Metrics # # In[295]: from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, plot_confusion_matrix # In[296]: accuracy_score(y_test, y_pred) # In[297]: round(accuracy_score(y_test, y_pred), 3) # In[298]: confusion_matrix(y_test, y_pred) # In[299]: plot_confusion_matrix(knn, scaled_X_test, y_test) # In[300]: print(classification_report(y_test, y_pred)) # - the metrics for precisiona and recall are different because the data is unbalanced # - It's better to have False Positives than False negatives. # - Finetuning oroptimizing the Recall metric will be at the cost of a lower precision score (which is alriht in our data context) # - Due to the context of the data, need to further optimize the model's ability to significantly minimize false negatives- which means ethe recall would be enhanced. # In[301]: error_rate = 1-(accuracy_score(y_test, y_pred)) error_rate # #### Hypertune the parameters to improve the Metrics Score # # In[302]: error_rate = [ ] for k in range (1, 50): knn = KNeighborsClassifier(n_neighbors=k) knn.fit(scaled_X_train, y_train) y_pred = knn.predict(scaled_X_test) error = 1-(accuracy_score(y_test, y_pred)) error_rate.append(error) # In[303]: error_rate # In[304]: plt.plot(range(1, 50), error_rate) plt.ylabel('error rate') plt.xlabel('range') # **Zone in on the graph, since the minimu error rate falls within the KNN range of 9 and 11** # In[305]: plt.plot(range(1, 50), error_rate) plt.ylabel('error rate') plt.xlabel('range') plt.xlim(8, 12) # In[306]: knn_2 = KNeighborsClassifier(n_neighbors=9) knn_2.fit(scaled_X_train, y_train) y_pred_2 = knn.predict(scaled_X_test) # In[307]: accuracy_score(y_test, y_pred_2) # In[308]: round(accuracy_score(y_test, y_pred), 3) # Since the accuracy has no much diff- leave the knn as it was # #### Hypertune the parameters using GrodSearchCV to improve the Recall score # # In[309]: scaler = StandardScaler() # In[310]: knn = KNeighborsClassifier() # In[311]: operations = [('scaler', scaler), ('knn', knn)] # In[312]: from sklearn.pipeline import Pipeline # In[313]: pipe = Pipeline(operations) # In[314]: from sklearn.model_selection import GridSearchCV # In[315]: k_values = range(1, 50) # In[316]: param_grid = {'knn__n_neighbors': k_values} # In[317]: from sklearn.metrics import make_scorer, recall_score # In[318]: scorer= make_scorer(recall_score) # In[319]: full_cv_classifier = GridSearchCV(pipe, param_grid, cv=5, scoring=scorer) # In[320]: full_cv_classifier.fit(scaled_X_train, y_train) # In[321]: full_cv_classifier.predict(scaled_X_test) # In[322]: plot_confusion_matrix(full_cv_classifier, scaled_X_test, y_test) # In[323]: full_cv_classifier.best_params_ # In[324]: full_cv_classifier.best_score_ # - The best knn value from the range using GridsearchCV is 3 # # ## Conclusion: # # Through careful adjustments, I improved the model's performance significantly. I reduced false negatives from 7 to 4, ensuring more cancer cases were correctly identified. Additionally, precision increased from 1 to 3, meaning the model made fewer incorrect positive predictions. Overall, my refined model is more reliable for early cancer detection. These improvements not only enhance the model's accuracy but also its potential utility in clinical settings for better patient outcomes.