#!/usr/bin/env python # coding: utf-8 # In[1]: import os, sys import matplotlib as mpl import matplotlib.pyplot as plt from IPython.display import display get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd import numpy as np # import Image from PIL from PIL import Image from skimage.feature import hog from skimage.color import rgb2gray from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA # import train_test_split from sklearn's model selection module from sklearn.model_selection import train_test_split # import SVC from sklearn's svm module from sklearn.svm import SVC # import accuracy_score from sklearn's metrics module from sklearn.metrics import roc_curve, auc, accuracy_score # HIGHLIGHT_ON = '\x1b[1;33;40m' HIGHLIGHT_OFF = '\x1b[m!' # ## BeeImage Dataset # https://www.kaggle.com/datasets/jenny18/honey-bee-annotated-images # # Based on DataCamp's Naive Bees project: https://app.datacamp.com/learn/projects/412 # # ### Data Preprocessing # In[2]: data = pd.read_csv('bee_data.csv', index_col=0) # droping subspecies = -1 data.drop(data[data.subspecies == '-1'].index, inplace=True) data.head() # In[3]: # removing all other collumns data = data[['subspecies']] data.head() # In[4]: pd.Series(data['subspecies']).value_counts() # In[5]: # considering only two species data2 = data.query('subspecies == "Russian honey bee" or subspecies == "Carniolan honey bee"') data2.head() # In[6]: pd.Series(data2['subspecies']).value_counts() # In[7]: col_encoding = {'subspecies': {'Russian honey bee': 0, 'Carniolan honey bee': 1} } data2 = data2.replace(col_encoding) data2.head() # In[8]: labels = data2['subspecies'] def get_image(img_name, root="bee_imgs/"): """ Converts an image number into the file path where the image is located, opens the image, and returns the image as a numpy array. """ file_path = os.path.join(root, img_name) img = Image.open(file_path) img = img.resize((100,100)) # two images has 4 layers! return np.array(img)[:,:,:3] # subset the dataframe to just Apis (genus is 0.0) get the value of the sixth item in the index img_name = data2[data2.subspecies == 0].index[5] print(img_name) plt.subplot(1,2,1) plt.imshow(get_image(img_name)) plt.title(img_name) # subset the dataframe to just Bombus (genus is 1.0) get the value of the sixth item in the index img_name = data2[data2.subspecies == 1].index[5] print(img_name) # show the corresponding image of a Bombus plt.subplot(1,2,2) plt.imshow(get_image(img_name)) plt.title(img_name) plt.show() # In[9]: # load a bombus image using our get_image function and bombus_row from the previous cell img = get_image(img_name) # print the shape of the bombus image print(f'--> Color image shape: {HIGHLIGHT_ON}{img.shape}{HIGHLIGHT_OFF}') # convert the bombus image to grayscale gray_img = rgb2gray(img) # show the grayscale image plt.imshow(gray_img, cmap=mpl.cm.gray) # grayscale bombus image only has one channel print(f'--> Grayscale image shape: {HIGHLIGHT_ON}{gray_img.shape}{HIGHLIGHT_OFF}') # ## Feature Extraction # # ### Histogram of Oriented Gradients (HOG) # In[10]: # run HOG using our grayscale bombus image hog_features, hog_image = hog(img, visualize=True, block_norm='L2-Hys', pixels_per_cell=(16, 16)) # show our hog_image with a gray colormap plt.imshow(hog_image, cmap=mpl.cm.gray); # ### Two features: HOG + Pixel intensities # In[11]: def create_features(img): # flatten three channel color image color_features = img.flatten() # convert image to grayscale gray_image = rgb2gray(img) # get HOG features from grayscale image hog_features = hog(gray_image, block_norm='L2-Hys', pixels_per_cell=(16, 16)) # combine color and hog features into a single array flat_features = np.hstack((color_features, hog_features)) return flat_features bombus_features = create_features(img) # print shape of bombus_features print(bombus_features.shape, type(bombus_features), bombus_features) # In[12]: def create_feature_matrix(label_dataframe): features_list = [] for img_id in label_dataframe.index: # load image imgi = get_image(img_id) # get features for image image_features = create_features(imgi) features_list.append(image_features) # convert list of arrays into a matrix feature_matrix = np.array(features_list) return feature_matrix # run create_feature_matrix on our dataframe of images feature_matrix = create_feature_matrix(data2) print(f'--> Size of the features matrix: {HIGHLIGHT_ON}{feature_matrix.shape}{HIGHLIGHT_OFF}') # In[13]: img = get_image('032_499.png') print(img.shape) # ### Splitting data: 70% training, 30% testing # In[14]: # split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(feature_matrix, data2.subspecies, test_size=.3, stratify=data2.subspecies, random_state=1234123) # look at the distribution of labels in the train set print(pd.Series(y_train).value_counts()) print(pd.Series(y_test).value_counts()) # ### Normalization # In[15]: # get shape of our training features print('Training features matrix shape is: ', X_train.shape) # define standard scaler ss = StandardScaler() # fit the scaler and transform the training features train_stand = ss.fit_transform(X_train) # transform the test features test_stand = ss.transform(X_test) # look at the new shape of the standardized feature matrices print('Standardized training features matrix shape is: ', train_stand.shape) print('Standardized test features matrix shape is: ', test_stand.shape) # ## Principal Component Analysis # In[16]: # PCA n_components = 300 pca = PCA(n_components=n_components) # ### Analysis of the number of components # In[17]: data_std = ss.fit_transform(feature_matrix) X_train_pca = pca.fit_transform(data_std) # # Determine explained variance using explained_variance_ration_ attribute # exp_var_pca = pca.explained_variance_ratio_ # # Cumulative sum of eigenvalues; This will be used to create step plot # for visualizing the variance explained by each principal component. # cum_sum_eigenvalues = np.cumsum(exp_var_pca) # # Create the visualization plot # plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance') plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance') plt.ylabel('Explained variance ratio') plt.xlabel('Principal component index') plt.legend(loc='best') plt.tight_layout() plt.show() print(f'--> Explained variance with [{n_components} components]: {HIGHLIGHT_ON}{np.max(cum_sum_eigenvalues):.4f}{HIGHLIGHT_OFF}') # In[18]: pca = PCA(n_components=n_components) # use fit_transform on our standardized training features X_train = pca.fit_transform(train_stand) # use transform on our standardized test features X_test = pca.transform(test_stand) # look at new shape print('Training features matrix is: ', X_train.shape) print('Test features matrix is: ', X_test.shape) # ## Classifier: Support Vector Machine (SVM) # In[19]: # define support vector classifier svm = SVC(kernel='linear', probability=True, random_state=42) # fit model svm.fit(X_train, y_train) # generate predictions y_pred = svm.predict(X_test) # calculate accuracy accuracy = accuracy_score(y_test, y_pred) print(f'--> Model accuracy {HIGHLIGHT_ON}{accuracy:.4f}{HIGHLIGHT_OFF}') # ### Prediction on Test set # In[20]: # predict probabilities for X_test using predict_proba probabilities = svm.predict_proba(X_test) # select the probabilities for label 1.0 y_proba = probabilities[:,1] # calculate false positive rate and true positive rate at different thresholds false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_proba, pos_label=1) # calculate AUC roc_auc = auc(false_positive_rate, true_positive_rate) plt.title('Receiver Operating Characteristic') # plot the false positive rate on the x axis and the true positive rate on the y axis roc_plot = plt.plot(false_positive_rate, true_positive_rate, label='AUC = {:0.2f}'.format(roc_auc)) plt.legend(loc=0) plt.plot([0,1], [0,1], ls='--') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate'); # In[ ]: