#!/usr/bin/env python
# coding: utf-8

# In[1]:


import os, sys

import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
get_ipython().run_line_magic('matplotlib', 'inline')

import pandas as pd
import numpy as np

# import Image from PIL
from PIL import Image

from skimage.feature import hog
from skimage.color import rgb2gray

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# import train_test_split from sklearn's model selection module
from sklearn.model_selection import train_test_split

# import SVC from sklearn's svm module
from sklearn.svm import SVC

# import accuracy_score from sklearn's metrics module
from sklearn.metrics import roc_curve, auc, accuracy_score

#
HIGHLIGHT_ON = '\x1b[1;33;40m'
HIGHLIGHT_OFF =  '\x1b[m!'


# ## BeeImage Dataset
# https://www.kaggle.com/datasets/jenny18/honey-bee-annotated-images
# 
# Based on DataCamp's Naive Bees project: https://app.datacamp.com/learn/projects/412
# 
# ### Data Preprocessing

# In[2]:


data = pd.read_csv('bee_data.csv', index_col=0)

# droping subspecies = -1
data.drop(data[data.subspecies == '-1'].index, inplace=True)

data.head()


# In[3]:


# removing all other collumns
data = data[['subspecies']]
data.head()


# In[4]:


pd.Series(data['subspecies']).value_counts()


# In[5]:


# considering only two species
data2 = data.query('subspecies == "Russian honey bee" or subspecies == "Carniolan honey bee"')
data2.head()


# In[6]:


pd.Series(data2['subspecies']).value_counts()


# In[7]:


col_encoding = {'subspecies': 
                    {'Russian honey bee': 0, 'Carniolan honey bee': 1}
               }

data2 = data2.replace(col_encoding)
data2.head()


# In[8]:


labels = data2['subspecies']

def get_image(img_name, root="bee_imgs/"):
    """
    Converts an image number into the file path where the image is located, 
    opens the image, and returns the image as a numpy array.
    """
    file_path = os.path.join(root, img_name)
    img = Image.open(file_path)
    img = img.resize((100,100))
    
    # two images has 4 layers!
    return np.array(img)[:,:,:3]

# subset the dataframe to just Apis (genus is 0.0) get the value of the sixth item in the index
img_name = data2[data2.subspecies == 0].index[5]
print(img_name)

plt.subplot(1,2,1)
plt.imshow(get_image(img_name))
plt.title(img_name)

# subset the dataframe to just Bombus (genus is 1.0) get the value of the sixth item in the index
img_name = data2[data2.subspecies == 1].index[5]
print(img_name)

# show the corresponding image of a Bombus
plt.subplot(1,2,2)
plt.imshow(get_image(img_name))
plt.title(img_name)
plt.show()


# In[9]:


# load a bombus image using our get_image function and bombus_row from the previous cell
img = get_image(img_name)

# print the shape of the bombus image
print(f'--> Color image shape: {HIGHLIGHT_ON}{img.shape}{HIGHLIGHT_OFF}')

# convert the bombus image to grayscale
gray_img = rgb2gray(img)

# show the grayscale image
plt.imshow(gray_img, cmap=mpl.cm.gray)

# grayscale bombus image only has one channel
print(f'--> Grayscale image shape: {HIGHLIGHT_ON}{gray_img.shape}{HIGHLIGHT_OFF}')


# ## Feature Extraction
# 
# ### Histogram of Oriented Gradients (HOG)

# In[10]:


# run HOG using our grayscale bombus image
hog_features, hog_image = hog(img,
                              visualize=True,
                              block_norm='L2-Hys',
                              pixels_per_cell=(16, 16))

# show our hog_image with a gray colormap
plt.imshow(hog_image, cmap=mpl.cm.gray);


# ### Two features: HOG + Pixel intensities

# In[11]:


def create_features(img):
    # flatten three channel color image
    color_features = img.flatten()
    # convert image to grayscale
    gray_image = rgb2gray(img)
    # get HOG features from grayscale image
    hog_features = hog(gray_image, block_norm='L2-Hys', pixels_per_cell=(16, 16))
    # combine color and hog features into a single array
    flat_features = np.hstack((color_features, hog_features))
    return flat_features

bombus_features = create_features(img)

# print shape of bombus_features
print(bombus_features.shape, type(bombus_features), bombus_features)


# In[12]:


def create_feature_matrix(label_dataframe):
    features_list = []
    
    for img_id in label_dataframe.index:
        # load image
        imgi = get_image(img_id)
        # get features for image
        image_features = create_features(imgi)
        features_list.append(image_features)
    
    # convert list of arrays into a matrix
    feature_matrix = np.array(features_list)
    return feature_matrix

# run create_feature_matrix on our dataframe of images
feature_matrix = create_feature_matrix(data2)
print(f'--> Size of the features matrix: {HIGHLIGHT_ON}{feature_matrix.shape}{HIGHLIGHT_OFF}')


# In[13]:


img = get_image('032_499.png')
print(img.shape)


# ### Splitting data: 70% training, 30% testing

# In[14]:


# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(feature_matrix,
                                                    data2.subspecies,
                                                    test_size=.3,
                                                    stratify=data2.subspecies,
                                                    random_state=1234123)

# look at the distribution of labels in the train set
print(pd.Series(y_train).value_counts())
print(pd.Series(y_test).value_counts())


# ### Normalization

# In[15]:


# get shape of our training features
print('Training features matrix shape is: ', X_train.shape)

# define standard scaler
ss = StandardScaler()

# fit the scaler and transform the training features
train_stand = ss.fit_transform(X_train)

# transform the test features
test_stand = ss.transform(X_test)

# look at the new shape of the standardized feature matrices
print('Standardized training features matrix shape is: ', train_stand.shape)
print('Standardized test features matrix shape is: ', test_stand.shape)


# ## Principal Component Analysis

# In[16]:


# PCA
n_components = 300
pca = PCA(n_components=n_components)


# ### Analysis of the number of components

# In[17]:


data_std = ss.fit_transform(feature_matrix)

X_train_pca = pca.fit_transform(data_std)
#
# Determine explained variance using explained_variance_ration_ attribute
#
exp_var_pca = pca.explained_variance_ratio_
#
# Cumulative sum of eigenvalues; This will be used to create step plot
# for visualizing the variance explained by each principal component.
#
cum_sum_eigenvalues = np.cumsum(exp_var_pca)
#
# Create the visualization plot
#
plt.bar(range(0,len(exp_var_pca)), exp_var_pca, alpha=0.5, align='center', label='Individual explained variance')
plt.step(range(0,len(cum_sum_eigenvalues)), cum_sum_eigenvalues, where='mid',label='Cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal component index')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

print(f'--> Explained variance with [{n_components} components]: {HIGHLIGHT_ON}{np.max(cum_sum_eigenvalues):.4f}{HIGHLIGHT_OFF}')


# In[18]:


pca = PCA(n_components=n_components)

# use fit_transform on our standardized training features
X_train = pca.fit_transform(train_stand)

# use transform on our standardized test features
X_test = pca.transform(test_stand)

# look at new shape
print('Training features matrix is: ', X_train.shape)
print('Test features matrix is: ', X_test.shape)


# ## Classifier: Support Vector Machine (SVM)

# In[19]:


# define support vector classifier
svm = SVC(kernel='linear', probability=True, random_state=42)

# fit model
svm.fit(X_train, y_train)

# generate predictions
y_pred = svm.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'--> Model accuracy {HIGHLIGHT_ON}{accuracy:.4f}{HIGHLIGHT_OFF}')


# ### Prediction on Test set

# In[20]:


# predict probabilities for X_test using predict_proba
probabilities = svm.predict_proba(X_test)

# select the probabilities for label 1.0
y_proba = probabilities[:,1]

# calculate false positive rate and true positive rate at different thresholds
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_proba, pos_label=1)

# calculate AUC
roc_auc = auc(false_positive_rate, true_positive_rate)

plt.title('Receiver Operating Characteristic')
# plot the false positive rate on the x axis and the true positive rate on the y axis
roc_plot = plt.plot(false_positive_rate,
                    true_positive_rate,
                    label='AUC = {:0.2f}'.format(roc_auc))

plt.legend(loc=0)
plt.plot([0,1], [0,1], ls='--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate');


# In[ ]: