#import Opencv library
try:
    import cv2
except ImportError:
    print "You must have OpenCV installed"
    exit(1)

#check the OpenCV version
try:
    v=cv2.__version__
    assert (tuple(map(int,v.split(".")))>(2,4,2))
except (AssertionError, ValueError):
    print "Install newer version of OpenCV than 2.4.2, i.e from 2.4.3"
    exit(1)
    
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from modshogun import *

# get the list of all jpg images from the path provided
import os
def get_imlist(path):
    return [[os.path.join(path,f) for f in os.listdir(path) if (f.endswith('.jpg') or f.endswith('.png'))]]

#Use the following function when reading an image through OpenCV and displaying through plt.
def showfig(image, ucmap):
    #There is a difference in pixel ordering in OpenCV and Matplotlib.
    #OpenCV follows BGR order, while matplotlib follows RGB order.
    if len(image.shape)==3 :
        b,g,r = cv2.split(image)       # get b,g,r
        image = cv2.merge([r,g,b])     # switch it to rgb
    imgplot=plt.imshow(image, ucmap)
    imgplot.axes.get_xaxis().set_visible(False)
    imgplot.axes.get_yaxis().set_visible(False)

plt.rcParams['figure.figsize'] = 17, 4
filenames=get_imlist('../../../data/SIFT/template/')
filenames=np.array(filenames)

# for keeping all the descriptors from the template images
descriptor_mat=[]

# initialise OpenCV's SIFT
sift=cv2.SIFT()
fig = plt.figure()
plt.title('SIFT detected Keypoints')

for image_no in xrange(3):
    img=cv2.imread(filenames[0][image_no])
    img=cv2.resize(img, (500, 300), interpolation=cv2.INTER_AREA)
    gray=cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    gray=cv2.equalizeHist(gray)
    
    #detect the SIFT keypoints and the descriptors.
    kp, des=sift.detectAndCompute(gray,None)
    # store the descriptors.
    descriptor_mat.append(des)
    # here we draw the keypoints
    img=cv2.drawKeypoints(img, kp, flags=cv2.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS)
    fig.add_subplot(1, 3, image_no+1)
    showfig(img, None)

def get_similar_descriptors(k, descriptor_mat):

    descriptor_mat=np.double(np.vstack(descriptor_mat))
    descriptor_mat=descriptor_mat.T

    #initialize KMeans in Shogun 
    sg_descriptor_mat_features=RealFeatures(descriptor_mat)

    #EuclideanDistance is used for the distance measurement.
    distance=EuclideanDistance(sg_descriptor_mat_features, sg_descriptor_mat_features)

    #group the descriptors into k clusters.
    kmeans=KMeans(k, distance)
    kmeans.train()

    #get the cluster centers.
    cluster_centers=(kmeans.get_cluster_centers())
    
    return cluster_centers

cluster_centers=get_similar_descriptors(100, descriptor_mat)

# name of all the folders together
folders=['cars','planes','trains']
training_sample=[]
for folder in folders:
    #get all the training images from a particular class 
    filenames=get_imlist('../../../data/SIFT/%s'%folder)
    for i in xrange(10):
        temp=cv2.imread(filenames[0][i])
        training_sample.append(temp)

plt.rcParams['figure.figsize']=21,16
fig=plt.figure()
plt.title('10 training images for each class')
for image_no in xrange(30):
    fig.add_subplot(6,5, image_no+1)
    showfig(training_sample[image_no], None)

def get_sift_training():
    
    # name of all the folders together
    folders=['cars','planes','trains']
    
    folder_number=-1
    des_training=[]
      
    for folder in folders:
        folder_number+=1

        #get all the training images from a particular class 
        filenames=get_imlist('../../../data/SIFT/%s'%folder)
        filenames=np.array(filenames)
        
        des_per_folder=[]
        for image_name in filenames[0]:
            img=cv2.imread(image_name)

            # carry out normal preprocessing routines
            gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
            gray=cv2.resize(gray, (500, 300), interpolation=cv2.INTER_AREA)
            gray=cv2.equalizeHist(gray)

            #get all the SIFT descriptors for an image
            _, des=sift.detectAndCompute(gray, None)
            des_per_folder.append(des)
    
        des_training.append(des_per_folder)
    return des_training

descriptor_training=get_sift_training()

def compute_training_data(k, cluster_centers, descriptors):
    
    # a list to hold histograms of all the training images
    all_histograms=[]
    # labels for all of the test images
    final_labels=[]
    # to hold the cluster number a descriptor belong to
    cluster_labels=[]

    #initialize a KNN in Shogun
    dist=EuclideanDistance()
    labels=MulticlassLabels(np.double(range(k)))
    knn=KNN(1, dist, labels)

    #Target descriptors are the cluster_centers that we got earlier. 
    #All the descriptors of an image are matched against these for 
    #calculating the histogram.
    sg_cluster_centers=RealFeatures(cluster_centers)
    knn.train(sg_cluster_centers)

    # name of all the folders together
    folders=['cars','planes','trains']
    folder_number=-1

    for folder in folders:
        folder_number+=1

        #get all the training images from a particular class 
        filenames=get_imlist('../../../data/SIFT/%s'%folder)

        for image_name in xrange(len(filenames[0])):
            
            des=descriptors[folder_number][image_name]
            
            #Shogun works in a way in which columns are samples and rows are features.
            #Hence we need to transpose the observation matrix
            des=(np.double(des)).T
            sg_des=RealFeatures(np.array(des))

            #find all the labels of cluster_centers that are nearest to the descriptors present in the current image. 
            cluster_labels=(knn.apply_multiclass(sg_des)).get_labels()

            histogram_per_image=[]
            for i in xrange(k):
                #find the histogram for the current image
                histogram_per_image.append(sum(cluster_labels==i))

            all_histograms.append(np.array(histogram_per_image))
            final_labels.append(folder_number)

    # we now have the training features(all_histograms) and labels(final_labels) 
    all_histograms=np.array(all_histograms)
    final_labels=np.array(final_labels)
    return all_histograms, final_labels, knn

all_histograms, final_labels, knn=compute_training_data(100, cluster_centers, descriptor_training)

def train_svm(all_histograms, final_labels):
    
    # we will use GMNPSVM class of Shogun for one vs rest multiclass classification
    obs_matrix=np.double(all_histograms.T)
    sg_features=RealFeatures(obs_matrix)
    sg_labels=MulticlassLabels(np.double(final_labels))
    kernel=LinearKernel(sg_features, sg_features)
    C=1
    gsvm=GMNPSVM(C, kernel, sg_labels)
    _=gsvm.train(sg_features)
    return gsvm

gsvm=train_svm(all_histograms, final_labels)

# Lets see the testing images
testing_sample=[]
#get all the testing images  
filenames=get_imlist('../../../data/SIFT/test_image/')
for i in xrange(len(filenames[0])):
    temp=cv2.imread(filenames[0][i])
    testing_sample.append(temp)

plt.rcParams['figure.figsize']=20,8
fig=plt.figure()
plt.title('Test Images')
for image_no in xrange(len(filenames[0])):
    fig.add_subplot(3,8, image_no+1)
    showfig(testing_sample[image_no], None)

def get_sift_testing():
    filenames=get_imlist('../../../data/SIFT/test_image/')
    filenames=np.array(filenames)
    des_testing=[]
    for image_name in filenames[0]:
        result=[]
        #read the test image
        img=cv2.imread(image_name)

        #follow the normal preprocessing routines 
        gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        gray=cv2.resize(gray, (500, 300), interpolation=cv2.INTER_AREA)
        gray=cv2.equalizeHist(gray)

        #compute all the descriptors of the test images
        _, des=sift.detectAndCompute(gray, None)
        des_testing.append(des)
    return des_testing

descriptor_testing=get_sift_testing()

def classify_svm(k, knn, des_testing):
    
    # a list to hold histograms of all the test images
    all_histograms=[]
    filenames=get_imlist('../../../data/SIFT/test_image/')
    
    for image_name in xrange(len(filenames[0])):
        
        result=[]
        des=des_testing[image_name]
        
        #Shogun works in a way in which columns are samples and rows are features.
        #Hence we need to transpose the observation matrix
        des=(np.double(des)).T
        sg_des=RealFeatures(np.array(des))

        #cluster all the above found descriptors into the vocabulary
        cluster_labels=(knn.apply_multiclass(sg_des)).get_labels()

        #get the histogram for the current test image
        histogram=[]
        for i in xrange(k):
            histogram.append(sum(cluster_labels==i))
        
        all_histograms.append(np.array(histogram))

    all_histograms=np.double(np.array(all_histograms))
    all_histograms=all_histograms.T
    sg_testfeatures=RealFeatures(all_histograms)
    return gsvm.apply(sg_testfeatures).get_labels()

predicted=classify_svm(100, knn, descriptor_testing)
print "the predicted labels for k=100 are as follows: "
print predicted

def create_conf_matrix(expected, predicted, n_classes):
    m = [[0] * n_classes for i in range(n_classes)]
    for pred, exp in zip(predicted, expected):
        m[exp][int(pred)] += 1
    return np.array(m)

import re
filenames=get_imlist('../../../data/SIFT/test_image/')
# get the formation of the files, later to be used for calculating the confusion matrix
formation=([int(''.join(x for x in filename if x.isdigit())) for filename in filenames[0]])
    
# associate them with the correct labels by making a dictionary
keys=range(len(filenames[0]))

values=[0,1,0,2,1,0,1,0,0,0,1,2,2,2,2,1,1,1,1,1]
label_dict=dict(zip(keys, values))

# the following list holds the actual labels
expected=[]
for i in formation:
    expected.append(label_dict[i-1])

best_k=1
max_accuracy=0

for k in xrange(1,5):
    k=100*k
    
    # step 2
    cluster_centers=get_similar_descriptors(k, descriptor_mat)
    
    # step 3
    all_histograms, final_labels, knn=compute_training_data(k, cluster_centers, descriptor_training)
    
    # step 4
    gsvm=train_svm(all_histograms, final_labels)
    
    # step 5
    predicted=classify_svm(k, knn, descriptor_testing)
    accuracy=sum(predicted==expected)*100/float(len(expected))
    print "for a k=%d, accuracy is %d%%"%(k, accuracy)
    
    #step 6
    m=create_conf_matrix(expected, predicted, 3)

    if accuracy>max_accuracy:
        best_k=k
        max_accuracy=accuracy
        best_prediction=predicted
    
    print "confusion matrix for k=%d"%k
    print m

plt.rcParams['figure.figsize']=20,8
fig=plt.figure()
for image_no in xrange(len(filenames[0])):
    fig.add_subplot(3,8, image_no+1)
    plt.title('pred. class: '+folders[int(best_prediction[image_no])])
    showfig(testing_sample[image_no], None)