#!/usr/bin/env python
# coding: utf-8

# # Recognizing hand-written digits
# 
# ## Introduction
# 
# This notebook adapts the existing example of applying support vector classification from scikit-learn ([https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html#sphx-glr-auto-examples-classification-plot-digits-classification-py](https://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html#sphx-glr-auto-examples-classification-plot-digits-classification-py)) to PyRCN to demonstrate, how PyRCN can be used to classify hand-written digits.
# 
# The tutorial is based on numpy, scikit-learn and PyRCN. 

# In[1]:


import numpy as np
import time
from sklearn.base import clone
from sklearn.model_selection import train_test_split
from sklearn.model_selection import (
    ParameterGrid, RandomizedSearchCV, cross_validate)
from scipy.stats import uniform, loguniform
from sklearn.metrics import make_scorer

from pyrcn.model_selection import SequentialSearchCV
from pyrcn.echo_state_network import ESNClassifier
from pyrcn.metrics import accuracy_score
from pyrcn.datasets import load_digits


# ## Load the dataset
# 
# The dataset is already part of scikit-learn and consists of 1797 8x8 images. 
# 
# We are using our dataloader that is derived from scikit-learns dataloader and returns arrays of 8x8 sequences and corresponding labels.

# In[2]:


X, y = load_digits(return_X_y=True, as_sequence=True)
print("Number of digits: {0}".format(len(X)))
print("Shape of digits {0}".format(X[0].shape))


# ## Split dataset in training and test
# 
# Afterwards, we split the dataset into training and test sets. We train the ESN using 80% of the digits and test it using the remaining images. 

# In[3]:


stratify = np.asarray([np.unique(yt) for yt in y]).flatten()
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=stratify, random_state=42)
X_tr = np.copy(X_train)
y_tr = np.copy(y_train)
X_te = np.copy(X_test)
y_te = np.copy(y_test)
for k, _ in enumerate(y_tr):
    y_tr[k] = np.repeat(y_tr[k], 8, 0)
for k, _ in enumerate(y_te):
    y_te[k] = np.repeat(y_te[k], 8, 0)

print("Number of digits in training set: {0}".format(len(X_train)))
print("Shape of digits in training set: {0}".format(X_train[0].shape))
print("Number of digits in test set: {0}".format(len(X_test)))
print("Shape of digits in test set: {0}".format(X_test[0].shape))


# ## Set up a ESN
# 
# To develop an ESN model for digit recognition, we need to tune several hyper-parameters, e.g., input_scaling, spectral_radius, bias_scaling and leaky integration.
# 
# We follow the way proposed in the introductory paper of PyRCN to optimize hyper-parameters sequentially.
# 
# We define the search spaces for each step together with the type of search (a grid search in this context).
# 
# At last, we initialize a SeqToLabelESNClassifier with the desired output strategy and with the initially fixed parameters.

# In[4]:


initially_fixed_params = {
    'hidden_layer_size': 50, 'input_activation': 'identity', 'k_in': 5,
    'bias_scaling': 0.0, 'reservoir_activation': 'tanh', 'leakage': 1.0,
    'bidirectional': False, 'k_rec': 10, 'continuation': False, 'alpha': 1e-5,
    'random_state': 42, 'decision_strategy': "winner_takes_all"}

step1_esn_params = {'input_scaling': uniform(loc=1e-2, scale=1),
                    'spectral_radius': uniform(loc=0, scale=2)}

step2_esn_params = {'leakage': loguniform(1e-5, 1e0)}
step3_esn_params = {'bias_scaling': uniform(loc=0, scale=2)}
step4_esn_params = {'alpha': loguniform(1e-5, 1e0)}

kwargs_step1 = {'n_iter': 200, 'random_state': 42, 'verbose': 10, 'n_jobs': -1,
                'scoring': make_scorer(accuracy_score)}
kwargs_step2 = {'n_iter': 50, 'random_state': 42, 'verbose': 1, 'n_jobs': -1,
                'scoring': make_scorer(accuracy_score)}
kwargs_step3 = {'verbose': 1, 'n_jobs': -1,
                'scoring': make_scorer(accuracy_score)}
kwargs_step4 = {'n_iter': 50, 'random_state': 42, 'verbose': 1, 'n_jobs': -1,
                'scoring': make_scorer(accuracy_score)}

# The searches are defined similarly to the steps of a sklearn.pipeline.Pipeline:
searches = [('step1', RandomizedSearchCV, step1_esn_params, kwargs_step1),
            ('step2', RandomizedSearchCV, step2_esn_params, kwargs_step2),
            ('step3', RandomizedSearchCV, step3_esn_params, kwargs_step3),
            ('step4', RandomizedSearchCV, step4_esn_params, kwargs_step4)]

base_esn = ESNClassifier(**initially_fixed_params)


# ## Optimization
# 
# We provide a SequentialSearchCV that basically iterates through the list of searches that we have defined before. It can be combined with any model selection tool from scikit-learn.

# In[5]:


sequential_search = SequentialSearchCV(base_esn, searches=searches).fit(
    X_tr, y_tr)


# ## Use the ESN with final hyper-parameters
# 
# After the optimization, we extract the ESN with final hyper-parameters as the result of the optimization.

# In[6]:


base_esn = sequential_search.best_estimator_


# In[7]:


base_esn.get_params()


# In[8]:


sequential_search.all_cv_results_["step4"]


# ## Test the ESN
# 
# Finally, we increase the reservoir size and compare the impact of uni- and bidirectional ESNs. Notice that the ESN strongly benefit from both, increasing the reservoir size and from the bi-directional working mode.

# In[9]:


param_grid = {'hidden_layer_size': [50, 100, 200, 400, 500],
              'bidirectional': [False, True]}

print("CV results\tFit time\tInference time\tAccuracy score\tSize[Bytes]")
for params in ParameterGrid(param_grid):
    esn_cv = cross_validate(clone(base_esn).set_params(**params), X=X_train, y=y_train,
                            scoring=make_scorer(accuracy_score), n_jobs=-1)
    t1 = time.time()
    esn = clone(base_esn).set_params(**params).fit(X_train, y_train)
    t_fit = time.time() - t1
    t1 = time.time()
    esn_par = clone(base_esn).set_params(**params).fit(X_train, y_train, n_jobs=-1)
    t_fit_par = time.time() - t1
    mem_size = esn.__sizeof__()
    t1 = time.time()
    acc_score = accuracy_score(y_test, esn.predict(X_test))
    t_inference = time.time() - t1
    print(f"{esn_cv}\t{t_fit}\t{t_inference}\t{acc_score}\t{mem_size}")


# Alternatively, we can also use a PyTorch implementation of the ESN model

# In[13]:


from pyrcn.nn import ESN
import torch


# In[11]:


base_esn = ESN(input_size=8, hidden_size=50, num_layers=1, nonlinearity='tanh',
               bias=True, input_scaling=0.016952130531190705,
               spectral_radius=1.0214946051551315, bias_scaling=1.500499867548985,
               bias_shift=0., input_sparsity=0.1, recurrent_sparsity=0.1,
               bidirectional=False)


# In[16]:


for x, y in zip(X_train, y_train):
    x = torch.Tensor(x).float()
    print(base_esn(x)[0].shape)
    break


# In[ ]: