#!/usr/bin/env python
# coding: utf-8

# # Interfacing *torch* to *heyoka.py*
# 
# ```{note}
# For an introduction on feed forward neural networks in *heyoka.py*, check out the example: [Feed-Forward Neural Networks](<./ffnn.ipynb>).
# ```
# 
# 
# ```{warning}
# This tutorial assumes [torch](https://pytorch.org/) is installed
# ```
# 
# *heyoka.py* is not a library meant for machine learning, nor it aspires to be one. However, given its support for feed-forward neural networks and their potential use in numerical integration, it is useful to connect the *heyoka.py* `ffnn()` factory to a torch model. This tutorial tackles this! 

# In[1]:


import heyoka as hk

import numpy as np

# We will need torch for this notebook. The CPU version is enough though.
import torch
from torch import nn


# We start defining a `ffnn` model in *torch*. We use as a test-case, a feed-forward neural network with two hidden layers having 32 neurons each and use `tanh` as nonlinearities and a `sigmoid` for the output layer. We define the model as to map it easily to the *heyoka* `ffnn` factory, but other styles are also possible.
# 
# This will typically look something like:

# In[2]:


# Let us use float64 (this is not necessary as heyoka has also the float32 type, but we go for high precision here!)
torch.set_default_dtype(torch.float64)


class torch_net(nn.Module):
    def __init__(
        self,
        n_inputs=4,
        nn_hidden=[32, 32],
        n_out=1,
        activations=[nn.Tanh(), nn.Tanh(), nn.Sigmoid()],
    ):
        super(torch_net, self).__init__()

        # We treat all layers equally.
        dims = [n_inputs] + nn_hidden + [n_out]

        # Linear function.
        self.fcs = nn.ModuleList(
            [nn.Linear(dims[i], dims[i + 1]) for i in range(len(dims) - 1)]
        )

        # Non-linearities.
        self.acts = nn.ModuleList(activations)

    def forward(self, x):
        for fc, act in zip(self.fcs, self.acts):
            x = act(fc(x))
        return x


# Weights and biases are stored by *torch* in the model as arrays, while *heyoka* flattens everything into a one-dimensional sequence containing all weights first, then all biases.
# 
# The following function takes care of converting the *torch* representation to *heyoka*'s: 

# In[3]:


def weights_and_biases_heyoka(model):
    weights = {}
    biases = {}

    for name, param in model.named_parameters():
        if "weight" in name:
            weights[name] = param.data.clone()
        elif "bias" in name:
            biases[name] = param.data.clone()
    biases_torch = []
    weights_torch = []
    for idx in range(len(weights)):
        weights_torch.append(weights[list(weights.keys())[idx]].numpy())
        biases_torch.append(biases[list(biases.keys())[idx]].numpy())

    w_flat = []
    b_flat = []
    for i in range(len(weights_torch)):
        w_flat += list(weights_torch[i].flatten())
        b_flat += list(biases_torch[i].flatten())
    w_flat = np.array(w_flat)
    b_flat = np.array(b_flat)
    print(w_flat.shape)
    return np.concatenate((w_flat, b_flat))


# We are now ready to instantiate the model and extract its weights and biases ready for constructing an `heyoka.ffnn` object:

# In[4]:


model = torch_net(
    n_inputs=4,
    nn_hidden=[32, 32],
    n_out=1,
    activations=[nn.Tanh(), nn.Tanh(), nn.Sigmoid()],
)

# Here one would likely perform some training ... at the end, we extract the model parameters:
flattened_weights = weights_and_biases_heyoka(model)


# Let us instantiate the feed forward neural network in *heyoka.py* using those parameters:

# In[5]:


inp_1, inp_2, inp_3, inp_4 = hk.make_vars("inp_1", "inp_2", "inp_3", "inp_4")
model_heyoka = hk.model.ffnn(
    inputs=[inp_1, inp_2, inp_3, inp_4],
    nn_hidden=[32, 32],
    n_out=1,
    activations=[hk.tanh, hk.tanh, hk.sigmoid],
    nn_wb=flattened_weights,
)


# Good! Just to be sure, we now verify the output is the same at inference? Let's first compile the network so that we can run inference: 

# In[6]:


model_heyoka_compiled = hk.cfunc(model_heyoka, [inp_1, inp_2, inp_3, inp_4])


# ... and create some random inputs

# In[7]:


random_input = torch.rand((4, 1000000))
random_input_torch = random_input.t()
random_input_numpy = random_input.numpy()
out_array = np.zeros((1, 1000000))


# Now, let's compare the output of `heyoka.ffnn` and `torch` to see if they are identical

# In[8]:


t_hey = model_heyoka_compiled(random_input_numpy, outputs=out_array)
t_torch = model(random_input_torch).detach().numpy().reshape(1, -1)


# In[9]:


print("Maximum difference in the inference is: ", (t_hey - t_torch).max())


# In this way we have managed to port the *torch* model in *heyoka.py*, reproducing the same results... 

# # Using *torch* optimizers
# 
# 
# An some cases the parameters of the neural network may need to be updated with *heyoka.py* in the loop, check out the example: [Neural ODEs](<./NeuralODEs_II.ipynb>). 
# 
# This part shows how to interface with optimizers (in this case Adam) provided by *torch*.
# 
# In the following we use a simplified version of [Neural ODEs](<./NeuralODEs_II.ipynb>), where a feedforward neural network learns the underlying dynamics of a liner system through observations.
# 

# In[10]:


import matplotlib.pylab as plt


# Dynamics

# $$
# \left\{
# \begin{array}{l}
# \dot x = Ax + b
# \end{array}
# \right.
# $$

# In[11]:


# We now assemble the dynamics
A = np.array(
    [
        [0.0, 0.0, 0.0, 1.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 1.0, 0.0],
        [0.0, 0.0, 0.0, 0.0, 0.0, 1.0],
        [-2.5, 2.0, 0.0, 0.0, 0.0, 0.0],
        [2.0, -5.4, 3.4, 0.0, 0.0, 0.0],
        [0.0, 3.4, -3.4, 0.0, 0.0, 0.0],
    ]
)

b = [0, 0, 0, -3.5, 0.26, 3.74]

n = len(b)
assert A.shape == (n, n)

x_vars = hk.make_vars(*[f"x_{i}" for i in range(1, n + 1)])

Ax_b = A @ np.array(x_vars) + b
dynamics = list(zip(x_vars, Ax_b))


# Setup

# In[12]:


# This is the number of observations (i.e. the ground truth).
data_size = 1000
# This is the number of initial conditions to use in one batch (M)
batch_size_ic = 20
# This is the number of observations to predict from each sampled initial condition (T)
batch_size_time = 10
# This is the initial condition used to generate the observations (of the real dynamics)
ic = [2.0, 0.0, 0.0, 1.0, 0.0, 0.0]
# This is the time grid used to generate the observations
t_grid = np.linspace(0.0, 20, data_size)


# The taylor integrator we use for the ground truth

# In[13]:


ta = hk.taylor_adaptive(
    # The ODEs.
    dynamics,
    # The initial conditions.
    ic,
    # Operate in compact mode.
    compact_mode=False,
    # Define the tolerance
    tol=1e-18,
)


# We generate the ground truth (observations)

# In[14]:


ta.time = 0
ta.state[:] = ic
gt = ta.propagate_grid(t_grid)[5]


# In[15]:


# Visualize
def plot_t_evol(
    out,
    tgrid,
    labels,
    title="",
    color="#1f77b4",
    batchy=None,
    tgrid_batch=None,
    color_batch=None,
    batch_ic=None,
    t_grids=None,
    ta_var=None,
    ic_var=None,
    nn_wb=None,
    batch_t_grid=None,
):
    fig = plt.figure(figsize=(12, 6))

    if len(title) > 0:
        fig.suptitle(title)

    ncoord = out.shape[1]
    ncols = 3
    nrows = ncoord // ncols + (ncoord % ncols)

    for i in range(0, ncoord):
        ax = fig.add_subplot(nrows, ncols, i + 1)
        ax.plot(tgrid, out[:, i], color=color)

        if batchy is not None:
            for j, t_grid_batch in enumerate(tgrid_batch):
                ax.plot(t_grid_batch, batchy[:, j, i], linewidth=3, color=color_batch)

        if batch_ic is not None:
            for one_ic, t_grid_batch in zip(batch_ic, t_grids):
                ta_var.time = 0
                ta_var.state[:] = list(one_ic) + ic_var
                ta_var.pars[:] = nn_wb
                sol = ta_var.propagate_grid(batch_t_grid)[5]
                ax.plot(
                    t_grid_batch, sol[:, i], linewidth=3, marker=".", color="#ff7f0e"
                )

        ax.set_xlabel("Time [-]")
        ax.set_ylabel(labels[i], rotation=0, labelpad=10)

    plt.tight_layout()


plot_t_evol(
    gt,
    t_grid,
    ["$x1$", "$x2$", "$x3$", "$x4$", "$x5$", "$x6$"],
    title="States over time (observations)",
)


# This creates the batch.

# In[16]:


def get_batch(t_data, x_data, batch_size_ic, batch_size_time):
    # We select the initial conditions from which generate predictions
    s = np.random.choice(
        np.arange(gt.shape[0] - batch_size_time, dtype=np.int64),
        batch_size_ic,
        replace=False,
    )
    ic_batch = x_data[s, :]  # (M, n)
    # Assuming uniform grid and a non-autonomous system, all predictions will be made on the same time grid
    t_batch = t_data[:batch_size_time]  # (T)
    t_grids = np.array(
        [t_data[s_ : s_ + batch_size_time] for s_ in s]
    )  # Used for plotting
    x_batch = np.stack([x_data[s + i] for i in range(batch_size_time)])  # (T, M, n)
    return ic_batch, t_batch, x_batch, t_grids


# This instantiates the ffnn as heyoka expression, and defines the neural ODE

# In[17]:


# We define as nonlinearity a simple linear layer
linear = lambda inp: inp

# We call the factory to construct a FFNN. Following the torcheqdiff example here we put cubes as inputs:
ffnn = hk.model.ffnn(
    inputs=x_vars,
    nn_hidden=[30, 30],
    n_out=len(x_vars),
    activations=[hk.tanh, hk.tanh, linear],
)

# We thus define the neural dynamics
dyn_n = list(zip(x_vars, ffnn))


# We now need the gradient of the loss over the batch
# 
# $$
# \mathcal L_{\theta} = \sum_{i=0}^{M}\sum_{j=0}^T (\pmb y_{ij} - \hat {\pmb y}_{ij})\cdot (\pmb y_{ij} - \hat {\pmb y}_{ij})
# $$
# 
# $$
# \frac{\partial L_\theta}{\partial \theta} =  2 \sum_{i=0}^{M}\sum_{j=0}^T (\pmb y_{ij} - \hat {\pmb y}_{ij})\cdot \frac{\partial \pmb y_{ij}}{\partial \theta}
# $$
# 
# hence we need a variational integrator.

# In[18]:


var_dyn_n = hk.var_ode_sys(dyn_n, args=hk.var_args.params)
ta_var = hk.taylor_adaptive(
    # The ODEs.
    var_dyn_n,
    # The initial conditions.
    ic,
    # Operate in compact mode.
    compact_mode=True,
    # Define the tolerance
    tol=1e-8,
)
# We store for convenience the initial conditions of the variational state
ic_var = list(ta_var.state[n:])


# In[19]:


batch_ic, batch_t_grid, batch_y, t_grids = get_batch(
    t_grid, gt, batch_size_ic, batch_size_time
)

plot_t_evol(
    gt,
    t_grid,
    ["$x1$", "$x2$", "$x3$", "$x4$", "$x5$", "$x6$"],
    title="States over time (observations) with one random training batch shown in red",
    batchy=batch_y,
    tgrid_batch=t_grids,
    color_batch="red",
)


# In[20]:


# We find initialization values for weigths / biases
n_pars = len(ta_var.pars)
nn_wb = np.random.normal(loc=0, scale=0.1, size=(n_pars,))

plot_t_evol(
    gt,
    t_grid,
    ["$x1$", "$x2$", "$x3$", "$x4$", "$x5$", "$x6$"],
    title="States over time (observations) with Neural ODE (orange)",
    batch_ic=batch_ic,
    t_grids=t_grids,
    ta_var=ta_var,
    ic_var=ic_var,
    nn_wb=nn_wb,
    batch_t_grid=batch_t_grid,
)


# This returns the value of the loss and gradients of the loss with respect to the network parameters

# In[21]:


def loss_and_gradient(nn_wb, batch_ic, batch_y):
    # We loop over the various selected initial conditions and produce the predictions
    # and their gradients
    # We reset the batch_loss value
    batch_loss = 0
    # We reset the gradient loss
    grad = np.array([0.0] * len(ta_var.pars))
    for i, ic_ in enumerate(batch_ic):
        ta_var.time = 0
        ta_var.state[:] = list(ic_) + ic_var
        ta_var.pars[:] = nn_wb
        sol = ta_var.propagate_grid(batch_t_grid)[5]
        # Here is the term (y-y_hat)
        diff = sol[:, :n] - batch_y[:, i, :]
        # Which is now summed sum sum (y-y_hat).(y-y_hat)
        batch_loss += np.sum(diff**2)
        # And the gradient computed  as 2 sum (y-y_hat).dy
        for dy in sol[:, n:].reshape(batch_y.shape[0], n, -1):
            grad += 2 * np.sum(diff @ dy, axis=0)
    # We then take the mean over the points used
    batch_loss /= batch_size_ic * batch_size_time
    grad /= batch_size_ic * batch_size_time
    return batch_loss, grad


# Simple fixed learning rate stochastic gradient descent

# In[22]:


# We create a random batch
batch_losses = []
learning_rate = 1e-2

for i in range(300):
    batch_ic, batch_t_grid, batch_y, _ = get_batch(
        t_grid, gt, batch_size_ic=20, batch_size_time=10
    )
    batch_loss, grad = loss_and_gradient(nn_wb, batch_ic, batch_y)
    print(f"iter: {i}, loss : {batch_loss:.3e}", end="\r")
    nn_wb = nn_wb - learning_rate * grad
    batch_losses.append(batch_loss)


# In[23]:


plt.semilogy(batch_losses, color="red")
plt.title("Training batch loss over gradient descent steps")


# In[24]:


ta_var.time = 0
ta_var.state[:] = ic + ic_var
ta_var.pars[:] = nn_wb
sol = ta_var.propagate_grid(t_grid)[5]

plot_t_evol(
    gt,
    t_grid,
    ["$x1$", "$x2$", "$x3$", "$x4$", "$x5$", "$x6$"],
    title="States over time (observations) with Neural ODE (orange)",
    batch_ic=[ic],
    t_grids=[t_grid],
    ta_var=ta_var,
    ic_var=ic_var,
    nn_wb=nn_wb,
    batch_t_grid=t_grid,
)


# Now we do the same training with the Adam Optimizer

# In[25]:


# Reinitialize values for weigths / biases
n_pars = len(ta_var.pars)
nn_wb = np.random.normal(loc=0, scale=0.1, size=(n_pars,))


# In[26]:


# Set desired precision
precision = np.single  # np.double
learning_rate = 1e-2

# Determine the PyTorch dtype based on precision
dtype = torch.float32 if precision == np.single else torch.float64

# Gradient descent using the Adam optimizer
weights_biases_tensor = torch.tensor(nn_wb, dtype=dtype, requires_grad=True)
optimizer = torch.optim.Adam(
    [weights_biases_tensor],
    lr=precision(learning_rate),
    betas=(0.9, 0.999),
    eps=precision(1e-8),
    amsgrad=True,
)


# In[27]:


# We create a random batch
batch_losses = []

for i in range(300):
    batch_ic, batch_t_grid, batch_y, _ = get_batch(
        t_grid, gt, batch_size_ic=20, batch_size_time=10
    )
    batch_loss, grad = loss_and_gradient(nn_wb, batch_ic, batch_y)
    print(f"iter: {i}, loss : {batch_loss:.3e}", end="\r")
    batch_losses.append(batch_loss)

    # Clear the gradients for the next iteration
    optimizer.zero_grad()

    # weights_biases_tensor.data = torch.tensor(current_weights)
    weights_biases_tensor.data = torch.tensor(
        np.array(nn_wb), dtype=dtype, requires_grad=True
    )

    # Convert to tensor
    dLdw_tensor = torch.tensor(np.squeeze(grad), dtype=dtype)

    # Manually set the gradients for the parameter
    weights_biases_tensor.grad = dLdw_tensor

    # Use the Adam optimizer to perform the weight update
    optimizer.step()

    # Convert back to array
    nn_wb = weights_biases_tensor.detach().numpy()


# In[28]:


plt.semilogy(batch_losses, color="red")
plt.title("Training batch loss over gradient descent steps")


# In[29]:


ta_var.time = 0
ta_var.state[:] = ic + ic_var
ta_var.pars[:] = nn_wb
sol = ta_var.propagate_grid(t_grid)[5]

plot_t_evol(
    gt,
    t_grid,
    ["$x1$", "$x2$", "$x3$", "$x4$", "$x5$", "$x6$"],
    title="States over time (observations) with Neural ODE (orange)",
    batch_ic=[ic],
    t_grids=[t_grid],
    ta_var=ta_var,
    ic_var=ic_var,
    nn_wb=nn_wb,
    batch_t_grid=t_grid,
)