#!/usr/bin/env python
# coding: utf-8

# # Custom mean functions: Meta-learning with GPs
# One of the advantages of Gaussian process is their flexibility as a modelling tool. For instance, if the modeller knows there is an underlying trend in the data, they can specify a mean function that captures this trend.
# 
# In this notebook, we illustrate how to use GPflow to construct a custom neural network mean function for GPs that can capture complex trends. We look at this functionality in the context of meta-learning, where a number of metatasks are available at train time and the user wants to adapt a flexible model to new tasks at test time.
# 
# For an in-depth discussion on this topic, see *(Fortuin and Rätsch, 2019)*. This notebook reproduces section 4.2 of this paper.

# In[1]:


import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

import gpflow
from gpflow.decors import params_as_tensors
from gpflow.kernels import RBF
from gpflow.likelihoods import Gaussian
from gpflow.mean_functions import MeanFunction
from gpflow.models import GPR
from gpflow.params import Parameter, ParamList
from gpflow.training import GradientDescentOptimizer

from gpflow.test_util import notebook_niter

get_ipython().run_line_magic('matplotlib', 'inline')


# ## Generate the tasks
# To generate the meta and test tasks, we sample from a Gaussian process with an Squared Exponential covariance function and a sinusoidal mean function. Each task is a realisation of this process.

# In[2]:


def generate_data(num_functions=10, N=1000):
    jitter = 1e-6
    Xs = np.linspace(-5.0, 5.0, N)[:, None]
    kernel = RBF(input_dim=1, lengthscales=1.)
    cov = kernel.compute_K_symm(Xs)
    L = np.linalg.cholesky(cov + np.eye(N) * jitter)
    epsilon = np.random.randn(N, num_functions)
    F = np.sin(Xs) + np.matmul(L, epsilon)
    return Xs, F


# We generate 10 tasks for illustration.

# In[3]:


Xs, F = generate_data(10)


# In[4]:


plt.plot(Xs, F);


# We generate the meta and test tasks.

# In[5]:


def generate_meta_and_test_tasks(num_datapoints, num_meta, num_test):
    N = 1000
    Xs, F = generate_data(num_functions=num_meta + num_test, N=N)
    meta_indices = [np.random.permutation(N)[:num_datapoints] for _ in range(num_meta)] 
    test_indices = [np.random.permutation(N)[:num_datapoints] for _ in range(num_test)]
    meta = []
    for i, mi in enumerate(meta_indices):
        Y = F[mi, i][:, None] + 1e-1 * np.random.randn(num_datapoints, 1)
        meta.append((Xs[mi], Y))
    test = []
    for i, ti in enumerate(test_indices):
        Y = F[ti, num_meta + i][:, None] + 1e-1 * np.random.randn(num_datapoints, 1)
        test.append(((Xs[ti], Y), (Xs, F[:, num_meta + i][:, None])))
    return meta, test


# In[6]:


num_meta_tasks = 20
num_test_tasks = 5
num_datapoints = 5
meta, test = generate_meta_and_test_tasks(num_datapoints, num_meta_tasks, num_test_tasks)


# ## Create the mean function
# To create a custom mean function in GPflow, we inherit from the `MeanFunction` class and implement a `__call__` method. We store the neural network parameter inside the object and implement the forward pass in `__call__`.

# In[7]:


# Function for weight initialisation
def xavier_weights(input_dim, output_dim):
    """
    Xavier initialisation for the weights of a NN layer
    See:
       Xavier Glorot and Yoshua Bengio (2010):
       Understanding the difficulty of training deep feedforward neural networks.
       International Conference on Artificial Intelligence and Statistics.
    """

    xavier_std = (2./(input_dim + output_dim)) ** 0.5
    return np.random.randn(input_dim, output_dim) * xavier_std


# Neural network implementation
class DeepNeuralNetworkMeanFunction(MeanFunction):
    """
    Neural network mean function
    """
    def __init__(self, input_dim, output_dim, inner_dims=[100], inner_activation=tf.nn.tanh, seed=None, Ws=None, bs=None):
        """
        :param input_dim: input dimension
        :param output_dim: output dimension
        :param inner_dims: dimension of inner layers
        :param inner_activation: activation for inner layers
        :param seed: random seed
        :param mean_function: initialise the network with parameters from mean_function
        """

        super().__init__()

        self.seed = seed
        self.rng = np.random.RandomState(self.seed)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.inner_dims = inner_dims

        self.inner_activation = inner_activation

        if (Ws is None) and (bs is None):
            self._construct_network()
        else:
            self.Ws = ParamList(Ws) 
            self.bs = ParamList(bs)

    def _construct_network(self):
        Ws, bs = [], []
        dims = [self.input_dim, *self.inner_dims, self.output_dim]
        for dim_in, dim_out in zip(dims[:-1], dims[1:]):
            Ws.append(Parameter(xavier_weights(dim_in, dim_out), trainable=True))
            bs.append(Parameter(np.zeros((1, dim_out)), trainable=True))

        self.Ws, self.bs = ParamList(Ws), ParamList(bs)

    @params_as_tensors
    def __call__(self, X):
        """
        Feedforward pass

        :param X: NxD tensor
        """

        forward = X

        for i, (W, b) in enumerate(zip(self.Ws, self.bs)):
            forward = tf.matmul(forward, W) + b
            if i < len(self.bs) - 1:
                forward = self.inner_activation(forward)

        return forward


# ## Build the GP metamodel
# Meta-learning boils down to learning a good prior that can generalise to new tasks with a small number of datapoints. This framework is prevalent in GP modelling, where we usually maximise the marginal likelihood to learn a good set of hyperparameters that specify the GP prior.
# 
# We perform the same optimisation here, while sharing the hyperparameters across all the metatasks. For simplicity, we fix the kernel and likelihood parameters and only learn those for the mean function. Hence, our "meta-learning" procedure is to cycle through the metatasks continuously, optimising their marginal likelihood until a convergence criteria is reached (here, we just implement a fixed number of iterations over the tasks).
# 
# To begin this process, first we create a utility function that takes in a task (X, Y) and a mean function and outputs a GP model.

# In[8]:


def build_model(X, Y, Ws, bs):
    mean_function = DeepNeuralNetworkMeanFunction(1, 1, [64, 64], Ws=Ws, bs=bs)
    kernel = RBF(input_dim=1)
    model = GPR(X, Y, kern=kernel, mean_function=mean_function)
    model.kern.trainable = False
    model.likelihood.variance = 1e-2
    model.likelihood.trainable = False
    return model


# Next, we define the training loop for meta-learning.

# In[9]:


import time

def train_loop(meta_tasks, num_iter=5):
    """
    Meta-learning training loop
    
    :param meta_tasks: list of metatasks.
    :param num_iter: number of iterations of tasks set
    :returns: a mean funciton object
    """
    # Initialise mean function
    Ws = None
    bs = None
    # Iterate for several passes over the tasks set
    for iteration in range(num_iter):
        ts = time.time()
        print("Currently in meta-iteration {}".format(iteration))
        # Iterate over tasks
        for task in meta_tasks:
            X, Y = task
            # we create each model in its own session and graph
            with tf.Graph().as_default() as graph, tf.Session(graph=graph).as_default() as sess:                    
                model = build_model(X, Y, Ws, bs)
                GradientDescentOptimizer(1e-3).minimize(model, maxiter=notebook_niter(100, test_n=1))
                # Extract optimal mean function params to be fed to the next model
                Ws = list(model.mean_function.Ws.read_values().values())
                bs = list(model.mean_function.bs.read_values().values())
        print(">>>> iteration took {} ms".format(time.time() - ts))
    return Ws, bs


# In[10]:


Ws_optimal, bs_optimal = train_loop(meta)


# Finally, we use the optimised mean function for all of the test tasks. Note, we do not do any further optimisation for the hyperparameters in this step.

# In[11]:


test_models = [build_model(X, Y, Ws_optimal, bs_optimal) for ((X, Y), (_, _)) in test]


# ## Assess the model
# We assess the performance of this procedure on the test tasks. For this, we use the mean squared error as a performance metric.

# In[12]:


def mean_squared_error(y, y_pred):
    return np.mean((y - y_pred) ** 2)


# In[13]:


mean_squared_errors = []
for i, test_task in enumerate(test):
    (_, _), (Xs, F) = test_task
    pred = test_models[i].predict_f(Xs)
    plt.figure()
    plt.plot(Xs, pred[0], label='Predictions')
    plt.plot(Xs, F, label='Ground Truth')
    mse = mean_squared_error(F, pred[0])
    mean_squared_errors.append(mse)
    plt.title(f"Test Task {i + 1} | MSE = {mse}")
    plt.legend()


# In[14]:


mean_mse = np.mean(mean_squared_errors)
std_mse = np.std(mean_squared_errors) / np.sqrt(num_test_tasks)
print(f"The mean MSE over all {num_test_tasks} test tasks is {mean_mse} +/- {std_mse}")


# We achieve comparable results to those reported in the paper.
# 
# Note: We only use 20 metatasks and 5 test tasks for scalability, whereas the paper uses 1000 and 200 respectively. Hence, there might be some discrepancies in the results.

# ## References
# Fortuin, Vincent, and Gunnar Rätsch. "Deep Mean Functions for Meta-Learning in Gaussian Processes." arXiv preprint arXiv:1901.08098 (2019).