#!/usr/bin/env python
# coding: utf-8

# # Building and training a mutli-layer network with Keras

# In[ ]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
get_ipython().run_line_magic('matplotlib', 'inline')


# ## Classifying Iris versicolor
# 
# Let us know try a slightly different problem: identifying the species *versicolor* instead of *setosa*.
# 
# This is a more challenging problem, because the species *versicolor* is very close to the related species *virginica*, as shown on the data below.

# In[ ]:


# Load data
df = pd.read_csv('./data/versicolor/train.csv')
X = df[['petal length (cm)', 'petal width (cm)']].values
y = df['versicolor'].values


# In[ ]:


def plot_keras_model( model=None ):
    "Plot the Keras model, along with data"
    plt.clf()
    # Calculate the probability on a mesh
    if model is not None:
        petal_width_mesh, petal_length_mesh = \
            np.meshgrid( np.linspace(0,3,100), np.linspace(0,8,100) )
        petal_width_mesh = petal_width_mesh.flatten()
        petal_length_mesh = petal_length_mesh.flatten()
        p = model.predict( np.stack( (petal_length_mesh, petal_width_mesh), axis=1 ) )
        p = p.reshape((100,100))
        # Plot the probability on the mesh
        plt.imshow( p.T, extent=[0,8,0,3], origin='lower', 
               vmin=0, vmax=1, cmap='RdBu', aspect='auto', alpha=0.7 )
    # Plot the data points
    plt.scatter( df['petal length (cm)'], df['petal width (cm)'], c=df['versicolor'], cmap='RdBu')
    plt.xlabel('petal length (cm)')
    plt.ylabel('petal width (cm)')
    cb = plt.colorbar()
    cb.set_label('versicolor')
plot_keras_model()


# ## Single layer network
# 
# Let us see how a single-layer neural network performs in this case. Here are building exactly the same kind of network as we did in the previous notebook. 

# In[ ]:


# Build the model
single_layer_model = Sequential()
single_layer_model.add( Dense( output_dim=1, input_dim=2 ) )
single_layer_model.add( Activation( 'sigmoid' ) )


# In[ ]:


# Prepare the model for training
single_layer_model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.1), metrics=['accuracy'])


# In[ ]:


# Train the network
single_layer_model.fit( X, y, batch_size=16, epochs=1000, verbose=0 )


# In[ ]:


plot_keras_model( model=single_layer_model )


# The network is unable to make the correct prediction, even after 1000 epochs of training. 
# 
# This because, as we saw when tuning the weights by hand, a single-layer network is only capable of producing a single linear boundary between two areas of the plane. For a more complicated model, we need several layers.

# # Two layer network
# 
# A two layer network looks like this:
# 
# ![Two-layer network](./images/Two_layer.png)
# 
# where the number of units in the intermediate layer (4 here) is a parameter that the user needs to choose.

# In[ ]:


# Build the model: pick 8 units in the intermediate layer
two_layer_model = Sequential()
two_layer_model.add( Dense( input_dim=2, units=8 ) )
two_layer_model.add( Activation( 'sigmoid' ) )
two_layer_model.add( Dense( input_dim=8, units=1 ) )
two_layer_model.add( Activation( 'sigmoid' ) )


# In[ ]:


# Compile the model
two_layer_model.compile(loss='binary_crossentropy', optimizer=SGD(lr=0.1), metrics=['accuracy'])


# In[ ]:


# Train it
two_layer_model.fit( X, y, batch_size=16, epochs=1000, verbose=0 )


# In[ ]:


plot_keras_model( model=two_layer_model )


# ## A few more remarks on Keras and neural networks
# 
# Keras allows to build and train a number of neural network architectures:
# - fully-connected networks
# - convolutional networks
# - recurrent networks
# 
# The corresponding code with the Keras interface is much less verbose than directly with the Tensorflow interfact (but also less flexible).
# 
# Keras still requires the user to make many educated guesses:
# - Structure of the network (architecture, number of layers, number of nodes in each layers, activation functions)
# - Training parameters (loss function, optimizer and learning rate, batch size, number of epochs)