#!/usr/bin/env python # coding: utf-8 # In[1]: # %load /Users/facai/Study/book_notes/preconfig.py get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import seaborn as sns sns.set(color_codes=True) sns.set(font='SimHei', font_scale=2.5) plt.rcParams['axes.grid'] = False #import numpy as np #import pandas as pd #pd.options.display.max_rows = 20 #import sklearn #import itertools #import logging #logger = logging.getLogger() #from IPython.display import SVG def show_image(filename, figsize=None, res_dir=True): if figsize: plt.figure(figsize=figsize) if res_dir: filename = './res/{}'.format(filename) plt.imshow(plt.imread(filename)) # 10 Sequence Modeling: Recurrent and Recursive Nets # ============== # # RNN: to process sequential data # # traditionaly fully connected feedforward network: separate parameters at each position. # => recurrent neural network: shares the same parameters across several steps. # ### 10.1 Unfoliding Computational Graphs # # recurrent neural networks: $h^{(t)} = f(h^{(t-1)}, x^{(t)}; \theta)$ # In[2]: show_image('fig10_2.png', figsize=(12, 5)) # two major advantages: # + same input size even for a variable-length sequence. # + use the **same** transition function $f$ with the same parameters at every time step. # # unfolded graph: computing gradients # ### 10.2 Recurrent Neural Networks # # Different recurrent networks: # In[6]: # A. show_image('fig10_3.png', figsize=(10, 8)) # \begin{align} # a^t &= b + W h^{(t-1)} + U x^t \\ # h^t &= \tanh(a^t) \\ # o^t &= c + V h^t \\ # \hat{y}^t &= \operatorname{softmax}(o^t) \\ # \end{align} # # The total loss for a given sequence of $x$ values paired with a sequence of $y$ values would be just the sum of the losses over all the time steps: # # \begin{align} # &L \left ( \{x^1, \cdots, x^\tau\}, \{y^1, \cdots, y^\tau\}) \right ) \\ # &= \sum_t L^t \\ # &= - \sum_t \log p_{\text{model}} \left ( y^t \, | \, \{x^1, \cdots, x^\tau\} \right ) \\ # \end{align} # # So the back-propagation algorithm need $O(\tau)$ running time moving right to left through the graph, and also $O(\tau)$ memory cost to store the intermediate states. => back-propagation through time (BPTT): **powerful but also expensive to train** # In[2]: # B. show_image('fig10_4.png', figsize=(10, 8)) # **advantage** of eliminating hidden-to-hidden recurrence: decouple all the time steps (replace predition by its ground truth in sample for $t-1$) # => teacher forcing # In[4]: show_image('fig10_6.png', figsize=(10, 8)) # **disadvantage**: *open-loop* network (outputs fed back as input) => inputs are quite different between training and testing. # # + mitigate the problem: # 1. train with both teacher-forced inputs and with free-running inputs. # 2. randomly choose to use generated values or actual data as input. # In[7]: # C. show_image('fig10_5.png', figsize=(10, 8)) # #### 10.2.2 Computing the Gradient in a Recurrent Neural Network # # 1. calculate the gradients on the internal nodes. # 2. calculate the gradients on the paramter nodes. # Because the parameters are shared across many time steps, we introduce dummy variables $W^t$ that are defined to be copies of $W$ but with each $W^t$ used only at time step $t$. # In[8]: show_image("formula_gradient.png", figsize=(12, 8)) # #### 10.2.3 Recurrent Networks as Directed Graphical Models # # Mean squared error: the cross-entropy loss associated with an output distribution that is a unit Gaussian. # In[9]: show_image('fig10_7.png', figsize=(10, 8)) # RNNs obtain the same full connectivity above but efficient parametrization, as illustrated below: # # + Incorporating the $h^t$ nodes in the graphical model decouples the past and the future. # + Variable $y^i$ in the distant past may influence $y^t$ via its effect on $h$. # + Assume same conditional probability distributions at each time step. $\gets$ parameter sharing # In[10]: show_image('fig10_8.png', figsize=(10, 8)) # determining the length of the sequence: # + add a special symol in training sample. # + introduce an extra Bernoulli output: decision to either continue generation or halt at each time step. # + add an extra output to predict length $\tau$. # #### 10.2.4 Modeling Sequences Conditioned on Context with RNNs # In[13]: # fixed-length vector x, share R show_image('fig10_9.png', figsize=(10, 8)) # In[14]: # variable-length sequence show_image('fig10_10.png', figsize=(10, 8)) # restriction: the length of both sequences $x$ and $y$ must be the same. # ### 10.3 Bidirectional RNNs # # For speech recognition or handwriting reccognition, prediction depends on the **whole** input sequence => bidirectional # In[8]: show_image("fig10_11.png", figsize=(10, 12)) # ### 10.4 Encoder-Decoder Sequence-to-Sequence Architectures # In[7]: show_image("fig10_12.png", figsize=(10, 8)) # $C$: context, a vector or sequence of vectors that summarize the input sequence $X$. # # limitation: $C$ cannot be so small that it cannot properly summarize a long sequence. # ### 10.5 Deep Recurrent Networks # In[11]: show_image("fig10_13.png", figsize=(12, 15)) # ### 10.6 Recursive Neural Networks # # | structure | depth # ----|----|---- # RNN | chain-like | $\tau$ # recursive neural network | tree | $\log \tau$ # # Open question: How to best structure the tree? # In[13]: show_image("fig10_14.png", figsize=(5, 12)) # ### 10.7 The Challenge of Long-Term Dependencies # # basic problem: gradients => vanish / explode (over many stages) # # \begin{align} # h^t &= (W^t)^T h^0 \\ # W &= Q \Lambda A^T \\ # h^t &= Q^T \Lambda^t Q h^0 # \end{align} # # In $\Lambda^t$, $v < 1 \implies 0$. The largest eigenvalue dominates. # # => Any component of $h^0$ that is not aligned with the largest eignevector => discard. # # The problem is particular to RNN <= they share the same weight at each time step. # # The remaining sections describe approaches to overcoming the problem: # ### 10.8 Echo State Networks # # Set the input and recurrent weights such that the dynamic system is near the edge of stability, and ONLY learn the output weights. # ### 10.9 Leaky Units and Other Strategies for Multiple Time Scales # # design a model that operates at multiple time scalse => hanlde both near past and distant past => mitigate long-term dependencies # # + Adding Skip Connections through Time # + Leaky Units and a Spectrum of Different Time Scales: $u^t = \alpha u^{t-1} + (1 - \alpha) v^t$ # + Removing Connections # ### 10.10 The Long Short-Term Memory and Other Gated RNNs # # creating paths through time: # + accumulate information over a long duration. # + forget the old state if necessary. # # LSTM # # [Understanding LSTM Networks](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) # In[3]: show_image("fig10_16.png", figsize=(10, 15)) # GRU: gated recurrent units # # a single gating unit simultaneously controls the forgetting factor and the decision to update the state unit # # \begin{equation} # h_i^t = u_i^{(t-1)} h_i^{(i-1)} + (1 - u_i^{(t-1)} \sigma \left ( b_i + \sum_j U_{i, j} x_j^{(t-1)} + \sum_j W_{i, j} r_j^{(t-1)} h_j^{(t-1)} \right ) # \end{equation} # ### 10.11 Optimization for Long-Term Dependencies # # + second-order optimization: second derivatives may vanish at the same time that first derivates vanish. # + clipping gradients before updating: <= exploding gradients # 1. clip the parameter gradient element-wise. # 2. clip the norm $\|g\|$ of the gradient $g$. # + regularizing to encourage information flow. <= vanishing gradients # ### 10.12 Explicit Memory # In[6]: show_image("fig10_18.png", figsize=(8, 12)) # In[ ]: