#!/usr/bin/env python # coding: utf-8 # In this exercise you will: # 1. Write you function $f(x, y)$ # 2. Use gradient descent with and without momentum # 3. Using your implementation, optimize $f(x, y)$ w.r.t. the vector $[x, y]^T$. # 4. Redo step 1 and 3 to optimize a different function # 5. Write a short description of the sensitivity of the two algorithms to their hyperparemeter values # # After that, you're done. # If you want more, you can try one of the following stretch goals: # 1. In practice our gradient updates are often _noisy_ (see the [stochastic GD algorithm](https://en.wikipedia.org/wiki/Stochastic_gradient_descent)). Add a small amount of noise to the gradient update. If `g=autograd.grad(f)(xy)`, this would look like `g+=.2*np.random.randn(*g.shape)` within your `get_gd_delta` and `get_gd_delta_mom` functions. How does performance of GD and GD-with-momentum respond when gradients are noisy? # 2. Read the documentation of [`scipy.optimize.check_grad`](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.check_grad.html). Use this method to write a unit test of `autograd.grad(f)` and test it out for some non-trivial functions. # 3. Read section 2 of [the Adam paper](https://arxiv.org/pdf/1412.6980.pdf). Implement Adam from scratch by following algorithm 1 or translate an existsing online implementation or tutorial into `autograd`. # # In[1]: import autograd.numpy as np import autograd import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') np.set_printoptions(precision=3) import time # helper function for plotting isocontours def plot_isocontours(x_points, y_points, g): """ first makes a 2d grid from the 1d grid then plots isocontours using the function g """ X,Y = np.meshgrid(x_points, y_points) # build 2d grid Z = np.zeros_like(X) for i in range(len(X)): for j in range(len(X.T)): Z[i, j] = g(np.array((X[i, j], Y[i, j]))) # compute function values fig, ax = plt.subplots() im = ax.contour(X, Y, Z, 100) plt.colorbar(im, ax=ax) return fig, ax # # 1a. Write your own function f(x, y) using the template below. # x and y are represented by the 2d vector `xy` # # In[2]: # write forward function def f(xy): x, y = xy[0], xy[1] # write your own function by changing the expression below output = x**2 + 2.*y**2 # an easy one # output = x**2 + 2.*y**2 + 2.*x*y # a weird one # output = np.log(x**2 + 0.25*y**2) + 0.01*x**2*y**2 # ??? # write your own function by changing the expression above # remember that only convex functions have well-definied and unique global minima return output # differentiate f(x) w.r.t. x df = autograd.grad(f) # # 1b. plot isocontours of $f(x, y)$ and $\nabla_{x, y}||f(x, y)||_2$ # In[3]: x_points = np.linspace(-5, 5, 10) # optionally change the plot limits here y_points = np.linspace(-5, 5, 10) df_norm = lambda xy: np.sqrt(np.sum(np.square(df(xy)))) fig, ax = plot_isocontours(x_points, y_points, f) ax.set_title('f(x, y)') fig, ax = plot_isocontours(x_points, y_points, df_norm) ax.set_title('norm(df/dxy f(x, y))') plt.show() # # 2. Implement the updates for gradient descent with and without momentum using the code template provided # In[4]: def get_gd_delta(xy, f, learning_rate): """ gradient descent update xy <-- xy + delta delta depends on the current position xy, the function f, and the learning rate """ # write your update code below delta = -learning_rate*np.random.randn(*xy.shape) # random upates; replace me # write your update code above return delta def get_gd_mom_delta(xy, f, learning_rate, prev_delta, alpha): """ gradient descent with momentum update xy <-- xy + delta delta depends on the current position xy, the function f, the learning rate, the previous delta, and alpha """ # write your update code below delta = -learning_rate*np.random.randn(*xy.shape) # random upates; replace me # write your update code above return delta # # 2 use your implementation to optimize f # In[5]: fig, ax = plot_isocontours(x_points, y_points, f) # plot function isocontours # hyperparameters #INITIAL_VAL = np.random.randn(2) INITIAL_VAL = np.array([4., 1.]) USE_MOMENTUM = True LEARNING_RATE = 0.3 ALPHA = 0.8 N_ITER = 10 # see if you can reach the function minimum in as few iterations as possible # initialize xy = np.copy(INITIAL_VAL) delta = np.zeros(2) ax.plot(*xy, color='r', marker='.', ms=25) # plot initial values from IPython import display for i in range(N_ITER): # input("Press Enter to continue...") # optional; Enter key triggers next update # time.sleep(0.5) # optional; slow down animation if the flickering is distracting xy_old = np.copy(xy) g = df(xy) # compute standard gradient if USE_MOMENTUM: delta = get_gd_mom_delta(xy, f, LEARNING_RATE, delta, ALPHA) else: delta = get_gd_delta(xy, f, LEARNING_RATE) xy += delta # update position # plot ax.plot([xy_old[0], xy[0]], [xy_old[1], xy[1]],'-k',lw=2) # plot a line connecting old and new param values fig.canvas.draw() ax.set_title('i={}, x,y={}, f(x,y)={:.3f}'.format(i, xy, f(xy))) display.display(fig) display.clear_output(wait=True) print('final value f(x, y)={:.3f} at (x, y)=({:.3f}, {:.3f})'.format(f(xy), xy[0], xy[1])) # # 4. repeat steps 1 and 3 with a new function! # # 5. Briefly describe what you observed. # ## How did the optimization change as the values of the hyperparameters (`INITIAL_VAL`, `USE_MOMENTUM`, `LEARNING_RATE`, `ALPHA`, `N_ITER`) were adjusted? # # Double click to write your response here... # In[ ]: