#!/usr/bin/env python # coding: utf-8 # *** # *** # # # # Introduction to Gradient Descent # # The Idea Behind Gradient Descent 梯度下降 # # *** # *** # # # # # # # **如何找到最快下山的路?** # - 假设此时山上的浓雾很大,下山的路无法确定; # - 假设你摔不死! # # - 你只能利用自己周围的信息去找到下山的路径。 # - 以你当前的位置为基准,寻找这个位置最陡峭的方向,从这个方向向下走。 # # # **Gradient is the vector of partial derivatives** # # One approach to maximizing a function is to # - pick a random starting point, # - compute the gradient, # - take a small step in the direction of the gradient, and # - repeat with a new staring point. # # # # # # Let's represent parameters as $\Theta$, learning rate as $\alpha$, and gradient as $\bigtriangledown J(\Theta)$, # To the find the best model is an optimization problem # - “minimizes the error of the model” # - “maximizes the likelihood of the data.” # We’ll frequently need to maximize (or minimize) functions. # - to find the input vector v that produces the largest (or smallest) possible value. # # # Mathematics behind Gradient Descent # # A simple mathematical intuition behind one of the commonly used optimisation algorithms in Machine Learning. # # https://www.douban.com/note/713353797/ # The cost or loss function: # # $$Cost = \frac{1}{N} \sum_{i = 1}^N (Y' -Y)^2$$ # # # # Parameters with small changes: # $$ m_1 = m_0 - \delta m, b_1 = b_0 - \delta b$$ # # The cost function J is a function of m and b: # # $$J_{m, b} = \frac{1}{N} \sum_{i = 1}^N (Y' -Y)^2 = \frac{1}{N} \sum_{i = 1}^N Error_i^2$$ # $$\frac{\partial J}{\partial m} = 2 Error \frac{\partial}{\partial m}Error$$ # # $$\frac{\partial J}{\partial b} = 2 Error \frac{\partial}{\partial b}Error$$ # Let's fit the data with linear regression: # # $$\frac{\partial}{\partial m}Error = \frac{\partial}{\partial m}(Y' - Y) = \frac{\partial}{\partial m}(mX + b - Y)$$ # # Since $X, b, Y$ are constant: # # $$\frac{\partial}{\partial m}Error = X$$ # $$\frac{\partial}{\partial b}Error = \frac{\partial}{\partial b}(Y' - Y) = \frac{\partial}{\partial b}(mX + b - Y)$$ # # Since $X, m, Y$ are constant: # # $$\frac{\partial}{\partial m}Error = 1$$ # Thus: # # $$\frac{\partial J}{\partial m} = 2 * Error * X$$ # $$\frac{\partial J}{\partial b} = 2 * Error$$ # Let's get rid of the constant 2 and multiplying the learning rate $\alpha$, who determines how large a step to take: # # $$\frac{\partial J}{\partial m} = Error * X * \alpha$$ # $$\frac{\partial J}{\partial b} = Error * \alpha$$ # # Since $ m_1 = m_0 - \delta m, b_1 = b_0 - \delta b$: # # $$ m_1 = m_0 - Error * X * \alpha$$ # # $$b_1 = b_0 - Error * \alpha$$ # # **Notice** that the slope b can be viewed as the beta value for X = 1. Thus, the above two equations are in essence the same. # # Let's represent parameters as $\Theta$, learning rate as $\alpha$, and gradient as $\bigtriangledown J(\Theta)$, we have: # # # $$\Theta_1 = \Theta_0 - \alpha \bigtriangledown J(\Theta)$$ # # # # # Hence,to solve for the gradient, we iterate through our data points using our new $m$ and $b$ values and compute the partial derivatives. # # This new gradient tells us # - the slope of our cost function at our current position # - the direction we should move to update our parameters. # # - The size of our update is controlled by the learning rate. # In[17]: import numpy as np # Size of the points dataset. m = 20 # Points x-coordinate and dummy value (x0, x1). X0 = np.ones((m, 1)) X1 = np.arange(1, m+1).reshape(m, 1) X = np.hstack((X0, X1)) # Points y-coordinate y = np.array([3, 4, 5, 5, 2, 4, 7, 8, 11, 8, 12, 11, 13, 13, 16, 17, 18, 17, 19, 21]).reshape(m, 1) # The Learning Rate alpha. alpha = 0.01 # In[18]: def error_function(theta, X, y): '''Error function J definition.''' diff = np.dot(X, theta) - y return (1./2*m) * np.dot(np.transpose(diff), diff) def gradient_function(theta, X, y): '''Gradient of the function J definition.''' diff = np.dot(X, theta) - y return (1./m) * np.dot(np.transpose(X), diff) def gradient_descent(X, y, alpha): '''Perform gradient descent.''' theta = np.array([1, 1]).reshape(2, 1) gradient = gradient_function(theta, X, y) while not np.all(np.absolute(gradient) <= 1e-5): theta = theta - alpha * gradient gradient = gradient_function(theta, X, y) return theta # source:https://www.jianshu.com/p/c7e642877b0e # In[23]: optimal = gradient_descent(X, y, alpha) print('Optimal parameters Theta:', optimal[0][0], optimal[1][0]) print('Error function:', error_function(optimal, X, y)[0,0]) # # This is the End! # # Estimating the Gradient # If f is a function of one variable, its derivative at a point x measures how f(x) changes when we make a very small change to x. # # > It is defined as the limit of the difference quotients: # # # 差商(difference quotient)就是因变量的改变量与自变量的改变量两者相除的商。 # In[2]: def difference_quotient(f, x, h): return (f(x + h) - f(x)) / h # For many functions it’s easy to exactly calculate derivatives. # # For example, the square function: # # def square(x): # return x * x # # has the derivative: # # def derivative(x): # return 2 * x # In[3]: def square(x): return x * x def derivative(x): return 2 * x derivative_estimate = lambda x: difference_quotient(square, x, h=0.00001) # In[9]: def sum_of_squares(v): """computes the sum of squared elements in v""" return sum(v_i ** 2 for v_i in v) # In[12]: # plot to show they're basically the same import matplotlib.pyplot as plt x = range(-10,10) plt.plot(x, list(map(derivative, x)), 'rx') # red x plt.plot(x, list(map(derivative_estimate, x)), 'b+') # blue + plt.show() # When f is a function of many variables, it has multiple partial derivatives. # In[5]: def partial_difference_quotient(f, v, i, h): # add h to just the i-th element of v w = [v_j + (h if j == i else 0) for j, v_j in enumerate(v)] return (f(w) - f(v)) / h def estimate_gradient(f, v, h=0.00001): return [partial_difference_quotient(f, v, i, h) for i, _ in enumerate(v)] # # Using the Gradient # In[6]: def step(v, direction, step_size): """move step_size in the direction from v""" return [v_i + step_size * direction_i for v_i, direction_i in zip(v, direction)] def sum_of_squares_gradient(v): return [2 * v_i for v_i in v] # In[38]: from collections import Counter from linear_algebra import distance, vector_subtract, scalar_multiply from functools import reduce import math, random # In[42]: print("using the gradient") # generate 3 numbers v = [random.randint(-10,10) for i in range(3)] print(v) tolerance = 0.0000001 n = 0 while True: gradient = sum_of_squares_gradient(v) # compute the gradient at v if n%50 ==0: print(v, sum_of_squares(v)) next_v = step(v, gradient, -0.01) # take a negative gradient step if distance(next_v, v) < tolerance: # stop if we're converging break v = next_v # continue if we're not n += 1 print("minimum v", v) print("minimum value", sum_of_squares(v)) # # Choosing the Right Step Size # Although the rationale for moving against the gradient is clear, # - how far to move is not. # - Indeed, choosing the right step size is more of an art than a science. # Methods: # 1. Using a fixed step size # 1. Gradually shrinking the step size over time # 1. At each step, choosing the step size that minimizes the value of the objective function # In[13]: step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001] # It is possible that certain step sizes will result in invalid inputs for our function. # # So we’ll need to create a “safe apply” function # - returns infinity for invalid inputs: # - which should never be the minimum of anything # In[14]: def safe(f): """define a new function that wraps f and return it""" def safe_f(*args, **kwargs): try: return f(*args, **kwargs) except: return float('inf') # this means "infinity" in Python return safe_f # # Putting It All Together # - **target_fn** that we want to minimize # - **gradient_fn**. # # For example, the target_fn could represent the errors in a model as a function of its parameters, # # To choose a starting value for the parameters `theta_0`. # In[15]: def minimize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001): """use gradient descent to find theta that minimizes target function""" step_sizes = [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0.00001] theta = theta_0 # set theta to initial value target_fn = safe(target_fn) # safe version of target_fn value = target_fn(theta) # value we're minimizing while True: gradient = gradient_fn(theta) next_thetas = [step(theta, gradient, -step_size) for step_size in step_sizes] # choose the one that minimizes the error function next_theta = min(next_thetas, key=target_fn) next_value = target_fn(next_theta) # stop if we're "converging" if abs(value - next_value) < tolerance: return theta else: theta, value = next_theta, next_value # In[16]: # minimize_batch" v = [random.randint(-10,10) for i in range(3)] v = minimize_batch(sum_of_squares, sum_of_squares_gradient, v) print("minimum v", v) print("minimum value", sum_of_squares(v)) # Sometimes we’ll instead want to maximize a function, which we can do by minimizing its negative # In[19]: def negate(f): """return a function that for any input x returns -f(x)""" return lambda *args, **kwargs: -f(*args, **kwargs) def negate_all(f): """the same when f returns a list of numbers""" return lambda *args, **kwargs: [-y for y in f(*args, **kwargs)] def maximize_batch(target_fn, gradient_fn, theta_0, tolerance=0.000001): return minimize_batch(negate(target_fn), negate_all(gradient_fn), theta_0, tolerance) # Using the batch approach, each gradient step requires us to make a prediction and compute the gradient for the whole data set, which makes each step take a long time. # Error functions are additive # - The predictive error on the whole data set is simply the sum of the predictive errors for each data point. # # When this is the case, we can instead apply a technique called **stochastic gradient descent** # - which computes the gradient (and takes a step) for only one point at a time. # - It cycles over our data repeatedly until it reaches a stopping point. # # Stochastic Gradient Descent # During each cycle, we’ll want to iterate through our data in a random order: # In[20]: def in_random_order(data): """generator that returns the elements of data in random order""" indexes = [i for i, _ in enumerate(data)] # create a list of indexes random.shuffle(indexes) # shuffle them for i in indexes: # return the data in that order yield data[i] # This approach avoids circling around near a minimum forever # - whenever we stop getting improvements we’ll decrease the step size and eventually quit. # In[25]: def minimize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): data = list(zip(x, y)) theta = theta_0 # initial guess alpha = alpha_0 # initial step size min_theta, min_value = None, float("inf") # the minimum so far iterations_with_no_improvement = 0 # if we ever go 100 iterations with no improvement, stop while iterations_with_no_improvement < 100: value = sum( target_fn(x_i, y_i, theta) for x_i, y_i in data ) if value < min_value: # if we've found a new minimum, remember it # and go back to the original step size min_theta, min_value = theta, value iterations_with_no_improvement = 0 alpha = alpha_0 else: # otherwise we're not improving, so try shrinking the step size iterations_with_no_improvement += 1 alpha *= 0.9 # and take a gradient step for each of the data points for x_i, y_i in in_random_order(data): gradient_i = gradient_fn(x_i, y_i, theta) theta = vector_subtract(theta, scalar_multiply(alpha, gradient_i)) return min_theta # In[26]: def maximize_stochastic(target_fn, gradient_fn, x, y, theta_0, alpha_0=0.01): return minimize_stochastic(negate(target_fn), negate_all(gradient_fn), x, y, theta_0, alpha_0) # In[ ]: print("using minimize_stochastic_batch") x = list(range(101)) y = [3*x_i + random.randint(-10, 20) for x_i in x] theta_0 = random.randint(-10,10) v = minimize_stochastic(sum_of_squares, sum_of_squares_gradient, x, y, theta_0) print("minimum v", v) print("minimum value", sum_of_squares(v)) # Scikit-learn has a Stochastic Gradient Descent module http://scikit-learn.org/stable/modules/sgd.html # In[ ]: