!pip install d2l==0.17.6 %matplotlib inline import torch from d2l import torch as d2l eta = 0.4 def f_2d(x1, x2): return 0.1 * x1 ** 2 + 2 * x2 ** 2 def gd_2d(x1, x2, s1, s2): return (x1 - eta * 0.2 * x1, x2 - eta * 4 * x2, 0, 0) d2l.show_trace_2d(f_2d, d2l.train_2d(gd_2d)) eta = 0.6 d2l.show_trace_2d(f_2d, d2l.train_2d(gd_2d)) def momentum_2d(x1, x2, v1, v2): v1 = beta * v1 + 0.2 * x1 v2 = beta * v2 + 4 * x2 return x1 - eta * v1, x2 - eta * v2, v1, v2 eta, beta = 0.6, 0.5 d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d)) eta, beta = 0.6, 0.25 d2l.show_trace_2d(f_2d, d2l.train_2d(momentum_2d)) d2l.set_figsize() betas = [0.95, 0.9, 0.6, 0] for beta in betas: x = torch.arange(40).detach().numpy() d2l.plt.plot(x, beta ** x, label=f'beta = {beta:.2f}') d2l.plt.xlabel('time') d2l.plt.legend(); def init_momentum_states(feature_dim): v_w = torch.zeros((feature_dim, 1)) v_b = torch.zeros(1) return (v_w, v_b) def sgd_momentum(params, states, hyperparams): for p, v in zip(params, states): with torch.no_grad(): v[:] = hyperparams['momentum'] * v + p.grad p[:] -= hyperparams['lr'] * v p.grad.data.zero_() def train_momentum(lr, momentum, num_epochs=2): d2l.train_ch11(sgd_momentum, init_momentum_states(feature_dim), {'lr': lr, 'momentum': momentum}, data_iter, feature_dim, num_epochs) data_iter, feature_dim = d2l.get_data_ch11(batch_size=10) train_momentum(0.02, 0.5) train_momentum(0.01, 0.9) train_momentum(0.005, 0.9) trainer = torch.optim.SGD d2l.train_concise_ch11(trainer, {'lr': 0.005, 'momentum': 0.9}, data_iter) lambdas = [0.1, 1, 10, 19] eta = 0.1 d2l.set_figsize((6, 4)) for lam in lambdas: t = torch.arange(20).detach().numpy() d2l.plt.plot(t, (1 - eta * lam) ** t, label=f'lambda = {lam:.2f}') d2l.plt.xlabel('time') d2l.plt.legend();