!pip install d2l==1.0.3 !pip install "setuptools==66" !pip install "wheel==0.38.4" !pip install "gym==0.21.0" %matplotlib inline import random import numpy as np from d2l import torch as d2l seed = 0 # Random number generator seed gamma = 0.95 # Discount factor num_iters = 10 # Number of iterations random.seed(seed) # Set the random seed to ensure results can be reproduced np.random.seed(seed) # Now set up the environment env_info = d2l.make_env('FrozenLake-v1', seed=seed) def value_iteration(env_info, gamma, num_iters): env_desc = env_info['desc'] # 2D array shows what each item means prob_idx = env_info['trans_prob_idx'] nextstate_idx = env_info['nextstate_idx'] reward_idx = env_info['reward_idx'] num_states = env_info['num_states'] num_actions = env_info['num_actions'] mdp = env_info['mdp'] V = np.zeros((num_iters + 1, num_states)) Q = np.zeros((num_iters + 1, num_states, num_actions)) pi = np.zeros((num_iters + 1, num_states)) for k in range(1, num_iters + 1): for s in range(num_states): for a in range(num_actions): # Calculate \sum_{s'} p(s'\mid s,a) [r + \gamma v_k(s')] for pxrds in mdp[(s,a)]: # mdp(s,a): [(p1,next1,r1,d1),(p2,next2,r2,d2),..] pr = pxrds[prob_idx] # p(s'\mid s,a) nextstate = pxrds[nextstate_idx] # Next state reward = pxrds[reward_idx] # Reward Q[k,s,a] += pr * (reward + gamma * V[k - 1, nextstate]) # Record max value and max action V[k,s] = np.max(Q[k,s,:]) pi[k,s] = np.argmax(Q[k,s,:]) d2l.show_value_function_progress(env_desc, V[:-1], pi[:-1]) value_iteration(env_info=env_info, gamma=gamma, num_iters=num_iters)