import gym import torch env = gym.make('CartPole-v0') n_state = env.observation_space.shape[0] n_action = env.action_space.n def run_episode(env, weight): """simulates an episode given the input weight and returns the total reward. Here, we convert the state array to a tensor of the float type because we need to compute the multiplication of the state and weight tensor, torch.matmul(state, weight), for linear mapping. The action with the higher value is selected using the torch.argmax() operation. And don't forget to take the value of the resulting action tensor using .item() because it is a one-element tensor. """ state = env.reset() total_reward = 0 is_done = False while not is_done: state = torch.from_numpy(state).float() action = torch.argmax(torch.matmul(state, weight)) state, reward, is_done, _ = env.step(action.item()) total_reward += reward return total_reward n_episode = 1000 best_total_reward = 0 best_weight = None total_rewards = [] for episode in range(n_episode): """Now, we can run n_episode. For each episode, we do the following: - Randomly pick the weight - Let the agent take actions according to the linear mapping - An episode terminates and returns the total reward - Update the best total reward and the best weight if necessary - Also, keep a record of the total reward """ weight = torch.rand(n_state, n_action) total_reward = run_episode(env, weight) print('Episode {}: {}'.format(episode+1, total_reward)) if total_reward > best_total_reward: best_weight = weight best_total_reward = total_reward total_rewards.append(total_reward) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode)) import matplotlib.pyplot as plt plt.plot(total_rewards) plt.xlabel('Episode') plt.ylabel('Reward') plt.show() n_episode_eval = 100 total_rewards_eval = [] for episode in range(n_episode_eval): total_reward = run_episode(env, best_weight) print('Episode {}: {}'.format(episode+1, total_reward)) total_rewards_eval.append(total_reward) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards_eval) / n_episode_eval)) n_episode = 1000 best_total_reward = 0 best_weight = None total_rewards = [] for episode in range(n_episode): weight = torch.rand(n_state, n_action) total_reward = run_episode(env, weight) print('Episode {}: {}'.format(episode+1, total_reward)) if total_reward > best_total_reward: best_weight = weight best_total_reward = total_reward total_rewards.append(total_reward) if best_total_reward == 200: break n_training = 1000 n_episode_training = [] for _ in range(n_training): for episode in range(n_episode): weight = torch.rand(n_state, n_action) total_reward = run_episode(env, weight) if total_reward == 200: n_episode_training.append(episode+1) break print('Expectation of training episodes needed: ', sum(n_episode_training) / n_training) import torch import gym env = gym.make('CartPole-v0') n_state = env.observation_space.shape[0] n_action = env.action_space.n def run_episode(env, weight): state = env.reset() total_reward = 0 is_done = False while not is_done: state = torch.from_numpy(state).float() action = torch.argmax(torch.matmul(state, weight)) state, reward, is_done, _ = env.step(action.item()) total_reward += reward return total_reward n_episode = 1000 best_weight = torch.rand(n_state, n_action) best_total_reward = 0 total_rewards = [] noise_scale = 0.01 for episode in range(n_episode): """After we randomly pick an initial weight, for each episode, we do the following: - Add random noise to the weight - Let the agent take actions according to the linear mapping - An episode terminates and returns the total reward - If the current reward is greater than the best one obtained so far, update the best reward and the weight - Otherwise, the best reward and the weight remain unchanged - Also, keep a record of the total reward """ weight = best_weight + noise_scale * torch.rand(n_state, n_action) total_reward = run_episode(env, weight) if total_reward >= best_total_reward: best_total_reward = total_reward best_weight = weight total_rewards.append(total_reward) print('Episode {}: {}'.format(episode + 1, total_reward)) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode)) best_weight = torch.rand(n_state, n_action) noise_scale = 0.01 best_total_reward = 0 total_rewards = [] for episode in range(n_episode): weight = best_weight + noise_scale * torch.rand(n_state, n_action) total_reward = run_episode(env, weight) if total_reward >= best_total_reward: best_total_reward = total_reward best_weight = weight noise_scale = max(noise_scale / 2, 1e-4) else: noise_scale = min(noise_scale * 2, 2) print('Episode {}: {}'.format(episode + 1, total_reward)) total_rewards.append(total_reward) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode)) import matplotlib.pyplot as plt plt.plot(total_rewards) plt.xlabel('Episode') plt.ylabel('Reward') plt.show() n_episode_eval = 100 total_rewards_eval = [] for episode in range(n_episode_eval): total_reward = run_episode(env, best_weight) print('Episode {}: {}'.format(episode+1, total_reward)) total_rewards_eval.append(total_reward) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards_eval) / n_episode_eval)) best_weight = torch.rand(n_state, n_action) noise_scale = 0.01 best_total_reward = 0 total_rewards = [] for episode in range(n_episode): weight = best_weight + noise_scale * torch.rand(n_state, n_action) total_reward = run_episode(env, weight) if total_reward >= best_total_reward: best_total_reward = total_reward best_weight = weight noise_scale = max(noise_scale / 2, 1e-4) else: noise_scale = min(noise_scale * 2, 2) print('Episode {}: {}'.format(episode + 1, total_reward)) total_rewards.append(total_reward) if episode >= 99 and sum(total_rewards[-100:]) >= 19500: break import torch import gym env = gym.make('CartPole-v0') n_state = env.observation_space.shape[0] n_action = env.action_space.n def run_episode(env, weight): """We define the run_episode function, which simulates an episode given the input weight and returns the total reward and the gradients computed. More specifically, it does the following tasks in each step: - Calculates the probabilities, probs, for both actions based on the current state and input weight - Samples an action, action, based on the resulting probabilities - Computes the derivatives, d_softmax, of the softmax function with the probabilities as input - Divides the resulting derivatives, d_softmax, by the probabilities, probs, to get the derivatives, d_log, of the log term with respect to the policy - Applies the chain rule to compute the gradient, grad, of the weights - Records the resulting gradient, grad - Performs the action, accumulates the reward, and updates the state """ state = env.reset() grads = [] total_reward = 0 is_done = False while not is_done: state = torch.from_numpy(state).float() z = torch.matmul(state, weight) probs = torch.nn.Softmax()(z) action = int(torch.bernoulli(probs[1]).item()) d_softmax = torch.diag(probs) - probs.view(-1, 1) * probs d_log = d_softmax[action] / probs[action] grad = state.view(-1, 1) * d_log grads.append(grad) state, reward, is_done, _ = env.step(action) total_reward += reward if is_done: break return total_reward, grads n_episode = 1000 learning_rate = 0.001 total_rewards = [] weight = torch.rand(n_state, n_action) for episode in range(n_episode): total_reward, gradients = run_episode(env, weight) print('Episode {}: {}'.format(episode + 1, total_reward)) for i, gradient in enumerate(gradients): weight += learning_rate * gradient * (total_reward - i) total_rewards.append(total_reward) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards) / n_episode)) import matplotlib.pyplot as plt plt.plot(total_rewards) plt.xlabel('Episode') plt.ylabel('Reward') plt.show() n_episode_eval = 100 total_rewards_eval = [] for episode in range(n_episode_eval): total_reward, _ = run_episode(env, weight) print('Episode {}: {}'.format(episode+1, total_reward)) total_rewards_eval.append(total_reward) print('Average total reward over {} episode: {}'.format(n_episode, sum(total_rewards_eval) / n_episode_eval))