#!/usr/bin/env python # coding: utf-8 # # More Tic-Tac-Toe and a Simple Robot Arm # For this assignment, you will use the reinforcement learning algorithm, Q learning, with a neural network to approximate the Q function. You will apply this to the game Tic-Tac-Toe and to the control of a simple robot arm. # # Most of the code is provided. You are asked to make specific modifications and find parameter values that result in good performance on these tasks. The two tasks will probably require different parameter values. # Download necessary code from [ttt_arm.zip](https://www.cs.colostate.edu/~anderson/cs545/notebooks/ttt_arm.zip). # In[1]: get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') import numpy as np import matplotlib.pyplot as plt from IPython.display import display, clear_output import pandas as pd import pickle # ## Tic Tac Toe # In[2]: import tictactoe # In[32]: class Game: def __init__(self, environment, agents): self.env = environment self.agents = agents def train(self, parms, verbose=True): n_batches = parms['n_batches'] n_games_per_batch = parms['n_games_per_batch'] n_epochs = parms['n_epochs'] method = parms['method'] learning_rate = parms['learning_rate'] epsilon = parms['initial_epsilon'] final_epsilon = parms['final_epsilon'] ttt = self.env epsilon_decay = np.exp((np.log(final_epsilon) - np.log(epsilon)) / (n_batches)) # to produce this final value epsilon_trace = [] outcomes = [] for batch in range(n_batches): agents['X'].clear_samples() agents['O'].clear_samples() for gamei in range(n_games_per_batch): ttt.initialize() done = False while not done: agent = agents[ttt.player] obs = ttt.observe() if len(self.env.valid_actions()) == 9: action = np.random.choice(self.env.valid_actions()) # print('picked random action at start of game') else: action = agent.epsilon_greedy(epsilon) # print('picked best action') ttt.act(action) r = ttt.reinforcement() done = ttt.terminal_state() agent.add_sample(obs, action, r, done) outcomes.append(r) # end n_trials_per_batch self.agents['X'].train(n_epochs, method, learning_rate) self.agents['O'].train(n_epochs, method, learning_rate) epsilon_trace.append(epsilon) epsilon *= epsilon_decay if verbose and (len(outcomes) % ((n_batches * n_games_per_batch) // 20) == 0): print(f'{len(outcomes)}/{n_batches * n_games_per_batch} games, {np.mean(outcomes):.2f} outcome mean') if verbose: plt.subplot(3, 1, 1) n_per = 10 n_bins = len(outcomes) // n_per outcomes_binned = np.array(outcomes).reshape(-1, n_per) avgs = outcomes_binned.mean(1) xs = np.linspace(n_per, n_per * n_bins, len(avgs)) plt.plot(xs, avgs) plt.axhline(y=0, color='orange', ls='--') plt.ylabel('R') plt.subplot(3, 1, 2) plt.plot(xs, np.sum(outcomes_binned == -1, axis=1), 'r-', label='O Wins') plt.plot(xs, np.sum(outcomes_binned == 0, axis=1), 'b-', label='Draws') plt.plot(xs, np.sum(outcomes_binned == 1, axis=1), 'g-', label='X Wins') plt.legend(loc='center') plt.ylabel(f'Number of Games\nin Bins of {n_per:d}') plt.subplot(3, 1, 3) plt.plot(epsilon_trace) plt.ylabel(r'$\epsilon$') return outcomes, epsilon_trace def play_game(self, epsilon=0.0, verbose=True): ttt = self.env agents = self.agents ttt.initialize() while True: agent = agents[ttt.player] obs = ttt.observe() if len(ttt.valid_actions()) == 9: action = agent.epsilon_greedy(epsilon=1.0) else: action = agent.epsilon_greedy(epsilon) ttt.act(action) if verbose: print(ttt) if ttt.terminal_state(): return ttt.reinforcement() def play_game_show_Q(self, epsilon=0.0): ttt = self.env agents = self.agents step = 0 ttt.initialize() while True: agent = agents[ttt.player] obs = ttt.observe() actions = ttt.valid_actions() if len(ttt.valid_actions()) == 9: action = agent.epsilon_greedy(epsilon=1.0) else: action = agent.epsilon_greedy(epsilon) ttt.act(action) step += 1 plt.subplot(5, 2, step) Qs = np.array([agent.use(np.hstack((obs, a))) for a in actions]) board_image = np.array([np.nan] * 9) for Q, a in zip(Qs, actions): board_image[a] = Q[0, 0] board_image = board_image.reshape(3, 3) maxmag = np.nanmax(np.abs(board_image)) plt.imshow(board_image, cmap='coolwarm', vmin=-maxmag, vmax=maxmag) plt.colorbar() obs = ttt.observe() i = -1 for row in range(3): for col in range(3): i += 1 if obs[i] == 1: plt.text(col, row, 'X', ha='center', fontweight='bold', fontsize='large', color='black') elif obs[i] == -1: plt.text(col, row, 'O', ha='center', fontweight='bold', fontsize='large', color='black') plt.axis('off') if ttt.terminal_state(): break plt.tight_layout() # In[33]: ttt = tictactoe.TicTacToe() nh = [10] agents = {'X': tictactoe.QnetAgent(ttt, nh, 'max'), 'O': tictactoe.QnetAgent(ttt, nh, 'min')} game = Game(ttt, agents) game.play_game(0) # In[52]: previous_best = -np.inf results = [] for nb in [100, 500]: for ng in [5, 10, 20]: for ne in [2, 5, 10, 20]: for nh in [ [], [50], [50, 50, 50] ]: parms = { 'n_batches': nb, 'n_games_per_batch': ng, 'n_epochs': ne, 'method': 'scg', 'learning_rate': 0.01, 'initial_epsilon': 1.0, 'final_epsilon': 0.01, 'gamma': 1.0 } agents = {'X': tictactoe.QnetAgent(ttt, nh, 'max'), 'O': tictactoe.QnetAgent(ttt, nh, 'min')} game = Game(ttt, agents) outcomes, _ = game.train(parms, verbose=False) mean_outcomes = np.mean(outcomes) results.append([nh, nb, ng, ne, mean_outcomes]) clear_output() df = pd.DataFrame(results, columns=('hiddens', 'batches', 'games', 'epochs', 'mean r')) print(df.sort_values(by='mean r', ascending=False)) if mean_outcomes > previous_best: previous_best = mean_outcomes with open('best_ttt_agents.pkl', 'wb') as f: pickle.dump(agents, f) # In[55]: with open('best_ttt_agents.pkl', 'rb') as f: agents = pickle.load(f) ttt = agents['X'].env game = Game(ttt, agents) rs = [] for n_games in range(100): rs.append(game.play_game(epsilon=0.05, verbose=False)) print(f'mean of final outcomes {np.mean(rs)}') # In[56]: game.play_game_show_Q() # ## Robot # In[1]: import numpy as np import matplotlib.pyplot as plt from IPython.display import display, clear_output import pandas as pd import pickle import robot # In[2]: class Experiment: def __init__(self, environment, agent): self.env = environment self.agent = agent def train(self, parms, verbose=True): n_batches = parms['n_batches'] n_steps_per_batch = parms['n_steps_per_batch'] n_epochs = parms['n_epochs'] method = parms['method'] learning_rate = parms['learning_rate'] epsilon = parms['initial_epsilon'] final_epsilon = parms['final_epsilon'] gamma = parms['gamma'] env = self.env epsilon_decay = np.exp((np.log(final_epsilon) - np.log(epsilon))/ (n_batches)) # to produce this final value epsilon_trace = [] outcomes = [] for batch in range(n_batches): agent.clear_samples() env.initialize() sum_rs = 0 for step in range(n_steps_per_batch): obs = self.env.observe() action = agent.epsilon_greedy(epsilon) env.act(action) r = env.reinforcement() sum_rs += r done = step == n_steps_per_batch - 1 agent.add_sample(obs, action, r, done) outcomes.append(sum_rs / n_steps_per_batch) self.agent.train(n_epochs, method, learning_rate, gamma) epsilon_trace.append(epsilon) epsilon *= epsilon_decay if verbose and (len(outcomes) % (n_batches // 20) == 0): print(f'{len(outcomes)}/{n_batches} batches, {np.mean(outcomes):.4f} outcome mean') if verbose: plt.figure(1) plt.clf() plt.subplot(2, 1, 1) n_per = 10 n_bins = len(outcomes) // n_per outcomes_binned = np.array(outcomes).reshape(-1, n_per) avgs = outcomes_binned.mean(1) xs = np.linspace(n_per, n_per * n_bins, len(avgs)) plt.plot(xs, avgs) plt.axhline(y=0, color='orange', ls='--') plt.ylabel('R') plt.subplot(2, 1, 2) plt.plot(epsilon_trace) plt.ylabel(r'$\epsilon$') #plt.pause(0.1) return outcomes # , epsilon_trace def test(self, n_trials, n_steps, epsilon=0.0, graphics=True): if graphics: fig = plt.figure(figsize=(10, 10)) robot = self.env sum_rs = 0 for trial in range(n_trials): robot.initialize() agent = self.agent points = np.zeros((n_steps, robot.n_links + 1, 2)) actions = np.zeros((n_steps, robot.n_links)) Q_values = np.zeros((n_steps)) for i in range(n_steps): action = agent.epsilon_greedy(epsilon) Q = agent.use(np.hstack((robot.observe(), action))) self.env.act(action) sum_rs += self.env.reinforcement() points[i] = robot.points actions[i] = action Q_values[i] = Q[0, 0] if graphics: Q_min, Q_max = np.min(Q_values), np.max(Q_values) print(Q_min, Q_max) for i in range(n_steps): fig.clf() plt.scatter(robot.goal[0], robot.goal[1], s=40, c='blue') action = actions[i] robot.set_points(points[i]) robot.draw() # alpha=(Q_values[i] - Q_min) / (Q_max - Q_min)) clear_output(wait=True) display(fig) clear_output(wait=True) return sum_rs / (n_trials * n_steps) # In[7]: robbie = robot.Robot() robbie.set_goal([5., 5.]) agent = robot.QnetAgent(robbie, [100, 100, 100]) experiment = Experiment(robbie, agent) # In[11]: import pandas as pd previous_best = -np.inf results = [] for nb in [100, 200, 5000]: for ns in [50, 100]: for ne in [5, 10]: for nh in [ [], [50], [50, 50] ]: parms = { 'n_batches': nb, 'n_steps_per_batch': ns, 'n_epochs': ne, 'method': 'scg', 'learning_rate': 0.01, 'initial_epsilon': 1.0, 'final_epsilon': 0.001, 'gamma': 1.0 } agent = robot.QnetAgent(robbie, nh) experiment = Experiment(robbie, agent) outcomes = experiment.train(parms, verbose=False) results.append([nh, nb, ns, ne, outcomes[-1]]) clear_output() df = pd.DataFrame(results, columns=('hiddens', 'batches', 'steps', 'epochs', 'dist')) print(df.sort_values(by='dist', ascending=False)) if outcomes[-1] > previous_best: previous_best = outcomes[-1] with open('best_robot_agent.pkl', 'wb') as f: pickle.dump(agent, f) print() # In[12]: with open('best_robot_agent.pkl', 'rb') as f: agent = pickle.load(f) robbie = agent.env experiment = Experiment(robbie, agent) mean_r = experiment.test(n_trials=10, n_steps=100, epsilon=0.0, graphics=False) print(f'mean of reinforcements {mean_r:.3f}') # In[15]: parms = { 'n_batches': 500, 'n_steps_per_batch': 50, 'n_epochs': 5, 'method': 'scg', 'learning_rate': 0.01, 'initial_epsilon': 1.0, 'final_epsilon': 0.0001, 'gamma': 1.0 } agent = robot.QnetAgent(robbie, [50]) experiment = Experiment(robbie, agent) outcomes = experiment.train(parms) # In[16]: experiment.test(10, 100, epsilon=0.0, graphics=True)