#!/usr/bin/env python # coding: utf-8 # In[1]: import gym import numpy as np import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[2]: from gym.envs.registration import register register( id='FrozenLakeNotSlippery-v0', entry_point='gym.envs.toy_text:FrozenLakeEnv', kwargs={'map_name' : '4x4', 'is_slippery': False}, max_episode_steps=100, reward_threshold=0.78, # optimum = .8196 ) #env = gym.make('FrozenLake-v0') env = gym.make('FrozenLakeNotSlippery-v0') # In[3]: Q = np.zeros([env.observation_space.n, env.action_space.n]) #Q = np.random.rand(env.observation_space.n, env.action_space.n) lr = 0.8 y = 0.95 num_episodes = 2000 #create list to contain total rewards per episode rList = [] for i in range(num_episodes): #Reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 #The Q-Table learning algorithm while j < 99: j+=1 #Choose an action by greedily (with noise) picking from Q table a = np.argmax(Q[s,:] + (np.random.randn(1, env.action_space.n) * (1./(i**2 + 1)))) #Get new state and reward from environment s1, r, d, _ = env.step(a) #Update Q-Table with new knowledge Q[s,a] = Q[s,a] + lr*(r + y * np.max(Q[s1,:]) - Q[s,a]) rAll += r s = s1 if d == True: break rList.append(rAll) print('Score over time: {}'.format(sum(rList)/num_episodes)) print('Final Q-Table Values') print(Q) # In[4]: s = env.reset() d = 0 j = 0 while j < 99: #env.render() j+=1 a = np.argmax(Q[s,:] ) #Get new state and reward from environment s1, r, d, _ = env.step(a) s = s1 if d == True: break # In[5]: plt.plot(rList)