import numpy as np
import tensorflow as tf
import gym
import tensorflow_probability as tfp
class ActorCritic(tf.keras.Model):
def __init__(self, action_dim):
super().__init__()
self.fc1 = tf.keras.layers.Dense(512, activation="relu")
self.fc2 = tf.keras.layers.Dense(128, activation="relu")
self.critic = tf.keras.layers.Dense(1, activation=None)
self.actor = tf.keras.layers.Dense(action_dim, activation=None)
def call(self, input_data):
x = self.fc1(input_data)
x1 = self.fc2(x)
actor = self.actor(x1)
critic = self.critic(x1)
return critic, actor
class Agent:
def __init__(self, action_dim=4, gamma=0.99):
"""Agent with a neural-network brain powered policy
Args:
action_dim (int): Action dimension
gamma (float) : Discount factor. Default=0.99
"""
self.gamma = gamma
self.opt = tf.keras.optimizers.Adam(learning_rate=1e-4)
self.actor_critic = ActorCritic(action_dim)
def get_action(self, state):
_, action_probabilities = self.actor_critic(np.array([state]))
action_probabilities = tf.nn.softmax(action_probabilities)
action_probabilities = action_probabilities.numpy()
dist = tfp.distributions.Categorical(
probs=action_probabilities, dtype=tf.float32
)
action = dist.sample()
return int(action.numpy()[0])
def actor_loss(self, prob, action, td):
prob = tf.nn.softmax(prob)
dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
log_prob = dist.log_prob(action)
loss = -log_prob * td
return loss
def learn(self, state, action, reward, next_state, done):
state = np.array([state])
next_state = np.array([next_state])
with tf.GradientTape() as tape:
value, action_probabilities = self.actor_critic(state, training=True)
value_next_st, _ = self.actor_critic(next_state, training=True)
td = reward + self.gamma * value_next_st * (1 - int(done)) - value
actor_loss = self.actor_loss(action_probabilities, action, td)
critic_loss = td ** 2
total_loss = actor_loss + critic_loss
grads = tape.gradient(total_loss, self.actor_critic.trainable_variables)
self.opt.apply_gradients(zip(grads, self.actor_critic.trainable_variables))
return total_loss
def train(agent, env, episodes, render=True):
"""Train `agent` in `env` for `episodes`
Args:
agent (Agent): Agent to train
env (gym.Env): Environment to train the agent
episodes (int): Number of episodes to train
render (bool): True=Enable/False=Disable rendering; Default=True
"""
for episode in range(episodes):
done = False
state = env.reset()
total_reward = 0
all_loss = []
while not done:
action = agent.get_action(state)
next_state, reward, done, _ = env.step(action)
loss = agent.learn(state, action, reward, next_state, done)
all_loss.append(loss)
state = next_state
total_reward += reward
if render:
env.render()
if done:
print("\n")
print(f"Episode#:{episode} ep_reward:{total_reward}", end="\r")
if __name__ == "__main__":
env = gym.make("CartPole-v0")
agent = Agent(env.action_space.n)
num_episodes = 10 # Increase number of episodes to train
# Set render=True to visualize Agent's actions in the env
train(agent, env, num_episodes, render=False)
Episode#:0 ep_reward:10.0 Episode#:1 ep_reward:28.0 Episode#:2 ep_reward:10.0 Episode#:3 ep_reward:17.0 Episode#:4 ep_reward:32.0 Episode#:5 ep_reward:32.0 Episode#:6 ep_reward:15.0 Episode#:7 ep_reward:37.0 Episode#:8 ep_reward:10.0 Episode#:9 ep_reward:21.0
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d
Author: Sparsh A. Last updated: 2021-12-02 11:55:28 Compiler : GCC 7.5.0 OS : Linux Release : 5.4.104+ Machine : x86_64 Processor : x86_64 CPU cores : 2 Architecture: 64bit gym : 0.17.3 IPython : 5.5.0 tensorflow_probability: 0.15.0 tensorflow : 2.7.0 numpy : 1.19.5
END