Notebook

In [ ]:

!wget -q --show-progress https://github.com/RecoHut-Projects/drl-recsys/raw/S990517/tools/webgym.zip
!unzip webgym.zip

In [ ]:

!pip install selenium
!apt-get update # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

In [ ]:

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

In [ ]:

import webgym

In [ ]:

import argparse
import os
import copy
import random
from collections import deque
from datetime import datetime

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import (
    Conv2D,
    Dense,
    Dropout,
    Flatten,
    Input,
    Lambda,
    MaxPool2D,
    concatenate,
)

In [ ]:

%load_ext tensorboard

In [ ]:

tf.keras.backend.set_floatx("float64")

In [ ]:

parser = argparse.ArgumentParser(prog="TFRL-SocialMedia-Like-Reply-Agent")
parser.add_argument("--env", default="MiniWoBSocialMediaReplyVisualEnv-v0")
parser.add_argument("--update-freq", type=int, default=16)
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--actor-lr", type=float, default=1e-4)
parser.add_argument("--critic-lr", type=float, default=1e-4)
parser.add_argument("--clip-ratio", type=float, default=0.1)
parser.add_argument("--gae-lambda", type=float, default=0.95)
parser.add_argument("--gamma", type=float, default=0.99)
parser.add_argument("--logdir", default="logs")

args = parser.parse_args([])
logdir = os.path.join(
    args.logdir, parser.prog, args.env, datetime.now().strftime("%Y%m%d-%H%M%S")
)
print(f"Saving training logs to:{logdir}")
writer = tf.summary.create_file_writer(logdir)

Saving training logs to:logs/TFRL-SocialMedia-Like-Reply-Agent/MiniWoBSocialMediaReplyVisualEnv-v0/20211203-061930

In [ ]:

class Actor:
    def __init__(self, state_dim, action_dim, action_bound, std_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = np.array(action_bound)
        self.std_bound = std_bound
        self.weight_initializer = tf.keras.initializers.he_normal()
        self.eps = 1e-5
        self.model = self.nn_model()
        self.model.summary()  # Print a summary of the Actor model
        self.opt = tf.keras.optimizers.Nadam(args.actor_lr)

    def nn_model(self):
        obs_input = Input(self.state_dim, name="im_obs")
        conv1 = Conv2D(
            filters=64,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="same",
            input_shape=self.state_dim,
            data_format="channels_last",
            activation="relu",
        )(obs_input)
        pool1 = MaxPool2D(pool_size=(3, 3), strides=1)(conv1)
        conv2 = Conv2D(
            filters=32,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool1)
        pool2 = MaxPool2D(pool_size=(3, 3), strides=1)(conv2)
        conv3 = Conv2D(
            filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool2)
        pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)
        conv4 = Conv2D(
            filters=8,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool3)
        pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)
        flat = Flatten()(pool4)
        dense1 = Dense(
            16, activation="relu", kernel_initializer=self.weight_initializer
        )(flat)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(
            8, activation="relu", kernel_initializer=self.weight_initializer
        )(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        # action_dim[0] = 2
        output_val = Dense(
            self.action_dim[0],
            activation="relu",
            kernel_initializer=self.weight_initializer,
        )(dropout2)
        # Scale & clip x[i] to be in range [0, action_bound[i]]
        action_bound = copy.deepcopy(self.action_bound)
        mu_output = Lambda(
            lambda x: tf.clip_by_value(x * action_bound, 1e-9, action_bound),
            name="mu_output",
        )(output_val)
        std_output_1 = Dense(
            self.action_dim[0],
            activation="softplus",
            kernel_initializer=self.weight_initializer,
        )(dropout2)
        std_output = Lambda(
            lambda x: tf.clip_by_value(
                x * action_bound, 1e-9, action_bound / 2, name="std_output"
            )
        )(std_output_1)
        return tf.keras.models.Model(
            inputs=obs_input, outputs=[mu_output, std_output], name="Actor"
        )

    def get_action(self, state):
        # Convert [Image] to np.array(np.adarray)
        state_np = np.array([np.array(s) for s in state])
        if len(state_np.shape) == 3:
            # Convert (w, h, c) to (1, w, h, c)
            state_np = np.expand_dims(state_np, 0)
        mu, std = self.model.predict(state_np)
        action = np.random.normal(mu[0], std[0] + self.eps, size=self.action_dim).astype(
            "int"
        )
        # Clip action to be between 0 and max obs screen size
        action = np.clip(action, 0, self.action_bound)
        # 1 Action per instance of env; Env expects: (num_instances, actions)
        action = (action,)
        log_policy = self.log_pdf(mu, std, action)
        return log_policy, action

    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(
            var * 2 * np.pi
        )
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)

    def compute_loss(self, log_old_policy, log_new_policy, actions, gaes):
        # Avoid INF in exp by setting 80 as the upper bound since,
        # tf.exp(x) for x>88 yeilds NaN (float32)
        ratio = tf.exp(
            tf.minimum(log_new_policy - tf.stop_gradient(log_old_policy), 80)
        )
        gaes = tf.stop_gradient(gaes)
        clipped_ratio = tf.clip_by_value(
            ratio, 1.0 - args.clip_ratio, 1.0 + args.clip_ratio
        )
        surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)
        return tf.reduce_mean(surrogate)

    def train(self, log_old_policy, states, actions, gaes):
        with tf.GradientTape() as tape:
            mu, std = self.model(states, training=True)
            log_new_policy = self.log_pdf(mu, std, actions)
            loss = self.compute_loss(log_old_policy, log_new_policy, actions, gaes)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

    def save(self, model_dir: str, version: int = 1):
        actor_model_save_dir = os.path.join(
            model_dir, "actor", str(version), "model.savedmodel"
        )
        self.model.save(actor_model_save_dir, save_format="tf")
        print(f"Actor model saved at:{actor_model_save_dir}")


class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.weight_initializer = tf.keras.initializers.he_normal()
        self.model = self.nn_model()
        self.model.summary()  # Print a summary of the Critic model
        self.opt = tf.keras.optimizers.Nadam(args.critic_lr)

    def nn_model(self):
        obs_input = Input(self.state_dim)
        conv1 = Conv2D(
            filters=64,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="same",
            input_shape=self.state_dim,
            data_format="channels_last",
            activation="relu",
        )(obs_input)
        pool1 = MaxPool2D(pool_size=(3, 3), strides=2)(conv1)
        conv2 = Conv2D(
            filters=32,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool1)
        pool2 = MaxPool2D(pool_size=(3, 3), strides=2)(conv2)
        conv3 = Conv2D(
            filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool2)
        pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)
        conv4 = Conv2D(
            filters=8,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool3)
        pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)
        flat = Flatten()(pool4)
        dense1 = Dense(
            16, activation="relu", kernel_initializer=self.weight_initializer
        )(flat)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(
            8, activation="relu", kernel_initializer=self.weight_initializer
        )(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        value = Dense(
            1, activation="linear", kernel_initializer=self.weight_initializer
        )(dropout2)

        return tf.keras.models.Model(inputs=obs_input, outputs=value, name="Critic")

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            # assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

    def save(self, model_dir: str, version: int = 1):
        critic_model_save_dir = os.path.join(
            model_dir, "critic", str(version), "model.savedmodel"
        )
        self.model.save(critic_model_save_dir, save_format="tf")
        print(f"Critic model saved at:{critic_model_save_dir}")


class PPOAgent:
    def __init__(self, env):
        self.env = env
        self.state_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.shape
        # Set action_bounds to be within the actual task-window/browser-view of the agent
        self.action_bound = [self.env.task_width, self.env.task_height]
        self.std_bound = [1e-2, 1.0]

        self.actor = Actor(
            self.state_dim, self.action_dim, self.action_bound, self.std_bound
        )
        self.critic = Critic(self.state_dim)

    def gae_target(self, rewards, v_values, next_v_value, done):
        n_step_targets = np.zeros_like(rewards)
        gae = np.zeros_like(rewards)
        gae_cumulative = 0
        forward_val = 0

        if not done:
            forward_val = next_v_value

        for k in reversed(range(0, len(rewards))):
            delta = rewards[k] + args.gamma * forward_val - v_values[k]
            gae_cumulative = args.gamma * args.gae_lambda * gae_cumulative + delta
            gae[k] = gae_cumulative
            forward_val = v_values[k]
            n_step_targets[k] = gae[k] + v_values[k]
        return gae, n_step_targets

    def train(self, max_episodes=1000):
        with writer.as_default():
            for ep in range(max_episodes):
                state_batch = []
                action_batch = []
                reward_batch = []
                old_policy_batch = []

                episode_reward, done = 0, False

                state = self.env.reset()
                prev_state = state
                step_num = 0

                while not done:
                    # self.env.render()
                    log_old_policy, action = self.actor.get_action(state)

                    next_state, reward, dones, _ = self.env.step(action)
                    step_num += 1
                    print(
                        f"ep#:{ep} step#:{step_num} step_rew:{reward} action:{action} dones:{dones}"
                    )
                    done = np.all(dones)
                    if done:
                        next_state = prev_state
                    else:
                        prev_state = next_state
                    state = np.array([np.array(s) for s in state])
                    next_state = np.array([np.array(s) for s in next_state])
                    reward = np.reshape(reward, [1, 1])
                    log_old_policy = np.reshape(log_old_policy, [1, 1])

                    state_batch.append(state)
                    action_batch.append(action)
                    reward_batch.append((reward + 8) / 8)
                    old_policy_batch.append(log_old_policy)

                    if len(state_batch) >= args.update_freq or done:
                        states = np.array([state.squeeze() for state in state_batch])
                        # Convert ([x, y],) to [x, y]
                        actions = np.array([action[0] for action in action_batch])
                        rewards = np.array(
                            [reward.squeeze() for reward in reward_batch]
                        )
                        old_policies = np.array(
                            [old_pi.squeeze() for old_pi in old_policy_batch]
                        )

                        v_values = self.critic.model.predict(states)
                        next_v_value = self.critic.model.predict(next_state)

                        gaes, td_targets = self.gae_target(
                            rewards, v_values, next_v_value, done
                        )
                        actor_losses, critic_losses = [], []
                        for epoch in range(args.epochs):
                            actor_loss = self.actor.train(
                                old_policies, states, actions, gaes
                            )
                            actor_losses.append(actor_loss)
                            critic_loss = self.critic.train(states, td_targets)
                            critic_losses.append(critic_loss)
                        # Plot mean actor & critic losses on every update
                        tf.summary.scalar("actor_loss", np.mean(actor_losses), step=ep)
                        tf.summary.scalar(
                            "critic_loss", np.mean(critic_losses), step=ep
                        )

                        state_batch = []
                        action_batch = []
                        reward_batch = []
                        old_policy_batch = []

                    episode_reward += reward[0][0]
                    state = next_state[0]

                print(f"Episode#{ep} Reward:{episode_reward} Actions:{action_batch}")
                tf.summary.scalar("episode_reward", episode_reward, step=ep)

    def save(self, model_dir: str, version: int = 1):
        self.actor.save(model_dir, version)
        self.critic.save(model_dir, version)


if __name__ == "__main__":
    env_name = args.env
    env = gym.make(env_name)
    cta_agent = PPOAgent(env)
    cta_agent.train(max_episodes=2)
    # Model saving
    model_dir = "trained_models"
    agent_name = f"PPO_{env_name}-v0"
    agent_version = 1
    agent_model_path = os.path.join(model_dir, agent_name)
    cta_agent.save(agent_model_path, agent_version)

Model: "Actor"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 im_obs (InputLayer)            [(None, 64, 64, 3)]  0           []                               
                                                                                                  
 conv2d_4 (Conv2D)              (None, 64, 64, 64)   1792        ['im_obs[0][0]']                 
                                                                                                  
 max_pooling2d_4 (MaxPooling2D)  (None, 62, 62, 64)  0           ['conv2d_4[0][0]']               
                                                                                                  
 conv2d_5 (Conv2D)              (None, 60, 60, 32)   18464       ['max_pooling2d_4[0][0]']        
                                                                                                  
 max_pooling2d_5 (MaxPooling2D)  (None, 58, 58, 32)  0           ['conv2d_5[0][0]']               
                                                                                                  
 conv2d_6 (Conv2D)              (None, 56, 56, 16)   4624        ['max_pooling2d_5[0][0]']        
                                                                                                  
 max_pooling2d_6 (MaxPooling2D)  (None, 54, 54, 16)  0           ['conv2d_6[0][0]']               
                                                                                                  
 conv2d_7 (Conv2D)              (None, 52, 52, 8)    1160        ['max_pooling2d_6[0][0]']        
                                                                                                  
 max_pooling2d_7 (MaxPooling2D)  (None, 50, 50, 8)   0           ['conv2d_7[0][0]']               
                                                                                                  
 flatten_1 (Flatten)            (None, 20000)        0           ['max_pooling2d_7[0][0]']        
                                                                                                  
 dense_3 (Dense)                (None, 16)           320016      ['flatten_1[0][0]']              
                                                                                                  
 dropout_2 (Dropout)            (None, 16)           0           ['dense_3[0][0]']                
                                                                                                  
 dense_4 (Dense)                (None, 8)            136         ['dropout_2[0][0]']              
                                                                                                  
 dropout_3 (Dropout)            (None, 8)            0           ['dense_4[0][0]']                
                                                                                                  
 dense_5 (Dense)                (None, 2)            18          ['dropout_3[0][0]']              
                                                                                                  
 dense_6 (Dense)                (None, 2)            18          ['dropout_3[0][0]']              
                                                                                                  
 mu_output (Lambda)             (None, 2)            0           ['dense_5[0][0]']                
                                                                                                  
 lambda (Lambda)                (None, 2)            0           ['dense_6[0][0]']                
                                                                                                  
==================================================================================================
Total params: 346,228
Trainable params: 346,228
Non-trainable params: 0
__________________________________________________________________________________________________
Model: "Critic"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 64, 64, 3)]       0         
                                                                 
 conv2d_8 (Conv2D)           (None, 64, 64, 64)        1792      
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 31, 31, 64)       0         
 2D)                                                             
                                                                 
 conv2d_9 (Conv2D)           (None, 29, 29, 32)        18464     
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 14, 14, 32)       0         
 2D)                                                             
                                                                 
 conv2d_10 (Conv2D)          (None, 12, 12, 16)        4624      
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 10, 10, 16)       0         
 g2D)                                                            
                                                                 
 conv2d_11 (Conv2D)          (None, 8, 8, 8)           1160      
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 6, 6, 8)          0         
 g2D)                                                            
                                                                 
 flatten_2 (Flatten)         (None, 288)               0         
                                                                 
 dense_7 (Dense)             (None, 16)                4624      
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_8 (Dense)             (None, 8)                 136       
                                                                 
 dropout_5 (Dropout)         (None, 8)                 0         
                                                                 
 dense_9 (Dense)             (None, 1)                 9         
                                                                 
=================================================================
Total params: 30,809
Trainable params: 30,809
Non-trainable params: 0
_________________________________________________________________
ep#:0 step#:1 step_rew:[0.0] action:(array([ 70, 210]),) dones:[False]
ep#:0 step#:2 step_rew:[0.0] action:(array([160, 196]),) dones:[False]
ep#:0 step#:3 step_rew:[0.0] action:(array([ 43, 193]),) dones:[False]
ep#:0 step#:4 step_rew:[0.0] action:(array([160, 209]),) dones:[False]
ep#:0 step#:5 step_rew:[0.0] action:(array([160, 205]),) dones:[False]
ep#:0 step#:6 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:0 step#:7 step_rew:[0.0] action:(array([132, 202]),) dones:[False]
ep#:0 step#:8 step_rew:[0.0] action:(array([160, 201]),) dones:[False]
ep#:0 step#:9 step_rew:[0.0] action:(array([ 41, 210]),) dones:[False]
ep#:0 step#:10 step_rew:[0.0] action:(array([160, 205]),) dones:[False]
ep#:0 step#:11 step_rew:[0.0] action:(array([160, 205]),) dones:[False]
ep#:0 step#:12 step_rew:[0.0] action:(array([ 75, 210]),) dones:[False]
ep#:0 step#:13 step_rew:[0.0] action:(array([160, 205]),) dones:[False]
ep#:0 step#:14 step_rew:[0.0] action:(array([160, 190]),) dones:[False]
ep#:0 step#:15 step_rew:[0.0] action:(array([160, 202]),) dones:[False]
ep#:0 step#:16 step_rew:[0.0] action:(array([106, 200]),) dones:[False]
ep#:0 step#:17 step_rew:[0.0] action:(array([ 51, 188]),) dones:[False]
ep#:0 step#:18 step_rew:[0.0] action:(array([ 59, 210]),) dones:[False]
ep#:0 step#:19 step_rew:[0.0] action:(array([ 94, 203]),) dones:[False]
ep#:0 step#:20 step_rew:[0.0] action:(array([160, 208]),) dones:[False]
ep#:0 step#:21 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:0 step#:22 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:0 step#:23 step_rew:[0.0] action:(array([ 50, 207]),) dones:[False]
ep#:0 step#:24 step_rew:[0.0] action:(array([137, 210]),) dones:[False]
ep#:0 step#:25 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:0 step#:26 step_rew:[0.0] action:(array([133, 204]),) dones:[False]
ep#:0 step#:27 step_rew:[0.0] action:(array([ 66, 210]),) dones:[False]
ep#:0 step#:28 step_rew:[0.0] action:(array([ 42, 207]),) dones:[False]
ep#:0 step#:29 step_rew:[0.0] action:(array([115, 209]),) dones:[False]
ep#:0 step#:30 step_rew:[0.0] action:(array([101, 210]),) dones:[False]
ep#:0 step#:31 step_rew:[0.0] action:(array([160, 208]),) dones:[False]
ep#:0 step#:32 step_rew:[0.0] action:(array([160, 210]),) dones:[False]

WARNING:root:Cannot call CoordClick(coords: (68, 205)) on instance 0, which is already done

ep#:0 step#:33 step_rew:[-1.0] action:(array([ 68, 205]),) dones:[True]
Episode#0 Reward:-1.0 Actions:[]
ep#:1 step#:1 step_rew:[0.0] action:(array([160, 197]),) dones:[False]
ep#:1 step#:2 step_rew:[0.0] action:(array([160, 196]),) dones:[False]
ep#:1 step#:3 step_rew:[0.0] action:(array([160, 201]),) dones:[False]
ep#:1 step#:4 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:5 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:6 step_rew:[0.0] action:(array([160, 192]),) dones:[False]
ep#:1 step#:7 step_rew:[0.0] action:(array([160, 203]),) dones:[False]
ep#:1 step#:8 step_rew:[0.0] action:(array([160, 205]),) dones:[False]
ep#:1 step#:9 step_rew:[0.0] action:(array([ 43, 210]),) dones:[False]
ep#:1 step#:10 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:11 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:12 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:13 step_rew:[0.0] action:(array([121, 205]),) dones:[False]
ep#:1 step#:14 step_rew:[0.0] action:(array([133, 210]),) dones:[False]
ep#:1 step#:15 step_rew:[0.0] action:(array([160, 206]),) dones:[False]
ep#:1 step#:16 step_rew:[0.0] action:(array([ 30, 210]),) dones:[False]
ep#:1 step#:17 step_rew:[0.0] action:(array([129, 210]),) dones:[False]
ep#:1 step#:18 step_rew:[0.0] action:(array([160, 193]),) dones:[False]
ep#:1 step#:19 step_rew:[0.0] action:(array([109, 204]),) dones:[False]
ep#:1 step#:20 step_rew:[0.0] action:(array([  0, 200]),) dones:[False]
ep#:1 step#:21 step_rew:[0.0] action:(array([104, 210]),) dones:[False]
ep#:1 step#:22 step_rew:[0.0] action:(array([150, 201]),) dones:[False]
ep#:1 step#:23 step_rew:[0.0] action:(array([  4, 210]),) dones:[False]
ep#:1 step#:24 step_rew:[0.0] action:(array([150, 203]),) dones:[False]
ep#:1 step#:25 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:26 step_rew:[0.0] action:(array([124, 210]),) dones:[False]
ep#:1 step#:27 step_rew:[0.0] action:(array([114, 208]),) dones:[False]
ep#:1 step#:28 step_rew:[0.0] action:(array([110, 208]),) dones:[False]
ep#:1 step#:29 step_rew:[0.0] action:(array([160, 210]),) dones:[False]
ep#:1 step#:30 step_rew:[0.0] action:(array([142, 210]),) dones:[False]
ep#:1 step#:31 step_rew:[0.0] action:(array([160, 203]),) dones:[False]
ep#:1 step#:32 step_rew:[0.0] action:(array([112, 210]),) dones:[False]

WARNING:root:Cannot call CoordClick(coords: (160, 202)) on instance 0, which is already done

ep#:1 step#:33 step_rew:[-1.0] action:(array([160, 202]),) dones:[True]
Episode#1 Reward:-1.0 Actions:[]
WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/actor/1/model.savedmodel/assets

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/actor/1/model.savedmodel/assets

Actor model saved at:trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/actor/1/model.savedmodel
WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/critic/1/model.savedmodel/assets

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/critic/1/model.savedmodel/assets

Critic model saved at:trained_models/PPO_MiniWoBSocialMediaReplyVisualEnv-v0-v0/critic/1/model.savedmodel

In [ ]:

%tensorboard --logdir /content/logs/TFRL-SocialMedia-Like-Reply-Agent/MiniWoBSocialMediaReplyVisualEnv-v0

In [ ]:

parser = argparse.ArgumentParser(prog="TFRL-SocialMedia-Mute-User-Agent")
parser.add_argument("--env", default="MiniWoBSocialMediaMuteUserVisualEnv-v0")
parser.add_argument("--update-freq", type=int, default=16)
parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--actor-lr", type=float, default=1e-4)
parser.add_argument("--critic-lr", type=float, default=1e-4)
parser.add_argument("--clip-ratio", type=float, default=0.1)
parser.add_argument("--gae-lambda", type=float, default=0.95)
parser.add_argument("--gamma", type=float, default=0.99)
parser.add_argument("--logdir", default="logs")

args = parser.parse_args([])
logdir = os.path.join(
    args.logdir, parser.prog, args.env, datetime.now().strftime("%Y%m%d-%H%M%S")
)
print(f"Saving training logs to:{logdir}")
writer = tf.summary.create_file_writer(logdir)

Saving training logs to:logs/TFRL-SocialMedia-Mute-User-Agent/MiniWoBSocialMediaMuteUserVisualEnv-v0/20211203-062327

In [ ]:

class Actor:
    def __init__(self, state_dim, action_dim, action_bound, std_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = np.array(action_bound)
        self.std_bound = std_bound
        self.weight_initializer = tf.keras.initializers.he_normal()
        self.eps = 1e-5
        self.model = self.nn_model()
        self.model.summary()  # Print a summary of the Actor model
        self.opt = tf.keras.optimizers.Nadam(args.actor_lr)

    def nn_model(self):
        obs_input = Input(self.state_dim, name="im_obs")
        conv1 = Conv2D(
            filters=64,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="same",
            input_shape=self.state_dim,
            data_format="channels_last",
            activation="relu",
        )(obs_input)
        pool1 = MaxPool2D(pool_size=(3, 3), strides=1)(conv1)
        conv2 = Conv2D(
            filters=32,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool1)
        pool2 = MaxPool2D(pool_size=(3, 3), strides=1)(conv2)
        conv3 = Conv2D(
            filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool2)
        pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)
        conv4 = Conv2D(
            filters=8,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool3)
        pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)
        flat = Flatten()(pool4)
        dense1 = Dense(
            16, activation="relu", kernel_initializer=self.weight_initializer
        )(flat)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(
            8, activation="relu", kernel_initializer=self.weight_initializer
        )(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        # action_dim[0] = 2
        output_val = Dense(
            self.action_dim[0],
            activation="relu",
            kernel_initializer=self.weight_initializer,
        )(dropout2)
        # Scale & clip x[i] to be in range [0, action_bound[i]]
        action_bound = copy.deepcopy(self.action_bound)
        mu_output = Lambda(
            lambda x: tf.clip_by_value(x * action_bound, 1e-9, action_bound),
            name="mu_output",
        )(output_val)
        std_output_1 = Dense(
            self.action_dim[0],
            activation="softplus",
            kernel_initializer=self.weight_initializer,
        )(dropout2)
        std_output = Lambda(
            lambda x: tf.clip_by_value(
                x * action_bound, 1e-9, action_bound / 2, name="std_output"
            )
        )(std_output_1)
        return tf.keras.models.Model(
            inputs=obs_input, outputs=[mu_output, std_output], name="Actor"
        )

    def get_action(self, state):
        # Convert [Image] to np.array(np.adarray)
        state_np = np.array([np.array(s) for s in state])
        if len(state_np.shape) == 3:
            # Convert (w, h, c) to (1, w, h, c)
            state_np = np.expand_dims(state_np, 0)
        mu, std = self.model.predict(state_np)
        action = np.random.normal(mu[0], std[0] + self.eps, size=self.action_dim).astype(
            "int"
        )
        # Clip action to be between 0 and max obs screen size
        action = np.clip(action, 0, self.action_bound)
        # 1 Action per instance of env; Env expects: (num_instances, actions)
        action = (action,)
        log_policy = self.log_pdf(mu, std, action)
        return log_policy, action

    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(
            var * 2 * np.pi
        )
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)

    def compute_loss(self, log_old_policy, log_new_policy, actions, gaes):
        # Avoid INF in exp by setting 80 as the upper bound since,
        # tf.exp(x) for x>88 yeilds NaN (float32)
        ratio = tf.exp(
            tf.minimum(log_new_policy - tf.stop_gradient(log_old_policy), 80)
        )
        gaes = tf.stop_gradient(gaes)
        clipped_ratio = tf.clip_by_value(
            ratio, 1.0 - args.clip_ratio, 1.0 + args.clip_ratio
        )
        surrogate = -tf.minimum(ratio * gaes, clipped_ratio * gaes)
        return tf.reduce_mean(surrogate)

    def train(self, log_old_policy, states, actions, gaes):
        with tf.GradientTape() as tape:
            mu, std = self.model(states, training=True)
            log_new_policy = self.log_pdf(mu, std, actions)
            loss = self.compute_loss(log_old_policy, log_new_policy, actions, gaes)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

    def save(self, model_dir: str, version: int = 1):
        actor_model_save_dir = os.path.join(
            model_dir, "actor", str(version), "model.savedmodel"
        )
        self.model.save(actor_model_save_dir, save_format="tf")
        print(f"Actor model saved at:{actor_model_save_dir}")


class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.weight_initializer = tf.keras.initializers.he_normal()
        self.model = self.nn_model()
        self.model.summary()  # Print a summary of the Critic model
        self.opt = tf.keras.optimizers.Nadam(args.critic_lr)

    def nn_model(self):
        obs_input = Input(self.state_dim)
        conv1 = Conv2D(
            filters=64,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="same",
            input_shape=self.state_dim,
            data_format="channels_last",
            activation="relu",
        )(obs_input)
        pool1 = MaxPool2D(pool_size=(3, 3), strides=2)(conv1)
        conv2 = Conv2D(
            filters=32,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool1)
        pool2 = MaxPool2D(pool_size=(3, 3), strides=2)(conv2)
        conv3 = Conv2D(
            filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool2)
        pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)
        conv4 = Conv2D(
            filters=8,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool3)
        pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)
        flat = Flatten()(pool4)
        dense1 = Dense(
            16, activation="relu", kernel_initializer=self.weight_initializer
        )(flat)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(
            8, activation="relu", kernel_initializer=self.weight_initializer
        )(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        value = Dense(
            1, activation="linear", kernel_initializer=self.weight_initializer
        )(dropout2)

        return tf.keras.models.Model(inputs=obs_input, outputs=value, name="Critic")

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            # assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss

    def save(self, model_dir: str, version: int = 1):
        critic_model_save_dir = os.path.join(
            model_dir, "critic", str(version), "model.savedmodel"
        )
        self.model.save(critic_model_save_dir, save_format="tf")
        print(f"Critic model saved at:{critic_model_save_dir}")


class PPOAgent:
    def __init__(self, env):
        self.env = env
        self.state_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.shape
        # Set action_bounds to be within the actual task-window/browser-view of the agent
        self.action_bound = [self.env.task_width, self.env.task_height]
        self.std_bound = [1e-2, 1.0]

        self.actor = Actor(
            self.state_dim, self.action_dim, self.action_bound, self.std_bound
        )
        self.critic = Critic(self.state_dim)

    def gae_target(self, rewards, v_values, next_v_value, done):
        n_step_targets = np.zeros_like(rewards)
        gae = np.zeros_like(rewards)
        gae_cumulative = 0
        forward_val = 0

        if not done:
            forward_val = next_v_value

        for k in reversed(range(0, len(rewards))):
            delta = rewards[k] + args.gamma * forward_val - v_values[k]
            gae_cumulative = args.gamma * args.gae_lambda * gae_cumulative + delta
            gae[k] = gae_cumulative
            forward_val = v_values[k]
            n_step_targets[k] = gae[k] + v_values[k]
        return gae, n_step_targets

    def train(self, max_episodes=1000):
        with writer.as_default():
            for ep in range(max_episodes):
                state_batch = []
                action_batch = []
                reward_batch = []
                old_policy_batch = []

                episode_reward, done = 0, False

                state = self.env.reset()
                prev_state = state
                step_num = 0

                while not done:
                    # self.env.render()
                    log_old_policy, action = self.actor.get_action(state)

                    next_state, reward, dones, _ = self.env.step(action)
                    step_num += 1
                    print(
                        f"ep#:{ep} step#:{step_num} step_rew:{reward} action:{action} dones:{dones}"
                    )
                    done = np.all(dones)
                    if done:
                        next_state = prev_state
                    else:
                        prev_state = next_state
                    state = np.array([np.array(s) for s in state])
                    next_state = np.array([np.array(s) for s in next_state])
                    reward = np.reshape(reward, [1, 1])
                    log_old_policy = np.reshape(log_old_policy, [1, 1])

                    state_batch.append(state)
                    action_batch.append(action)
                    reward_batch.append((reward + 8) / 8)
                    old_policy_batch.append(log_old_policy)

                    if len(state_batch) >= args.update_freq or done:
                        states = np.array([state.squeeze() for state in state_batch])
                        # Convert ([x, y],) to [x, y]
                        actions = np.array([action[0] for action in action_batch])
                        rewards = np.array(
                            [reward.squeeze() for reward in reward_batch]
                        )
                        old_policies = np.array(
                            [old_pi.squeeze() for old_pi in old_policy_batch]
                        )

                        v_values = self.critic.model.predict(states)
                        next_v_value = self.critic.model.predict(next_state)

                        gaes, td_targets = self.gae_target(
                            rewards, v_values, next_v_value, done
                        )
                        actor_losses, critic_losses = [], []
                        for epoch in range(args.epochs):
                            actor_loss = self.actor.train(
                                old_policies, states, actions, gaes
                            )
                            actor_losses.append(actor_loss)
                            critic_loss = self.critic.train(states, td_targets)
                            critic_losses.append(critic_loss)
                        # Plot mean actor & critic losses on every update
                        tf.summary.scalar("actor_loss", np.mean(actor_losses), step=ep)
                        tf.summary.scalar(
                            "critic_loss", np.mean(critic_losses), step=ep
                        )

                        state_batch = []
                        action_batch = []
                        reward_batch = []
                        old_policy_batch = []

                    episode_reward += reward[0][0]
                    state = next_state[0]

                print(f"Episode#{ep} Reward:{episode_reward} Actions:{action_batch}")
                tf.summary.scalar("episode_reward", episode_reward, step=ep)

    def save(self, model_dir: str, version: int = 1):
        self.actor.save(model_dir, version)
        self.critic.save(model_dir, version)


if __name__ == "__main__":
    env_name = args.env
    env = gym.make(env_name)
    cta_agent = PPOAgent(env)
    cta_agent.train(max_episodes=2)
    # Model saving
    model_dir = "trained_models"
    agent_name = f"PPO_{env_name}"
    agent_version = 1
    agent_model_path = os.path.join(model_dir, agent_name)
    cta_agent.save(agent_model_path, agent_version)

Model: "Actor"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
==================================================================================================
 im_obs (InputLayer)            [(None, 64, 64, 3)]  0           []                               
                                                                                                  
 conv2d_20 (Conv2D)             (None, 64, 64, 64)   1792        ['im_obs[0][0]']                 
                                                                                                  
 max_pooling2d_20 (MaxPooling2D  (None, 62, 62, 64)  0           ['conv2d_20[0][0]']              
 )                                                                                                
                                                                                                  
 conv2d_21 (Conv2D)             (None, 60, 60, 32)   18464       ['max_pooling2d_20[0][0]']       
                                                                                                  
 max_pooling2d_21 (MaxPooling2D  (None, 58, 58, 32)  0           ['conv2d_21[0][0]']              
 )                                                                                                
                                                                                                  
 conv2d_22 (Conv2D)             (None, 56, 56, 16)   4624        ['max_pooling2d_21[0][0]']       
                                                                                                  
 max_pooling2d_22 (MaxPooling2D  (None, 54, 54, 16)  0           ['conv2d_22[0][0]']              
 )                                                                                                
                                                                                                  
 conv2d_23 (Conv2D)             (None, 52, 52, 8)    1160        ['max_pooling2d_22[0][0]']       
                                                                                                  
 max_pooling2d_23 (MaxPooling2D  (None, 50, 50, 8)   0           ['conv2d_23[0][0]']              
 )                                                                                                
                                                                                                  
 flatten_5 (Flatten)            (None, 20000)        0           ['max_pooling2d_23[0][0]']       
                                                                                                  
 dense_17 (Dense)               (None, 16)           320016      ['flatten_5[0][0]']              
                                                                                                  
 dropout_10 (Dropout)           (None, 16)           0           ['dense_17[0][0]']               
                                                                                                  
 dense_18 (Dense)               (None, 8)            136         ['dropout_10[0][0]']             
                                                                                                  
 dropout_11 (Dropout)           (None, 8)            0           ['dense_18[0][0]']               
                                                                                                  
 dense_19 (Dense)               (None, 2)            18          ['dropout_11[0][0]']             
                                                                                                  
 dense_20 (Dense)               (None, 2)            18          ['dropout_11[0][0]']             
                                                                                                  
 mu_output (Lambda)             (None, 2)            0           ['dense_19[0][0]']               
                                                                                                  
 lambda_2 (Lambda)              (None, 2)            0           ['dense_20[0][0]']               
                                                                                                  
==================================================================================================
Total params: 346,228
Trainable params: 346,228
Non-trainable params: 0
__________________________________________________________________________________________________
Model: "Critic"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_3 (InputLayer)        [(None, 64, 64, 3)]       0         
                                                                 
 conv2d_24 (Conv2D)          (None, 64, 64, 64)        1792      
                                                                 
 max_pooling2d_24 (MaxPoolin  (None, 31, 31, 64)       0         
 g2D)                                                            
                                                                 
 conv2d_25 (Conv2D)          (None, 29, 29, 32)        18464     
                                                                 
 max_pooling2d_25 (MaxPoolin  (None, 14, 14, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_26 (Conv2D)          (None, 12, 12, 16)        4624      
                                                                 
 max_pooling2d_26 (MaxPoolin  (None, 10, 10, 16)       0         
 g2D)                                                            
                                                                 
 conv2d_27 (Conv2D)          (None, 8, 8, 8)           1160      
                                                                 
 max_pooling2d_27 (MaxPoolin  (None, 6, 6, 8)          0         
 g2D)                                                            
                                                                 
 flatten_6 (Flatten)         (None, 288)               0         
                                                                 
 dense_21 (Dense)            (None, 16)                4624      
                                                                 
 dropout_12 (Dropout)        (None, 16)                0         
                                                                 
 dense_22 (Dense)            (None, 8)                 136       
                                                                 
 dropout_13 (Dropout)        (None, 8)                 0         
                                                                 
 dense_23 (Dense)            (None, 1)                 9         
                                                                 
=================================================================
Total params: 30,809
Trainable params: 30,809
Non-trainable params: 0
_________________________________________________________________
ep#:0 step#:1 step_rew:[0.0] action:(array([ 93, 111]),) dones:[False]
ep#:0 step#:2 step_rew:[0.0] action:(array([  0, 152]),) dones:[False]
ep#:0 step#:3 step_rew:[0.0] action:(array([91, 35]),) dones:[False]
ep#:0 step#:4 step_rew:[0.0] action:(array([20,  0]),) dones:[False]
ep#:0 step#:5 step_rew:[0.0] action:(array([19,  0]),) dones:[False]
ep#:0 step#:6 step_rew:[0.0] action:(array([60, 45]),) dones:[False]
ep#:0 step#:7 step_rew:[0.0] action:(array([ 0, 80]),) dones:[False]
ep#:0 step#:8 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:0 step#:9 step_rew:[0.0] action:(array([143,   0]),) dones:[False]
ep#:0 step#:10 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:0 step#:11 step_rew:[0.0] action:(array([  0, 142]),) dones:[False]
ep#:0 step#:12 step_rew:[0.0] action:(array([ 93, 115]),) dones:[False]
ep#:0 step#:13 step_rew:[0.0] action:(array([  0, 105]),) dones:[False]
ep#:0 step#:14 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:0 step#:15 step_rew:[0.0] action:(array([ 26, 113]),) dones:[False]
ep#:0 step#:16 step_rew:[0.0] action:(array([97,  0]),) dones:[False]
ep#:0 step#:17 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:0 step#:18 step_rew:[0.0] action:(array([160,   0]),) dones:[False]
ep#:0 step#:19 step_rew:[0.0] action:(array([110,  51]),) dones:[False]
ep#:0 step#:20 step_rew:[0.0] action:(array([84,  0]),) dones:[False]
ep#:0 step#:21 step_rew:[0.0] action:(array([50,  0]),) dones:[False]
ep#:0 step#:22 step_rew:[0.0] action:(array([ 0, 15]),) dones:[False]
ep#:0 step#:23 step_rew:[0.0] action:(array([54,  0]),) dones:[False]
ep#:0 step#:24 step_rew:[0.0] action:(array([ 0, 73]),) dones:[False]
ep#:0 step#:25 step_rew:[0.0] action:(array([0, 0]),) dones:[False]

WARNING:root:Cannot call CoordClick(coords: (0, 14)) on instance 0, which is already done

ep#:0 step#:26 step_rew:[0.0] action:(array([80, 81]),) dones:[False]
ep#:0 step#:27 step_rew:[-1.0] action:(array([ 0, 14]),) dones:[True]
Episode#0 Reward:-1.0 Actions:[]
ep#:1 step#:1 step_rew:[0.0] action:(array([ 0, 48]),) dones:[False]
ep#:1 step#:2 step_rew:[0.0] action:(array([60, 83]),) dones:[False]
ep#:1 step#:3 step_rew:[0.0] action:(array([95,  0]),) dones:[False]
ep#:1 step#:4 step_rew:[0.0] action:(array([  0, 184]),) dones:[False]
ep#:1 step#:5 step_rew:[0.0] action:(array([0, 4]),) dones:[False]
ep#:1 step#:6 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:7 step_rew:[0.0] action:(array([160,   0]),) dones:[False]
ep#:1 step#:8 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:9 step_rew:[0.0] action:(array([29, 43]),) dones:[False]
ep#:1 step#:10 step_rew:[0.0] action:(array([ 0, 81]),) dones:[False]
ep#:1 step#:11 step_rew:[0.0] action:(array([65,  0]),) dones:[False]
ep#:1 step#:12 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:13 step_rew:[0.0] action:(array([  0, 210]),) dones:[False]
ep#:1 step#:14 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:15 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:16 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:17 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:18 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:19 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:20 step_rew:[0.0] action:(array([0, 0]),) dones:[False]
ep#:1 step#:21 step_rew:[0.0] action:(array([ 0, 50]),) dones:[False]
ep#:1 step#:22 step_rew:[0.0] action:(array([ 0, 80]),) dones:[False]
ep#:1 step#:23 step_rew:[0.0] action:(array([31,  0]),) dones:[False]
ep#:1 step#:24 step_rew:[0.0] action:(array([ 84, 161]),) dones:[False]
ep#:1 step#:25 step_rew:[0.0] action:(array([ 0, 28]),) dones:[False]
ep#:1 step#:26 step_rew:[0.0] action:(array([1, 0]),) dones:[False]

WARNING:root:Cannot call CoordClick(coords: (0, 140)) on instance 0, which is already done

ep#:1 step#:27 step_rew:[0.0] action:(array([  0, 172]),) dones:[False]
ep#:1 step#:28 step_rew:[-1.0] action:(array([  0, 140]),) dones:[True]
Episode#1 Reward:-1.0 Actions:[]
WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/actor/1/model.savedmodel/assets

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/actor/1/model.savedmodel/assets

Actor model saved at:trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/actor/1/model.savedmodel
WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/critic/1/model.savedmodel/assets

INFO:tensorflow:Assets written to: trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/critic/1/model.savedmodel/assets

Critic model saved at:trained_models/PPO_MiniWoBSocialMediaMuteUserVisualEnv-v0/critic/1/model.savedmodel

In [ ]:

%tensorboard --logdir /content/logs/TFRL-SocialMedia-Mute-User-Agent/MiniWoBSocialMediaMuteUserVisualEnv-v0

In [ ]:

parser = argparse.ArgumentParser(
    prog="TFRL-SocialMedia-Mute-User-DDPGAgent"
)
parser.add_argument("--env", default="MiniWoBSocialMediaMuteUserVisualEnv-v0")
parser.add_argument("--actor_lr", type=float, default=0.0005)
parser.add_argument("--critic_lr", type=float, default=0.001)
parser.add_argument("--batch_size", type=int, default=64)
parser.add_argument("--tau", type=float, default=0.05)
parser.add_argument("--gamma", type=float, default=0.99)
parser.add_argument("--train_start", type=int, default=2000)
parser.add_argument("--logdir", default="logs")

args = parser.parse_args([])
logdir = os.path.join(
    args.logdir, parser.prog, args.env, datetime.now().strftime("%Y%m%d-%H%M%S")
)
print(f"Saving training logs to:{logdir}")
writer = tf.summary.create_file_writer(logdir)

Saving training logs to:logs/TFRL-SocialMedia-Mute-User-DDPGAgent/MiniWoBSocialMediaMuteUserVisualEnv-v0/20211203-062554

In [ ]:

class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)

    def store(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])

    def sample(self):
        sample = random.sample(self.buffer, args.batch_size)
        states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
        states = np.array(states).reshape(args.batch_size, -1)
        next_states = np.array(next_states).reshape(args.batch_size, -1)
        return states, actions, rewards, next_states, done

    def size(self):
        return len(self.buffer)


class Actor:
    def __init__(self, state_dim, action_dim, action_bound):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.weight_initializer = tf.keras.initializers.he_normal()
        self.eps = 1e-5
        self.model = self.nn_model()
        self.opt = tf.keras.optimizers.Adam(args.actor_lr)

    def nn_model(self):
        obs_input = Input(self.state_dim)
        conv1 = Conv2D(
            filters=64,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="same",
            input_shape=self.state_dim,
            data_format="channels_last",
            activation="relu",
        )(obs_input)
        pool1 = MaxPool2D(pool_size=(3, 3), strides=1)(conv1)
        conv2 = Conv2D(
            filters=32,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool1)
        pool2 = MaxPool2D(pool_size=(3, 3), strides=1)(conv2)
        conv3 = Conv2D(
            filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool2)
        pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)
        conv4 = Conv2D(
            filters=8,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool3)
        pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)
        flat = Flatten()(pool4)
        dense1 = Dense(
            16, activation="relu", kernel_initializer=self.weight_initializer
        )(flat)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(
            8, activation="relu", kernel_initializer=self.weight_initializer
        )(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        # action_dim[0] = 2
        output_val = Dense(
            self.action_dim[0],
            activation="relu",
            kernel_initializer=self.weight_initializer,
        )(dropout2)
        # Scale & clip x[i] to be in range [0, action_bound[i]]
        mu_output = Lambda(
            lambda x: tf.clip_by_value(x * self.action_bound, 1e-9, self.action_bound)
        )(output_val)
        return tf.keras.models.Model(inputs=obs_input, outputs=mu_output, name="Actor")

    def train(self, states, q_grads):
        with tf.GradientTape() as tape:
            grads = tape.gradient(
                self.model(states), self.model.trainable_variables, -q_grads
            )
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))

    def predict(self, state):
        return self.model.predict(state)

    def get_action(self, state):
        # Convert [Image] to np.array(np.adarray)
        state_np = np.array([np.array(s) for s in state])
        if len(state_np.shape) == 3:
            # Convert (w, h, c) to (1, w, h, c)
            state_np = np.expand_dims(state_np, 0)
        action = self.model.predict(state_np)
        # Clip action to be between 0 and max obs screen size
        action = np.clip(action, 0, self.action_bound)
        # 1 Action per instance of env; Env expects: (num_instances, actions)
        return action


class Critic:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.weight_initializer = tf.keras.initializers.he_normal()
        self.model = self.nn_model()
        self.opt = tf.keras.optimizers.Adam(args.critic_lr)

    def nn_model(self):
        obs_input = Input(self.state_dim)
        conv1 = Conv2D(
            filters=64,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="same",
            input_shape=self.state_dim,
            data_format="channels_last",
            activation="relu",
        )(obs_input)
        pool1 = MaxPool2D(pool_size=(3, 3), strides=2)(conv1)
        conv2 = Conv2D(
            filters=32,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool1)
        pool2 = MaxPool2D(pool_size=(3, 3), strides=2)(conv2)
        conv3 = Conv2D(
            filters=16,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool2)
        pool3 = MaxPool2D(pool_size=(3, 3), strides=1)(conv3)
        conv4 = Conv2D(
            filters=8,
            kernel_size=(3, 3),
            strides=(1, 1),
            padding="valid",
            activation="relu",
        )(pool3)
        pool4 = MaxPool2D(pool_size=(3, 3), strides=1)(conv4)
        flat = Flatten()(pool4)
        dense1 = Dense(
            16, activation="relu", kernel_initializer=self.weight_initializer
        )(flat)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(
            8, activation="relu", kernel_initializer=self.weight_initializer
        )(dropout1)
        dropout2 = Dropout(0.3)(dense2)
        value = Dense(
            1, activation="linear", kernel_initializer=self.weight_initializer
        )(dropout2)

        return tf.keras.models.Model(inputs=obs_input, outputs=value, name="Critic")

    def predict(self, inputs):
        return self.model.predict(inputs)

    def q_gradients(self, states, actions):
        actions = tf.convert_to_tensor(actions)
        with tf.GradientTape() as tape:
            tape.watch(actions)
            q_values = self.model([states, actions])
            q_values = tf.squeeze(q_values)
        return tape.gradient(q_values, actions)

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, actions, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model([states, actions], training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class DDPGAgent:
    def __init__(self, env):
        self.env = env
        self.state_dim = self.env.observation_space.shape
        self.action_dim = self.env.action_space.shape
        self.action_bound = self.env.action_space.high

        self.buffer = ReplayBuffer()

        self.actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.critic = Critic(self.state_dim, self.action_dim)

        self.target_actor = Actor(self.state_dim, self.action_dim, self.action_bound)
        self.target_critic = Critic(self.state_dim, self.action_dim)

        actor_weights = self.actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        self.target_actor.model.set_weights(actor_weights)
        self.target_critic.model.set_weights(critic_weights)

    def update_target(self):
        actor_weights = self.actor.model.get_weights()
        t_actor_weights = self.target_actor.model.get_weights()
        critic_weights = self.critic.model.get_weights()
        t_critic_weights = self.target_critic.model.get_weights()

        for i in range(len(actor_weights)):
            t_actor_weights[i] = (
                args.tau * actor_weights[i] + (1 - args.tau) * t_actor_weights[i]
            )

        for i in range(len(critic_weights)):
            t_critic_weights[i] = (
                args.tau * critic_weights[i] + (1 - args.tau) * t_critic_weights[i]
            )

        self.target_actor.model.set_weights(t_actor_weights)
        self.target_critic.model.set_weights(t_critic_weights)

    def get_td_target(self, rewards, q_values, dones):
        targets = np.asarray(q_values)
        for i in range(q_values.shape[0]):
            if dones[i]:
                targets[i] = rewards[i]
            else:
                targets[i] = args.gamma * q_values[i]
        return targets

    def add_ou_noise(self, x, rho=0.15, mu=0, dt=1e-1, sigma=0.2, dim=1):
        return (
            x + rho * (mu - x) * dt + sigma * np.sqrt(dt) * np.random.normal(size=dim)
        )

    def replay_experience(self):
        for _ in range(10):
            states, actions, rewards, next_states, dones = self.buffer.sample()
            target_q_values = self.target_critic.predict(
                [next_states, self.target_actor.predict(next_states)]
            )
            td_targets = self.get_td_target(rewards, target_q_values, dones)

            self.critic.train(states, actions, td_targets)

            s_actions = self.actor.predict(states)
            s_grads = self.critic.q_gradients(states, s_actions)
            grads = np.array(s_grads).reshape((-1, self.action_dim))
            self.actor.train(states, grads)
            self.update_target()

    def train(self, max_episodes=1000):
        with writer.as_default():
            for ep in range(max_episodes):
                step_num, episode_reward, done = 0, 0, False

                state = self.env.reset()
                prev_state = state
                bg_noise = np.random.randint(
                    self.env.action_space.low,
                    self.env.action_space.high,
                    self.env.action_space.shape,
                )
                while not done:
                    # self.env.render()
                    action = self.actor.get_action(state)
                    noise = self.add_ou_noise(bg_noise, dim=self.action_dim)
                    action = np.clip(action + noise, 0, self.action_bound).astype("int")

                    next_state, reward, dones, _ = self.env.step(action)
                    done = np.all(dones)
                    if done:
                        next_state = prev_state
                    else:
                        prev_state = next_state

                    for (s, a, r, s_n, d) in zip(
                        next_state, action, reward, next_state, dones
                    ):
                        self.buffer.store(s, a, (r + 8) / 8, s_n, d)
                        episode_reward += r

                    step_num += 1  # 1 across num_instances
                    print(
                        f"ep#:{ep} step#:{step_num} step_rew:{reward} action:{action} dones:{dones}"
                    )

                    bg_noise = noise
                    state = next_state
                if (
                    self.buffer.size() >= args.batch_size
                    and self.buffer.size() >= args.train_start
                ):
                    self.replay_experience()
                print(f"Episode#{ep} Reward:{episode_reward}")
                tf.summary.scalar("episode_reward", episode_reward, step=ep)


if __name__ == "__main__":
    env_name = "MiniWoBSocialMediaMuteUserVisualEnv-v0"
    env = gym.make(env_name)
    agent = DDPGAgent(env)
    agent.train(max_episodes=2)

ep#:0 step#:1 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:2 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:3 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:4 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:5 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:6 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:7 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:8 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:9 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:10 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:11 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:12 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:13 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:14 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:15 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:16 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:17 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:18 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:19 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:20 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:21 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:22 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:23 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:24 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:25 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:26 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:27 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:28 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:29 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:30 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:31 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:32 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:33 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:34 step_rew:[0.0] action:[[160 210]] dones:[False]

WARNING:root:Cannot call CoordClick(coords: (160, 210)) on instance 0, which is already done

ep#:0 step#:35 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:0 step#:36 step_rew:[-1.0] action:[[160 210]] dones:[True]
Episode#0 Reward:-1.0
ep#:1 step#:1 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:2 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:3 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:4 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:5 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:6 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:7 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:8 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:9 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:10 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:11 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:12 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:13 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:14 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:15 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:16 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:17 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:18 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:19 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:20 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:21 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:22 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:23 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:24 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:25 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:26 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:27 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:28 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:29 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:30 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:31 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:32 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:33 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:34 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:35 step_rew:[0.0] action:[[160 210]] dones:[False]

WARNING:root:Cannot call CoordClick(coords: (160, 210)) on instance 0, which is already done

ep#:1 step#:36 step_rew:[0.0] action:[[160 210]] dones:[False]
ep#:1 step#:37 step_rew:[-1.0] action:[[160 210]] dones:[True]
Episode#1 Reward:-1.0

In [ ]:

%tensorboard --logdir /content/logs/TFRL-SocialMedia-Mute-User-DDPGAgent/MiniWoBSocialMediaMuteUserVisualEnv-v0

In [ ]:

!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p selenium

Author: Sparsh A.

Last updated: 2021-12-03 06:32:29

selenium: 4.1.0

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

argparse  : 1.1
sys       : 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
gym       : 0.17.3
tensorflow: 2.7.0
IPython   : 5.5.0
numpy     : 1.19.5

END

Building an RL Agent to manage social media accounts on the web¶

Social Media Like Reply Agent¶

Social Media Mute Agent¶

Social Media Mute Agent DDPG¶