#!/usr/bin/env python # coding: utf-8 # # Grid2Op integration with existing frameworks # # Try me out interactively with: [![Binder](./img/badge_logo.svg)](https://mybinder.org/v2/gh/rte-france/Grid2Op/master) # # # **objectives** This notebooks briefly explains how to use grid2op with commonly used RL frameworks. It also explains the main methods / class of the `grid2op.gym_compat` module that ease grid2op integration with these frameworks. # # This explains the ideas and shows a "self contained" somewhat minimal example of use of some RL frameworks with grid2op. For a more complete, easier, more concise, etc. integration, please use the "l2rpn_baselines" package. # # # The structure is always very similar: # 1. Create a grid2op environment # 2. Convert it to a gym environment # 3. (optional) Customize the action space and observation space # 4. Use the framework to train an agent # 5. Embed the trained agent into a grid2op Agent to take valid grid2op actions. # # In this notebook, we will demonstrate its usage with 3 different framework. The code provided here are given as examples and we do not assume anything on their performance or fitness of use. More detailed example will be provided in the l2rpn-baselines repository in due time (work in progress at the time of writing this notebook). The 3 framework we will demonstrate an example of are: # # - ray (rllib): see [ray on github](https://github.com/ray-project/ray) or [rllib on github](https://github.com/ray-project/ray/blob/master/doc/source/rllib.rst) # - stable-baselines3: see [stable-baselines3 on github](https://github.com/DLR-RM/stable-baselines3) # - tf_agents: see [tf_agents on github](https://github.com/tensorflow/agents) # # Other RL frameworks are not cover here. If you already use them, let us know ! # - https://github.com/PaddlePaddle/PARL/blob/develop/README.md (used by the winner teams of Neurips competitions !) Work in progress. # - https://github.com/deepmind/acme # # Note also that there is still the possibility to use past codes in the l2rpn-baselines repository: https://github.com/rte-france/l2rpn-baselines . This repository contains code snippets that can be reuse to make really nice agents on the l2rpn competitions. You can try it out :-) # # # Execute the cell below by removing the `#` characters if you use google colab ! # # Cell will look like: # ```python # import sys # !$sys.executable install grid2op[optional] # for use with google colab (grid2Op is not installed by default) # !$sys.executable install tensorflow pytorch stable-baselines3 'ray[rllib]' tf_agents # ``` # # It might take a while # # In[ ]: import sys # !$sys.executable -m pip install grid2op[optional] # for use with google colab (grid2Op is not installed by default) # !$sys.executable -m pip install stable-baselines3 'ray[rllib]' tf_agents # In[ ]: # because this notebook is part of some tests, we train the agent for only a small number of steps nb_step_train = 0 # ## Organisation of this notebook # # For the organisation of this notebook we decided to first detail features closer to grid2op to go later on "higher level" feature that are closer to "standard" gym representation (eg `Box` and `Discrete` space). # # Note the closer you are to grid2op the more grid2op feature you can use. For example, in gym environment, it is not possible to use the "simulate" function (remember, this function allow to use a simulator that has a behaviour close to the one of the environment) at all. Also, grid2op observation and action comes with a lot of different feature (capacity to add them, to retrieve the graph of the grid etc.) which is not possible to use directly in gym. # # That being said, this notebook is organized as follow: # # - [Convert it to a gym environment](#Convert-it-to-a-gym-environment): basic use of the `gym_compat` grid2op module allowing to convert a grid2op environment to a gym environment. # - [Action space](#Action-space): basic usage of the action space, by removing redundant feature (`gym_env.observation_space.ignore_attr`) or transforming feature from a continuous space to a discrete space (`ContinuousToDiscreteConverter`) # - [Observation space](#Observation-space): basic usage of the observation space, by removing redunddant features (`keep_only_attr`) or to scale the data on between a certain range (`ScalerAttrConverter`) # - [Making the grid2op agent](#Making-the-grid2op-agent) explains how to make a grid2op agent once trained. Note that a more "agent focused" view is provided in the notebook [04_TrainingAnAgent](04_TrainingAnAgent.ipynb) ! # # To dive deeper and with proper "hands on", you can refer to one of the following notebooks that uses real RL frameworks: # # 1) RLLIB: see notebook [11_ray_integration](./11_ray_integration.ipynb) for more information about RLLIB # 2) Stable baselines: see notebook [11_ray_integration](./11_stable_baselines3_integration.ipynb) for more information about stables-baselines3 # 3) tf agents: coming soon # 4) acme: coming soon # ## 0) Recommended initial steps # # # ### Split the environment into training, validation and test # # As in other machine learning tasks, we highly recommend, before even trying to train an agent, to split the "chronics" (ie the episode data) into 3 datasets: # - "train" use to train the agent # - "val" use to validate the hyper parameters # - "test" at which you would look only once to report the agent performance in a scientific paper (for example) # # Grid2op lets you do that with relative ease: # # ```python # import grid2op # env_name = "l2rpn_case14_sandbox" # or any other... # env = grid2op.make(env_name) # # # extract 1% of the "chronics" to be used in the validation environment. The other 99% will # # be used for test # nm_env_train, nm_env_val, nm_env_test = env.train_val_split_random(pct_val=1., pct_test=1.) # # # and now you can use the training set only to train your agent: # print(f"The name of the training environment is \\"{nm_env_train}\\"") # print(f"The name of the validation environment is \\"{nm_env_val}\\"") # print(f"The name of the test environment is \\"{nm_env_test}\\"") # ``` # # And now, you can use the training environment to train your agent: # # ```python # import grid2op # env_name = "l2rpn_case14_sandbox" # env = grid2op.make(env_name+"train") # ``` # # Be carefull, on windows you might run into issues. Don't hesitate to have a look at the documentation of this funciton if this the case (see https://grid2op.readthedocs.io/en/latest/environment.html#grid2op.Environment.Environment.train_val_split and https://grid2op.readthedocs.io/en/latest/environment.html#grid2op.Environment.Environment.train_val_split_random) # # More information are provided here: https://grid2op.readthedocs.io/en/latest/environment.html#splitting-into-raining-validation-test-scenarios # # ### Use the `experimental_read_from_local_dir` flag # # This flag allows python to better "understands" the classes in grid2op and avoid lots of issue with pickle / multi processing etc. # # The complete documentation is available here https://grid2op.readthedocs.io/en/latest/environment.html#grid2op.Environment.BaseEnv.generate_classes # # Basically, once, and only once, outside of this process, you can call: # # ```python # import grid2op # env_name = "l2rpn_case14_sandbox" # or any other name # # env = grid2op.make(env_name, ...) # again: redo this step each time you customize "..." # # for example if you change the `action_class` or the `backend` etc. # # env.generate_classes() # ``` # # Then, each time you want to reload the same environment, you can do: # # ```python # import grid2op # env_name = SAME NAME AS ABOVE # env = grid2op.make(env_name, # experimental_read_from_local_dir=True, # ..., # SAME ENV CUSTOMIZATION AS ABOVE # ) # ``` # # This is known to solve bug related to multi processing, but also to reduce the amount of RAM taken (in some cases) as well as creation time (in some cases) # # ### Other steps # # The grid2op documentation is full of details to "optimize" the number of steps you can do per seconds. This number can rise from a few dozen per seconds to around a thousands per seconds with proper care. # # We strongly encouraged you to leverage all the possibilities which includes (but are not limited to): # - using "lightsim2grid" as a backend for a 10-15x speed up in the "env.step(...)" function # - using "MultifolderWithCache" or "env.chronics_handler.set_chunk(...)" for faster "env.reset(...)" see https://grid2op.readthedocs.io/en/latest/environment.html#optimize-the-data-pipeline # # # ### Create a grid2op environment # # This is a rather standard step, with lots of inspiration drawn from openAI gym framework, and there is absolutely no specificity here. # In[ ]: import grid2op try: from lightsim2grid import LightSimBackend bk_cls = LightSimBackend except ImportError as exc: print(f"Error: {exc} when importing faster LightSimBackend") from grid2op.Backend import PandaPowerBackend bk_cls = PandaPowerBackend env_name = "l2rpn_case14_sandbox" env_glop = grid2op.make(env_name, test=True, backend=bk_cls()) # NOTE: do not set the flag "test=True" for a real usage ! # NOTE: use grid2op.make(env_name+"_train", test=True) for a real usage (see paragraph above !) # This flag is here for testing purpose !!! obs_glop = env_glop.reset() # obs_glop # ### Convert it to a gym environment # # To that end, we recommend using the "gym_compat" module. More information is given in the [official grid2op documentation](https://grid2op.readthedocs.io/en/latest/gym.html) # In[ ]: import gymnasium import numpy as np from grid2op.gym_compat import GymEnv env_gym_init = GymEnv(env_glop) env_gym = GymEnv(env_glop) print(f"The \"env_gym\" is a gym environment: {isinstance(env_gym, gymnasium.Env)}") obs_gym, info = env_gym.reset() # ### Customize the action space and observation space # # This step is optional, but highly recommended. # # By default, grid2op actions and observations are huge. Even for this very simplistic example, you have really important sizes: # In[ ]: dim_act_space = np.sum([np.sum(env_gym.action_space[el].shape) for el in env_gym.action_space.spaces]) print(f"The size of the action space is : " f"{dim_act_space}") dim_obs_space = np.sum([np.sum(env_gym.observation_space[el].shape).astype(int) for el in env_gym.observation_space.spaces]) print(f"The size of the observation space is : " f"{dim_obs_space}") # #### Action space # This is partly due because in grid2op, you can represent the same concept (*eg* reconnect a powerline) in different manners (in this case: either you "toggle a switch" - if the said powerline was connected, it will disconnect it, otherwise it will reconnect it- or you can say "i want this line connected whatever its original state"). This behaviour is detailed in the [official grid2op documentation](https://grid2op.readthedocs.io/en/latest/action.html#usage-examples). # # To (in general) reduce the action space by a factor of 2, you can represent these actions only using the change method (for example). You can do that with: # In[ ]: # example: ignore the "set_status" and "set_bus" type of actions, that are covered by the "change_status" and # "change_bus" #print(env_gym.action_space) #print() env_gym.action_space = env_gym.action_space.ignore_attr("set_bus").ignore_attr("set_line_status") #print(env_gym.action_space) #print() new_dim_act_space = np.sum([np.sum(env_gym.action_space[el].shape) for el in env_gym.action_space.spaces]) print(f"The new size of the action space is : {new_dim_act_space}") # Grid2op environments allow for both continuous and discrete action. For the sake of the example, let's "convert" the continuous actions in discrete ones (this is done with "binning" the values as explained in more details [in the documentation](https://grid2op.readthedocs.io/en/latest/gym.html#grid2op.gym_compat.ContinuousToDiscreteConverter) ) # In[ ]: # example: convert the continuous action type "redispatch" to a discrete action type from grid2op.gym_compat import ContinuousToDiscreteConverter env_gym.action_space = env_gym.action_space.reencode_space("redispatch", ContinuousToDiscreteConverter(nb_bins=11) ) # In[ ]: # And now our action space looks like: env_gym.action_space # You also have the possibility to use other types of more common action space. # # #### More customization for the action space # # For example, just like in most Atari Games, you can encode each unary action by an integer (for example "0" might be "turn left", "1" "turn right" etc.) and have your argent predict the ID of the action instead of its complex form. # # This action space will "automatically" transform continuous actions into discrete by "binning" (more information on the official documentation for example here ) # # This can be achieved with: # In[ ]: from grid2op.gym_compat import DiscreteActSpace env_gym.action_space = DiscreteActSpace(env_gym.init_env.action_space) print(f"There are {env_gym.action_space.n} independant actions") env_gym.action_space # You can customize it even more, for example if you have at your disposal a list of grid2op actions you want to use (and not use the other one, this is explained in the documentation). # #### Observation space # # For the obsevation space, we will remove lots of useless attributes (remember, it is for the sake of the example here, and rescale some other so that they have numbers between rougly 0. and 1., which stabilizes the learning process). # In[ ]: # first let's see which are the attributes in the observation space: # More information on # https://beta-grid2op.readthedocs.io/en/latest/observation.html#main-observation-attributes # and # https://grid2op.readthedocs.io/en/latest/gym.html#observation-space-and-action-space-customization env_gym.observation_space # Let's keep only the information about the flow on the powerlines: `rho`, the generation `gen_p`, the load `load_p` and the representation of the topology `topo_vect` (for the sake of the example, once again) # In[ ]: env_gym.observation_space = env_gym.observation_space.keep_only_attr(["rho", "gen_p", "load_p", "topo_vect", "actual_dispatch"]) new_dim_obs_space = np.sum([np.sum(env_gym.observation_space[el].shape).astype(int) for el in env_gym.observation_space.spaces]) print(f"The new size of the observation space is : " f"{new_dim_obs_space} (it was {dim_obs_space} before!)") # One other detail here, the generation and loads are not scaled (they are given in MW). We recommend to scale them to have number roughly between 0 and 1 for stability during learning. # # This can be done pretty easily with the code below: # In[ ]: from grid2op.gym_compat import ScalerAttrConverter from gymnasium.spaces import Box ob_space = env_gym.observation_space ob_space = ob_space.reencode_space("actual_dispatch", ScalerAttrConverter(substract=0., divide=env_glop.gen_pmax ) ) ob_space = ob_space.reencode_space("gen_p", ScalerAttrConverter(substract=0., divide=env_glop.gen_pmax ) ) ob_space = ob_space.reencode_space("load_p", ScalerAttrConverter(substract=obs_gym["load_p"], divide=0.5 * obs_gym["load_p"] ) ) # for even more customization, you can use any functions you want ! shape_ = (env_glop.dim_topo, env_glop.dim_topo) env_gym.observation_space.add_key("connectivity_matrix", lambda obs: obs.connectivity_matrix(), # can be any function returning a gym space Box(shape=shape_, low=np.zeros(shape_), high=np.ones(shape_), ) # this "Box" should represent the return type of the above function ) env_gym.observation_space = ob_space env_gym.observation_space # For the next notebooks, we use the following environment wrapper: # # ```python # from gymnasium import Env # from gymnasium.spaces import Discrete, MultiDiscrete, Box # import json # # import ray # from ray.rllib.algorithms.ppo import PPOConfig # from ray.rllib.algorithms import ppo # # from typing import Dict, Literal, Any # import copy # # import grid2op # from grid2op.gym_compat import GymEnv, BoxGymObsSpace, DiscreteActSpace, BoxGymActSpace, MultiDiscreteActSpace # from lightsim2grid import LightSimBackend # # # class Grid2opEnvWrapper(Env): # def __init__(self, # env_config: Dict[Literal["backend_cls", # "backend_options", # "env_name", # "env_is_test", # "obs_attr_to_keep", # "act_type", # "act_attr_to_keep"], # Any]= None): # super().__init__() # if env_config is None: # env_config = {} # # # handle the backend # backend_cls = LightSimBackend # if "backend_cls" in env_config: # backend_cls = env_config["backend_cls"] # backend_options = {} # if "backend_options" in env_config: # backend_options = env_config["backend_options"] # backend = backend_cls(**backend_options) # # # create the grid2op environment # env_name = "l2rpn_case14_sandbox" # if "env_name" in env_config: # env_name = env_config["env_name"] # if "env_is_test" in env_config: # is_test = bool(env_config["env_is_test"]) # else: # is_test = False # self._g2op_env = grid2op.make(env_name, backend=backend, test=is_test) # # NB by default this might be really slow (when the environment is reset) # # see https://grid2op.readthedocs.io/en/latest/data_pipeline.html for maybe 10x speed ups ! # # TODO customize reward or action_class for example ! # # # create the gym env (from grid2op) # self._gym_env = GymEnv(self._g2op_env) # # # customize observation space # obs_attr_to_keep = ["rho", "p_or", "gen_p", "load_p"] # if "obs_attr_to_keep" in env_config: # obs_attr_to_keep = copy.deepcopy(env_config["obs_attr_to_keep"]) # self._gym_env.observation_space.close() # self._gym_env.observation_space = BoxGymObsSpace(self._g2op_env.observation_space, # attr_to_keep=obs_attr_to_keep # ) # # export observation space for the Grid2opEnv # self.observation_space = Box(shape=self._gym_env.observation_space.shape, # low=self._gym_env.observation_space.low, # high=self._gym_env.observation_space.high) # # # customize the action space # act_type = "discrete" # if "act_type" in env_config: # act_type = env_config["act_type"] # # self._gym_env.action_space.close() # if act_type == "discrete": # # user wants a discrete action space # act_attr_to_keep = ["set_line_status_simple", "set_bus"] # if "act_attr_to_keep" in env_config: # act_attr_to_keep = copy.deepcopy(env_config["act_attr_to_keep"]) # self._gym_env.action_space = DiscreteActSpace(self._g2op_env.action_space, # attr_to_keep=act_attr_to_keep) # self.action_space = Discrete(self._gym_env.action_space.n) # elif act_type == "box": # # user wants continuous action space # act_attr_to_keep = ["redispatch", "set_storage", "curtail"] # if "act_attr_to_keep" in env_config: # act_attr_to_keep = copy.deepcopy(env_config["act_attr_to_keep"]) # self._gym_env.action_space = BoxGymActSpace(self._g2op_env.action_space, # attr_to_keep=act_attr_to_keep) # self.action_space = Box(shape=self._gym_env.action_space.shape, # low=self._gym_env.action_space.low, # high=self._gym_env.action_space.high) # elif act_type == "multi_discrete": # # user wants a multi-discrete action space # act_attr_to_keep = ["one_line_set", "one_sub_set"] # if "act_attr_to_keep" in env_config: # act_attr_to_keep = copy.deepcopy(env_config["act_attr_to_keep"]) # self._gym_env.action_space = MultiDiscreteActSpace(self._g2op_env.action_space, # attr_to_keep=act_attr_to_keep) # self.action_space = MultiDiscrete(self._gym_env.action_space.nvec) # else: # raise NotImplementedError(f"action type '{act_type}' is not currently supported.") # # def reset(self, seed=None, options=None): # # use default _gym_env (from grid2op.gym_compat module) # # NB: here you can also specify "default options" when you reset, for example: # # - limiting the duration of the episode "max step" # # - starting at different steps "init ts" # # - study difficult scenario "time serie id" # # - specify an initial state of your grid "init state" # return self._gym_env.reset(seed=seed, options=options) # # def step(self, action): # # use default _gym_env (from grid2op.gym_compat module) # return self._gym_env.step(action) # # ``` # # # ## Making the grid2op agent # # In this subsection we briefly explain how to wrap the trained agent (see below for training methods depending on the framework you want to use). The goal is to make this "tutorial" complete, in the sense that you will be able to use the trained agent in regular grid2op framework, for example using the `Runner` # # This subsection is compatible with all code that is explained in this notebook, even though we demonstrate it with the env created above. # # The basic idea is really simple, you create an grid2op agent, initialize it with the gym_env (you got from the `gym_compat` module) and use the "gym_env.action_space.from_gym" and "gym_env.observation_space.to_gym" function to convert the action and the observation. # In[ ]: from grid2op.Agent import BaseAgent class AgentFromGym(BaseAgent): def __init__(self, gym_env, trained_agent): self.gym_env = gym_env BaseAgent.__init__(self, gym_env.init_env.action_space) self.trained_aget = trained_agent def act(self, obs, reward, done): gym_obs = self.gym_env.observation_space.to_gym(obs) gym_act = self.trained_agent.act(gym_obs, reward, done) grid2op_act = self.gym_env.action_space.from_gym(gym_act) return grid2op_act # And this is it. You are done ;-) # # We recommend you to read the notebook [04_TrainingAnAgent](./04_TrainingAnAgent.ipynb) for more information about this "template" agent. And most importantly, some examples of such agents (and "better" grid2op environment) are provided in the "l2rpn_baselines" package. # ## 1) RLLIB # # To make it easier to get started, we moved this into the notebook [11_ray_integration](./11_ray_integration.ipynb) # # Please have a look at this notebook for more information. # ## 2) Stable baselines # # To make it easier to get started, we moved this into the notebook [11_stable_baselines3_integration](./11_stable_baselines3_integration.ipynb) # # Please have a look at this notebook for more information. # ## 3) Tf Agents # Lastly, the RL frameworks we will use is tf agents. # # Compared to the previous one, this framework is more verbose. In this notebook we will mimic what has been done in the https://github.com/tensorflow/agents/blob/master/docs/tutorials/1_dqn_tutorial.ipynb # # To that end, we will introduce the last "gym transformer" available in grid2op at time of writing. This function will transform the action space in a Discrete one. With this modeling, the agent can take an action on a substation, or act on a powerline or perform redispatching. But, as opposed to what is done previously, it cannot act on, say, a substation and a powerline at the same time. # # This limitation does not come from tf agents. But this limitation is necessary to run the tutorial of the DQN provided with tf agents. # # # First we will build the observation space as for the stable baselines repository. See section [2) Stable baselines](#2\)-Stable-baselines) for more information. # # ### Observation space # In[ ]: # create the gym environment env_tfa = GymEnv(env_glop) # tfa for "tf agents" glop_obs = env_glop.reset() # In[ ]: # customize the observation space env_tfa.observation_space = BoxGymObsSpace(env_tfa.init_env.observation_space, attr_to_keep=["gen_p", "load_p", "topo_vect", "rho", "actual_dispatch", "connectivity_matrix"], divide={"gen_p": env_glop.gen_pmax, "load_p": glop_obs.load_p, "actual_dispatch": env_glop.gen_pmax}, functs={"connectivity_matrix": ( lambda grid2obs: grid2obs.connectivity_matrix().flatten(), 0., 1., None, None, ) } ) obs_gym, info = env_tfa.reset() # Again, the observation space might need to be customize. We don't assume here that everything here is relevant, nor that any information that would be needed for an agent is here. # # This example is only here to demonstrate how to use grid2op with openai gym framework. # # ### Action space # # As opposed to the previous action space, to use the tutorial of tf agents, we need to customize the action space to ouput a single number (the id of the action you want to take). # # This can be done with the `DiscreteActSpace` gym converter, that behave approximately the same way as `MultiDiscreteActSpace` does. # In[ ]: from grid2op.gym_compat import DiscreteActSpace reencoded_act_space = DiscreteActSpace(env_sb.init_env.action_space, attr_to_keep=["set_line_status", "set_bus", "redispatch"]) env_tfa.action_space = reencoded_act_space obs_gym = env_sb.reset() print(env_tfa.action_space.from_gym(env_tfa.action_space.sample())) # In[ ]: print(env_tfa.action_space.from_gym(env_tfa.action_space.sample())) # ### Wrapping up and start training # # And that is it. All the rest is done thanks to tf agents. # # tf agents is a lot more verbose than ray and stable baselines, but it allows for more control on what you want to do, we will, for the sake of the example, only show the step without detailing them. # # For more information, you can visit their github: # https://github.com/tensorflow/agents # # website: # https://www.tensorflow.org/agents/api_docs/python/tf_agents # # and the notebook that inspired this one: # https://colab.research.google.com/github/tensorflow/agents/blob/master/docs/tutorials/1_dqn_tutorial.ipynb # Note: the above code, once again, only aims at showing how to integrate grid2op with tf agents. Its aim is not to showcase the best use of tensorflow, tf agents or grid2op. # # It is only an example for demonstration purpose and do not aim at providing an interesting agent at all. For that you might want to use something different than DQN, tune the hyper parameters (including size of each neural networks, number of step for which you train, learning rate, etc. etc.), define in a better fasshion the action space and observation space etc. # In[ ]: import tensorflow as tf from tf_agents.agents.dqn import dqn_agent from tf_agents.environments import tf_py_environment from tf_agents.networks import sequential from tf_agents.policies import random_tf_policy from tf_agents.replay_buffers import tf_uniform_replay_buffer from tf_agents.trajectories import trajectory from tf_agents.specs import tensor_spec from tf_agents.utils import common # initialize the environment from tf_agents.environments.gym_wrapper import GymWrapper tf_env_train = tf_py_environment.TFPyEnvironment(GymWrapper(env_tfa)) eval_env = tf_py_environment.TFPyEnvironment(GymWrapper(copy.deepcopy(env_tfa))) # meta parameters num_iterations = nb_step_train initial_collect_steps = 100 collect_steps_per_iteration = 1 replay_buffer_max_length = 100000 batch_size = 64 learning_rate = 1e-3 log_interval = 200 num_eval_episodes = 10 eval_interval = 1000 # neural nets (for the agents) fc_layer_params = (100, 50) action_tensor_spec = tensor_spec.from_spec(tf_env_train.action_spec()) num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1 # Define a helper function to create Dense layers configured with the right # activation and kernel initializer. def dense_layer(num_units): return tf.keras.layers.Dense( num_units, activation=tf.keras.activations.relu, kernel_initializer=tf.keras.initializers.VarianceScaling( scale=2.0, mode='fan_in', distribution='truncated_normal')) # QNetwork consists of a sequence of Dense layers followed by a dense layer # with `num_actions` units to generate one q_value per available action as # it's output. dense_layers = [dense_layer(num_units) for num_units in fc_layer_params] q_values_layer = tf.keras.layers.Dense( num_actions, activation=None, kernel_initializer=tf.keras.initializers.RandomUniform( minval=-0.03, maxval=0.03), bias_initializer=tf.keras.initializers.Constant(-0.2)) q_net = sequential.Sequential(dense_layers + [q_values_layer]) # optimizer (for training) optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) # just a variable to count the number of "env.step" performed train_step_counter = tf.Variable(0) # create the agent agent = dqn_agent.DqnAgent( tf_env_train.time_step_spec(), tf_env_train.action_spec(), q_network=q_net, optimizer=optimizer, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter) agent.initialize() # for exploration random_policy = random_tf_policy.RandomTFPolicy(tf_env_train.time_step_spec(), tf_env_train.action_spec()) # replay buffer (to store the past actions / states / rewards) replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=tf_env_train.batch_size, max_length=replay_buffer_max_length) def collect_step(environment, policy, buffer): time_step = environment.current_time_step() action_step = policy.action(time_step) next_time_step = environment.step(action_step.action) traj = trajectory.from_transition(time_step, action_step, next_time_step) # Add trajectory to the replay buffer buffer.add_batch(traj) def collect_data(env, policy, buffer, steps): for _ in range(steps): collect_step(env, policy, buffer) collect_data(tf_env_train, random_policy, replay_buffer, initial_collect_steps) # generate the datasets # Dataset generates trajectories with shape [Bx2x...] dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3) iterator = iter(dataset) # train it # (Optional) Optimize by wrapping some of the code in a graph using TF function. agent.train = common.function(agent.train) # Reset the train step agent.train_step_counter.assign(0) # Evaluate the agent's policy once before training. def compute_avg_return(environment, policy, num_episodes=10): total_return = 0.0 for _ in range(num_episodes): time_step = environment.reset() episode_return = 0.0 while not time_step.is_last(): action_step = policy.action(time_step) time_step = environment.step(action_step.action) episode_return += time_step.reward total_return += episode_return avg_return = total_return / num_episodes return avg_return.numpy()[0] # See also the metrics module for standard implementations of different metrics. # https://github.com/tensorflow/agents/tree/master/tf_agents/metrics avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes) returns = [avg_return] for _ in range(num_iterations): # Collect a few steps using collect_policy and save to the replay buffer. collect_data(tf_env_train, agent.collect_policy, replay_buffer, collect_steps_per_iteration) # Sample a batch of data from the buffer and update the agent's network. experience, unused_info = next(iterator) trainer = agent.train(experience) train_loss = trainer.loss step = agent.train_step_counter.numpy() if step % log_interval == 0: print('step = {0}: loss = {1}'.format(step, train_loss)) if step % eval_interval == 0: avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes) print('step = {0}: Average Return = {1}'.format(step, avg_return)) returns.append(avg_return) avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes) if num_iterations: print('Final Average return aftre training for {} steps: {}'.format(step, avg_return)) returns.append(avg_return) # If you want to use another RL framework, let us know by filling a github issue template here: https://github.com/rte-france/Grid2Op/issues/new?assignees=&labels=enhancement&template=feature_request.md&title= # # Even better, if you have used another RL framework, let us know and we will find a way to integrate your developement into this notebook ! You can write an issue https://github.com/rte-france/Grid2Op/issues/new?assignees=&labels=documentation&template=documentation.md&title= and explaining which framework you used and a minimal code example we could use