#!/usr/bin/env python # coding: utf-8 # # DiscreteDP # # ***Getting Started with a Simple Example*** # **Daisuke Oyama** # # *Faculty of Economics, University of Tokyo* # This notebook demonstrates via a simple example how to use the `DiscreteDP` module. # In[1]: from __future__ import division, print_function import numpy as np from scipy import sparse from quantecon.markov import DiscreteDP # ## A two-state example # Let us consider the following two-state dynamic program, # taken from Puterman (2005), Section 3.1, pp.33-35; # see also Example 6.2.1, pp.155-156. # # * There are two possible states $0$ and $1$. # # * At state $0$, you may choose either "stay", say action $0$, or "move", action $1$. # # * At state $1$, there is no way to move, so that you can only stay, i.e., # $0$ is the only available action. # (You may alternatively distinguish between the action "staty" at state $0$ # and that at state $1$, and call the latter action $2$; # but here we choose to refer to the both actions as action $0$.) # # * At state $0$, # if you choose action $0$ (stay), # then you receive a reward $5$, and # in the next period the state will remain at $0$ with probability $1/2$, # but it moves to $1$ with probability $1/2$. # # * If you choose action $1$ (move), # then you receive a reward $10$, and # the state in the next period will be $1$ with probability $1$. # # * At state $1$, where the only action you can take is $0$ (stay), # you receive a reward $-1$, and # the state will remain at $1$ with probability $1$. # # * You want to maximize the sum of discounted expected reward flows # with discount factor $\beta \in [0, 1)$. # The optimization problem consists of: # # * the state space: $S = \{0, 1\}$; # # * the action space: $A = \{0, 1\}$; # # * the set of feasible state-action pairs # $\mathit{SA} = \{(0, 0), (0, 1), (1, 0)\} \subset S \times A$; # # * the reward function $r\colon \mathit{SA} \to \mathbb{R}$, where # $$ # r(0, 0) = 5,\ r(0, 1) = 10,\ r(1, 0) = -1; # $$ # # * the transition probability function $q \colon \mathit{SA} \to \Delta(S)$, where # $$ # \begin{aligned} # &(q(0 | 0, 0), q(1 | 0, 0)) = (1/2, 1/2), \\ # &(q(0 | 0, 1), q(1 | 0, 1)) = (0, 1), \\ # &(q(0 | 1, 0), q(1 | 1, 0)) = (0, 1); # \end{aligned} # $$ # # * the discount factor $\beta \in [0, 1)$. # The Belmann equation for this problem is: # $$ # \begin{aligned} # v(0) &= \max \left\{5 + \beta \left(\frac{1}{2} v(0) + \frac{1}{2} v(1)\right), # 10 + \beta v(1)\right\}, \\ # v(1) &= (-1) + \beta v(1). # \end{aligned} # $$ # This problem is simple enough to solve by hand: # the optimal value function $v^*$ is given by # $$ # \begin{aligned} # &v(0) = # \begin{cases} # \dfrac{5 - 5.5 \beta}{(1 - 0.5 \beta) (1 - \beta)} & \text{if $\beta > \frac{10}{11}$} \\ # \dfrac{10 - 11 \beta}{1 - \beta} & \text{otherwise}, # \end{cases}\\ # &v(1) = -\frac{1}{1 - \beta}, # \end{aligned} # $$ # and the optimal policy function $\sigma^*$ is given by # $$ # \begin{aligned} # &\sigma^*(0) = # \begin{cases} # 0 & \text{if $\beta > \frac{10}{11}$} \\ # 1 & \text{otherwise}, # \end{cases}\\ # &\sigma^*(1) = 0. # \end{aligned} # $$ # In[2]: def v_star(beta): v = np.empty(2) v[1] = -1 / (1 - beta) if beta > 10/11: v[0] = (5 - 5.5*beta) / ((1 - 0.5*beta) * (1 - beta)) else: v[0] = (10 - 11*beta) / (1 - beta) return v # We want to solve this problem numerically by using the `DiscreteDP` class. # We will set $\beta = 0.95$ ($> 10/11$), for which the anlaytical solution is: # $\sigma^* = (0, 0)$ and # In[3]: v_star(beta=0.95) # ### Formulating the model # There are two ways to represent the data for instantiating a `DiscreteDP` object. # Let $n$, $m$, and $L$ denote the numbers of states, actions, # and feasbile state-action pairs, respectively; # in the above example, $n = 2$, $m = 2$, and $L = 3$. # # 1. `DiscreteDP(R, Q, beta)` # # with parameters: # # * $n \times m$ reward array `R`, # * $n \times m \times n$ transition probability array `Q`, and # * discount factor `beta`, # # where `R[s, a]` is the reward for action `a` when the state is `s` and # `Q[s, a, s']` is the probability that the state in the next period is `s'` # when the current state is `s` and the action chosen is `a`. # # 2. `DiscreteDP(R, Q, beta, s_indices, a_indices)` # # with parameters: # # * length $L$ reward vector `R`, # * $L \times n$ transition probability array `Q`, # * discount factor `beta`, # * length $L$ array `s_indices`, and # * length $L$ array `a_indices`, # # where the pairs `(s_indices[0], a_indices[0])`, ..., `(s_indices[L-1], a_indices[L-1])` # enumerate feasible state-action pairs, and # `R[i]` is the reward for action `a_indices[i]` when the state is `s_indices[i]` and # `Q[i, s']` is the probability that the state in the next period is `s'` # when the current state is `s_indices[i]` and the action chosen is `a_indices[0]`. # ### Creating a `DiscreteDP` instance # Let us illustrate the two formulations by the simple example at the outset. # #### Product formulation # This formulation is straightforward # when the number of feasible actions is constant across states # so that the set of feasible state-action pairs is naturally represetend # by the product $S \times A$, # while any problem can actually be represented in this way # by defining the reward `R[s, a]` to be $-\infty$ when action `a` is infeasible under state `s`. # To apply this approach to the current example, # we consider the effectively equivalent problem # in which at both states $0$ and $1$, # both actions $0$ (stay) and $1$ (move) are available, # but action $1$ yields a reward $-\infty$ at state $1$. # The reward array `R` is an $n \times m$ 2-dimensional array: # In[4]: R = [[5, 10], [-1, -float('inf')]] # The transition probability array `Q` is an $n \times m \times n$ 3-dimenstional array: # In[5]: Q = [[(0.5, 0.5), (0, 1)], [(0, 1), (0.5, 0.5)]] # Probabilities in Q[1, 1] are arbitrary # Note that the transition probabilities for action $(s, a) = (1, 1)$ are arbitrary, # since $a = 1$ is infeasible at $s = 1$ in the original problem. # Let us set the discount factor $\beta$ to be $0.95$: # In[6]: beta = 0.95 # We are ready to create a `DiscreteDP` instance: # In[7]: ddp = DiscreteDP(R, Q, beta) # #### State-action pairs formulation # When the number of feasible actions varies across states, # it can be inefficient in terms of memory usage # to extend the domain by treating infeasible actions # to be "feasible but yielding reward $-\infty$". # This formulation takes the set of feasible state-action pairs as is, # defining `R` to be a 1-dimensional array of length `L` # and `Q` to be a 2-dimensional array of shape `(L, n)`, # where `L` is the number of feasible state-action pairs. # First, we have to list all the feasible state-action pairs. # For our example, they are: $(s, a) = (0, 0), (0, 1), (1, 0)$. # We have arrays `s_indices` and ` a_indices` of length $3$ # contain the indices of states and actions, respectively. # In[8]: s_indices = [0, 0, 1] # State indices a_indices = [0, 1, 0] # Action indices # The reward vector `R` is a length $L$ 1-dimensional array: # In[9]: # Rewards for (s, a) = (0, 0), (0, 1), (1, 0), respectively R = [5, 10, -1] # The transition probability array `Q` is an $L \times n$ 2-dimensional array: # In[10]: # Probability vectors for (s, a) = (0, 0), (0, 1), (1, 0), respectively Q = [(0.5, 0.5), (0, 1), (0, 1)] # For the discount factor, set $\beta = 0.95$ as before: # In[11]: beta = 0.95 # Now create a `DiscreteDP` instance: # In[12]: ddp_sa = DiscreteDP(R, Q, beta, s_indices, a_indices) # ##### Notes # Importantly, this formulation allows us to represent the transition probability array `Q` # as a [`scipy.sparse`](http://docs.scipy.org/doc/scipy/reference/sparse.html) matrix # (of any format), # which is useful for large and sparse problems. # For example, let us convert the above ndarray `Q` to the Coordinate (coo) format: # In[13]: import scipy.sparse Q = scipy.sparse.coo_matrix(Q) # Pass it to `DiscreteDP` with the other parameters: # In[14]: ddp_sparse = DiscreteDP(R, Q, beta, s_indices, a_indices) # Internally, the matrix `Q` is converted to the Compressed Sparse Row (csr) format: # In[15]: ddp_sparse.Q # In[16]: ddp_sparse.Q.toarray() # ### Solving the model # Now let us solve our model. # Currently, `DiscreteDP` supports the following solution algorithms: # # * policy iteration; # * value iteration; # * modified policy iteration. # # (The methods are the same across the formulations.) # #### Policy iteration # We solve the model first by policy iteration, # which gives the exact solution: # In[17]: v_init = [0, 0] # Initial value function, optional(default=max_a r(s, a)) res = ddp.solve(method='policy_iteration', v_init=v_init) # `res` contains the information about the solution result: # In[18]: res # The optimal policy function: # In[19]: res.sigma # The optimal value function: # In[20]: res.v # This coincides with the analytical solution: # In[21]: v_star(beta) # In[22]: np.allclose(res.v, v_star(beta)) # The number of iterations: # In[23]: res.num_iter # Verify that the value of the policy `[0, 0]` is actually equal to the optimal value `v`: # In[24]: ddp.evaluate_policy(res.sigma) # In[25]: ddp.evaluate_policy(res.sigma) == res.v # `res.mc` is the controlled Markov chain given by the optimal policy `[0, 0]`: # In[26]: res.mc # #### Value iteration # Next, solve the model by value iteration, # which returns an $\varepsilon$-optimal solution for a specified value of $\varepsilon$: # In[27]: epsilon = 1e-2 # Convergece tolerance, optional(default=1e-3) v_init = [0, 0] # Initial value function, optional(default=max_a r(s, a)) res_vi = ddp.solve(method='value_iteration', v_init=v_init, epsilon=epsilon) # In[28]: res_vi # The computed policy function `res1.sigma` is an $\varepsilon$-optimal policy, # and the value function `res1.v` is an $\varepsilon/2$-approximation # of the true optimal value function. # In[29]: np.abs(v_star(beta) - res_vi.v).max() # #### Modified policy iteration # Finally, solve the model by modified policy iteration: # In[30]: epsilon = 1e-2 # Convergece tolerance, optional(defaul=1e-3) v_init = [0, 0] # Initial value function, optional(default=max_a r(s, a)) res_mpi = ddp.solve(method='modified_policy_iteration', v_init=v_init, epsilon=epsilon) # In[31]: res_mpi # Modified policy function also returns an $\varepsilon$-optimal policy function # and an $\varepsilon/2$-approximate value function: # In[32]: np.abs(v_star(beta) - res_mpi.v).max() # ## References # * M.L. Puterman, # [*Markov Decision Processes: Discrete Stochastic Dynamic Programming*](http://onlinelibrary.wiley.com/book/10.1002/9780470316887), # Wiley-Interscience, 2005. # In[ ]: