#!/usr/bin/env python # coding: utf-8 # # Sequential Extensions of Causal and Evidential Decision Theory # # Written by [Jan Leike](http://jan.leike.name/) and [Tom Everitt](http://www.tomeveritt.se/). # # ## About this notebook # # The code is written in Python 3, but should be compatible with Python 2. # # The code can be used to calculate the value estimates of the decision theories discussed in our paper [Sequential Extensions of Causal and Evidential Decision Theory](http://users.cecs.anu.edu.au/~leike/publications/Sequential Extensions of Causal and Evidential Decision Theory - Everitt, Leike, Hutter 2015.pdf). Generally, the implemented value functions take as input a distribution defining the environmental probabilities, an action/policy, lists of possible percepts and hidden states, and a utility function. The implementation aims to follow the formulas in the paper as closely as possible. See the code documentation for details. # # The results of the calculations are displayed below the code boxes. If you wish to re-run some code (with or without changes), please first run all earlier code boxes in order, as functions in later code boxes depend on earlier code boxes having been executed at least once. # # To run a code box, press Shift+Enter. # # # ## One-Shot Decision Making # # In a _decision problem_, # we take one action $a \in \mathcal{A}$, receive a percept $e \in \mathcal{E}$ # (typically called _outcome_ in the decision theory literature) # and get a payoff according to the utility function # $u: \mathcal{E} \to [0, 1]$. # We assume that the set of actions $\mathcal{A}$ and the set of percepts $\mathcal{E}$ are finite. # Additionally, # the environment contains a _hidden state_ $s \in \mathcal{S}$. # The hidden state holds information that is inaccessible to the agent # at the time of the decision, # but may influence the decision and the percept. # Formally, the environment is given by # a probability distribution $P$ over the hidden state, the action and the percept # that factors according to a _causal graph_ (Pearl, 2009). # # ### Causal Graphs # # A causal graph over the random variables $x_1,\dots,x_n$ is # a directed acyclic graph with nodes $x_1,\dots,x_n$ and # probability distributions $P(x_i\mid pa_i)$ for each node $x_i$ # where $pa_i$ is the set of parents of $x_i$ in the graph. # It is natural to identify the causal graph with # the factored distribution $P(x_1,\dots, x_n) = \prod_{i=1}^n P(x_i\mid pa_i)$. # Given such a causal graph, # the $\texttt{do}$-operator is defined as # $$ # P(x_1, \ldots, x_j, \ldots, x_n \mid \texttt{do}(x_j := b)) # = \frac{\prod_{i=1}^n P(x_i \mid pa_i)}{P(x_j \mid pa_j)} # $$ # if $x_j = b$ on the left hand side and $0$ otherwise. # The result is a new probability distribution # that can be marginalized and conditioned in the standard way. # Intuitively, intervening on node $x_j$ means # ignoring all incoming arrows to $x_j$, as the effects they represent # are no longer relevant when we intervene; # the division removes the $P(x_j \mid pa_j)$ factor from $P(x_1,\dots,x_n)$. # Note that the $\texttt{do}$-operator is only defined for distributions # for which a causal graph has been specified. # See (Pearl, 2009, Ch. 3.4) for details. # # In the first two (one-shot) examples, the environment is given as # the following causal graph over the hidden state $s$, action $a$, and percept $e$. # # ![im1](one-shot.png) # # According to this causal graph, # the probability distribution $P$ factors causally into # $P(s, a, e) = P(s) P(a \mid s) P(e \mid s, a)$. # # The environment has a prior belief over the agent's actions, # but that prior does not have to assign high probability to the # actions that the agent actually ends up taking. # We think of this prior as _partial information_ about the agent # or in multi-agent systems beliefs held by other agents. # Since we assume the agent's policy to be deterministic and inside the environment, # the environment could actually know all the agent's actions in advance. # However, for self-consistency reasons, the agent needs to be # uncertain about what the environment knows about it: # if the agent knew which action she will take, # she could decide to take different actions which leads to a paradox. # In[1]: # Imports and Helper functions from fractions import Fraction # exact arithmetic with fractions def distribution(d): """ Return a probability distribution corresponding the probability mu(a | b) Input: a dictionary specifying the distribution Output: a function that queries the dictionary and returns 0 if nothing is found """ def mu(a, b=''): #print("evaluating ",a,b) if b == '': return d[a] if a in d else 0 else: return d[(a, b)] if (a, b) in d else 0 return mu # ### Value Functions # # One-shot evidential value function: # $$ # V^{\mathrm{evi},a}_{\mu,1} # := \sum_{e \in \mathcal{E}} \mu(e \mid a) u(e) # = \sum_{e \in \mathcal{E}} \sum_{s \in \mathcal{S}} \mu(e \mid s, a) \mu(s \mid a) u(e) # $$ # One-shot causal value function: # $$ # V^{\mathrm{cau},a}_{\mu,1} # := \sum_{e \in \mathcal{E}} \mu(e \mid \texttt{do}(a)) u(e) # = \sum_{e \in \mathcal{E}} \sum_{s \in \mathcal{S}} \mu(e \mid s, a) \mu(s) u(e) # $$ # In[2]: def evidential_value(mu, a, E, S, u): """ Evidential value of action a in environment mu with possible percepts E, hidden states S, and utility function u. """ mu_a = sum([mu(a, s)*mu(s) for s in S]) # mu(a) return sum([u[e]*mu(e, s + a)*mu(s)*mu(a, s)/mu_a for e in E for s in S]) def causal_value(mu, a, E, S, u): """ Causal value of action a in environment mu with possible percepts E, hidden states S, and utility function u """ return sum([u[e]*mu(e, s + a)*mu(s) for e in E for s in S]) # ### Newcomb's Problem # # In [Newcomb's Problem](http://en.wikipedia.org/wiki/Newcomb%27s_paradox) there are two boxes: # an opaque box that is either empty or contains one million dollars and # a transparent box that contains one thousand dollars. # The agent can choose between taking only the opaque box ("one-boxing") # and taking both boxes ("two-boxing"). # The content of the opaque box is determined by a very reliable predictor: # If it predicts the agent will one-box, the box contains the million, and # if it predicts the agent will two-box, the box is empty. # # In Newcomb's problem # EDT prescribes to one-box because one-boxing is evidence that # the box contains a million dollars. # In contrast, CDT prescribes to two-box because two-boxing dominates one-boxing: # in either case we are a thousand dollars richer, # and our decision cannot causally affect the prediction. # Newcomb's problem has been raised as a critique to CDT, # but many philosophers insist that two-boxing is in fact # the rational choice, even if it makes you end up poor. # In[3]: # Newcomb's Problem S = {'E', 'F'} # E = empty, F = full A = {'1', '2'} # 1 = one-box, 2 = two-box E = {'0', 'T', 'M', 'A'} # 0 = no money, T = $1000, M = $1000000, A = $1001000 (all of the money) u = {'0': 0, 'T': 1000, 'M': 1000000, 'A': 1001000} # Utilities eps = Fraction('0.01') # Prediction inaccuracy mu = distribution({ 'E': Fraction('0.5'), # Prob. box empty 'F': Fraction('0.5'), # Prob. box full ('1', 'F'): 1 - eps, # Prob. agent one-boxes given box full ('1', 'E'): eps, # Prob. agent one-boxes given box empty ('2', 'F'): eps, # Prob. agent two-boxes given box full ('2', 'E'): 1 - eps, # Prob. agent one-boxes given box full ('0', 'E1'): 1, # Prob. receiving $0 given empty and one-boxing ('T', 'E2'): 1, # Prob. receiving $T given empty and two-boxing ('M', 'F1'): 1, # Prob. receiving $M given full and one-boxing ('A', 'F2'): 1, # Prob. receiving $M+$T given full and two-boxing }) # Unspecified conditionals default to 0 print('Evidential value of one-boxing: ${:}'.format(evidential_value(mu, '1', E, S, u))) print('Evidential value of two-boxing: ${:}'.format(evidential_value(mu, '2', E, S, u))) print('Causal value of one-boxing: ${:}'.format(causal_value(mu, '1', E, S, u))) print('Causal value of two-boxing: ${:}'.format(causal_value(mu, '2', E, S, u))) assert evidential_value(mu, '1', E, S, u) > evidential_value(mu, '2', E, S, u) # EDT prefers 1 assert causal_value(mu, '1', E, S, u) < causal_value(mu, '2', E, S, u) # CDT prefers 2 # ### Toxoplasmosis Problem # # This problem takes place in a world in which there is a certain parasite that # cause their host to be attracted to cats, # in addition to uncomfortable sideeffects. # The agent is handed an adorable little kitten and # is faced with the decision of whether or not to pet it. # Petting the kitten feels nice and # therefore yields more utility than not petting it. # However, people suffering from the parasite are more likely to pet the kitten. # # Petting the kitten constitutes evidence # of the presence of the parasite, and thus EDT recommends against it. # CDT correctly observes that there is no causal connection between # petting the kitten and having the parasite, # and is therefore in favor of petting. # In[4]: # Toxoplasmosis Problem S = {'T', 'H'} # T = toxoplasmosis, H = healthy A = {'P', 'N'} # P = petting, N = not petting E = {'0', '1', '8', '9'} # 0 = sick, not petting; 1 = sick, petting; 8 = healthy, not petting; 9 = healthy, petting u = {'0': -10, '1': -9, '8': 0, '9': 1} # Utilities mu = distribution({ 'T': Fraction('0.5'), 'H': Fraction('0.5'), ('P', 'T'): Fraction('0.8'), ('P', 'H'): Fraction('0.2'), ('N', 'T'): Fraction('0.2'), ('N', 'H'): Fraction('0.8'), ('0', 'TN'): 1, ('1', 'TP'): 1, ('8', 'HN'): 1, ('9', 'HP'): 1, }) # Unspecified conditionals default to 0 print('Evidential value of petting: {:}'.format(evidential_value(mu, 'P', E, S, u))) print('Evidential value of not petting: {:}'.format(evidential_value(mu, 'N', E, S, u))) print('Causal value of petting: {:}'.format(causal_value(mu, 'P', E, S, u))) print('Causal value of not petting: {:}'.format(causal_value(mu, 'N', E, S, u))) assert evidential_value(mu, 'P', E, S, u) < evidential_value(mu, 'N', E, S, u) # EDT prefers N assert causal_value(mu, 'P', E, S, u) > causal_value(mu, 'N', E, S, u) # CDT prefers P # ## Sequential Decisions # # For the remainder of this notebook, # we consider an environment $\mu$ that # the agent interacts with sequentially: # at time step $t$ the agent chooses an _action_ $a_t \in \mathcal{A}$ and # receives a _percept_ $e_t \in \mathcal{E}$ # which yields a _utility_ of $u(e_t) \in \mathbb{R}$; # the cycle then repeats for $t + 1$. # A _history_ is an element of $(\mathcal{A} \times \mathcal{E})^*$. # We use $æ \in \mathcal{A} \times \mathcal{E}$ to denote one interaction cycle, # and $æ_{