#!/usr/bin/env python
# coding: utf-8

# > **前言：** 之前讨论了（1步）时序差分方法（[ipynb链接](https://github.com/PiperLiu/Reinforcement-Learning-practice-zh/blob/master/practice/05-01-Temporal-Difference-Prediction.ipynb)）与蒙特卡洛方法（[ipynb链接](https://github.com/PiperLiu/Reinforcement-Learning-practice-zh/blob/master/practice/04-Monte-Carlo-Methods.ipynb)）。刚刚学习完 Sutton 的[《强化学习（第二版）》](http://rl.qiwihui.com/zh_CN/latest/chapter1/introduction.html)的第七章：n步自举法。它是时序差分方法与蒙特卡洛方法的折中，一般地，效果要好于二者。
# 
# *本次笔记不记录公式、算法框架，介绍思想。具体内容请见中文电子书：*
# 
# [第7章 n 步引导（Bootstrapping）方法](http://rl.qiwihui.com/zh_CN/latest/partI/chapter7/n_step_bootstrapping.html)
# 
# ****
# 
# ### n步自举法与时序差分方法、蒙特卡洛方法
# 
# ![](images/06-01.png)
# 
# 如上图：
# - 时序差分方法中，下一状态的价值是“估计”出来的；
# - 蒙特卡洛方法中，下一状态的价值是在整个幕都终止后，更加后续状态的折扣算出来的，是“已知”的；
# - n步自举法有“部分估计”、“部分已知”的特性。但是，所有n步方法都要在更新之前延迟至n个时刻步长。
# 
# ### 同轨策略下的控制：为什么n步比1步收敛得更快
# 
# ![](images/06-02.png)
# 
# 如上图，取自 Sutton 的书，只有G点有收益，状态到达G点后：
# - 在1步控制中（即之前所谓的“时序差分学习”），到达G的前1步的“状态-动作”价值得到了增强；
# - 在10步控制中，到达G的前的第10步就可以“感受”到价值的增强。
# 
# ### 离轨策略下的n步学习：共4种
# 
# ![](images/06-03.png)
# 
# 在[蒙特卡洛方法](https://blog.csdn.net/weixin_42815609/article/details/104025586)中我讨论过“重要度采样率”，用于离轨策略下的学习（包括估值与控制）；在[时序差分控制](https://blog.csdn.net/weixin_42815609/article/details/104069627)的“期望Sarsa”中，采用后续状态的动作期望，对节点进行估值。**这两个思想在离轨策略下的n步学习中得以混合、应用。**
# 
# 上图取自 Sutton 的书，从左到右的解释见下表：
# 
# |名称|介绍|
# |---|---|
# |可推广为带控制变量的每次决策型方法|基于后续n步状态的采样率对收益进行学习|
# |不使用重要度采样：n步树回溯算法|基于后续n步状态的“状态-动作”-收益期望进行学习|
# |n步期望Sarsa|基于后续第n步状态的“状态-动作”-收益期望进行学习，其他进行重要度采样*|
# |n步Q(σ)|对后续n步交叉采取重要度采样率与期望进行学习|
# 
# [*] n步期望Sarsa 与 n步Sarsa略有不同：
# - 在期望Sarsa中，采样率取$\rho_{t+1:t+n-1}$而非Sarsa的$\rho_{t+1:t+n}$；
# - 因为在期望Sarsa方法中，最有一个状态考虑所有可能的动作，实际采取哪个动作不重要，无需修正。

# ### 代码：n步时序差分预测
# 
# 使用例子 6.2 ，如下图。
# 
# ![](images/06-04.png)
# 
# n步时序差分预测算法框图如下。
# 
# ![](images/06-05.png)

# In[1]:


#######################################################################
# Copyright (C)                                                       #
# 2016-2018 Shangtong Zhang(zhangshangtong.cpp@gmail.com)             #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# Permission given to modify the code as long as you keep this        #
# declaration at the top                                              #
#######################################################################

import numpy as np
import matplotlib
get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
from tqdm import tqdm

# all states
N_STATES = 19

# discount
GAMMA = 1

# all states but terminal states
STATES = np.arange(1, N_STATES + 1)

# start from the middle state
START_STATE = 10

# two terminal states
# an action leading to the left terminal state has reward -1
# an action leading to the right terminal state has reward 1
END_STATES = [0, N_STATES + 1]

# true state value from bellman equation
TRUE_VALUE = np.arange(-20, 22, 2) / 20.0
TRUE_VALUE[0] = TRUE_VALUE[-1] = 0

# n-steps TD method
# @value: values for each state, will be updated
# @n: # of steps
# @alpha: # step size
def temporal_difference(value, n, alpha):
    # initial starting state
    state = START_STATE

    # arrays to store states and rewards for an episode
    # space isn't a major consideration, so I didn't use the mod trick
    states = [state]
    rewards = [0]

    # track the time
    time = 0

    # the length of this episode
    T = float('inf')
    while True:
        # go to next time step
        time += 1

        if time < T:
            # choose an action randomly
            if np.random.binomial(1, 0.5) == 1:
                next_state = state + 1
            else:
                next_state = state - 1

            if next_state == 0:
                reward = -1
            elif next_state == 20:
                reward = 1
            else:
                reward = 0

            # store new state and new reward
            states.append(next_state)
            rewards.append(reward)

            if next_state in END_STATES:
                T = time

        # get the time of the state to update
        update_time = time - n
        if update_time >= 0:
            returns = 0.0
            # calculate corresponding rewards
            for t in range(update_time + 1, min(T, update_time + n) + 1):
                returns += pow(GAMMA, t - update_time - 1) * rewards[t]
            # add state value to the return
            """
            这里的 if update_time + n <= T
            表示如果后续第 n 步状态没有终止，那么对于n步状态之后的状态（ n+1, ... , 终止）采用估值
            即： value[state[(update_time + n)]]
            否则，无需估值。
            """
            if update_time + n <= T:
                returns += pow(GAMMA, n) * value[states[(update_time + n)]]
            state_to_update = states[update_time]
            # update the state value
            if not state_to_update in END_STATES:
                value[state_to_update] += alpha * (returns - value[state_to_update])
        if update_time == T - 1:
            break
        state = next_state

# Figure 7.2, it will take quite a while
def figure7_2():
    # all possible steps
    steps = np.power(2, np.arange(0, 10))

    # all possible alphas
    alphas = np.arange(0, 1.1, 0.1)

    # each run has 10 episodes
    episodes = 10

    # perform 100 independent runs
    runs = 100

    # track the errors for each (step, alpha) combination
    errors = np.zeros((len(steps), len(alphas)))
    for run in tqdm(range(0, runs)):
        for step_ind, step in enumerate(steps):
            for alpha_ind, alpha in enumerate(alphas):
                # print('run:', run, 'step:', step, 'alpha:', alpha)
                value = np.zeros(N_STATES + 2)
                for ep in range(0, episodes):
                    temporal_difference(value, step, alpha)
                    # calculate the RMS error
                    errors[step_ind, alpha_ind] += np.sqrt(np.sum(np.power(value - TRUE_VALUE, 2)) / N_STATES)
    # take average
    errors /= episodes * runs

    for i in range(0, len(steps)):
        plt.plot(alphas, errors[i, :], label='n = %d' % (steps[i]))
    plt.xlabel('alpha')
    plt.ylabel('RMS error')
    plt.ylim([0.25, 0.55])
    plt.legend()

    plt.show()

figure7_2()


# 来自：[Zhang's GitHub](https://github.com/ShangtongZhang/reinforcement-learning-an-introduction/tree/master/chapter07)
# 
# 运行结果：
# 
# 我对代码进行了1处标注。

# ### 其他：数学性
# 
# 本章公式较多，值得一提的是，n步时序差分方法有坚实的数学基础，引用书上的话：在最坏的情况下，采用它的期望值作为对$v_k$的估计可以保证比$V_{k+n-1}$更好；即n步回报的期望的最坏误差能够保证不大于$V_{t+n-1}$的最坏误差的$\gamma^n$倍（$\gamma \le 1$）。
# 
# $$\max _{s}\left|\mathbb{E}_{\pi}\left[G_{t: t+n} | S_{t}=s\right]-v_{\pi}(s)\right| \leq \gamma^{n} \max _{s}\left|V_{t+n-1}(s)-v_{\pi}(s)\right|$$
# 
# 但 n 并非越大越好，从代码产生的例子中我们可以看出。
# 
# 此外，更新公式间的递推转换，不予过分关注。
# 
# 另外提一点： Sutton 在第一版中认为“n步算法在实际中不可行”，而见过Cichosz（1995），van Seijen（2016）的研究后，认为其为很实用的算法。

# In[ ]: