#!/usr/bin/env python # coding: utf-8 # # Table of Contents #

# In[667]: import tensorflow as tf import numpy as np import random import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[253]: # 假设老虎机标准差为 1/2 倍均值 # 假设所获得的奖励符合高斯分布 BANDITS = [0.10, -0.20, 1.0, -5.0, 5.0, -20.0, 10.0, 3.0, -2.0, -1.0] def bandit_reward(bandit): # 所有 bandits 的期望 mu = BANDITS[bandit] sigma = np.abs(mu)/2.0 return np.random.normal(mu, sigma, 10) # In[254]: # 每次拉老虎机，随机从奖励结果中取一个 def pull_bandit(bandit): banditreward = bandit_reward(bandit) choose = np.random.randint(0,len(BANDITS)-1) return banditreward[choose] # In[255]: # 验证一下, 可以发现基本能够达到期望值 for i, rew in enumerate(BANDITS): result = [] # 重复 1000 次 for j in range(1000): result.append(pull_bandit(i)) print("Actual reward: %s; Estimated reward: %s" % (rew, np.average(result))) # # greedy # In[287]: # 贪心算法 total_reward = 0 try_reward = [] trial = 3 ite = 0 total_iter = 1000 while ite < total_iter: # 各三次实验求平均，然后以贪心算法选择最大的 Q if ite < trial * len(BANDITS): for i in range(len(BANDITS)): ite += 1 reward = round(pull_bandit(i),1) try_reward.append(reward) total_reward += reward else: Q_table = np.average(np.array(try_reward).reshape(trial, len(BANDITS)), axis=0) # 保留两位小数 Q_table = list(np.around(Q_table, decimals=1)) Q = max(Q_table) index = Q_table.index(Q) ite += 1 total_reward += Q assert sum(try_reward) + Q*(total_iter - trial * len(BANDITS)) - total_reward < 1.0 print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") else: print("You haven't chosen the best bandit....") # 贪心算法小结： # # 注意这个结果是和 bandits 的期望以及标准差相关的。 # # - BANDITS 期望一定时：如果标准差很小，那么选择基本会限定在最高的几个老虎机上，如果标准差非常大，则有可能选择到其他的老虎机。 # - BANDITS 标准差一定时：期望相差越大，越容易选到最高的几个老虎机上，期望相差越小，则可能选到其他老虎机。 # # epson-greedy # In[674]: # epson-贪心算法1：上来就探索 def epson_greedy(epson): total_reward = 0 is_best = 0 epson_num = 0 try_reward = [] ite = 0 total_iter = 1000 Q_table = [0] * len(BANDITS) while ite < total_iter: if random.random() < epson: ite += 1 epson_num += 1 choose = np.random.randint(0,len(BANDITS)) rew = pull_bandit(choose) total_reward += rew # 更新的值保留两位小数，方便看结果 Q_table[choose] = round(rew, 2) Q = max(Q_table) index = Q_table.index(Q) # 保留两位小数 else: ite += 1 Q = max(Q_table) index = Q_table.index(Q) total_reward += Q print("共探索了 %s 次." % epson_num) print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[675]: # epson = 0.1，10% 的概率在探索新的老虎机 epson_greedy(0.1) # In[735]: # epson-贪心算法2：先确定一个 Q table 再探索，结合了贪心算法 def epson_greedy2(epson): total_reward = 0 is_best = 0 epson_num = 0 try_reward = [] trial = 3 ite = 0 total_iter = 1000 Q_table = [0] * len(BANDITS) while ite < total_iter: # 先三次实验确定 Q_table，再探索 if ite < trial * len(BANDITS): for i in range(len(BANDITS)): ite += 1 reward = round(pull_bandit(i),1) try_reward.append(reward) total_reward += reward else: Q_table = np.average(np.array(try_reward).reshape(trial, len(BANDITS)), axis=0) Q_table = list(np.around(Q_table, decimals=1)) # epson 概率选择下一个要拉的老虎机，1-epson 选目前奖励最高的 # 如果是非静态的 bandit，可以把 Q_table 中每台 bandit 的 reward 存成 list，然后 # 用 Q_{n+1} = \sum_{i=1}^n \alpha (1-\alpha)^{n-i} R_i + (1-\alpha)^{n} Q_1 计算即可 # alpha 为超参数，R_i 为存储的所有 Reward，Q1 为第一次的 Reward if random.random() < epson: epson_num += 1 choose = np.random.randint(0,len(BANDITS)) rew = pull_bandit(choose) total_reward += rew # 更新的值保留两位小数，方便看结果 Q_table[choose] = round(rew, 2) else: ite += 1 Q = max(Q_table) index = Q_table.index(Q) total_reward += Q print("共探索了 %s 次." % epson_num) print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index])) # 如果你选的 Q（两位小数）没有在 Q_table，说明最后一次的 Q_table 是 Q_table_new，最大的 Q 是探索出来的 # 实际中不需要新建一个 Qtable，这里为了看起来更加直观 print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[737]: # epson = 0.1，10% 的概率在探索新的老虎机 epson_greedy2(0.1) # epson-贪心小结： # # - 获得最高的收益时，不一定就是选择了期望收益最大的老虎机。相反也是。当然大部分的情况下，最高的收益对应的是选择了期望最大的老虎机。 # - 对 BANDITS 值没有那么敏感。 # # optimistic exploration greedy # In[596]: # optimistic exploration：直到 “失望” 前，一直探索，然后贪心 def opt_greedy(init_q): total_reward = 0 is_best = 0 epson_num = 0 try_reward = [] trial_num = 0 ite = 0 total_iter = 1000 Q_table = [init_q] * len(BANDITS) indexes = range(0, len(BANDITS)) while ite < total_iter: # 当最大值是初始乐观值时，一直保持探索（不死心……） if max(Q_table) == init_q: trial_num += 1 ite += 1 choose = indexes[np.random.randint(0, len(indexes))] reward = round(pull_bandit(choose), 1) total_reward += reward # 更新 Q table Q_table[choose] = round((Q_table[choose] + reward)/2, 1) # 更新 indexes indexes = [i for i in range(len(Q_table)) if Q_table[i]==init_q] else: ite += 1 choose = Q_table.index(max(Q_table)) reward = round(pull_bandit(choose), 1) total_reward += reward print("尝试次数：%s" % trial_num) print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[choose])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, max(Q_table), choose+1)) print("实际中的 Q_table: %s\n" % BANDITS) if choose == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[597]: opt_greedy(10) # optimistic exploration 小结： # # - 非常重要：结果与 BANDITS **数据分布**关系极大，当分布方差极大时，很容易收敛到绝对值比较小的期望上。 # - 这点很容易理解，因为方差大（比如 2 倍均值），所以期望大的老虎机总会获得到一个很大的负值 Q，取平均就会导致 Q 变小，结果就再也没有机会选到这台老虎机。 # - 相对来说，上面的这种情况在 epson-贪心和贪心算法上就会好很多。 # # upper-confidence bound # In[633]: # upper-confidence bound def ucb(c): total_reward = 0 is_best = 0 try_reward = [] ite = 0 total_iter = 1000 Q_table_choose = [0] * len(BANDITS) Q_table_true = [0] * len(BANDITS) N_table = [0] * len(BANDITS) while ite < total_iter: ite += 1 # 控制探索次数 # 也可以用 epson 来控制，epson 满足条件时探索，其他时间贪心 if ite <= 100: choose = np.random.randint(0,len(BANDITS)) rew = pull_bandit(choose) total_reward += rew N_table[choose] += 1 Q_table_choose[choose] = round(rew, 2) + c * np.sqrt(np.log(ite)/N_table[choose]) Q_table_true[choose] = round(rew, 2) index = Q_table_choose.index(max(Q_table_choose)) Q = Q_table_true[index] else: total_reward += Q print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[index])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[626]: ucb(0.1) # # gradient # In[480]: def softmax(xlist): ex = np.exp(xlist - np.max(xlist)) return ex/np.sum(ex, axis=0) # In[728]: # gradient # 不是特别确定理解是否有误 def gradient(alpha): is_best = 0 H_table = np.random.rand(len(BANDITS)) pi = softmax(H_table) ite = 0 total_iter = 1000 ht = 0 total_reward = 0 while ite < total_iter: ite += 1 # 随机投掷，看 pi 分布下哪个最高就选择哪个 throw = np.random.multinomial(100, pi) choose = list(throw).index(max(throw)) rew = pull_bandit(choose) total_reward += rew if rew > 0: htnext = ht + alpha * rew * (1 - pi[choose]) else: htnext = ht - alpha * rew * pi[choose] H_table[choose] = htnext ht = htnext pi = softmax(H_table) print("获得的总收益：%d，你选择的老虎机期望收益：%s...\n" % (total_reward, BANDITS[choose])) if list(pi).index(max(pi)) == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[730]: gradient(0.1) # # All Together # In[ ]: best_list = [] aver_reward = [] # 重复 100 次实验 N = 100 x = np.logspace(-7, 2, num=10, base=2) for radio in x: ed_best, oeg_best, ucb_best, gra_best = 0, 0, 0, 0 ed_rew, oeg_rew, ucb_rew, gra_rew = 0, 0, 0, 0 for i in range(N): edr, egb = epson_greedy(radio) ed_best += egb ed_rew += edr oegr, oegb = opt_greedy(radio*100) oeg_best += oegb oeg_rew += oegr ucbr, ucbb = ucb(radio) ucb_best += ucbb ucb_rew += ucbr grar, grab = gradient(radio) gra_best += grab gra_rew += grar best_list.append([ed_best/N, oeg_best/N, ucb_best/N, gra_best/N]) aver_reward.append([ed_rew/N, oeg_rew/N, ucb_rew/N, gra_rew/N]) # In[732]: best_list = np.array(best_list) aver_reward = np.array(aver_reward) y_edb, y_oegb, y_ucbb, y_grab = best_list[:,0], best_list[:,1], best_list[:,2], best_list[:,3] y_edr, y_oegr, y_ucbr, y_grar = aver_reward[:,0], aver_reward[:,1], aver_reward[:,2], aver_reward[:,3] plt.figure(figsize=(12, 8)) plt.subplots_adjust(left=.125, right=.9, bottom=.1, top=.9, wspace=.1, hspace=.3) plt.subplot(2,1,1) plt.title('All Bandit Algorithm Results Best') plt.plot(x, y_edb, color='black', label='epson-greedy') plt.plot(x, y_oegb, color='red', label='optimistic exploration greedy') plt.plot(x, y_ucbb, color='skyblue', label='upper-confidence bound') plt.plot(x, y_grab, color='blue', label='gradient') plt.legend() plt.xlabel('radio') plt.ylabel('probability of getting the best bandit') plt.subplot(2,1,2) plt.title('All Bandit Algorithm Results Avereward') plt.plot(x, y_edr, color='black', label='epson-greedy') plt.plot(x, y_oegr, color='red', label='optimistic exploration greedy') plt.plot(x, y_ucbr, color='skyblue', label='upper-confidence bound') plt.plot(x, y_grar, color='blue', label='gradient') plt.legend() plt.xlabel('radio') plt.ylabel('average reward') plt.show() # 和作者的结果不太一样，但大体结果差不多，可能的原因比较多，比如： # # - BANDIT 初始期望设定 # - 各算法涉及的其他初始值设定 # - 其他代码细节 # # 任意一个原因都可能导致结果大相径庭。但我们的出发点是理解算法思想和流程，大可不必关注太多细节。 # In[ ]: