#!/usr/bin/env python # coding: utf-8 # # Table of Contents #

1  greedy
2  epson-greedy
3  optimistic exploration greedy
4  upper-confidence bound
5  gradient
6  All Together
# In[667]: import tensorflow as tf import numpy as np import random import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[253]: # 假设老虎机标准差为 1/2 倍均值 # 假设所获得的奖励符合高斯分布 BANDITS = [0.10, -0.20, 1.0, -5.0, 5.0, -20.0, 10.0, 3.0, -2.0, -1.0] def bandit_reward(bandit): # 所有 bandits 的期望 mu = BANDITS[bandit] sigma = np.abs(mu)/2.0 return np.random.normal(mu, sigma, 10) # In[254]: # 每次拉老虎机,随机从奖励结果中取一个 def pull_bandit(bandit): banditreward = bandit_reward(bandit) choose = np.random.randint(0,len(BANDITS)-1) return banditreward[choose] # In[255]: # 验证一下, 可以发现基本能够达到期望值 for i, rew in enumerate(BANDITS): result = [] # 重复 1000 次 for j in range(1000): result.append(pull_bandit(i)) print("Actual reward: %s; Estimated reward: %s" % (rew, np.average(result))) # # greedy # In[287]: # 贪心算法 total_reward = 0 try_reward = [] trial = 3 ite = 0 total_iter = 1000 while ite < total_iter: # 各 三次 实验求平均,然后以贪心算法选择最大的 Q if ite < trial * len(BANDITS): for i in range(len(BANDITS)): ite += 1 reward = round(pull_bandit(i),1) try_reward.append(reward) total_reward += reward else: Q_table = np.average(np.array(try_reward).reshape(trial, len(BANDITS)), axis=0) # 保留两位小数 Q_table = list(np.around(Q_table, decimals=1)) Q = max(Q_table) index = Q_table.index(Q) ite += 1 total_reward += Q assert sum(try_reward) + Q*(total_iter - trial * len(BANDITS)) - total_reward < 1.0 print("获得的总收益:%d,你选择的老虎机期望收益:%s...\n" % (total_reward, BANDITS[index])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") else: print("You haven't chosen the best bandit....") # 贪心算法小结: # # 注意这个结果是和 bandits 的期望以及标准差相关的。 # # - BANDITS 期望一定时:如果标准差很小,那么选择基本会限定在最高的几个老虎机上,如果标准差非常大,则有可能选择到其他的老虎机。 # - BANDITS 标准差一定时:期望相差越大,越容易选到最高的几个老虎机上,期望相差越小,则可能选到其他老虎机。 # # epson-greedy # In[674]: # epson-贪心算法1:上来就探索 def epson_greedy(epson): total_reward = 0 is_best = 0 epson_num = 0 try_reward = [] ite = 0 total_iter = 1000 Q_table = [0] * len(BANDITS) while ite < total_iter: if random.random() < epson: ite += 1 epson_num += 1 choose = np.random.randint(0,len(BANDITS)) rew = pull_bandit(choose) total_reward += rew # 更新的值保留两位小数,方便看结果 Q_table[choose] = round(rew, 2) Q = max(Q_table) index = Q_table.index(Q) # 保留两位小数 else: ite += 1 Q = max(Q_table) index = Q_table.index(Q) total_reward += Q print("共探索了 %s 次." % epson_num) print("获得的总收益:%d,你选择的老虎机期望收益:%s...\n" % (total_reward, BANDITS[index])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[675]: # epson = 0.1,10% 的概率在探索新的老虎机 epson_greedy(0.1) # In[735]: # epson-贪心算法2:先确定一个 Q table 再探索,结合了贪心算法 def epson_greedy2(epson): total_reward = 0 is_best = 0 epson_num = 0 try_reward = [] trial = 3 ite = 0 total_iter = 1000 Q_table = [0] * len(BANDITS) while ite < total_iter: # 先三次实验确定 Q_table,再探索 if ite < trial * len(BANDITS): for i in range(len(BANDITS)): ite += 1 reward = round(pull_bandit(i),1) try_reward.append(reward) total_reward += reward else: Q_table = np.average(np.array(try_reward).reshape(trial, len(BANDITS)), axis=0) Q_table = list(np.around(Q_table, decimals=1)) # epson 概率选择下一个要拉的老虎机,1-epson 选目前奖励最高的 # 如果是非静态的 bandit,可以把 Q_table 中每台 bandit 的 reward 存成 list,然后 # 用 Q_{n+1} = \sum_{i=1}^n \alpha (1-\alpha)^{n-i} R_i + (1-\alpha)^{n} Q_1 计算即可 # alpha 为超参数,R_i 为存储的所有 Reward,Q1 为第一次的 Reward if random.random() < epson: epson_num += 1 choose = np.random.randint(0,len(BANDITS)) rew = pull_bandit(choose) total_reward += rew # 更新的值保留两位小数,方便看结果 Q_table[choose] = round(rew, 2) else: ite += 1 Q = max(Q_table) index = Q_table.index(Q) total_reward += Q print("共探索了 %s 次." % epson_num) print("获得的总收益:%d,你选择的老虎机期望收益:%s...\n" % (total_reward, BANDITS[index])) # 如果你选的 Q(两位小数)没有在 Q_table,说明最后一次的 Q_table 是 Q_table_new,最大的 Q 是探索出来的 # 实际中不需要 新建一个 Qtable,这里为了看起来更加直观 print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[737]: # epson = 0.1,10% 的概率在探索新的老虎机 epson_greedy2(0.1) # epson-贪心小结: # # - 获得最高的收益时,不一定就是选择了期望收益最大的老虎机。相反也是。当然大部分的情况下,最高的收益对应的是选择了期望最大的老虎机。 # - 对 BANDITS 值没有那么敏感。 # # optimistic exploration greedy # In[596]: # optimistic exploration:直到 “失望” 前,一直探索,然后贪心 def opt_greedy(init_q): total_reward = 0 is_best = 0 epson_num = 0 try_reward = [] trial_num = 0 ite = 0 total_iter = 1000 Q_table = [init_q] * len(BANDITS) indexes = range(0, len(BANDITS)) while ite < total_iter: # 当 最大值 是初始乐观值时,一直保持探索(不死心……) if max(Q_table) == init_q: trial_num += 1 ite += 1 choose = indexes[np.random.randint(0, len(indexes))] reward = round(pull_bandit(choose), 1) total_reward += reward # 更新 Q table Q_table[choose] = round((Q_table[choose] + reward)/2, 1) # 更新 indexes indexes = [i for i in range(len(Q_table)) if Q_table[i]==init_q] else: ite += 1 choose = Q_table.index(max(Q_table)) reward = round(pull_bandit(choose), 1) total_reward += reward print("尝试次数:%s" % trial_num) print("获得的总收益:%d,你选择的老虎机期望收益:%s...\n" % (total_reward, BANDITS[choose])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, max(Q_table), choose+1)) print("实际中的 Q_table: %s\n" % BANDITS) if choose == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[597]: opt_greedy(10) # optimistic exploration 小结: # # - 非常重要:结果与 BANDITS **数据分布**关系极大,当分布方差极大时,很容易收敛到绝对值比较小的期望上。 # - 这点很容易理解,因为方差大(比如 2 倍均值),所以期望大的老虎机总会获得到一个很大的负值 Q,取平均就会导致 Q 变小,结果就再也没有机会选到这台老虎机。 # - 相对来说,上面的这种情况在 epson-贪心 和 贪心算法上就会好很多。 # # upper-confidence bound # In[633]: # upper-confidence bound def ucb(c): total_reward = 0 is_best = 0 try_reward = [] ite = 0 total_iter = 1000 Q_table_choose = [0] * len(BANDITS) Q_table_true = [0] * len(BANDITS) N_table = [0] * len(BANDITS) while ite < total_iter: ite += 1 # 控制探索次数 # 也可以用 epson 来控制,epson 满足条件时探索,其他时间贪心 if ite <= 100: choose = np.random.randint(0,len(BANDITS)) rew = pull_bandit(choose) total_reward += rew N_table[choose] += 1 Q_table_choose[choose] = round(rew, 2) + c * np.sqrt(np.log(ite)/N_table[choose]) Q_table_true[choose] = round(rew, 2) index = Q_table_choose.index(max(Q_table_choose)) Q = Q_table_true[index] else: total_reward += Q print("获得的总收益:%d,你选择的老虎机期望收益:%s...\n" % (total_reward, BANDITS[index])) print("你估计的 Q_table: %s, 你选择额的最大 Q: %s, %s 号老虎机。" % (Q_table, Q, index+1)) print("实际中的 Q_table: %s\n" % BANDITS) if index == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[626]: ucb(0.1) # # gradient # In[480]: def softmax(xlist): ex = np.exp(xlist - np.max(xlist)) return ex/np.sum(ex, axis=0) # In[728]: # gradient # 不是特别确定理解是否有误 def gradient(alpha): is_best = 0 H_table = np.random.rand(len(BANDITS)) pi = softmax(H_table) ite = 0 total_iter = 1000 ht = 0 total_reward = 0 while ite < total_iter: ite += 1 # 随机投掷,看 pi 分布下哪个最高就选择哪个 throw = np.random.multinomial(100, pi) choose = list(throw).index(max(throw)) rew = pull_bandit(choose) total_reward += rew if rew > 0: htnext = ht + alpha * rew * (1 - pi[choose]) else: htnext = ht - alpha * rew * pi[choose] H_table[choose] = htnext ht = htnext pi = softmax(H_table) print("获得的总收益:%d,你选择的老虎机期望收益:%s...\n" % (total_reward, BANDITS[choose])) if list(pi).index(max(pi)) == BANDITS.index(max(BANDITS)): print("You have chosen the best bandit!") is_best = 1 else: print("You haven't chosen the best bandit....") is_best = 0 return total_reward, is_best # In[730]: gradient(0.1) # # All Together # In[ ]: best_list = [] aver_reward = [] # 重复 100 次实验 N = 100 x = np.logspace(-7, 2, num=10, base=2) for radio in x: ed_best, oeg_best, ucb_best, gra_best = 0, 0, 0, 0 ed_rew, oeg_rew, ucb_rew, gra_rew = 0, 0, 0, 0 for i in range(N): edr, egb = epson_greedy(radio) ed_best += egb ed_rew += edr oegr, oegb = opt_greedy(radio*100) oeg_best += oegb oeg_rew += oegr ucbr, ucbb = ucb(radio) ucb_best += ucbb ucb_rew += ucbr grar, grab = gradient(radio) gra_best += grab gra_rew += grar best_list.append([ed_best/N, oeg_best/N, ucb_best/N, gra_best/N]) aver_reward.append([ed_rew/N, oeg_rew/N, ucb_rew/N, gra_rew/N]) # In[732]: best_list = np.array(best_list) aver_reward = np.array(aver_reward) y_edb, y_oegb, y_ucbb, y_grab = best_list[:,0], best_list[:,1], best_list[:,2], best_list[:,3] y_edr, y_oegr, y_ucbr, y_grar = aver_reward[:,0], aver_reward[:,1], aver_reward[:,2], aver_reward[:,3] plt.figure(figsize=(12, 8)) plt.subplots_adjust(left=.125, right=.9, bottom=.1, top=.9, wspace=.1, hspace=.3) plt.subplot(2,1,1) plt.title('All Bandit Algorithm Results Best') plt.plot(x, y_edb, color='black', label='epson-greedy') plt.plot(x, y_oegb, color='red', label='optimistic exploration greedy') plt.plot(x, y_ucbb, color='skyblue', label='upper-confidence bound') plt.plot(x, y_grab, color='blue', label='gradient') plt.legend() plt.xlabel('radio') plt.ylabel('probability of getting the best bandit') plt.subplot(2,1,2) plt.title('All Bandit Algorithm Results Avereward') plt.plot(x, y_edr, color='black', label='epson-greedy') plt.plot(x, y_oegr, color='red', label='optimistic exploration greedy') plt.plot(x, y_ucbr, color='skyblue', label='upper-confidence bound') plt.plot(x, y_grar, color='blue', label='gradient') plt.legend() plt.xlabel('radio') plt.ylabel('average reward') plt.show() # 和作者的结果不太一样,但大体结果差不多,可能的原因比较多,比如: # # - BANDIT 初始期望设定 # - 各算法涉及的其他初始值设定 # - 其他代码细节 # # 任意一个原因都可能导致结果大相径庭。但我们的出发点是理解算法思想和流程,大可不必关注太多细节。 # In[ ]: