#!/usr/bin/env python # coding: utf-8 # # Contextual bandit with changing context # > Customizing the context and changing it midway to see how fast the agent can adapt to the new context and start recommending better products as per the context # # - toc: true # - badges: true # - comments: true # - categories: [contextual bandit] # - image: # In[ ]: get_ipython().system('pip install -q vowpalwabbit') # In[ ]: from vowpalwabbit import pyvw import random import matplotlib.pyplot as plt import pandas as pd from itertools import product # ### Setting the context # In[ ]: USER_LIKED_ARTICLE = -1.0 USER_DISLIKED_ARTICLE = 0.0 # In[ ]: users = ['A','B','C'] items = ['Item1','Item2','Item3','Item4','Item5','Item6'] context1 = ['morning','evening'] context2 = ['summer','winter'] context = pd.DataFrame(list(product(users, context1, context2, items)), columns=['users', 'context1', 'context2', 'items']) context['reward'] = 0 #user 1 likes Item 1 in morning, and Item 6 in summer context.loc[(context.users=='A') & \ (context.context1=='morning') & \ (context['items']=='Item1'), \ 'reward'] = 1 context.loc[(context.users=='A') & \ (context.context2=='summer') & \ (context['items']=='Item6'), \ 'reward'] = 1 #user 2 likes Item 2 in winter, and Item 5 in summer morning context.loc[(context.users=='B') & \ (context.context2=='winter') & \ (context['items']=='Item2'), \ 'reward'] = 1 context.loc[(context.users=='B') & \ (context.context1=='morning') & \ (context.context2=='summer') & \ (context['items']=='Item5'), \ 'reward'] = 1 #user 3 likes Item 2 in morning, Item 3 in evening, and item 4 in winter morning context.loc[(context.users=='C') & \ (context.context1=='morning') & \ (context['items']=='Item2'), \ 'reward'] = 1 context.loc[(context.users=='C') & \ (context.context1=='evening') & \ (context['items']=='Item3'), \ 'reward'] = 1 context.loc[(context.users=='C') & \ (context.context1=='morning') & \ (context.context2=='winter') & \ (context['items']=='Item4'), \ 'reward'] = 1 context['cost'] = context['reward']*-1 contextdf = context.copy() # In[ ]: contextdf.cost.value_counts() # ### Cost function util # In[ ]: def get_cost(context,action): return contextdf.loc[(contextdf['users']==context['user']) & \ (contextdf.context1==context['context1']) & \ (contextdf.context2==context['context2']) & \ (contextdf['items']==action), \ 'cost'].values[0] # In[ ]: get_cost({'user':'A','context1':'morning','context2':'summer'},'Item2') # ### Vowpalwabbit format util # In[ ]: # This function modifies (context, action, cost, probability) to VW friendly format def to_vw_example_format(context, actions, cb_label = None): if cb_label is not None: chosen_action, cost, prob = cb_label example_string = "" example_string += "shared |User users={} context1={} context2={}\n".format(context["user"], context["context1"], context["context2"]) for action in actions: if cb_label is not None and action == chosen_action: example_string += "0:{}:{} ".format(cost, prob) example_string += "|Action items={} \n".format(action) #Strip the last newline return example_string[:-1] # In[ ]: context = {"user":"A","context1":"morning","context2":"summer"} print(to_vw_example_format(context,items)) # In[ ]: def sample_custom_pmf(pmf): total = sum(pmf) scale = 1 / total pmf = [x * scale for x in pmf] draw = random.random() sum_prob = 0.0 for index, prob in enumerate(pmf): sum_prob += prob if(sum_prob > draw): return index, prob # In[ ]: def get_action(vw, context, actions): vw_text_example = to_vw_example_format(context,actions) pmf = vw.predict(vw_text_example) chosen_action_index, prob = sample_custom_pmf(pmf) return actions[chosen_action_index], prob # In[ ]: def choose_user(users): return random.choice(users) def choose_context1(context1): return random.choice(context1) def choose_context2(context2): return random.choice(context2) # In[ ]: def run_simulation(vw, num_iterations, users, contexts1, contexts2, actions, cost_function, do_learn = True): cost_sum = 0. ctr = [] for i in range(1, num_iterations+1): user = choose_user(users) context1 = choose_context1(contexts1) context2 = choose_context2(contexts2) context = {'user': user, 'context1': context1, 'context2': context2} # print(context) action, prob = get_action(vw, context, actions) # print(action, prob) cost = cost_function(context, action) # print(cost) cost_sum += cost if do_learn: # 5. Inform VW of what happened so we can learn from it vw_format = vw.parse(to_vw_example_format(context, actions, (action, cost, prob)),pyvw.vw.lContextualBandit) # 6. Learn vw.learn(vw_format) # 7. Let VW know you're done with these objects vw.finish_example(vw_format) # We negate this so that on the plot instead of minimizing cost, we are maximizing reward ctr.append(-1*cost_sum/i) return ctr # In[ ]: def plot_ctr(num_iterations, ctr): plt.plot(range(1,num_iterations+1), ctr) plt.xlabel('num_iterations', fontsize=14) plt.ylabel('ctr', fontsize=14) plt.ylim([0,1]) # In[ ]: # Instantiate learner in VW vw = pyvw.vw("--cb_explore_adf -q UA --quiet --epsilon 0.2") num_iterations = 5000 ctr = run_simulation(vw, num_iterations, users, context1, context2, items, get_cost) plot_ctr(num_iterations, ctr) # In[ ]: # Instantiate learner in VW but without -q vw = pyvw.vw("--cb_explore_adf --quiet --epsilon 0.2") num_iterations = 5000 ctr = run_simulation(vw, num_iterations, users, context1, context2, items, get_cost) plot_ctr(num_iterations, ctr) # In[ ]: # Instantiate learner in VW vw = pyvw.vw("--cb_explore_adf -q UA --quiet --epsilon 0.2") num_iterations = 5000 ctr = run_simulation(vw, num_iterations, users, context1, context2, items, get_cost, do_learn=False) plot_ctr(num_iterations, ctr) # In[ ]: users = ['A','B','C'] items = ['Item1','Item2','Item3','Item4','Item5','Item6'] context1 = ['morning','evening'] context2 = ['summer','winter'] context = pd.DataFrame(list(product(users, context1, context2, items)), columns=['users', 'context1', 'context2', 'items']) context['reward'] = 0 #user 1 likes Item 2 in morning, and Item 5 in summer context.loc[(context.users=='A') & \ (context.context1=='morning') & \ (context['items']=='Item2'), \ 'reward'] = 1 context.loc[(context.users=='A') & \ (context.context2=='summer') & \ (context['items']=='Item5'), \ 'reward'] = 1 #user 2 likes Item 2 in summer, and Item 5 in morning context.loc[(context.users=='B') & \ (context.context2=='summer') & \ (context['items']=='Item2'), \ 'reward'] = 1 context.loc[(context.users=='B') & \ (context.context1=='morning') & \ (context['items']=='Item5'), \ 'reward'] = 1 #user 3 likes Item 4 in morning, Item 3 in evening, and item 4 in winter evening context.loc[(context.users=='C') & \ (context.context1=='morning') & \ (context['items']=='Item4'), \ 'reward'] = 1 context.loc[(context.users=='C') & \ (context.context1=='evening') & \ (context['items']=='Item3'), \ 'reward'] = 1 context.loc[(context.users=='C') & \ (context.context1=='evening') & \ (context.context2=='winter') & \ (context['items']=='Item4'), \ 'reward'] = 1 context['cost'] = context['reward']*-1 contextdf_new = context.copy() def get_cost_new1(context,action): return contextdf_new.loc[(contextdf_new['users']==context['user']) & \ (contextdf_new.context1==context['context1']) & \ (contextdf_new.context2==context['context2']) & \ (contextdf_new['items']==action), \ 'cost'].values[0] # In[ ]: def run_simulation_multiple_cost_functions(vw, num_iterations, users, contexts1, contexts2, actions, cost_functions, do_learn = True): cost_sum = 0. ctr = [] start_counter = 1 end_counter = start_counter + num_iterations for cost_function in cost_functions: for i in range(start_counter, end_counter): user = choose_user(users) context1 = choose_context1(contexts1) context2 = choose_context2(contexts2) context = {'user': user, 'context1': context1, 'context2': context2} action, prob = get_action(vw, context, actions) cost = cost_function(context, action) cost_sum += cost if do_learn: vw_format = vw.parse(to_vw_example_format(context, actions, (action, cost, prob)),pyvw.vw.lContextualBandit) vw.learn(vw_format) ctr.append(-1*cost_sum/i) start_counter = end_counter end_counter = start_counter + num_iterations return ctr # In[ ]: # use first reward function initially and then switch to second reward function # Instantiate learner in VW vw = pyvw.vw("--cb_explore_adf -q UA --quiet --epsilon 0.2") num_iterations_per_cost_func = 5000 cost_functions = [get_cost, get_cost_new1] total_iterations = num_iterations_per_cost_func * len(cost_functions) ctr = run_simulation_multiple_cost_functions(vw, num_iterations_per_cost_func, users, context1, context2, items, cost_functions) plot_ctr(total_iterations, ctr) # In[ ]: # Do not learn # use first reward function initially and then switch to second reward function # Instantiate learner in VW vw = pyvw.vw("--cb_explore_adf -q UA --quiet --epsilon 0.2") num_iterations_per_cost_func = 5000 cost_functions = [get_cost, get_cost_new1] total_iterations = num_iterations_per_cost_func * len(cost_functions) ctr = run_simulation_multiple_cost_functions(vw, num_iterations_per_cost_func, users, context1, context2, items, cost_functions, do_learn=False) plot_ctr(total_iterations, ctr)