Imports¶

In [2]:

from ADMCode import visualize as vis
from ADMCode import qlearn, utils
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter('ignore', np.RankWarning)
warnings.filterwarnings("ignore", module="matplotlib")
warnings.filterwarnings("ignore")
sns.set(style='white', font_scale=1.3)

%matplotlib inline
%config InlineBackend.figure_format = 'png'
%config InlineBackend.savefig.dpi = 150

Q-learning¶

Equations¶

Q-update: $$Q(a_i) \leftarrow Q(a_i) + \alpha (r_t - Q(a_i))$$
Softmax policy: $$P(a_i) = \frac{e^{\beta Q(a_i)}}{\sum_j^N e^{\beta Q(a_j)}}$$

Code¶

In [2]:

def update_Qi(Qval, reward, alpha):
    """ update q-value of selected action, given reward and alpha
    """
    return Qval + alpha*(reward - Qval)

def update_Pall(Qvector, beta):
    """ update vector of action selection probabilities given
    associated q-values
    """
    return np.array([np.exp(beta*Q_i) / np.sum(np.exp(beta * Qvector)) for Q_i in Qvector])

Multi-armed bandit task¶

In [57]:

from IPython.display import Image
from IPython.core.display import HTML 
Image(url='https://github.com/CoAxLab/AdaptiveDecisionMaking_2018/blob/master/notebooks/images/bandit.png?raw=true', width=250)

Out[57]:

Q-agent¶

In [3]:

# alpha (float):  learning rate parameter
# beta (float):   inverse temperature parameter 
# preward (list): 1xN vector of P(r) for each bandit
# rvalues (list): 1xN vector of rew. values for each bandit
agent = qlearn.Qagent(alpha=.1,
                      beta=3.5, 
                      preward=[.85, .75, .65], 
                      rvalues=[1, 1, 1])

Simulate a single agent¶

In [4]:

# play 800 trials of multi-armed bandit task
data1 = agent.play_bandits(ntrials=1000, get_output=True)

Check output dataframe¶

In [88]:

#  trial: trial number
#  q0: Q-value (i.e., expected value) of first bandit
#  p0: soft-max probability of selecting first bandit
#  choice: chosen bandit
#  feedback: feedback value returned by chosen bandit
#  optimal: did the agent choose the optimal action (1) or not (0)
data1.head() 

Out[88]:

	agent	trial	q0	q1	q2	p0	p1	p2	choice	feedback	optimal
0	1	1	0.000	0.1	0.0	0.2925	0.4150	0.2925	1	1	0
1	1	2	0.000	0.1	0.1	0.2605	0.3697	0.3697	2	1	0
2	1	3	0.100	0.1	0.1	0.3333	0.3333	0.3333	0	1	1
3	1	4	0.190	0.1	0.1	0.4066	0.2967	0.2967	0	1	1
4	1	5	0.271	0.1	0.1	0.4764	0.2618	0.2618	0	1	1

Plot $Q \, \& \, P$ values $(\alpha=.1)$¶

In [6]:

# plot Q-values and softmax selection prob. for each bandit
agent.set_params(alpha=.1)
data1 = agent.play_bandits(ntrials=1000, get_output=True)
vis.plot_qlearning(data1)

Optimal Choice: 48.80

Re-simulate agent with $\alpha=.025$¶

In [7]:

agent.set_params(alpha=.025)
data2 = agent.play_bandits(ntrials=1000, get_output=True)
vis.plot_qlearning(data2)

Optimal Choice: 48.60

Simulate multiple agents (compare $\alpha$)¶

$\alpha = .1 \,\, \beta=4.5$¶

In [8]:

agent.set_params(alpha=.1)
data3 = agent.simulate_multiple(nsims=10, ntrials=1000)
vis.plot_qlearning(data3)

Optimal Choice: 45.85

$\alpha = .025 \,\, \beta=4.5$¶

In [459]:

agent.set_params(alpha=.025)
data4 = agent.simulate_multiple(nsims=10, ntrials=1000)
vis.plot_qlearning(data4)

Make agent more exploitative: $\uparrow \beta$¶

$\alpha = .1 \,\, \beta=8$¶

In [11]:

agent.set_params(alpha=.12, beta=8)
data5 = agent.simulate_multiple(nsims=20, ntrials=1000)
vis.plot_qlearning(data5)

Optimal Choice: 60.30

$\alpha = .025 \,\, \beta=8$¶

In [403]:

agent.set_params(alpha=.025, beta=8)
data6 = agent.simulate_multiple(nsims=20, ntrials=1000)
vis.plot_qlearning(data6)

Q-Agent Competition!¶

In [14]:

# DEFINE AN AGENT AND N-ARMED BANDIT TASK
# N-item list of P(reward)'s (for each of N arms)

# FIXED
preward = [.8, .7, .6]        
nSims = 10
nTrials = 1000

# MAXIMUMS
alpha = .1
beta = 5

myAgent = qlearn.Qagent(alpha=alpha, beta=beta, preward=preward)
myData = myAgent.simulate_multiple(nsims=nSims, ntrials=nTrials)
vis.plot_qlearning(myData)

Optimal Choice: 39.54

Q-learning code¶

In [ ]:

def update_Qi(Qval, reward, alpha):
    """ update q-value of selected action, given reward and alpha
    """
    return Qval + alpha*(reward - Qval)

def update_Pall(Qvector, beta):
    """ update vector of action selection probabilities given
    associated q-values
    """
    return np.array([np.exp(beta*Q_i) / np.sum(np.exp(beta * Qvector)) for Q_i in Qvector])

In [40]:

def play_bandits(alpha=.1, beta=.15, preward=[.8, .5, .2], ntrials=1000):
    """ 
    ::Arguments::
        alpha (float): learning rate
        beta (float): inverse temperature parameter
        preward (list): 1xN vector of P(reward) for each of N bandits
        ntrials (int): number of trials to play bandits
    ::Returns::
        pandasDF (Ntrials x Nbandits), trialwise Q/P values per bandit
    """
    nact = len(preward)
    actions = np.arange(nact)  
    rvalues = np.ones(nact)
    
    bandits = qlearn.MultiArmedBandit(preward=preward, rvalues=rvalues)
    
    Qmatrix=np.zeros((ntrials, nact))
    Pmatrix=np.zeros_like(Qmatrix)
    Qvalues = Qmatrix[0, :]
    Pvalues = np.array([1/nact]*nact)

    for t in range(ntrials):
        # select bandit arm (action)
        act_i = np.random.choice(actions, p=Pvalues)
        
        # observe feedback
        r = bandits.get_feedback(act_i)
        # get expected value 
        Qexpected = Qvalues[act_i]
        # update expected value and store in Qvalues array
        # Qexpected + alpha * (r - Qexpected)
        Qvalues[act_i] = update_Qi(Qexpected, r, alpha)
        
        # update action selection probabilities 
        Pvalues = update_Pall(Qvalues, beta)
        
        # store new values in output matrices
        Qmatrix[t, :] = Qvalues
        Pmatrix[t, :] = Pvalues
        
    return make_output_df(Qmatrix, Pmatrix)  

In [42]:

def make_output_df(Qmatrix, Pmatrix):
    """ generate output dataframe with trialwise Q and P measures for each bandit,
    as well as choice selection, and feedback
    ::Arguments::
        Qmatrix (ndarray): q-value array with dims [Ntrials x Nbandits] 
        Pmatrix (ndarray): softmax prob array with dims [Ntrials x Nbandits] 
    ::Returns::
        df (DataFrame): pandas df containing Q and P values for each bandit [Ntrials x 2*Nbandits]
    """
    actions = np.arange(Qmatrix.shape[1])
    df = pd.concat([pd.DataFrame(dat) for dat in [Qmatrix, Pmatrix]], axis=1)
    cols = [['{}{}'.format(x,c) for c in actions] for x in ['q', 'p']]
    df.columns = np.hstack(cols)
    df.insert(0, 'trial', np.arange(1, df.shape[0]+1))
    return df

In [43]:

df = play_bandits(alpha=.1, beta=2, preward=[.85, .75, .65])

In [44]:

df.head()

Out[44]:

	trial	q0	q1	q2	p0	p1	p2
0	1	0.8013	0.6865	0.6183	0.3104	0.3792	0.3104
1	2	0.1000	0.1000	0.0000	0.3548	0.3548	0.2905
2	3	0.1000	0.1900	0.0000	0.3316	0.3970	0.2715
3	4	0.1900	0.1900	0.0000	0.3726	0.3726	0.2548
4	5	0.1710	0.1900	0.0000	0.3638	0.3778	0.2584