import pandas as pd import numpy as np from numpy.linalg import inv from sklearn.metrics import roc_auc_score from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate from sklearn.tree import DecisionTreeClassifier from scipy.optimize import minimize from lightgbm import LGBMClassifier from scipy.stats import beta import pickle import os import shutil import tqdm from tqdm.notebook import tqdm from multiprocessing.dummy import Pool from IPython.display import clear_output import matplotlib.pyplot as plt from torch.utils.data import Dataset, DataLoader from __future__ import print_function from abc import ABCMeta, abstractmethod !mkdir -p data !cd data && wget -q --show-progress https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data !cd data && wget -q --show-progress https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names mushroom_data = pd.read_csv("data/agaricus-lepiota.data", header=None) column_names = ["classes", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor", "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape", "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring", "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type", "spore-print-color", "population", "habitat"] mushroom_data.columns = column_names mushroom_data.head() mushroom_data.dtypes # label encoding for column in column_names: mushroom_data[column] = mushroom_data[column].astype('category') mushroom_data[column] = mushroom_data[column].cat.codes # split idx_trn, idx_tst = train_test_split(mushroom_data.index, test_size=0.2, random_state=42, stratify=mushroom_data[['classes']]) def gini(var): df = mushroom_data.copy() x_trn = df.loc[idx_trn, var] y_trn = df.loc[idx_trn, 'classes'] x_tst = df.loc[idx_tst, var] y_tst = df.loc[idx_tst, 'classes'] if x_trn.dtype in ['O','object']: cats = pd.DataFrame({'x': x_trn, 'y': y_trn}).fillna('#NAN#').groupby('x').agg('mean').sort_values('y').index.values X_trn = pd.Categorical(x_trn.fillna('#NAN#'), categories=cats, ordered=True).codes.reshape(-1, 1) X_tst = pd.Categorical(x_tst.fillna('#NAN#'), categories=cats, ordered=True).codes.reshape(-1, 1) else: repl = min(x_trn.min(), x_tst.min())-1 if np.isfinite(min(x_trn.min(), x_tst.min())-1) else -999999 #repl = x_trn.min()-1 if np.isfinite(x_trn.min())-1 else -999999 X_trn = x_trn.fillna(repl).replace(np.inf, repl).replace(-np.inf, repl).values.reshape(-1, 1) X_tst = x_tst.fillna(repl).replace(np.inf, repl).replace(-np.inf, repl).values.reshape(-1, 1) obvious_gini_trn = 2*roc_auc_score(y_trn, X_trn)-1 obvious_gini_tst = 2*roc_auc_score(y_tst, X_tst)-1 if obvious_gini_trn < 0: obvious_gini_trn = -obvious_gini_trn obvious_gini_tst = -obvious_gini_tst parameters = {'min_samples_leaf':[0.01, 0.025, 0.05, 0.1]} dt = DecisionTreeClassifier(random_state=1) clf = GridSearchCV(dt, parameters, cv=4, scoring='roc_auc', n_jobs=10) clf.fit(X_trn, y_trn) true_gini_trn = 2*clf.best_score_-1 true_gini_tst = 2*roc_auc_score(y_tst, clf.predict_proba(X_tst)[:, 1])-1 if true_gini_trn < 0: true_gini_trn = -true_gini_trn true_gini_tst = -true_gini_tst if obvious_gini_trn > true_gini_trn: return [var, obvious_gini_trn, obvious_gini_tst] else: return [var, true_gini_trn, true_gini_tst] with Pool(20) as p: vars_gini = list(tqdm(p.imap(gini, column_names), total=len(column_names))) vars_gini = pd.DataFrame(vars_gini) vars_gini.set_index(0, inplace=True) vars_gini.columns = ['gini_train', 'gini_test'] vars_gini.T vars_corrs = mushroom_data.loc[:, column_names].corr().abs().stack().reset_index().drop_duplicates() vars_corrs = vars_corrs[vars_corrs.level_0!=vars_corrs.level_1] vars_corrs.columns = ['var_1', 'var_2', 'correlation'] vars_corrs = vars_corrs.set_index(['var_1', 'var_2'], drop=True).sort_values(by='correlation', ascending=False) vars_drop = [] for v in vars_corrs[vars_corrs.correlation > 0.7].index.values: if v[0] not in vars_drop and v[1] not in vars_drop: vars_drop.append(v[1] if vars_gini.loc[v[0], 'gini_train'] > vars_gini.loc[v[1], 'gini_train'] else v[0]) del v # all variables vars0 = column_names[1:] # drop values with gini less than 3% vars1 = [v for v in vars0 if vars_gini.loc[v, 'gini_train'] >= 0.03] # drop correlated variables vars2 = [v for v in vars1 if v not in vars_drop] i = 0 for var_lst in [vars0, vars1, vars2]: i += 1 lgb = LGBMClassifier(max_depth=1, n_estimators=250, random_state=42, n_jobs=30) cv = cross_validate(lgb, mushroom_data.loc[:, var_lst], mushroom_data.loc[:, 'classes'], cv=5, scoring='roc_auc', n_jobs=20, return_train_score=True) lgb.fit(mushroom_data[var_lst], mushroom_data['classes']) print({'Variables': len(var_lst), 'Train CV': round(cv['train_score'].mean()*2-1, 4), 'Test CV': round(cv['test_score'].mean()*2-1, 4)}) var_lst_imp = pd.Series(dict(zip(var_lst, lgb.feature_importances_))) var_lst = [i for i in var_lst_imp.index if var_lst_imp.loc[i]>0] print({'exclude': [i for i in var_lst_imp.index if var_lst_imp.loc[i]<=0]}) print(len(var_lst)) forw_cols = [] current_ginis = pd.Series({'Train CV':0, 'Test CV':0}) def forw(x): lgb = LGBMClassifier(max_depth=1, n_estimators=250, random_state=42, n_jobs=1) cv = cross_validate(lgb, mushroom_data.loc[:, forw_cols+[x]], mushroom_data.loc[:, 'classes'], cv=5, scoring='roc_auc', n_jobs=1, return_train_score=True) lgb.fit(mushroom_data.loc[:, forw_cols+[x]], mushroom_data.loc[:, 'classes']) return x, pd.Series({ 'Train CV': cv['train_score'].mean()*2-1, 'Test CV': cv['test_score'].mean()*2-1 }) forwards_log = [] while len(forw_cols)<30: with Pool(20) as p: res = list(tqdm(p.imap(forw, [i for i in var_lst if i not in forw_cols]), total=len(var_lst)-len(forw_cols), leave=False)) res = pd.DataFrame({i[0]:i[1] for i in res}).T delta = res - current_ginis if delta['Test CV'].max()<0: break best_var = delta['Test CV'].idxmax() forw_cols = forw_cols + [best_var] current_ginis = res.loc[best_var] forwards_log.append(current_ginis) clear_output() print(pd.DataFrame(forwards_log)) clear_output() forwards_log = pd.DataFrame(forwards_log) forwards_log['Uplift Train CV'] = forwards_log['Train CV']-forwards_log['Train CV'].shift(1).fillna(0) forwards_log['Uplift Test CV'] = forwards_log['Test CV']-forwards_log['Test CV'].shift(1).fillna(0) print(forwards_log) ids_vars = forwards_log[forwards_log['Uplift Test CV']>0.001].index.values.tolist() vars_gini.loc[ids_vars,:] mushroom_data_features = mushroom_data[ids_vars + ["classes"]] mushroom_data = mushroom_data_features.loc[mushroom_data_features.index.repeat(4)].reset_index(drop=True) mushroom_data["a"] = np.random.choice([0, 1], mushroom_data.shape[0]) mushroom_data["probs"] = 1 mushroom_data["y"] = 0 eat_edible = (1-mushroom_data["classes"]) * mushroom_data["a"] * 1 eat_poisonous = mushroom_data["classes"] * mushroom_data["a"] * np.random.choice([1, -1], mushroom_data.shape[0]) mushroom_data["y"] = eat_edible + eat_poisonous new_names = ['X_' + str(i+1) for i in range(len(ids_vars))] mushroom_data = mushroom_data.rename(columns=dict(zip(ids_vars, new_names))) mushroom_data_final = mushroom_data[new_names + ['a', 'y', 'probs']] mushroom_data_final.head() with open('data/mushroom_data_final.pickle', 'wb') as handle: pickle.dump(mushroom_data_final, handle, protocol=pickle.HIGHEST_PROTOCOL) def softmax(action_values, tau=1.0): """ Args: action_values (Numpy array): A 2D array of shape (batch_size, num_actions). The action-values computed by an action-value network. tau (float): The temperature parameter scalar. Returns: A 2D array of shape (batch_size, num_actions). Where each column is a probability distribution over the actions representing the policy. """ # Compute the preferences by dividing the action-values by the temperature parameter tau preferences = action_values / tau # Compute the maximum preference across the actions max_preference = np.max(preferences, axis=1) # your code here # Reshape max_preference array which has shape [Batch,] to [Batch, 1]. This allows NumPy broadcasting # when subtracting the maximum preference from the preference of each action. reshaped_max_preference = max_preference.reshape((-1, 1)) # print(reshaped_max_preference) # Compute the numerator, i.e., the exponential of the preference - the max preference. exp_preferences = np.exp(preferences - reshaped_max_preference) # print(exp_preferences) # Compute the denominator, i.e., the sum over the numerator along the actions axis. sum_of_exp_preferences = np.sum(exp_preferences, axis=1) # print(sum_of_exp_preferences) # your code here # Reshape sum_of_exp_preferences array which has shape [Batch,] to [Batch, 1] to allow for NumPy broadcasting # when dividing the numerator by the denominator. reshaped_sum_of_exp_preferences = sum_of_exp_preferences.reshape((-1, 1)) # print(reshaped_sum_of_exp_preferences) # Compute the action probabilities according to the equation in the previous cell. action_probs = exp_preferences / reshaped_sum_of_exp_preferences # print(action_probs) # your code here # squeeze() removes any singleton dimensions. It is used here because this function is used in the # agent policy when selecting an action (for which the batch dimension is 1.) As np.random.choice is used in # the agent policy and it expects 1D arrays, we need to remove this singleton batch dimension. action_probs = action_probs.squeeze() return action_probs # if __name__ == '__main__': # rand_generator = np.random.RandomState(0) # action_values = rand_generator.normal(0, 1, (2, 4)) # tau = 0.5 # action_probs = softmax(action_values, tau) # print("action_probs", action_probs) # assert (np.allclose(action_probs, np.array([ # [0.25849645, 0.01689625, 0.05374514, 0.67086216], # [0.84699852, 0.00286345, 0.13520063, 0.01493741] # ]))) # action_values = np.array([[0.0327, 0.0127, 0.0688]]) # tau = 1. # action_probs = softmax(action_values, tau) # print("action_probs", action_probs) # assert np.allclose(action_probs, np.array([0.3315, 0.3249, 0.3436]), atol=1e-04) # print("Passed the asserts! (Note: These are however limited in scope, additional testing is encouraged.)") class ReplayBuffer: def __init__(self, size, seed): """ Args: size (integer): The size of the replay buffer. minibatch_size (integer): The sample size. seed (integer): The seed for the random number generator. """ self.buffer = [] self.rand_generator = np.random.RandomState(seed) self.max_size = size def append(self, state, action, reward): """ Args: state (Numpy array): The state. action (integer): The action. reward (float): The reward. terminal (integer): 1 if the next state is a terminal state and 0 otherwise. next_state (Numpy array): The next state. """ if len(self.buffer) == self.max_size: del self.buffer[0] self.buffer.append([state, action, reward]) def sample(self, last_action): """ Returns: A list of transition tuples including state, action, reward, terinal, and next_state """ state, action, reward = map(list, zip(*self.buffer)) idxs = [elem == last_action for elem in action] X = [b for a, b in zip(idxs, state) if a] y = [b for a, b in zip(idxs, reward) if a] return X, y def size(self): return len(self.buffer) # if __name__ == "__main__": # buffer = ReplayBuffer(size=100000, seed=1) # buffer.append([1, 2, 3], 0, 1) # buffer.append([4, 21, 3], 1, 1) # buffer.append([0, 1, 1], 0, 0) # print(buffer.sample(0)) def generate_samples(num_samples, num_features, num_arms, return_dataframe=False): np.random.seed(1) # generate pseudo features X and "true" arms' weights X = np.random.randint(0, 4, size=(num_samples, num_features)) actions_weights = np.random.normal(loc=-1., scale=1, size=(num_arms, num_features)) # apply data generating policy policy_weights = np.random.normal(size=(num_arms, num_features)) action_scores = np.dot(X, policy_weights.T) action_probs = softmax(action_scores, tau=10) A = np.zeros((num_samples, 1)) for i in range(num_samples): A[i, 0] = np.random.choice(range(num_arms), 1, p=action_probs[i, :]) # store probabilities of choosing a particular action _rows = np.zeros_like(A, dtype=np.intp) _columns = A.astype(int) probs = action_probs[_rows, _columns] # calculate "true" outcomes Y ## broadcasting chosen actions to action weights matrix_multiplicator = actions_weights[_columns].squeeze() # (num_samples x num_features) matrix rewards = np.sum(X * matrix_multiplicator, axis=1).reshape(-1, 1) Y = (np.sign(rewards) + 1) / 2 if return_dataframe: column_names = ['X_' + str(i+1) for i in range(num_features)] X = pd.DataFrame(X, columns=column_names) A = pd.DataFrame(A, columns=['a']) Y = pd.DataFrame(Y, columns=['y']) probs = pd.DataFrame(probs, columns=['probs']) return pd.concat([X, A, Y, probs], axis=1) else: return X, A, Y, probs # dataset = generate_samples(100000, 4, 3, True) # dataset.head() def data_randomizer(pickle_file, seed=None): if isinstance(pickle_file, str): with open(pickle_file, 'rb') as f: dataset = pickle.load(f) else: dataset = pickle_file actions = sorted(dataset.iloc[:, -3].unique().tolist()) tst_smpl = pd.DataFrame().reindex_like(dataset).dropna() ratio = 0.1 for action in actions: action_subsample = dataset[dataset.iloc[:, -3] == action] action_drop, action_use = train_test_split(action_subsample.index, test_size=ratio, random_state=seed, stratify=action_subsample.iloc[:, -2]) tst_smpl = pd.concat([tst_smpl, action_subsample.loc[action_use]]).sample(frac=1, random_state=seed) tst_smpl = tst_smpl.reset_index(drop=True) del action_drop, action_use X = tst_smpl.iloc[:, :-3].to_numpy() A = tst_smpl.iloc[:, -3].to_numpy() Y = tst_smpl.iloc[:, -2].to_numpy() probs = tst_smpl.iloc[:, -1].to_numpy() return X, A, Y/probs class BanditDataset(Dataset): def __init__(self, pickle_file, seed=None): # load dataset X, A, Y = data_randomizer(pickle_file, seed) self.features = X self.actions = A self.rewards = Y def __len__(self): return len(self.rewards) def __getitem__(self, idx): feature_vec = self.features[idx] action = self.actions[idx] reward = self.rewards[idx] return feature_vec, action, reward # if __name__ == '__main__': # dir = 'data/mushroom_data_final.pickle' # data = data_randomizer(dir) # dataset = BanditDataset(pickle_file=dir, seed=1) # print(len(dataset)) # print(dataset.__len__()) # print(dataset[420]) # print(dataset[421]) # print(dataset[0]) # print(dataset[1]) # dl = DataLoader(dataset, batch_size=2, shuffle=True) # print(next(iter(dl))) # dataset = generate_samples(100000, 4, 3, True) # dataset = BanditDataset(pickle_file=dataset, seed=1) # print(len(dataset)) # print(dataset.__len__()) # print(dataset[420]) # print(dataset[421]) # print(dataset[0]) # print(dataset[1]) def get_leveled_data(arr): """ Args: arr: list of lists os different length Returns: average result over arr, axis=0 """ b = np.zeros([len(arr), len(max(arr, key=lambda x: len(x)))]) b[:, :] = np.nan for i, j in enumerate(arr): b[i][0:len(j)] = j return b def smooth(data, k): num_episodes = data.shape[1] num_runs = data.shape[0] smoothed_data = np.zeros((num_runs, num_episodes)) for i in range(num_episodes): if i < k: smoothed_data[:, i] = np.mean(data[:, :i + 1], axis=1) else: smoothed_data[:, i] = np.mean(data[:, i - k:i + 1], axis=1) return smoothed_data def plot_result(result_batch, result_online, batch_size): plt_agent_sweeps = [] num_steps = np.inf fig, ax = plt.subplots(figsize=(8, 6)) for data, label in zip([result_batch, result_online], ['batch', 'online']): sum_reward_data = get_leveled_data(data) # smooth data smoothed_sum_reward = smooth(data=sum_reward_data, k=100) mean_smoothed_sum_reward = np.mean(smoothed_sum_reward, axis=0) if mean_smoothed_sum_reward.shape[0] < num_steps: num_steps = mean_smoothed_sum_reward.shape[0] plot_x_range = np.arange(0, mean_smoothed_sum_reward.shape[0]) graph_current_agent_sum_reward, = ax.plot(plot_x_range, mean_smoothed_sum_reward[:], label=label) plt_agent_sweeps.append(graph_current_agent_sum_reward) update_points = np.ceil(np.arange(num_steps) / batch_size).astype(int) ax.plot(plot_x_range, mean_smoothed_sum_reward[update_points], label='upper bound') ax.legend(handles=plt_agent_sweeps, fontsize=13) ax.set_title("Learning Curve", fontsize=15) ax.set_xlabel('Episodes', fontsize=14) ax.set_ylabel('reward', rotation=0, labelpad=40, fontsize=14) # ax.set_ylim([-300, 300]) plt.tight_layout() plt.show() class BaseAgent: """Implements the agent for an RL-Glue environment. Note: agent_init, agent_start, agent_step, agent_end, agent_cleanup, and agent_message are required methods. """ __metaclass__ = ABCMeta def __init__(self): pass @abstractmethod def agent_init(self, agent_info={}): """Setup for the agent called when the experiment first starts.""" @abstractmethod def agent_start(self, observation): """The first method called when the experiment starts, called after the environment starts. Args: observation (Numpy array): the state observation from the environment's evn_start function. Returns: The first action the agent takes. """ @abstractmethod def agent_step(self, reward, observation): """A step taken by the agent. Args: reward (float): the reward received for taking the last action taken observation (Numpy array): the state observation from the environment's step based, where the agent ended up after the last step Returns: The action the agent is taking. """ @abstractmethod def agent_end(self, reward): """Run when the agent terminates. Args: reward (float): the reward the agent received for entering the terminal state. """ @abstractmethod def agent_cleanup(self): """Cleanup done after the agent ends.""" @abstractmethod def agent_message(self, message): """A function used to pass information from the agent to the experiment. Args: message: The message passed to the agent. Returns: The response (or answer) to the message. """ class RandomAgent(BaseAgent): def __init__(self): super().__init__() self.num_actions = None def agent_init(self, agent_info=None): if agent_info is None: agent_info = {} self.num_actions = agent_info.get('num_actions', 2) def agent_start(self, observation): pass def agent_step(self, reward, observation): pass def agent_end(self, reward): pass def agent_cleanup(self): pass def agent_message(self, message): pass def agent_policy(self, observation): return np.random.choice(self.num_actions) # if __name__ == '__main__': # ag = RandomAgent() # print(ag.num_actions) # ag.agent_init() # print(ag.num_actions) class Agent(BaseAgent): """agent does *no* learning, selects random action always""" def __init__(self): super().__init__() self.arm_count = None self.last_action = None self.num_actions = None self.q_values = None self.step_size = None self.initial_value = 0.0 self.batch_size = None self.q_values_oracle = None # used for batch updates def agent_init(self, agent_info=None): """Setup for the agent called when the experiment first starts.""" if agent_info is None: agent_info = {} self.num_actions = agent_info.get("num_actions", 2) self.initial_value = agent_info.get("initial_value", 0.0) self.q_values = np.ones(agent_info.get("num_actions", 2)) * self.initial_value self.step_size = agent_info.get("step_size", 0.1) self.batch_size = agent_info.get('batch_size', 1) self.q_values_oracle = self.q_values.copy() self.arm_count = np.zeros(self.num_actions) # [0.0 for _ in range(self.num_actions)] # self.last_action = np.random.choice(self.num_actions) # set first action to random def agent_start(self, observation): """The first method called when the experiment starts, called after the environment starts. Args: observation (Numpy array): the state observation from the environment's evn_start function. Returns: The first action the agent takes. """ self.last_action = np.random.choice(self.num_actions) return self.last_action def agent_step(self, reward, observation): """A step taken by the agent. Args: reward (float): the reward received for taking the last action taken observation (Numpy array): the state observation from the environment's step based, where the agent ended up after the last step Returns: The action the agent is taking. """ # local_action = 0 # choose the action here self.last_action = np.random.choice(self.num_actions) return self.last_action def agent_end(self, reward): pass def agent_cleanup(self): pass def agent_message(self, message): pass def argmax(q_values): """ Takes in a list of q_values and returns the index of the item with the highest value. Breaks ties randomly. returns: int - the index of the highest value in q_values """ top_value = float("-inf") ties = [] for i in range(len(q_values)): if q_values[i] > top_value: ties = [i] top_value = q_values[i] elif q_values[i] == top_value: ties.append(i) return np.random.choice(ties) class GreedyAgent(Agent): def __init__(self): super().__init__() def agent_init(self, agent_info=None): if agent_info is None: agent_info = {} super().agent_init(agent_info) def agent_step(self, reward, observation): """ Takes one step for the agent. It takes in a reward and observation and returns the action the agent chooses at that time step. Arguments: reward -- float, the reward the agent received from the environment after taking the last action. observation -- float, the observed state the agent is in. Do not worry about this as you will not use it until future lessons Returns: current_action -- int, the action chosen by the agent at the current time step. """ a = self.last_action self.arm_count[a] += 1 self.q_values_oracle[a] = self.q_values_oracle[a] + 1 / self.arm_count[a] * (reward - self.q_values_oracle[a]) if sum(self.arm_count) % self.batch_size == 0: self.q_values = self.q_values_oracle.copy() current_action = argmax(self.q_values) self.last_action = current_action return current_action class EpsilonGreedyAgent(Agent): def __init__(self): super().__init__() self.epsilon = None def agent_init(self, agent_info=None): if agent_info is None: agent_info = {} super().agent_init(agent_info) self.epsilon = agent_info.get("epsilon", 0.1) def agent_step(self, reward, observation): """ Takes one step for the agent. It takes in a reward and observation and returns the action the agent chooses at that time step. Arguments: reward -- float, the reward the agent received from the environment after taking the last action. observation -- float, the observed state the agent is in. Do not worry about this as you will not use it until future lessons Returns: current_action -- int, the action chosen by the agent at the current time step. """ a = self.last_action self.arm_count[a] += 1 self.q_values_oracle[a] = self.q_values_oracle[a] + 1 / self.arm_count[a] * (reward - self.q_values_oracle[a]) if np.sum(self.arm_count) % self.batch_size == 0: self.q_values = self.q_values_oracle.copy() if np.random.random() < self.epsilon: current_action = np.random.choice(range(len(self.arm_count))) else: current_action = argmax(self.q_values) self.last_action = current_action return current_action class UCBAgent(Agent): def __init__(self): super().__init__() self.upper_bounds = None self.alpha = None # exploration parameter def agent_init(self, agent_info=None): if agent_info is None: agent_info = {} super().agent_init(agent_info) self.alpha = agent_info.get("alpha", 1.0) self.arm_count = np.ones(self.num_actions) self.upper_bounds = np.sqrt(np.log(np.sum(self.arm_count)) / self.arm_count) def agent_step(self, reward, observation): a = self.last_action self.arm_count[a] += 1 self.q_values_oracle[a] = self.q_values_oracle[a] + 1 / self.arm_count[a] * (reward - self.q_values_oracle[a]) # since we start with arms_count = np.ones(num_actions), # we should subtract num_actions to get number of the current round if (np.sum(self.arm_count) - self.num_actions) % self.batch_size == 0: self.q_values = self.q_values_oracle.copy() self.upper_bounds = np.sqrt(np.log(np.sum(self.arm_count)) / self.arm_count) # if min(self.q_values + self.alpha * self.upper_bounds) < max(self.q_values): # print(f'Distinguish suboptimal arm at step {sum(self.arm_count)}') current_action = argmax(self.q_values + self.alpha * self.upper_bounds) # current_action = np.argmax(self.q_values + self.alpha * self.upper_bounds) self.last_action = current_action return current_action class TSAgent(Agent): def agent_step(self, reward, observation): a = self.last_action self.arm_count[a] += 1 self.q_values_oracle[a] = self.q_values_oracle[a] + 1 / self.arm_count[a] * (reward - self.q_values_oracle[a]) if (np.sum(self.arm_count) - self.num_actions) % self.batch_size == 0: self.q_values = self.q_values_oracle.copy() # sample from posteriors theta = [beta.rvs(a + 1, b + 1, size=1) for a, b in zip(self.q_values * self.arm_count, self.arm_count - self.q_values * self.arm_count)] # choose the max realization current_action = argmax(theta) self.last_action = current_action return current_action class LinUCBAgent(BaseAgent): def __init__(self): super().__init__() self.name = "LinUCB" def agent_init(self, agent_info=None): if agent_info is None: agent_info = {} self.num_actions = agent_info.get('num_actions', 3) self.alpha = agent_info.get('alpha', 1) self.batch_size = agent_info.get('batch_size', 1) # Set random seed for policy for each run self.policy_rand_generator = np.random.RandomState(agent_info.get("seed", None)) self.last_action = None self.last_state = None self.num_round = None def agent_policy(self, observation): p_t = np.zeros(self.num_actions) for i in range(self.num_actions): # initialize theta hat self.theta = inv(self.A[i]).dot(self.b[i]) # get context of each arm from flattened vector of length 100 cntx = observation # get gain reward of each arm p_t[i] = self.theta.T.dot(cntx) + self.alpha * np.sqrt(np.maximum(cntx.dot(inv(self.A[i]).dot(cntx)), 0)) # action = np.random.choice(np.where(p_t == max(p_t))[0]) action = self.policy_rand_generator.choice(np.where(p_t == max(p_t))[0]) return action def agent_start(self, observation): # Specify feature dimension self.ndims = len(observation) self.A = np.zeros((self.num_actions, self.ndims, self.ndims)) # Instantiate b as a 0 vector of length ndims. self.b = np.zeros((self.num_actions, self.ndims, 1)) # set each A per arm as identity matrix of size ndims for arm in range(self.num_actions): self.A[arm] = np.eye(self.ndims) self.A_oracle = self.A.copy() self.b_oracle = self.b.copy() self.last_state = observation self.last_action = self.agent_policy(self.last_state) self.num_round = 0 return self.last_action def agent_update(self, reward): self.A_oracle[self.last_action] = self.A_oracle[self.last_action] + np.outer(self.last_state, self.last_state) self.b_oracle[self.last_action] = np.add(self.b_oracle[self.last_action].T, self.last_state * reward).reshape(self.ndims, 1) def agent_step(self, reward, observation): if reward is not None: self.agent_update(reward) # it is a good question whether I should increment num_round outside # condition or not (since theoretical result doesn't clarify this self.num_round += 1 if self.num_round % self.batch_size == 0: self.A = self.A_oracle.copy() self.b = self.b_oracle.copy() self.last_state = observation self.last_action = self.agent_policy(self.last_state) return self.last_action def agent_end(self, reward): if reward is not None: self.agent_update(reward) self.num_round += 1 if self.num_round % self.batch_size == 0: self.A = self.A_oracle.copy() self.b = self.b_oracle.copy() def agent_message(self, message): pass def agent_cleanup(self): pass # if __name__ == '__main__': # agent_info = {'alpha': 2, # 'num_actions': 4, # 'seed': 1} # # check initialization # linucb = LinUCBAgent() # linucb.agent_init(agent_info) # print(linucb.num_actions, linucb.alpha) # assert linucb.num_actions == 4 # assert linucb.alpha == 2 # # check policy # observation = np.array([1, 2, 5, 0]) # linucb.A = np.zeros((linucb.num_actions, len(observation), len(observation))) # # Instantiate b as a 0 vector of length ndims. # linucb.b = np.zeros((linucb.num_actions, len(observation), 1)) # # set each A per arm as identity matrix of size ndims # for arm in range(linucb.num_actions): # linucb.A[arm] = np.eye(len(observation)) # action = linucb.agent_policy(observation) # print(action) # assert action == 1 # # check start # observation = np.array([1, 2, 5, 0]) # linucb.agent_start(observation) # print(linucb.ndims) # print(linucb.last_state, linucb.last_action) # assert linucb.ndims == len(observation) # assert np.allclose(linucb.last_state, observation) # assert np.allclose(linucb.b, np.zeros((linucb.num_actions, len(observation), 1))) # assert np.allclose(linucb.A, np.array([np.eye(len(observation)), np.eye(len(observation)), # np.eye(len(observation)), np.eye(len(observation))])) # assert linucb.last_action == 3 # # check step # observation = np.array([5, 3, 1, 2]) # reward = 1 # action = linucb.agent_step(reward, observation) # print(linucb.A) # print(linucb.b) # print(action) # true_A = np.array([[2., 2., 5., 0.], # [2., 5., 10., 0.], # [5., 10., 26., 0.], # [0., 0., 0., 1.]]) # true_b = np.array([[1.], # [2.], # [5.], # [0.]]) # for i in range(3): # assert np.allclose(linucb.A[i], np.eye(4)) # assert np.allclose(linucb.b[i], np.zeros((linucb.num_actions, 4, 1))) # assert np.allclose(linucb.A[3], true_A) # assert np.allclose(linucb.b[3], true_b) # assert linucb.last_action == 0 # observation = np.array([3, 1, 3, 5]) # reward = None # action = linucb.agent_step(reward, observation) # print(linucb.A) # print(linucb.b) # print(action) # assert np.allclose(linucb.A[3], true_A) # assert np.allclose(linucb.b[3], true_b) # assert action == 0 # # check batch size # agent_info = {'alpha': 2, # 'num_actions': 4, # 'seed': 1, # 'batch_size': 2} # linucb = LinUCBAgent() # linucb.agent_init(agent_info) # observation = np.array([1, 2, 5, 0]) # linucb.agent_start(observation) # assert linucb.num_round == 0 # assert linucb.last_action == 1 # observation = np.array([5, 3, 1, 2]) # reward = 1 # action = linucb.agent_step(reward, observation) # assert linucb.num_round == 1 # assert np.allclose(linucb.b, np.zeros((linucb.num_actions, len(observation), 1))) # assert np.allclose(linucb.A, np.array([np.eye(len(observation)), np.eye(len(observation)), # np.eye(len(observation)), np.eye(len(observation))])) # for i in [0, 2, 3]: # assert np.allclose(linucb.A_oracle[i], np.eye(4)) # assert np.allclose(linucb.b_oracle[i], np.zeros((linucb.num_actions, 4, 1))) # assert np.allclose(linucb.A_oracle[1], true_A) # assert np.allclose(linucb.b_oracle[1], true_b) # observation = np.array([3, 1, 3, 5]) # reward = None # action = linucb.agent_step(reward, observation) # # sinse reward is None, nothing should happen # assert linucb.num_round == 1 # assert np.allclose(linucb.b, np.zeros((linucb.num_actions, len(observation), 1))) # assert np.allclose(linucb.A, np.array([np.eye(len(observation)), np.eye(len(observation)), # np.eye(len(observation)), np.eye(len(observation))])) # for i in [0, 2, 3]: # assert np.allclose(linucb.A_oracle[i], np.eye(4)) # assert np.allclose(linucb.b_oracle[i], np.zeros((linucb.num_actions, 4, 1))) # assert np.allclose(linucb.A_oracle[1], true_A) # assert np.allclose(linucb.b_oracle[1], true_b) # observation = np.array([3, 0, 2, 5]) # reward = 0 # action = linucb.agent_step(reward, observation) # assert linucb.num_round == 2 # assert np.allclose(linucb.b, linucb.b_oracle) # assert np.allclose(linucb.A, linucb.A_oracle) class LinTSAgent(BaseAgent): def __init__(self): super().__init__() self.name = "LinTS" def agent_init(self, agent_info=None): if agent_info is None: agent_info = {} self.num_actions = agent_info.get('num_actions', 3) self.alpha = agent_info.get('alpha', 1) self.lambda_ = agent_info.get('lambda', 1) self.batch_size = agent_info.get('batch_size', 1) # Set random seed for policy for each run self.policy_rand_generator = np.random.RandomState(agent_info.get("seed", None)) self.replay_buffer = ReplayBuffer(agent_info['replay_buffer_size'], agent_info.get("seed")) self.last_action = None self.last_state = None self.num_round = None def agent_policy(self, observation, mode='sample'): p_t = np.zeros(self.num_actions) cntx = observation for i in range(self.num_actions): # sampling weights after update self.w = self.get_weights(i) # using weight depending on mode if mode == 'sample': w = self.w # weights are samples of posteriors elif mode == 'expected': w = self.m[i] # weights are expected values of posteriors else: raise Exception('mode not recognized!') # calculating probabilities p_t[i] = 1 / (1 + np.exp(-1 * cntx.dot(w))) action = self.policy_rand_generator.choice(np.where(p_t == max(p_t))[0]) # probs = softmax(p_t.reshape(1, -1)) # action = self.policy_rand_generator.choice(a=range(self.num_actions), p=probs) return action def get_weights(self, arm): return np.random.normal(self.m[arm], self.alpha * self.q[arm] ** (-1.0), size=len(self.w)) # the loss function def loss(self, w, *args): X, y = args return 0.5 * (self.q[self.last_action] * (w - self.m[self.last_action])).dot(w - self.m[self.last_action]) + np.sum( [np.log(1 + np.exp(-y[j] * w.dot(X[j]))) for j in range(y.shape[0])]) # the gradient def grad(self, w, *args): X, y = args return self.q[self.last_action] * (w - self.m[self.last_action]) + (-1) * np.array( [y[j] * X[j] / (1. + np.exp(y[j] * w.dot(X[j]))) for j in range(y.shape[0])]).sum(axis=0) # fitting method def agent_update(self, X, y): # step 1, find w self.w = minimize(self.loss, self.w, args=(X, y), jac=self.grad, method="L-BFGS-B", options={'maxiter': 20, 'disp': False}).x # self.m_oracle[self.last_action] = self.w self.m[self.last_action] = self.w # step 2, update q P = (1 + np.exp(1 - X.dot(self.m[self.last_action]))) ** (-1) #self.q_oracle[self.last_action] = self.q[self.last_action] + (P * (1 - P)).dot(X ** 2) self.q[self.last_action] = self.q[self.last_action] + (P * (1 - P)).dot(X ** 2) def agent_start(self, observation): # Specify feature dimension self.ndims = len(observation) # initializing parameters of the model self.m = np.zeros((self.num_actions, self.ndims)) self.q = np.ones((self.num_actions, self.ndims)) * self.lambda_ # initializing weights using any arm (e.g. 0) because they all equal self.w = np.array([0.]*self.ndims, dtype=np.float64) # self.m_oracle = self.m.copy() # self.q_oracle = self.q.copy() self.last_state = observation self.last_action = self.agent_policy(self.last_state) self.num_round = 0 return self.last_action def agent_step(self, reward, observation): # Append new experience to replay buffer if reward is not None: self.replay_buffer.append(self.last_state, self.last_action, reward) # it is a good question whether I should increment num_round outside # condition or not (since theoretical result doesn't clarify this self.num_round += 1 if self.num_round % self.batch_size == 0: X, y = self.replay_buffer.sample(self.last_action) X = np.array(X) y = np.array(y) self.agent_update(X, y) # self.m = self.m_oracle.copy() # self.q = self.q_oracle.copy() self.last_state = observation self.last_action = self.agent_policy(self.last_state) return self.last_action def agent_end(self, reward): # Append new experience to replay buffer if reward is not None: self.replay_buffer.append(self.last_state, self.last_action, reward) # it is a good question whether I should increment num_round outside # condition or not (since theoretical result doesn't clarify this self.num_round += 1 if self.num_round % self.batch_size == 0: X, y = self.replay_buffer.sample(self.last_action) X = np.array(X) y = np.array(y) self.agent_update(X, y) # self.m = self.m_oracle.copy() # self.q = self.q_oracle.copy() def agent_message(self, message): pass def agent_cleanup(self): pass # if __name__ == '__main__': # agent_info = {'alpha': 2, # 'num_actions': 3, # 'seed': 1, # 'lambda': 2, # 'replay_buffer_size': 100000} # np.random.seed(1) # # check initialization # lints = LinTSAgent() # lints.agent_init(agent_info) # print(lints.num_actions, lints.alpha, lints.lambda_) # assert lints.num_actions == 3 # assert lints.alpha == 2 # assert lints.lambda_ == 2 # # check agent policy # observation = np.array([1, 2, 5, 0]) # lints.m = np.zeros((lints.num_actions, len(observation))) # lints.q = np.ones((lints.num_actions, len(observation))) * lints.lambda_ # lints.w = np.random.normal(lints.m[0], lints.alpha * lints.q[0] ** (-1.0), size=len(observation)) # print(lints.w) # action = lints.agent_policy(observation) # print(action) # # check agent start # observation = np.array([1, 2, 5, 0]) # lints.agent_start(observation) # # manually reassign w to np.random.normal, because I np.seed doesn't work inside the class # np.random.seed(1) # lints.w = np.random.normal(lints.m[0], lints.alpha * lints.q[0] ** (-1.0), size=len(observation)) # print(lints.ndims) # print(lints.last_state, lints.last_action) # print(lints.last_action) # assert lints.ndims == len(observation) # assert np.allclose(lints.last_state, observation) # assert np.allclose(lints.m, np.zeros((lints.num_actions, lints.ndims))) # assert np.allclose(lints.q, np.ones((lints.num_actions, lints.ndims)) * lints.lambda_) # assert np.allclose(lints.w, np.array([ 1.62434536, -0.61175641, -0.52817175, -1.07296862])) # # assert lints.last_action == 1 # # check step # observation = np.array([5, 3, 1, 2]) # reward = 1 # action = lints.agent_step(reward, observation) # print(action) # observation = np.array([1, 3, 2, 1]) # reward = 0 # action = lints.agent_step(reward, observation) # print(action) class BaseEnvironment: """Implements the environment for an RLGlue environment Note: env_init, env_start, env_step, env_cleanup, and env_message are required methods. """ __metaclass__ = ABCMeta def __init__(self): reward = None observation = None termination = None self.reward_state_term = (reward, observation, termination) @abstractmethod def env_init(self, env_info={}): """Setup for the environment called when the experiment first starts. Note: Initialize a tuple with the reward, first state observation, boolean indicating if it's terminal. """ @abstractmethod def env_start(self): """The first method called when the experiment starts, called before the agent starts. Returns: The first state observation from the environment. """ @abstractmethod def env_step(self, action): """A step taken by the environment. Args: action: The action taken by the agent Returns: (float, state, Boolean): a tuple of the reward, state observation, and boolean indicating if it's terminal. """ @abstractmethod def env_cleanup(self): """Cleanup done after the environment ends""" @abstractmethod def env_message(self, message): """A message asking the environment for information Args: message: the message passed to the environment Returns: the response (or answer) to the message """ class Environment(BaseEnvironment): """Implements the environment for an RLGlue environment Note: env_init, env_start, env_step, env_cleanup, and env_message are required methods. """ actions = [0] def __init__(self): super().__init__() reward = None observation = None termination = None self.seed = None self.k = None self.reward_type = None self.custom_arms = None self.reward_state_term = (reward, observation, termination) self.count = 0 self.arms = [] self.subopt_gaps = None def env_init(self, env_info=None): """Setup for the environment called when the experiment first starts. Note: Initialize a tuple with the reward, first state observation, boolean indicating if it's terminal. """ if env_info is None: env_info = {} self.k = env_info.get("num_actions", 2) self.reward_type = env_info.get("reward_type", "subgaussian") self.custom_arms = env_info.get("arms_values", None) if self.reward_type not in ['Bernoulli', 'subgaussian']: raise ValueError('Unknown reward_type: ' + str(self.reward_type)) if self.custom_arms is None: if self.reward_type == 'Bernoulli': self.arms = np.random.uniform(0, 1, self.k) else: self.arms = np.random.randn(self.k) else: self.arms = self.custom_arms self.subopt_gaps = np.max(self.arms) - self.arms local_observation = 0 # An empty NumPy array self.reward_state_term = (0.0, local_observation, False) def env_start(self): """The first method called when the experiment starts, called before the agent starts. Returns: The first state observation from the environment. """ return self.reward_state_term[1] def env_step(self, action): """A step taken by the environment. Args: action: The action taken by the agent Returns: (float, state, Boolean): a tuple of the reward, state observation, and boolean indicating if it's terminal. """ if self.reward_type == 'Bernoulli': reward = np.random.binomial(1, self.arms[action], 1) else: reward = self.arms[action] + np.random.randn() obs = self.reward_state_term[1] self.reward_state_term = (reward, obs, False) return self.reward_state_term def env_cleanup(self): """Cleanup done after the environment ends""" pass def env_message(self, message): """A message asking the environment for information Args: message (string): the message passed to the environment Returns: string: the response (or answer) to the message """ if message == "what is the current reward?": return "{}".format(self.reward_state_term[0]) # else return "I don't know how to respond to your message" class OfflineEvaluator: def __init__(self, eval_info=None): if eval_info is None: eval_info = {} self.dataset = eval_info['dataset'] self.agent = eval_info['agent'] if not isinstance(self.dataset, Dataset): raise TypeError('dataset ' + "must be a " + str(Dataset)) if not isinstance(self.agent, BaseAgent): raise TypeError('agent ' + "must be a " + str(BaseAgent)) self.total_reward = None self.average_reward = None self.num_matches = None self.idxs = range(self.dataset.__len__()) self.counter = None def eval_start(self): self.total_reward = 0 self.average_reward = [0] self.num_matches = 0 self.idxs = range(self.dataset.__len__()) self.counter = 0 def _get_observation(self): idx = self.idxs[self.counter] self.counter += 1 return self.dataset.__getitem__(idx) def eval_step(self): observation = self._get_observation() state = observation[0] true_action = observation[1] reward = observation[2] pred_action = self.agent.agent_policy(state) if true_action != pred_action: return self.num_matches += 1 aw_reward = self.average_reward[-1] + (reward - self.average_reward[-1]) / self.num_matches self.average_reward.append(aw_reward) self.total_reward += reward def eval_run(self): self.eval_start() while self.counter < self.dataset.__len__(): self.eval_step() return self.average_reward # if __name__ == '__main__': # dir1 = 'data/mushroom_data_final.pickle' # ra = RandomAgent() # agent_info = {'num_actions': 2} # ra.agent_init(agent_info) # result = [] # result1 = [] # for seed_ in [1, 5, 10]: # , 2, 3, 32, 123, 76, 987, 2134]: # dataset = BanditDataset(pickle_file=dir1, seed=seed_) # eval_info = {'dataset': dataset, 'agent': ra} # evaluator = OfflineEvaluator(eval_info) # reward = evaluator.eval_run() # result.append(reward) # result1.append(evaluator.total_reward) # for elem in result: # plt.plot(elem) # plt.legend() # plt.show() class ReplayEnvironment(BaseEnvironment): dataset: BanditDataset def __init__(self): super().__init__() self.counter = None self.last_observation = None def env_init(self, env_info=None): """ Set parameters needed to setup the replay SavePilot environment. Assume env_info dict contains: { pickle_file: data directory [str] } Args: env_info (dict): """ if env_info is None: env_info = {} directory = env_info['pickle_file'] seed = env_info.get('seed', None) self.dataset = BanditDataset(directory, seed) self.idxs = range(self.dataset.__len__()) self.counter = 0 def _get_observation(self): idx = self.idxs[self.counter] return self.dataset.__getitem__(idx) def env_start(self): self.last_observation = self._get_observation() state = self.last_observation[0] reward = None is_terminal = False self.reward_state_term = (reward, state, is_terminal) self.counter += 1 # return first state from the environment return self.reward_state_term[1] def env_step(self, action): true_action = self.last_observation[1] reward = self.last_observation[2] if true_action != action: reward = None observation = self._get_observation() state = observation[0] if self.counter == self.dataset.__len__() - 1: is_terminal = True else: is_terminal = False self.reward_state_term = (reward, state, is_terminal) self.last_observation = observation self.counter += 1 return self.reward_state_term def env_cleanup(self): pass def env_message(self, message): pass class RLGlue: """RLGlue class args: env_name (string): the name of the module where the Environment class can be found agent_name (string): the name of the module where the Agent class can be found """ def __init__(self, env_class, agent_class): self.environment = env_class() self.agent = agent_class() self.total_reward = None self.average_reward = None self.last_action = None self.num_steps = None self.num_episodes = None self.num_matches = None def rl_init(self, agent_init_info={}, env_init_info={}): """Initial method called when RLGlue experiment is created""" self.environment.env_init(env_init_info) self.agent.agent_init(agent_init_info) self.total_reward = 0.0 self.average_reward = [0] self.num_steps = 0 self.num_episodes = 0 self.num_matches = 0 def rl_start(self): """Starts RLGlue experiment Returns: tuple: (state, action) """ last_state = self.environment.env_start() self.last_action = self.agent.agent_start(last_state) observation = (last_state, self.last_action) return observation def rl_agent_start(self, observation): """Starts the agent. Args: observation: The first observation from the environment Returns: The action taken by the agent. """ return self.agent.agent_start(observation) def rl_agent_step(self, reward, observation): """Step taken by the agent Args: reward (float): the last reward the agent received for taking the last action. observation : the state observation the agent receives from the environment. Returns: The action taken by the agent. """ return self.agent.agent_step(reward, observation) def rl_agent_end(self, reward): """Run when the agent terminates Args: reward (float): the reward the agent received when terminating """ self.agent.agent_end(reward) def rl_env_start(self): """Starts RL-Glue environment. Returns: (float, state, Boolean): reward, state observation, boolean indicating termination """ self.total_reward = 0.0 self.num_steps = 1 this_observation = self.environment.env_start() return this_observation def rl_env_step(self, action): """Step taken by the environment based on action from agent Args: action: Action taken by agent. Returns: (float, state, Boolean): reward, state observation, boolean indicating termination. """ ro = self.environment.env_step(action) (this_reward, _, terminal) = ro self.total_reward += this_reward if terminal: self.num_episodes += 1 else: self.num_steps += 1 return ro def rl_step(self): """Step taken by RLGlue, takes environment step and either step or end by agent. Returns: (float, state, action, Boolean): reward, last state observation, last action, boolean indicating termination """ (reward, last_state, term) = self.environment.env_step(self.last_action) if reward is not None: self.num_matches += 1 aw_reward = self.average_reward[-1] + (reward - self.average_reward[-1]) / self.num_matches self.average_reward.append(aw_reward) self.total_reward += reward if term: self.num_episodes += 1 self.agent.agent_end(reward) roat = (reward, last_state, None, term) else: self.num_steps += 1 self.last_action = self.agent.agent_step(reward, last_state) roat = (reward, last_state, self.last_action, term) return roat def rl_cleanup(self): """Cleanup done at end of experiment.""" self.environment.env_cleanup() self.agent.agent_cleanup() def rl_agent_message(self, message): """Message passed to communicate with agent during experiment Args: message: the message (or question) to send to the agent Returns: The message back (or answer) from the agent """ return self.agent.agent_message(message) def rl_env_message(self, message): """Message passed to communicate with environment during experiment Args: message: the message (or question) to send to the environment Returns: The message back (or answer) from the environment """ return self.environment.env_message(message) def rl_episode(self, max_steps_this_episode): """Runs an RLGlue episode Args: max_steps_this_episode (Int): the maximum steps for the experiment to run in an episode Returns: Boolean: if the episode should terminate """ is_terminal = False self.rl_start() while (not is_terminal) and ((max_steps_this_episode == 0) or (self.num_steps < max_steps_this_episode)): rl_step_result = self.rl_step() is_terminal = rl_step_result[3] return is_terminal def rl_return(self): """The total reward Returns: float: the total reward """ return self.total_reward def rl_num_steps(self): """The total number of steps taken Returns: Int: the total number of steps taken """ return self.num_steps def rl_num_episodes(self): """The number of episodes Returns Int: the total number of episodes """ return self.num_episodes class Policy: def __init__(self, env, agent): self.env = env self.agent = agent self.rl_glue = None @abstractmethod def get_average_performance(self, agent_info=None, env_info=None, exper_info=None): raise NotImplementedError class BanditWrapper(Policy): def get_average_performance(self, agent_info=None, env_info=None, exper_info=None): if exper_info is None: exper_info = {} if env_info is None: env_info = {} if agent_info is None: agent_info = {} num_runs = exper_info.get("num_runs", 100) num_steps = exper_info.get("num_steps", 1000) return_type = exper_info.get("return_type", None) seed = exper_info.get("seed", None) np.random.seed(seed) seeds = np.random.randint(0, num_runs * 100, num_runs) all_averages = [] subopt_arm_average = [] best_arm = [] worst_arm = [] all_chosen_arm = [] average_regret = [] for run in tqdm(range(num_runs)): np.random.seed(seeds[run]) self.rl_glue = RLGlue(self.env, self.agent) self.rl_glue.rl_init(agent_info, env_info) (first_state, first_action) = self.rl_glue.rl_start() worst_position = np.argmin(self.rl_glue.environment.arms) best_value = np.max(self.rl_glue.environment.arms) worst_value = np.min(self.rl_glue.environment.arms) best_arm.append(best_value) worst_arm.append(worst_value) scores = [0] averages = [] subopt_arm = [] chosen_arm_log = [] cum_regret = [0] delta = self.rl_glue.environment.subopt_gaps[first_action] cum_regret.append(cum_regret[-1] + delta) # first action was made in rl_start, that's why run over num_steps-1 for i in range(num_steps-1): reward, _, action, _ = self.rl_glue.rl_step() chosen_arm_log.append(action) scores.append(scores[-1] + reward) averages.append(scores[-1] / (i + 1)) subopt_arm.append(self.rl_glue.agent.arm_count[worst_position]) delta = self.rl_glue.environment.subopt_gaps[action] cum_regret.append(cum_regret[-1] + delta) all_averages.append(averages) subopt_arm_average.append(subopt_arm) all_chosen_arm.append(chosen_arm_log) average_regret.append(cum_regret) if return_type is None: returns = (np.mean(all_averages, axis=0), np.mean(best_arm)) elif return_type == 'regret': returns = np.mean(average_regret, axis=0) elif return_type == 'regret_reward': returns = (np.mean(average_regret, axis=0), np.mean(all_averages, axis=0)) elif return_type == 'arm_choice_analysis': returns = (np.mean(all_averages, axis=0), np.mean(best_arm), np.mean(all_chosen_arm, axis=0)) elif return_type == 'complex': returns = (np.mean(all_averages, axis=0), np.mean(subopt_arm_average, axis=0), np.array(best_arm), np.array(worst_arm), np.mean(average_regret, axis=0)) return returns def run_experiment(environment, agent, environment_parameters, agent_parameters, experiment_parameters, save_data=True, dir=''): rl_glue = RLGlue(environment, agent) # save sum of reward at the end of each episode agent_sum_reward = [] env_info = environment_parameters agent_info = agent_parameters # one agent setting for run in tqdm(range(1, experiment_parameters["num_runs"] + 1)): env_info["seed"] = run rl_glue.rl_init(agent_info, env_info) rl_glue.rl_episode(0) agent_sum_reward.append(rl_glue.average_reward) leveled_result = get_leveled_data(agent_sum_reward) if save_data: save_name = "{}-{}".format(rl_glue.agent.name, rl_glue.agent.batch_size) file_dir = "results/{}".format(dir) if not os.path.exists(file_dir): os.makedirs(file_dir) np.save("{}/sum_reward_{}".format(file_dir, save_name), leveled_result) return leveled_result # if __name__ == '__main__': # num_experements = 10 # batch_size = 100 # data_dir = 'data/mushroom_data_final.pickle' # experiment_parameters = {"num_runs": num_experements} # env_info = {'pickle_file': data_dir} # agent_info = {'alpha': 2, # 'num_actions': 3, # 'seed': 1, # 'batch_size': 1} # agent = LinUCBAgent # environment = ReplayEnvironment # result = run_experiment(environment, agent, env_info, agent_info, experiment_parameters, save_data=False) # smoothed_leveled_result = smooth(result, 100) # mean_smoothed_leveled_result = np.mean(smoothed_leveled_result, axis=0) # plt.plot(mean_smoothed_leveled_result, lw=3, ls='-.', label='online policy') # plt.show() env = Environment agent = UCBAgent alpha = 1 num_runs = 1000 num_steps = 10000 seed = None if_save = False exper_info = {"num_runs": num_runs, "num_steps": num_steps, "seed": seed, "return_type": "regret"} k = 2 arms_values = [0.7, 0.65] reward_type = 'Bernoulli' env_info = {"num_actions": k, "reward_type": reward_type, "arms_values": arms_values} # batch-online experiment batch_res = [] online_res = [] batch = 10 agent_info_batch = {"num_actions": k, "batch_size": batch, "alpha": alpha} agent_info_online = {"num_actions": k, "batch_size": 1, "alpha": alpha} exp1 = BanditWrapper(env, agent) batch_res.append(exp1.get_average_performance(agent_info_batch, env_info, exper_info)) online_res.append(exp1.get_average_performance(agent_info_online, env_info, exper_info)) av_online_res = np.mean(online_res, axis=0) av_batch_res = np.mean(batch_res, axis=0) plt.plot(av_batch_res, label='batch') plt.plot(av_online_res, label='online') M = int(num_steps / batch) update_points = np.ceil(np.arange(num_steps) / batch).astype(int) plt.plot(av_online_res[update_points] * batch, ls='--', label='upper bound, batch size = 10') plt.title('Cumulative Regret averaged over ' + str(num_runs) + ' runs') plt.xlabel('time steps') plt.ylabel('regret') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.legend() if if_save: plt.savefig('results/UCB transform example.png', bbox_inches='tight') plt.show() if if_save: name = 'batch_result, runs=' + str(num_runs) + ', steps=' + str(num_steps) with open('results/' + '/' + name + '.pickle', 'wb') as handle: pickle.dump(batch_res, handle, protocol=pickle.HIGHEST_PROTOCOL) model_dir = 'results/UCB/dynamic_by_batches' if not os.path.exists(model_dir): print(f'Creating a new model directory: {model_dir}') os.makedirs(model_dir) num_runs = 10 # 500 num_steps = 10001 seed = None exper_info = {"num_runs": num_runs, "num_steps": num_steps, "seed": seed, "return_type": "regret"} environments = [[0.7, 0.5], [0.7, 0.4], [0.7, 0.1], [0.35, 0.18, 0.47, 0.61], [0.4, 0.75, 0.57, 0.49], [0.70, 0.50, 0.30, 0.10]] for arms_values in environments: k = len(arms_values) reward_type = 'Bernoulli' env_info = {"num_actions": k, "reward_type": reward_type, "arms_values": arms_values} env = Environment agent = UCBAgent alpha = 1 # run online agent agent_info_online = {"num_actions": k, "batch_size": 1, "alpha": alpha} experiment = BanditWrapper(env, agent) online_regret = experiment.get_average_performance(agent_info_online, env_info, exper_info) # run batch agent batches = np.logspace(1.0, 3.0, num=20).astype(int) actual_regret = [] upper_bound = [] for batch in batches: agent_info_batch = {"num_actions": k, "batch_size": batch, "alpha": alpha} experiment = BanditWrapper(env, agent) batch_regret = experiment.get_average_performance(agent_info_batch, env_info, exper_info) actual_regret.append(batch_regret[-1]) M = int(num_steps / batch) upper_bound.append(online_regret[M] * batch) # save data name = 'dyn_by_batch_' + str(arms_values) name1 = name + ' batch_regret' with open(model_dir + '/' + name1 + '.pickle', 'wb') as handle: pickle.dump(actual_regret, handle, protocol=pickle.HIGHEST_PROTOCOL) name2 = name + ' online_regret' with open(model_dir + '/' + name2 + '.pickle', 'wb') as handle: pickle.dump(online_regret, handle, protocol=pickle.HIGHEST_PROTOCOL) print("End!") env = Environment agent = TSAgent num_runs = 10 # 1000 num_steps = 10000 seed = None if_save = False exper_info = {"num_runs": num_runs, "num_steps": num_steps, "seed": seed, "return_type": "regret"} k = 2 arms_values = [0.7, 0.65] reward_type = 'Bernoulli' env_info = {"num_actions": k, "reward_type": reward_type, "arms_values": arms_values} # batch-online experiment batch_res = [] online_res = [] batch = 10 agent_info_batch = {"num_actions": k, "batch_size": batch} agent_info_online = {"num_actions": k, "batch_size": 1} exp1 = BanditWrapper(env, agent) batch_res.append(exp1.get_average_performance(agent_info_batch, env_info, exper_info)) online_res.append(exp1.get_average_performance(agent_info_online, env_info, exper_info)) av_online_res = np.mean(online_res, axis=0) av_batch_res = np.mean(batch_res, axis=0) plt.plot(av_batch_res, label='batch') plt.plot(av_online_res, label='online') M = int(num_steps / batch) update_points = np.ceil(np.arange(num_steps) / batch).astype(int) plt.plot(av_online_res[update_points] * batch, ls='--', label='upper bound, batch size = 10') plt.title('Cumulative Regret averaged over ' + str(num_runs) + ' runs') plt.xlabel('time steps') plt.ylabel('regret') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.legend() if if_save: plt.savefig('results/TS example.png', bbox_inches='tight') plt.show() if if_save: name = 'batch_result, runs=' + str(num_runs) + ', steps=' + str(num_steps) with open('results/' + '/' + name + '.pickle', 'wb') as handle: pickle.dump(batch_res, handle, protocol=pickle.HIGHEST_PROTOCOL) model_dir = 'results/TS/dynamic_by_batches' if not os.path.exists(model_dir): print(f'Creating a new model directory: {model_dir}') os.makedirs(model_dir) num_runs = 10 # 500 num_steps = 10001 seed = None exper_info = {"num_runs": num_runs, "num_steps": num_steps, "seed": seed, "return_type": "regret"} environments = [[0.7, 0.5], [0.7, 0.4], [0.7, 0.1], [0.35, 0.18, 0.47, 0.61], [0.4, 0.75, 0.57, 0.49], [0.70, 0.50, 0.30, 0.10]] for arms_values in environments: k = len(arms_values) reward_type = 'Bernoulli' env_info = {"num_actions": k, "reward_type": reward_type, "arms_values": arms_values} env = Environment agent = TSAgent # run online agent agent_info_online = {"num_actions": k, "batch_size": 1} experiment = BanditWrapper(env, agent) online_regret = experiment.get_average_performance(agent_info_online, env_info, exper_info) # run batch agent batches = np.logspace(1.0, 3.0, num=20).astype(int) actual_regret = [] upper_bound = [] for batch in batches: agent_info_batch = {"num_actions": k, "batch_size": batch} experiment = BanditWrapper(env, agent) batch_regret = experiment.get_average_performance(agent_info_batch, env_info, exper_info) actual_regret.append(batch_regret[-1]) M = int(num_steps / batch) upper_bound.append(online_regret[M] * batch) # save data name = 'dyn_by_batch_' + str(k) + str(arms_values) name1 = name + ' batch_regret' with open(model_dir + '/' + name1 + '.pickle', 'wb') as handle: pickle.dump(actual_regret, handle, protocol=pickle.HIGHEST_PROTOCOL) name2 = name + ' online_regret' with open(model_dir + '/' + name2 + '.pickle', 'wb') as handle: pickle.dump(online_regret, handle, protocol=pickle.HIGHEST_PROTOCOL) print("End!") num_experiments = 20 batch_size = 100 data_dir = 'data/mushroom_data_final.pickle' env_info = {'pickle_file': data_dir} output_dir = 'LinUCB/dynamic_by_timesteps' agent_info = {'alpha': 2, 'num_actions': 3, 'seed': 1, 'batch_size': 1} agent_info_batch = {'alpha': 2, 'num_actions': 3, 'seed': 1, 'batch_size': batch_size} experiment_parameters = {"num_runs": num_experiments} agent = LinUCBAgent environment = ReplayEnvironment online_result = run_experiment(environment, agent, env_info, agent_info, experiment_parameters, True, output_dir) batch_result = run_experiment(environment, agent, env_info, agent_info_batch, experiment_parameters, True, output_dir) smoothed_leveled_result = smooth(online_result, 100) smoothed_leveled_result1 = smooth(batch_result, 100) mean_smoothed_leveled_result = np.mean(smoothed_leveled_result, axis=0) mean_smoothed_leveled_result1 = np.mean(smoothed_leveled_result1, axis=0) num_steps = np.minimum(len(mean_smoothed_leveled_result), len(mean_smoothed_leveled_result1)) update_points = np.ceil(np.arange(num_steps) / batch_size).astype(int) pic_filename = "results/{}/UCB_transform_timesteps.png".format(output_dir) plt.plot(mean_smoothed_leveled_result1, lw=3, label='batch, batch size = ' + str(batch_size)) plt.plot(mean_smoothed_leveled_result, lw=3, ls='-.', label='online policy') plt.plot(mean_smoothed_leveled_result[update_points], lw=3, ls='-.', label='dumb policy') plt.legend() plt.xlabel('time steps') plt.title("Smooth Cumulative Reward averaged over {} runs".format(num_experiments)) plt.ylabel('smoothed reward') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.savefig(pic_filename, bbox_inches='tight') plt.show() num_experiments = 20 data_dir = 'data/mushroom_data_final.pickle' env_info = {'pickle_file': data_dir} output_dir = 'LinUCB/dynamic_by_batches' agent_info = {'alpha': 2, 'num_actions': 3, 'seed': 1, 'batch_size': 1} experiment_parameters = {"num_runs": num_experiments} agent = LinUCBAgent environment = ReplayEnvironment # run online agent online_result = run_experiment(environment, agent, env_info, agent_info, experiment_parameters, True, output_dir) # smooth and average the result smoothed_leveled_result = smooth(online_result, 100) mean_smoothed_leveled_result = np.mean(smoothed_leveled_result, axis=0) mean_smoothed_leveled_result = mean_smoothed_leveled_result[~np.isnan(mean_smoothed_leveled_result)] # run batch agent batch_sizes = np.logspace(1.0, 2.7, num=20).astype(int) actual_regret = [] upper_bound = [] for batch in batch_sizes: agent_info_batch = {'alpha': 2, 'num_actions': 3, 'seed': 1, 'batch_size': batch} batch_result = run_experiment(environment, agent, env_info, agent_info_batch, experiment_parameters, True, output_dir) # smooth and average the result smoothed_leveled_result1 = smooth(batch_result, 100) mean_smoothed_leveled_result1 = np.mean(smoothed_leveled_result1, axis=0) mean_smoothed_leveled_result1 = mean_smoothed_leveled_result1[~np.isnan(mean_smoothed_leveled_result1)] actual_regret.append(mean_smoothed_leveled_result1[-1]) # fetch dumb result M = int(len(mean_smoothed_leveled_result1) / batch) upper_bound.append(mean_smoothed_leveled_result[M]) pic_filename = "results/{}/UCB_transform_batchsize.png".format(output_dir) plt.plot(batch_sizes, actual_regret, label='actual regret') plt.plot(batch_sizes, [mean_smoothed_leveled_result[-1]]*len(batch_sizes), label='online policy') plt.plot(batch_sizes, upper_bound, label='dumb policy') plt.legend() plt.title("Reward as a f-n of batch size (each point is averaged over {} runs)".format(num_experiments)) plt.xlabel('batch size (log scale)') plt.ylabel('reward') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.savefig(pic_filename, bbox_inches='tight') plt.show() num_experiments = 10 batch_size = 100 data_dir = 'data/mushroom_data_final.pickle' env_info = {'pickle_file': data_dir} output_dir = 'LinTS/dynamic_by_timesteps' agent_info = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': 1, 'replay_buffer_size': 100000} agent_info_batch = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': batch_size, 'replay_buffer_size': 100000} experiment_parameters = {"num_runs": num_experiments} agent = LinTSAgent environment = ReplayEnvironment online_result = run_experiment(environment, agent, env_info, agent_info, experiment_parameters, True, output_dir) batch_result = run_experiment(environment, agent, env_info, agent_info_batch, experiment_parameters, True, output_dir) smoothed_leveled_result = smooth(online_result, 100) smoothed_leveled_result1 = smooth(batch_result, 100) mean_smoothed_leveled_result = np.mean(smoothed_leveled_result, axis=0) mean_smoothed_leveled_result1 = np.mean(smoothed_leveled_result1, axis=0) num_steps = np.minimum(len(mean_smoothed_leveled_result), len(mean_smoothed_leveled_result1)) update_points = np.ceil(np.arange(num_steps) / batch_size).astype(int) pic_filename = "results/{}/TS_transform_timesteps.png".format(output_dir) plt.plot(mean_smoothed_leveled_result1, lw=3, label='batch, batch size = ' + str(batch_size)) plt.plot(mean_smoothed_leveled_result, lw=3, ls='-.', label='online policy') plt.plot(mean_smoothed_leveled_result[update_points], lw=3, ls='-.', label='dumb policy') plt.legend() plt.xlabel('time steps') plt.title("Smooth Cumulative Reward averaged over {} runs".format(num_experiments)) plt.ylabel('smoothed reward') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.savefig(pic_filename, bbox_inches='tight') plt.show() num_experiments = 20 data_dir = 'data/mushroom_data_final.pickle' env_info = {'pickle_file': data_dir} output_dir = 'LinTS/dynamic_by_batches' agent_info = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': 1, 'replay_buffer_size': 100000} experiment_parameters = {"num_runs": num_experiments} agent = LinTSAgent environment = ReplayEnvironment # run online agent online_result = run_experiment(environment, agent, env_info, agent_info, experiment_parameters, True, output_dir) # smooth and average the result smoothed_leveled_result = smooth(online_result, 100) mean_smoothed_leveled_result = np.mean(smoothed_leveled_result, axis=0) mean_smoothed_leveled_result = mean_smoothed_leveled_result[~np.isnan(mean_smoothed_leveled_result)] # run batch agent batch_sizes = np.logspace(1.0, 2.7, num=20).astype(int) actual_regret = [] upper_bound = [] for batch in batch_sizes: agent_info_batch = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': batch, 'replay_buffer_size': 100000} batch_result = run_experiment(environment, agent, env_info, agent_info_batch, experiment_parameters, True, output_dir) # smooth and average the result smoothed_leveled_result1 = smooth(batch_result, 100) mean_smoothed_leveled_result1 = np.mean(smoothed_leveled_result1, axis=0) mean_smoothed_leveled_result1 = mean_smoothed_leveled_result1[~np.isnan(mean_smoothed_leveled_result1)] actual_regret.append(mean_smoothed_leveled_result1[-1]) # fetch dumb result M = int(len(mean_smoothed_leveled_result1) / batch) upper_bound.append(mean_smoothed_leveled_result[M]) pic_filename = "results/{}/TS_transform_batchsize.png".format(output_dir) plt.plot(batch_sizes, actual_regret, label='actual regret') plt.plot(batch_sizes, [mean_smoothed_leveled_result[-1]]*len(batch_sizes), label='online policy') plt.plot(batch_sizes, upper_bound, label='dumb policy') plt.legend() plt.title("Reward as a f-n of batch size (each point is averaged over {} runs)".format(num_experiments)) plt.xlabel('batch size (log scale)') plt.ylabel('reward') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.savefig(pic_filename, bbox_inches='tight') plt.show() data_dir = 'data/mushroom_data_final.pickle' env_info = {'pickle_file': data_dir, 'seed': 1} # init env environment = ReplayEnvironment # init random agent random_agent_info = {'num_actions': 2} ra = RandomAgent() ra.agent_init(random_agent_info) # learn LinUCB agent agent_info = {'alpha': 2, 'num_actions': 2, 'seed': 1, 'batch_size': 1} agent = LinUCBAgent rl_glue = RLGlue(environment, agent) for i in range(4): rl_glue.rl_init(agent_info, env_info) rl_glue.rl_episode(0) UCB_agent = rl_glue.agent # learn LinTS agent agent_info = {'num_actions': 2, 'replay_buffer_size': 200, 'seed': 1, 'batch_size': 1} agent = LinTSAgent rl_glue = RLGlue(environment, agent) for i in range(4): rl_glue.rl_init(agent_info, env_info) rl_glue.rl_episode(0) TS_agent = rl_glue.agent result = [] result1 = [] result2 = [] exper_seeds = [2, 5, 10, 12, 54, 32, 15, 76, 45, 56] for seed_ in exper_seeds: dataset = BanditDataset(pickle_file=data_dir, seed=seed_) eval_info = {'dataset': dataset, 'agent': UCB_agent} eval_info1 = {'dataset': dataset, 'agent': TS_agent} eval_info2 = {'dataset': dataset, 'agent': ra} evaluator = OfflineEvaluator(eval_info) evaluator1 = OfflineEvaluator(eval_info1) evaluator2 = OfflineEvaluator(eval_info2) reward = evaluator.eval_run() reward1 = evaluator1.eval_run() reward2 = evaluator2.eval_run() result.append(reward) result1.append(reward1) result2.append(reward2) labels = ['UCB agent', 'TS agent', 'Random agent'] for i, res in enumerate([result, result1, result2]): for elem in res: plt.plot(elem, linewidth=0.1) avg = [float(sum(col))/len(col) for col in zip(*res)] plt.plot(avg, label=labels[i]) labels = ['UCB agent', 'TS agent', 'Random agent'] for i, res in enumerate([result, result1, result2]): for elem in res: plt.plot(elem, linewidth=0.1) avg = [float(sum(col))/len(col) for col in zip(*res)] plt.plot(avg, label=labels[i]) plt.legend() plt.ylim([0.1, 0.7]) plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.show() # generate 100 000 samples with 4 features and 3 actions dataset = generate_samples(100000, 4, 3, True) dataset.head() num_experiments = 10 batch_size1 = 30 batch_size2 = 100 env_info = {'pickle_file': dataset} agent1_info = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': batch_size1, 'replay_buffer_size': 100000} agent2_info = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': batch_size2, 'replay_buffer_size': 100000} experiment_parameters = {"num_runs": num_experiments} agent = LinTSAgent environment = ReplayEnvironment result1 = run_experiment(environment, agent, env_info, agent1_info, experiment_parameters, False) result2 = run_experiment(environment, agent, env_info, agent2_info, experiment_parameters, False) smoothed_leveled_result1 = smooth(result1, 100) smoothed_leveled_result2 = smooth(result2, 100) mean_smoothed_leveled_result1 = np.mean(smoothed_leveled_result1, axis=0) mean_smoothed_leveled_result2 = np.mean(smoothed_leveled_result2, axis=0) plt.plot(mean_smoothed_leveled_result1, label='batch size = ' + str(batch_size1)) plt.plot(mean_smoothed_leveled_result2, label='batch size = ' + str(batch_size2)) plt.legend() plt.xlabel('time steps') plt.title("Smooth Cumulative Reward averaged over {} runs".format(num_experiments)) plt.ylabel('smoothed conversion rate') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.show() num_experiments = 20 env_info = {'pickle_file': dataset} experiment_parameters = {"num_runs": num_experiments} agent = LinTSAgent environment = ReplayEnvironment # run batch agent batch_sizes = np.logspace(1.0, 2.7, num=20).astype(int) actual_regret = [] for batch in batch_sizes: agent_info_batch = {'alpha': 1, 'num_actions': 3, 'seed': 1, 'batch_size': batch, 'replay_buffer_size': 100000} batch_result = run_experiment(environment, agent, env_info, agent_info_batch, experiment_parameters, False) # smooth and average the result smoothed_leveled_result1 = smooth(batch_result, 100) mean_smoothed_leveled_result1 = np.mean(smoothed_leveled_result1, axis=0) mean_smoothed_leveled_result1 = mean_smoothed_leveled_result1[~np.isnan(mean_smoothed_leveled_result1)] actual_regret.append(mean_smoothed_leveled_result1[-1]) plt.plot(batch_sizes, actual_regret, label='actual regret') plt.legend() plt.title("Reward as a f-n of batch size (each point is averaged over {} runs)".format(num_experiments)) plt.xlabel('batch size (log scale)') plt.ylabel('reward') plt.grid(b=True, which='major', linestyle='--', alpha=0.5) plt.minorticks_on() plt.grid(b=True, which='minor', linestyle=':', alpha=0.2) plt.show()