## Further recommendation, test multiple frameworks, see what works better on average. ## Open source RL: https://docs.google.com/spreadsheets/d/1EeFPd-XIQ3mq_9snTlAZSsFY7Hbnmd7P5bbT8LPuMn0/edit#gid=0 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set() !pip install yfinance --upgrade --no-cache-dir ## Save future files to your drive ## In this notebook control for multiple testing import numpy as np from google.colab import drive drive.mount('/content/drive',force_remount=True) %cd "/content/drive/My Drive/FirmAI/FinML/Data/Agent Trading" from pandas_datareader import data as pdr import fix_yahoo_finance as yf yf.pdr_override() df_full = pdr.get_data_yahoo("JPM", start="2018-01-01").reset_index() df_full.to_csv('output/JPM.csv',index=False) df_full.head() df_full = pd.read_csv('output/JPM.csv') df= df_full.copy() name = 'Turtle Trading Agent' count = int(np.ceil(len(df) * 0.1)) signals = pd.DataFrame(index=df.index) signals['signal'] = 0.0 signals['trend'] = df['Close'] signals['RollingMax'] = (signals.trend.shift(1).rolling(count).max()) signals['RollingMin'] = (signals.trend.shift(1).rolling(count).min()) signals.loc[signals['RollingMax'] < signals.trend, 'signal'] = -1 signals.loc[signals['RollingMin'] > signals.trend, 'signal'] = 1 signals def buy_stock( real_movement, signal, initial_money = 10000, max_buy = 1, max_sell = 1, ): """ real_movement = actual movement in the real world delay = how much interval you want to delay to change our decision from buy to sell, vice versa initial_state = 1 is buy, 0 is sell initial_money = 1000, ignore what kind of currency max_buy = max quantity for share to buy max_sell = max quantity for share to sell """ starting_money = initial_money states_sell = [] states_buy = [] current_inventory = 0 def buy(i, initial_money, current_inventory): shares = initial_money // real_movement[i] if shares < 1: print( 'day %d: total balances %f, not enough money to buy a unit price %f' % (i, initial_money, real_movement[i]) ) else: if shares > max_buy: buy_units = max_buy else: buy_units = shares initial_money -= buy_units * real_movement[i] current_inventory += buy_units print( 'day %d: buy %d units at price %f, total balance %f' % (i, buy_units, buy_units * real_movement[i], initial_money) ) states_buy.append(0) return initial_money, current_inventory for i in range(real_movement.shape[0] - int(0.025 * len(df))): state = signal[i] if state == 1: initial_money, current_inventory = buy( i, initial_money, current_inventory ) states_buy.append(i) elif state == -1: if current_inventory == 0: print('day %d: cannot sell anything, inventory 0' % (i)) else: if current_inventory > max_sell: sell_units = max_sell else: sell_units = current_inventory current_inventory -= sell_units total_sell = sell_units * real_movement[i] initial_money += total_sell try: invest = ( (real_movement[i] - real_movement[states_buy[-1]]) / real_movement[states_buy[-1]] ) * 100 except: invest = 0 print( 'day %d, sell %d units at price %f, investment %f %%, total balance %f,' % (i, sell_units, total_sell, invest, initial_money) ) states_sell.append(i) invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest states_buy, states_sell, total_gains, invest = buy_stock(df.Close, signals['signal']) close = df['Close'] fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() df= df_full.copy() name = 'Moving Average agent' short_window = int(0.025 * len(df)) long_window = int(0.05 * len(df)) signals = pd.DataFrame(index=df.index) signals['signal'] = 0.0 signals['short_ma'] = df['Close'].rolling(window=short_window, min_periods=1, center=False).mean() signals['long_ma'] = df['Close'].rolling(window=long_window, min_periods=1, center=False).mean() signals['signal'][short_window:] = np.where(signals['short_ma'][short_window:] > signals['long_ma'][short_window:], 1.0, 0.0) signals['positions'] = signals['signal'].diff() signals def buy_stock( real_movement, signal, initial_money = 10000, max_buy = 1, max_sell = 1, ): """ real_movement = actual movement in the real world delay = how much interval you want to delay to change our decision from buy to sell, vice versa initial_state = 1 is buy, 0 is sell initial_money = 1000, ignore what kind of currency max_buy = max quantity for share to buy max_sell = max quantity for share to sell """ starting_money = initial_money states_sell = [] states_buy = [] current_inventory = 0 def buy(i, initial_money, current_inventory): shares = initial_money // real_movement[i] if shares < 1: print( 'day %d: total balances %f, not enough money to buy a unit price %f' % (i, initial_money, real_movement[i]) ) else: if shares > max_buy: buy_units = max_buy else: buy_units = shares initial_money -= buy_units * real_movement[i] current_inventory += buy_units print( 'day %d: buy %d units at price %f, total balance %f' % (i, buy_units, buy_units * real_movement[i], initial_money) ) states_buy.append(0) return initial_money, current_inventory for i in range(real_movement.shape[0] - int(0.025 * len(df))): state = signal[i] if state == 1: initial_money, current_inventory = buy( i, initial_money, current_inventory ) states_buy.append(i) elif state == -1: if current_inventory == 0: print('day %d: cannot sell anything, inventory 0' % (i)) else: if current_inventory > max_sell: sell_units = max_sell else: sell_units = current_inventory current_inventory -= sell_units total_sell = sell_units * real_movement[i] initial_money += total_sell try: invest = ( (real_movement[i] - real_movement[states_buy[-1]]) / real_movement[states_buy[-1]] ) * 100 except: invest = 0 print( 'day %d, sell %d units at price %f, investment %f %%, total balance %f,' % (i, sell_units, total_sell, invest, initial_money) ) states_sell.append(i) invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest states_buy, states_sell, total_gains, invest = buy_stock(df.Close, signals['positions']) close = df['Close'] fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() df= df_full.copy() name = 'Signal Rolling agent' def buy_stock( real_movement, delay = 5, initial_state = 1, initial_money = 10000, max_buy = 1, max_sell = 1, ): """ real_movement = actual movement in the real world delay = how much interval you want to delay to change our decision from buy to sell, vice versa initial_state = 1 is buy, 0 is sell initial_money = 1000, ignore what kind of currency max_buy = max quantity for share to buy max_sell = max quantity for share to sell """ starting_money = initial_money delay_change_decision = delay current_decision = 0 state = initial_state current_val = real_movement[0] states_sell = [] states_buy = [] current_inventory = 0 def buy(i, initial_money, current_inventory): shares = initial_money // real_movement[i] if shares < 1: print( 'day %d: total balances %f, not enough money to buy a unit price %f' % (i, initial_money, real_movement[i]) ) else: if shares > max_buy: buy_units = max_buy else: buy_units = shares initial_money -= buy_units * real_movement[i] current_inventory += buy_units print( 'day %d: buy %d units at price %f, total balance %f' % (i, buy_units, buy_units * real_movement[i], initial_money) ) states_buy.append(0) return initial_money, current_inventory if state == 1: initial_money, current_inventory = buy( 0, initial_money, current_inventory ) for i in range(1, real_movement.shape[0], 1): if real_movement[i] < current_val and state == 0: if current_decision < delay_change_decision: current_decision += 1 else: state = 1 initial_money, current_inventory = buy( i, initial_money, current_inventory ) current_decision = 0 states_buy.append(i) if real_movement[i] > current_val and state == 1: if current_decision < delay_change_decision: current_decision += 1 else: state = 0 if current_inventory == 0: print('day %d: cannot sell anything, inventory 0' % (i)) else: if current_inventory > max_sell: sell_units = max_sell else: sell_units = current_inventory current_inventory -= sell_units total_sell = sell_units * real_movement[i] initial_money += total_sell try: invest = ( (real_movement[i] - real_movement[states_buy[-1]]) / real_movement[states_buy[-1]] ) * 100 except: invest = 0 print( 'day %d, sell %d units at price %f, investment %f %%, total balance %f,' % (i, sell_units, total_sell, invest, initial_money) ) current_decision = 0 states_sell.append(i) current_val = real_movement[i] invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest states_buy, states_sell, total_gains, invest = buy_stock(df.Close, initial_state = 1, delay = 4, initial_money = 10000) close = df['Close'] fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() df = df_full.copy() name = 'Policy Gradient agent' class Agent: LEARNING_RATE = 1e-4 LAYER_SIZE = 256 GAMMA = 0.9 OUTPUT_SIZE = 3 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.X = tf.placeholder(tf.float32, (None, self.state_size)) self.REWARDS = tf.placeholder(tf.float32, (None)) self.ACTIONS = tf.placeholder(tf.int32, (None)) feed_forward = tf.layers.dense(self.X, self.LAYER_SIZE, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_forward, self.OUTPUT_SIZE, activation = tf.nn.softmax) input_y = tf.one_hot(self.ACTIONS, self.OUTPUT_SIZE) loglike = tf.log((input_y * (input_y - self.logits) + (1 - input_y) * (input_y + self.logits)) + 1) rewards = tf.tile(tf.reshape(self.REWARDS, (-1,1)), [1, self.OUTPUT_SIZE]) self.cost = -tf.reduce_mean(loglike * (rewards + 1)) self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X:inputs}) def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array([res]) def discount_rewards(self, r): discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(range(0, r.size)): running_add = running_add * self.GAMMA + r[t] discounted_r[t] = running_add return discounted_r def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self.get_predicted_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t] and t < (len(self.trend) - self.half_window): inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): ep_history = [] total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): action = self.get_predicted_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t] and t < (len(self.trend) - self.half_window): inventory.append(self.trend[t]) starting_money -= close[t] elif action == 2 and len(inventory): bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] ep_history.append([state,action,starting_money,next_state]) state = next_state ep_history = np.array(ep_history) ep_history[:,2] = self.discount_rewards(ep_history[:,2]) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={self.X:np.vstack(ep_history[:,0]), self.REWARDS:ep_history[:,2], self.ACTIONS:ep_history[:,1]}) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Q-learning agent' class Agent: def __init__(self, state_size, window_size, trend, skip, batch_size): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.action_size = 3 self.batch_size = batch_size self.memory = deque(maxlen = 1000) self.inventory = [] self.gamma = 0.95 self.epsilon = 0.5 self.epsilon_min = 0.01 self.epsilon_decay = 0.999 tf.reset_default_graph() self.sess = tf.InteractiveSession() self.X = tf.placeholder(tf.float32, [None, self.state_size]) self.Y = tf.placeholder(tf.float32, [None, self.action_size]) feed = tf.layers.dense(self.X, 256, activation = tf.nn.relu) self.logits = tf.layers.dense(feed, self.action_size) self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.GradientDescentOptimizer(1e-5).minimize( self.cost ) self.sess.run(tf.global_variables_initializer()) def act(self, state): if random.random() <= self.epsilon: return random.randrange(self.action_size) return np.argmax( self.sess.run(self.logits, feed_dict = {self.X: state})[0] ) def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array([res]) def replay(self, batch_size): mini_batch = [] l = len(self.memory) for i in range(l - batch_size, l): mini_batch.append(self.memory[i]) replay_size = len(mini_batch) X = np.empty((replay_size, self.state_size)) Y = np.empty((replay_size, self.action_size)) states = np.array([a[0][0] for a in mini_batch]) new_states = np.array([a[3][0] for a in mini_batch]) Q = self.sess.run(self.logits, feed_dict = {self.X: states}) Q_new = self.sess.run(self.logits, feed_dict = {self.X: new_states}) for i in range(len(mini_batch)): state, action, reward, next_state, done = mini_batch[i] target = Q[i] target[action] = reward if not done: target[action] += self.gamma * np.amax(Q_new[i]) X[i] = state Y[i] = target cost, _ = self.sess.run( [self.cost, self.optimizer], feed_dict = {self.X: X, self.Y: Y} ) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay return cost def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self.act(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t] and t < (len(self.trend) - self.half_window): inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): action = self.act(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t] and t < (len(self.trend) - self.half_window): inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self.memory.append((state, action, invest, next_state, starting_money < initial_money)) state = next_state batch_size = min(self.batch_size, len(self.memory)) cost = self.replay(batch_size) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip, batch_size = batch_size) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() import pkg_resources import types df= df_full.copy() name = 'Evolution Strategy agent' def get_imports(): for name, val in globals().items(): if isinstance(val, types.ModuleType): name = val.__name__.split('.')[0] elif isinstance(val, type): name = val.__module__.split('.')[0] poorly_named_packages = {'PIL': 'Pillow', 'sklearn': 'scikit-learn'} if name in poorly_named_packages.keys(): name = poorly_named_packages[name] yield name imports = list(set(get_imports())) requirements = [] for m in pkg_resources.working_set: if m.project_name in imports and m.project_name != 'pip': requirements.append((m.project_name, m.version)) for r in requirements: print('{}=={}'.format(*r)) class Deep_Evolution_Strategy: inputs = None def __init__( self, weights, reward_function, population_size, sigma, learning_rate ): self.weights = weights self.reward_function = reward_function self.population_size = population_size self.sigma = sigma self.learning_rate = learning_rate def _get_weight_from_population(self, weights, population): weights_population = [] for index, i in enumerate(population): jittered = self.sigma * i weights_population.append(weights[index] + jittered) return weights_population def get_weights(self): return self.weights def train(self, epoch = 100, print_every = 1): lasttime = time.time() for i in range(epoch): population = [] rewards = np.zeros(self.population_size) for k in range(self.population_size): x = [] for w in self.weights: x.append(np.random.randn(*w.shape)) population.append(x) for k in range(self.population_size): weights_population = self._get_weight_from_population( self.weights, population[k] ) rewards[k] = self.reward_function(weights_population) rewards = (rewards - np.mean(rewards)) / (np.std(rewards) + 1e-7) for index, w in enumerate(self.weights): A = np.array([p[index] for p in population]) self.weights[index] = ( w + self.learning_rate / (self.population_size * self.sigma) * np.dot(A.T, rewards).T ) if (i + 1) % print_every == 0: print( 'iter %d. reward: %f' % (i + 1, self.reward_function(self.weights)) ) print('time taken to train:', time.time() - lasttime, 'seconds') class Model: def __init__(self, input_size, layer_size, output_size): self.weights = [ np.random.randn(input_size, layer_size), np.random.randn(layer_size, output_size), np.random.randn(1, layer_size), ] def predict(self, inputs): feed = np.dot(inputs, self.weights[0]) + self.weights[-1] decision = np.dot(feed, self.weights[1]) return decision def get_weights(self): return self.weights def set_weights(self, weights): self.weights = weights class Agent: POPULATION_SIZE = 15 SIGMA = 0.1 LEARNING_RATE = 0.03 def __init__(self, model, window_size, trend, skip, initial_money): self.model = model self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.initial_money = initial_money self.es = Deep_Evolution_Strategy( self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE, ) def act(self, sequence): decision = self.model.predict(np.array(sequence)) return np.argmax(decision[0]) def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array([res]) def get_reward(self, weights): initial_money = self.initial_money starting_money = initial_money self.model.weights = weights state = self.get_state(0) inventory = [] quantity = 0 for t in range(0, len(self.trend) - 1, self.skip): action = self.act(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= close[t] elif action == 2 and len(inventory): bought_price = inventory.pop(0) starting_money += self.trend[t] state = next_state return ((starting_money - initial_money) / initial_money) * 100 def fit(self, iterations, checkpoint): self.es.train(iterations, print_every = checkpoint) def buy(self): initial_money = self.initial_money state = self.get_state(0) starting_money = initial_money states_sell = [] states_buy = [] inventory = [] for t in range(0, len(self.trend) - 1, self.skip): action = self.act(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest close = df.Close.values.tolist() window_size = 30 skip = 1 initial_money = 10000 model = Model(input_size = window_size, layer_size = 500, output_size = 3) agent = Agent(model = model, window_size = window_size, trend = close, skip = skip, initial_money = initial_money) agent.fit(iterations = 500, checkpoint = 10) states_buy, states_sell, total_gains, invest = agent.buy() fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Double Q-learning agent' class Model: def __init__(self, input_size, output_size, layer_size, learning_rate): self.X = tf.placeholder(tf.float32, (None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) feed_forward = tf.layers.dense(self.X, layer_size, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_forward, output_size) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 500 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.model = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.model_negative = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.trainable = tf.trainable_variables() def _assign(self): for i in range(len(self.trainable)//2): assign_op = self.trainable[i+len(self.trainable)//2].assign(self.trainable[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) Q_new_negative = self.sess.run(self.model_negative.logits, feed_dict={self.model_negative.X:new_states}) replay_size = len(replay) X = np.empty((replay_size, self.state_size)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, done_r = replay[i] target = Q[i] target[action_r] = reward_r if not done_r: target[action_r] += self.GAMMA * Q_new_negative[i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.model.logits, feed_dict={self.model.X:inputs}) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign() action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self._memorize(state, action, invest, next_state, starting_money < initial_money) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) state = next_state X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.model.cost, self.model.optimizer], feed_dict={self.model.X: X, self.model.Y:Y}) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Recurrent Q-learning agent' class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.X = tf.placeholder(tf.float32, (None, None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) cell = tf.nn.rnn_cell.LSTMCell(self.LAYER_SIZE, state_is_tuple = False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * self.LAYER_SIZE)) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.logits = tf.layers.dense(self.rnn[:,-1], self.OUTPUT_SIZE) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.logits, feed_dict={self.X:states, self.hidden_layer:init_values}) Q_new = self.sess.run(self.logits, feed_dict={self.X:new_states, self.hidden_layer:init_values}) replay_size = len(replay) X = np.empty((replay_size, 4, self.state_size)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) INIT_VAL = np.empty((replay_size, 2 * self.LAYER_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r, rnn_memory = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * np.amax(Q_new[i]) X[i] = state_r Y[i] = target INIT_VAL[i] = rnn_memory return X, Y, INIT_VAL def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): action, last_state = self.sess.run([self.logits,self.last_state], feed_dict={self.X:[self.INITIAL_FEATURES], self.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.logits, self.last_state], feed_dict={self.X:[self.INITIAL_FEATURES], self.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) self.INITIAL_FEATURES = new_state batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y, INIT_VAL = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={self.X: X, self.Y:Y, self.hidden_layer: INIT_VAL}) self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Double Recurrent Q-learning agent' class Model: def __init__(self, input_size, output_size, layer_size, learning_rate, name): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) cell = tf.nn.rnn_cell.LSTMCell(layer_size, state_is_tuple = False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * layer_size)) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.logits = tf.layers.dense(self.rnn[:,-1], output_size) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.model = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE, 'real_model') self.model_negative = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE, 'negative_model') self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.trainable = tf.trainable_variables() def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.model.logits, feed_dict={self.model.X:states, self.model.hidden_layer:init_values}) Q_new = self.sess.run(self.model.logits, feed_dict={self.model.X:new_states, self.model.hidden_layer:init_values}) Q_new_negative = self.sess.run(self.model_negative.logits, feed_dict={self.model_negative.X:new_states, self.model_negative.hidden_layer:init_values}) replay_size = len(replay) X = np.empty((replay_size, 4, self.state_size)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) INIT_VAL = np.empty((replay_size, 2 * self.LAYER_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r, rnn_memory = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * Q_new_negative[i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target INIT_VAL[i] = rnn_memory return X, Y, INIT_VAL def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): action, last_state = self.sess.run([self.model.logits,self.model.last_state], feed_dict={self.model.X:[self.INITIAL_FEATURES], self.model.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign('real_model', 'negative_model') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.model.logits, self.model.last_state], feed_dict={self.model.X:[self.INITIAL_FEATURES], self.model.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) self.INITIAL_FEATURES = new_state batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y, INIT_VAL = self._construct_memories(replay) cost, _ = self.sess.run([self.model.cost, self.model.optimizer], feed_dict={self.model.X: X, self.model.Y:Y, self.model.hidden_layer: INIT_VAL}) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Duel Q-learning agent' class Agent: def __init__(self, state_size, window_size, trend, skip, batch_size): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.action_size = 3 self.batch_size = batch_size self.memory = deque(maxlen = 1000) self.inventory = [] self.gamma = 0.95 self.epsilon = 0.5 self.epsilon_min = 0.01 self.epsilon_decay = 0.999 tf.reset_default_graph() self.sess = tf.InteractiveSession() self.X = tf.placeholder(tf.float32, [None, self.state_size]) self.Y = tf.placeholder(tf.float32, [None, self.action_size]) feed = tf.layers.dense(self.X, 512, activation = tf.nn.relu) tensor_action, tensor_validation = tf.split(feed,2,1) feed_action = tf.layers.dense(tensor_action, self.action_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) self.cost = tf.reduce_mean(tf.square(self.Y - self.logits)) self.optimizer = tf.train.GradientDescentOptimizer(1e-5).minimize( self.cost ) self.sess.run(tf.global_variables_initializer()) def act(self, state): if random.random() <= self.epsilon: return random.randrange(self.action_size) return np.argmax( self.sess.run(self.logits, feed_dict = {self.X: state})[0] ) def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array([res]) def replay(self, batch_size): mini_batch = [] l = len(self.memory) for i in range(l - batch_size, l): mini_batch.append(self.memory[i]) replay_size = len(mini_batch) X = np.empty((replay_size, self.state_size)) Y = np.empty((replay_size, self.action_size)) states = np.array([a[0][0] for a in mini_batch]) new_states = np.array([a[3][0] for a in mini_batch]) Q = self.sess.run(self.logits, feed_dict = {self.X: states}) Q_new = self.sess.run(self.logits, feed_dict = {self.X: new_states}) for i in range(len(mini_batch)): state, action, reward, next_state, done = mini_batch[i] target = Q[i] target[action] = reward if not done: target[action] += self.gamma * np.amax(Q_new[i]) X[i] = state Y[i] = target cost, _ = self.sess.run( [self.cost, self.optimizer], feed_dict = {self.X: X, self.Y: Y} ) if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay return cost def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self.act(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t] and t < (len(self.trend) - self.half_window): inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): action = self.act(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t] and t < (len(self.trend) - self.half_window): inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self.memory.append((state, action, invest, next_state, starting_money < initial_money)) state = next_state batch_size = min(self.batch_size, len(self.memory)) cost = self.replay(batch_size) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip, batch_size = batch_size) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Double Duel Q-learning agent' class Model: def __init__(self, input_size, output_size, layer_size, learning_rate): self.X = tf.placeholder(tf.float32, (None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) feed = tf.layers.dense(self.X, layer_size, activation = tf.nn.relu) tensor_action, tensor_validation = tf.split(feed,2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 500 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.model = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.model_negative = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.trainable = tf.trainable_variables() def _assign(self): for i in range(len(self.trainable)//2): assign_op = self.trainable[i+len(self.trainable)//2].assign(self.trainable[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.predict(states) Q_new = self.predict(new_states) Q_new_negative = self.sess.run(self.model_negative.logits, feed_dict={self.model_negative.X:new_states}) replay_size = len(replay) X = np.empty((replay_size, self.state_size)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, done_r = replay[i] target = Q[i] target[action_r] = reward_r if not done_r: target[action_r] += self.GAMMA * Q_new_negative[i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target return X, Y def predict(self, inputs): return self.sess.run(self.model.logits, feed_dict={self.model.X:inputs}) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign() action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self._memorize(state, action, invest, next_state, starting_money < initial_money) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) state = next_state replay = random.sample(self.MEMORIES, batch_size) X, Y = self._construct_memories(replay) cost, _ = self.sess.run([self.model.cost, self.model.optimizer], feed_dict={self.model.X: X, self.model.Y:Y}) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Duel Recurrent Q-learning agent' class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.X = tf.placeholder(tf.float32, (None, None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, self.OUTPUT_SIZE)) cell = tf.nn.rnn_cell.LSTMCell(self.LAYER_SIZE, state_is_tuple = False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * self.LAYER_SIZE)) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:,-1],2,1) feed_action = tf.layers.dense(tensor_action, self.OUTPUT_SIZE) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = self.LEARNING_RATE).minimize(self.cost) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.logits, feed_dict={self.X:states, self.hidden_layer:init_values}) Q_new = self.sess.run(self.logits, feed_dict={self.X:new_states, self.hidden_layer:init_values}) replay_size = len(replay) X = np.empty((replay_size, 4, self.state_size)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) INIT_VAL = np.empty((replay_size, 2 * self.LAYER_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r, rnn_memory = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * np.amax(Q_new[i]) X[i] = state_r Y[i] = target INIT_VAL[i] = rnn_memory return X, Y, INIT_VAL def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): action, last_state = self.sess.run([self.logits,self.last_state], feed_dict={self.X:[self.INITIAL_FEATURES], self.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.logits, self.last_state], feed_dict={self.X:[self.INITIAL_FEATURES], self.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) self.INITIAL_FEATURES = new_state batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y, INIT_VAL = self._construct_memories(replay) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict={self.X: X, self.Y:Y, self.hidden_layer: INIT_VAL}) self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Double Duel Recurrent Q-learning agent' class Model: def __init__(self, input_size, output_size, layer_size, learning_rate, name): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) cell = tf.nn.rnn_cell.LSTMCell(layer_size, state_is_tuple = False) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * layer_size)) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:,-1],2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) self.cost = tf.reduce_sum(tf.square(self.Y - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.model = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE, 'real_model') self.model_negative = Model(self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE, 'negative_model') self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) self.trainable = tf.trainable_variables() def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.model.logits, feed_dict={self.model.X:states, self.model.hidden_layer:init_values}) Q_new = self.sess.run(self.model.logits, feed_dict={self.model.X:new_states, self.model.hidden_layer:init_values}) Q_new_negative = self.sess.run(self.model_negative.logits, feed_dict={self.model_negative.X:new_states, self.model_negative.hidden_layer:init_values}) replay_size = len(replay) X = np.empty((replay_size, 4, self.state_size)) Y = np.empty((replay_size, self.OUTPUT_SIZE)) INIT_VAL = np.empty((replay_size, 2 * self.LAYER_SIZE)) for i in range(replay_size): state_r, action_r, reward_r, new_state_r, dead_r, rnn_memory = replay[i] target = Q[i] target[action_r] = reward_r if not dead_r: target[action_r] += self.GAMMA * Q_new_negative[i, np.argmax(Q_new[i])] X[i] = state_r Y[i] = target INIT_VAL[i] = rnn_memory return X, Y, INIT_VAL def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): action, last_state = self.sess.run([self.model.logits,self.model.last_state], feed_dict={self.model.X:[self.INITIAL_FEATURES], self.model.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign('real_model', 'negative_model') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.model.logits, self.model.last_state], feed_dict={self.model.X:[self.INITIAL_FEATURES], self.model.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) self.INITIAL_FEATURES = new_state batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) X, Y, INIT_VAL = self._construct_memories(replay) cost, _ = self.sess.run([self.model.cost, self.model.optimizer], feed_dict={self.model.X: X, self.model.Y:Y, self.model.hidden_layer: INIT_VAL}) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Actor-critic agent' class Actor: def __init__(self, name, input_size, output_size, size_layer): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, input_size)) feed_actor = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_actor, output_size) class Critic: def __init__(self, name, input_size, output_size, size_layer, learning_rate): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) self.REWARD = tf.placeholder(tf.float32, (None, 1)) feed_critic = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) feed_critic = tf.layers.dense(feed_critic, output_size, activation = tf.nn.relu) + self.Y feed_critic = tf.layers.dense(feed_critic, size_layer//2, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_critic, 1) self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.001 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: prediction = self.sess.run(self.actor.logits, feed_dict={self.actor.X:[state]})[0] action = np.argmax(prediction) return action def _construct_memories_and_train(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.Y:Q})[0] self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor_critic_grad:grads}) rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states,self.critic_target.Y:Q_target}) for i in range(len(replay)): if not replay[0][-1]: rewards[i] += self.GAMMA * rewards_target[i] cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer], feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.REWARD:rewards}) return cost def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor-original', 'actor-target') self._assign('critic-original', 'critic-target') action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self._memorize(state, action, invest, next_state, starting_money < initial_money) state = next_state batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Actor-critic Duel agent' class Actor: def __init__(self, name, input_size, output_size, size_layer): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, input_size)) feed_actor = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) tensor_action, tensor_validation = tf.split(feed_actor,2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action, tf.reduce_mean(feed_action,axis=1,keep_dims=True)) class Critic: def __init__(self, name, input_size, output_size, size_layer, learning_rate): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) self.REWARD = tf.placeholder(tf.float32, (None, 1)) feed_critic = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) tensor_action, tensor_validation = tf.split(feed_critic,2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) feed_critic = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) feed_critic = tf.nn.relu(feed_critic) + self.Y feed_critic = tf.layers.dense(feed_critic, size_layer//2, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_critic, 1) self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.001 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead): self.MEMORIES.append((state, action, reward, new_state, dead)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: prediction = self.sess.run(self.actor.logits, feed_dict={self.actor.X:[state]})[0] action = np.argmax(prediction) return action def _construct_memories_and_train(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.Y:Q})[0] self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor_critic_grad:grads}) rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states,self.critic_target.Y:Q_target}) for i in range(len(replay)): if not replay[0][-1]: rewards[i] += self.GAMMA * rewards_target[i] cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer], feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.REWARD:rewards}) return cost def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor-original', 'actor-target') self._assign('critic-original', 'critic-target') action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self._memorize(state, action, invest, next_state, starting_money < initial_money) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) state = next_state replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Actor-critic Recurrent agent' class Actor: def __init__(self, name, input_size, output_size, size_layer): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * size_layer)) cell = tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.logits = tf.layers.dense(self.rnn[:,-1], output_size) class Critic: def __init__(self, name, input_size, output_size, size_layer, learning_rate): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * size_layer)) self.REWARD = tf.placeholder(tf.float32, (None, 1)) feed_critic = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) cell = tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) feed_critic = tf.layers.dense(self.rnn[:,-1], output_size, activation = tf.nn.relu) + self.Y feed_critic = tf.layers.dense(feed_critic, size_layer//2, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_critic, 1) self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.001 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.skip = skip tf.reset_default_graph() self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: prediction = self.sess.run(self.actor.logits, feed_dict={self.actor.X:[state]})[0] action = np.argmax(prediction) return action def _construct_memories_and_train(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states, self.actor.hidden_layer: init_values}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states, self.actor_target.hidden_layer: init_values}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.hidden_layer: init_values})[0] self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor_critic_grad:grads, self.actor.hidden_layer: init_values}) rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states,self.critic_target.Y:Q_target, self.critic_target.hidden_layer: init_values}) for i in range(len(replay)): if not replay[0][-2]: rewards[i] += self.GAMMA * rewards_target[i] cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer], feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.REWARD:rewards, self.critic.hidden_layer: init_values}) return cost def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.actor.logits, self.actor.last_state], feed_dict={self.actor.X:[self.INITIAL_FEATURES], self.actor.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor-original', 'actor-target') self._assign('critic-original', 'critic-target') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.actor.logits, self.actor.last_state], feed_dict={self.actor.X:[self.INITIAL_FEATURES], self.actor.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) self.INITIAL_FEATURES = new_state replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Actor-critic Duel Recurrent agent' class Actor: def __init__(self, name, input_size, output_size, size_layer): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * size_layer)) cell = tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:,-1],2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + tf.subtract(feed_action, tf.reduce_mean(feed_action,axis=1,keep_dims=True)) class Critic: def __init__(self, name, input_size, output_size, size_layer, learning_rate): with tf.variable_scope(name): self.X = tf.placeholder(tf.float32, (None, None, input_size)) self.Y = tf.placeholder(tf.float32, (None, output_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * size_layer)) self.REWARD = tf.placeholder(tf.float32, (None, 1)) feed_critic = tf.layers.dense(self.X, size_layer, activation = tf.nn.relu) cell = tf.nn.rnn_cell.LSTMCell(size_layer, state_is_tuple = False) self.rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X, cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) tensor_action, tensor_validation = tf.split(self.rnn[:,-1],2,1) feed_action = tf.layers.dense(tensor_action, output_size) feed_validation = tf.layers.dense(tensor_validation, 1) feed_critic = feed_validation + tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) feed_critic = tf.nn.relu(feed_critic) + self.Y feed_critic = tf.layers.dense(feed_critic, size_layer//2, activation = tf.nn.relu) self.logits = tf.layers.dense(feed_critic, 1) self.cost = tf.reduce_mean(tf.square(self.REWARD - self.logits)) self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.cost) class Agent: LEARNING_RATE = 0.001 BATCH_SIZE = 32 LAYER_SIZE = 256 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() MEMORY_SIZE = 300 COPY = 1000 T_COPY = 0 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.skip = skip tf.reset_default_graph() self.actor = Actor('actor-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.actor_target = Actor('actor-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE) self.critic = Critic('critic-original', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.critic_target = Critic('critic-target', self.state_size, self.OUTPUT_SIZE, self.LAYER_SIZE, self.LEARNING_RATE) self.grad_critic = tf.gradients(self.critic.logits, self.critic.Y) self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.OUTPUT_SIZE]) weights_actor = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') self.grad_actor = tf.gradients(self.actor.logits, weights_actor, -self.actor_critic_grad) grads = zip(self.grad_actor, weights_actor) self.optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE).apply_gradients(grads) self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _assign(self, from_name, to_name): from_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=from_name) to_w = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=to_name) for i in range(len(from_w)): assign_op = to_w[i].assign(from_w[i]) self.sess.run(assign_op) def _memorize(self, state, action, reward, new_state, dead, rnn_state): self.MEMORIES.append((state, action, reward, new_state, dead, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: prediction = self.sess.run(self.actor.logits, feed_dict={self.actor.X:[state]})[0] action = np.argmax(prediction) return action def _construct_memories_and_train(self, replay): states = np.array([a[0] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) Q = self.sess.run(self.actor.logits, feed_dict={self.actor.X: states, self.actor.hidden_layer: init_values}) Q_target = self.sess.run(self.actor_target.logits, feed_dict={self.actor_target.X: states, self.actor_target.hidden_layer: init_values}) grads = self.sess.run(self.grad_critic, feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.hidden_layer: init_values})[0] self.sess.run(self.optimizer, feed_dict={self.actor.X:states, self.actor_critic_grad:grads, self.actor.hidden_layer: init_values}) rewards = np.array([a[2] for a in replay]).reshape((-1, 1)) rewards_target = self.sess.run(self.critic_target.logits, feed_dict={self.critic_target.X:new_states,self.critic_target.Y:Q_target, self.critic_target.hidden_layer: init_values}) for i in range(len(replay)): if not replay[0][-2]: rewards[i] += self.GAMMA * rewards_target[i] cost, _ = self.sess.run([self.critic.cost, self.critic.optimizer], feed_dict={self.critic.X:states, self.critic.Y:Q, self.critic.REWARD:rewards, self.critic.hidden_layer: init_values}) return cost def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.actor.logits, self.actor.last_state], feed_dict={self.actor.X:[self.INITIAL_FEATURES], self.actor.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if (self.T_COPY + 1) % self.COPY == 0: self._assign('actor-original', 'actor-target') self._assign('critic-original', 'critic-target') if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.actor.logits, self.actor.last_state], feed_dict={self.actor.X:[self.INITIAL_FEATURES], self.actor.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) self.INITIAL_FEATURES = new_state replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories_and_train(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Curiosity Q-learning agent' class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 500 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.X = tf.placeholder(tf.float32, (None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, self.state_size)) self.ACTION = tf.placeholder(tf.float32, (None)) self.REWARD = tf.placeholder(tf.float32, (None)) self.batch_size = tf.shape(self.ACTION)[0] with tf.variable_scope('curiosity_model'): action = tf.reshape(self.ACTION, (-1,1)) state_action = tf.concat([self.X, action], axis=1) save_state = tf.identity(self.Y) feed = tf.layers.dense(state_action, 32, activation=tf.nn.relu) self.curiosity_logits = tf.layers.dense(feed, self.state_size) self.curiosity_cost = tf.reduce_sum(tf.square(save_state - self.curiosity_logits), axis=1) self.curiosity_optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE)\ .minimize(tf.reduce_mean(self.curiosity_cost)) total_reward = tf.add(self.curiosity_cost, self.REWARD) with tf.variable_scope("q_model"): with tf.variable_scope("eval_net"): x_action = tf.layers.dense(self.X, 128, tf.nn.relu) self.logits = tf.layers.dense(x_action, self.OUTPUT_SIZE) with tf.variable_scope("target_net"): y_action = tf.layers.dense(self.Y, 128, tf.nn.relu) y_q = tf.layers.dense(y_action, self.OUTPUT_SIZE) q_target = total_reward + self.GAMMA * tf.reduce_max(y_q, axis=1) action = tf.cast(self.ACTION, tf.int32) action_indices = tf.stack([tf.range(self.batch_size, dtype=tf.int32), action], axis=1) q = tf.gather_nd(params=self.logits, indices=action_indices) self.cost = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE).minimize( self.cost, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "q_model/eval_net")) t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/target_net') e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/eval_net') self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X:inputs}) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) actions = np.array([a[1] for a in replay]) rewards = np.array([a[2] for a in replay]) new_states = np.array([a[3] for a in replay]) if (self.T_COPY + 1) % self.COPY == 0: self.sess.run(self.target_replace_op) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict = { self.X: states, self.Y: new_states, self.ACTION: actions, self.REWARD: rewards }) if (self.T_COPY + 1) % self.COPY == 0: self.sess.run(self.curiosity_optimizer, feed_dict = { self.X: states, self.Y: new_states, self.ACTION: actions, self.REWARD: rewards }) return cost def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self._memorize(state, action, invest, next_state, starting_money < initial_money) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) state = next_state replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Recurrent Curiosity Q-learning agent' class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 128 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.INITIAL_FEATURES = np.zeros((4, self.state_size)) self.X = tf.placeholder(tf.float32, (None, None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, None, self.state_size)) self.hidden_layer = tf.placeholder(tf.float32, (None, 2 * self.LAYER_SIZE)) self.ACTION = tf.placeholder(tf.float32, (None)) self.REWARD = tf.placeholder(tf.float32, (None)) self.batch_size = tf.shape(self.ACTION)[0] self.seq_len = tf.shape(self.X)[1] with tf.variable_scope('curiosity_model'): action = tf.reshape(self.ACTION, (-1,1,1)) repeat_action = tf.tile(action, [1,self.seq_len,1]) state_action = tf.concat([self.X, repeat_action], axis=-1) save_state = tf.identity(self.Y) cell = tf.nn.rnn_cell.LSTMCell(self.LAYER_SIZE, state_is_tuple = False) self.rnn,last_state = tf.nn.dynamic_rnn(inputs=state_action,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.curiosity_logits = tf.layers.dense(self.rnn[:,-1], self.state_size) self.curiosity_cost = tf.reduce_sum(tf.square(save_state[:,-1] - self.curiosity_logits), axis=1) self.curiosity_optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE)\ .minimize(tf.reduce_mean(self.curiosity_cost)) total_reward = tf.add(self.curiosity_cost, self.REWARD) with tf.variable_scope("q_model"): with tf.variable_scope("eval_net"): cell = tf.nn.rnn_cell.LSTMCell(self.LAYER_SIZE, state_is_tuple = False) rnn,self.last_state = tf.nn.dynamic_rnn(inputs=self.X,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) self.logits = tf.layers.dense(rnn[:,-1], self.OUTPUT_SIZE) with tf.variable_scope("target_net"): cell = tf.nn.rnn_cell.LSTMCell(self.LAYER_SIZE, state_is_tuple = False) rnn,last_state = tf.nn.dynamic_rnn(inputs=self.Y,cell=cell, dtype=tf.float32, initial_state=self.hidden_layer) y_q = tf.layers.dense(rnn[:,-1], self.OUTPUT_SIZE) q_target = total_reward + self.GAMMA * tf.reduce_max(y_q, axis=1) action = tf.cast(self.ACTION, tf.int32) action_indices = tf.stack([tf.range(self.batch_size, dtype=tf.int32), action], axis=1) q = tf.gather_nd(params=self.logits, indices=action_indices) self.cost = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE).minimize( self.cost, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "q_model/eval_net")) t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/target_net') e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/eval_net') self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _memorize(self, state, action, reward, new_state, done, rnn_state): self.MEMORIES.append((state, action, reward, new_state, done, rnn_state)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) actions = np.array([a[1] for a in replay]) rewards = np.array([a[2] for a in replay]) new_states = np.array([a[3] for a in replay]) init_values = np.array([a[-1] for a in replay]) if (self.T_COPY + 1) % self.COPY == 0: self.sess.run(self.target_replace_op) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict = { self.X: states, self.Y: new_states, self.ACTION: actions, self.REWARD: rewards, self.hidden_layer: init_values }) if (self.T_COPY + 1) % self.COPY == 0: self.sess.run(self.curiosity_optimizer, feed_dict = { self.X: states, self.Y: new_states, self.ACTION: actions, self.REWARD: rewards, self.hidden_layer: init_values }) return cost def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.logits, self.last_state], feed_dict={self.X:[self.INITIAL_FEATURES], self.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self.INITIAL_FEATURES = new_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money init_value = np.zeros((1, 2 * self.LAYER_SIZE)) for k in range(self.INITIAL_FEATURES.shape[0]): self.INITIAL_FEATURES[k,:] = state for t in range(0, len(self.trend) - 1, self.skip): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action, last_state = self.sess.run([self.logits, self.last_state], feed_dict={self.X:[self.INITIAL_FEATURES], self.hidden_layer:init_value}) action, init_value = np.argmax(action[0]), last_state next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) new_state = np.append([self.get_state(t + 1)], self.INITIAL_FEATURES[:3, :], axis = 0) self._memorize(self.INITIAL_FEATURES, action, invest, new_state, starting_money < initial_money, init_value[0]) self.INITIAL_FEATURES = new_state batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() from collections import deque import random df= df_full.copy() name = 'Duel Curiosity Q-learning agent' class Agent: LEARNING_RATE = 0.003 BATCH_SIZE = 32 LAYER_SIZE = 500 OUTPUT_SIZE = 3 EPSILON = 0.5 DECAY_RATE = 0.005 MIN_EPSILON = 0.1 GAMMA = 0.99 MEMORIES = deque() COPY = 1000 T_COPY = 0 MEMORY_SIZE = 300 def __init__(self, state_size, window_size, trend, skip): self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip tf.reset_default_graph() self.X = tf.placeholder(tf.float32, (None, self.state_size)) self.Y = tf.placeholder(tf.float32, (None, self.state_size)) self.ACTION = tf.placeholder(tf.float32, (None)) self.REWARD = tf.placeholder(tf.float32, (None)) self.batch_size = tf.shape(self.ACTION)[0] with tf.variable_scope('curiosity_model'): action = tf.reshape(self.ACTION, (-1,1)) state_action = tf.concat([self.X, action], axis=1) save_state = tf.identity(self.Y) feed = tf.layers.dense(state_action, 32, activation=tf.nn.relu) self.curiosity_logits = tf.layers.dense(feed, self.state_size) self.curiosity_cost = tf.reduce_sum(tf.square(save_state - self.curiosity_logits), axis=1) self.curiosity_optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE)\ .minimize(tf.reduce_mean(self.curiosity_cost)) total_reward = tf.add(self.curiosity_cost, self.REWARD) with tf.variable_scope("q_model"): with tf.variable_scope("eval_net"): x_action = tf.layers.dense(self.X, 128, tf.nn.relu) tensor_action, tensor_validation = tf.split(x_action,2,1) feed_action = tf.layers.dense(tensor_action, self.OUTPUT_SIZE) feed_validation = tf.layers.dense(tensor_validation, 1) self.logits = feed_validation + \ tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) with tf.variable_scope("target_net"): y_action = tf.layers.dense(self.Y, 128, tf.nn.relu) tensor_action, tensor_validation = tf.split(y_action,2,1) feed_action = tf.layers.dense(tensor_action, self.OUTPUT_SIZE) feed_validation = tf.layers.dense(tensor_validation, 1) y_q = feed_validation + \ tf.subtract(feed_action,tf.reduce_mean(feed_action,axis=1,keep_dims=True)) q_target = total_reward + self.GAMMA * tf.reduce_max(y_q, axis=1) action = tf.cast(self.ACTION, tf.int32) action_indices = tf.stack([tf.range(self.batch_size, dtype=tf.int32), action], axis=1) q = tf.gather_nd(params=self.logits, indices=action_indices) self.cost = tf.losses.mean_squared_error(labels=q_target, predictions=q) self.optimizer = tf.train.RMSPropOptimizer(self.LEARNING_RATE).minimize( self.cost, var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "q_model/eval_net")) t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/target_net') e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_model/eval_net') self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)] self.sess = tf.InteractiveSession() self.sess.run(tf.global_variables_initializer()) def _memorize(self, state, action, reward, new_state, done): self.MEMORIES.append((state, action, reward, new_state, done)) if len(self.MEMORIES) > self.MEMORY_SIZE: self.MEMORIES.popleft() def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array(res) def predict(self, inputs): return self.sess.run(self.logits, feed_dict={self.X:inputs}) def get_predicted_action(self, sequence): prediction = self.predict(np.array(sequence))[0] return np.argmax(prediction) def _select_action(self, state): if np.random.rand() < self.EPSILON: action = np.random.randint(self.OUTPUT_SIZE) else: action = self.get_predicted_action([state]) return action def _construct_memories(self, replay): states = np.array([a[0] for a in replay]) actions = np.array([a[1] for a in replay]) rewards = np.array([a[2] for a in replay]) new_states = np.array([a[3] for a in replay]) if (self.T_COPY + 1) % self.COPY == 0: self.sess.run(self.target_replace_op) cost, _ = self.sess.run([self.cost, self.optimizer], feed_dict = { self.X: states, self.Y: new_states, self.ACTION: actions, self.REWARD: rewards }) if (self.T_COPY + 1) % self.COPY == 0: self.sess.run(self.curiosity_optimizer, feed_dict = { self.X: states, self.Y: new_states, self.ACTION: actions, self.REWARD: rewards }) return cost def buy(self, initial_money): starting_money = initial_money states_sell = [] states_buy = [] inventory = [] state = self.get_state(0) for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and initial_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((close[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, close[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def train(self, iterations, checkpoint, initial_money): for i in range(iterations): total_profit = 0 inventory = [] state = self.get_state(0) starting_money = initial_money for t in range(0, len(self.trend) - 1, self.skip): action = self._select_action(state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) total_profit += self.trend[t] - bought_price starting_money += self.trend[t] invest = ((starting_money - initial_money) / initial_money) self._memorize(state, action, invest, next_state, starting_money < initial_money) batch_size = min(len(self.MEMORIES), self.BATCH_SIZE) state = next_state replay = random.sample(self.MEMORIES, batch_size) cost = self._construct_memories(replay) self.T_COPY += 1 self.EPSILON = self.MIN_EPSILON + (1.0 - self.MIN_EPSILON) * np.exp(-self.DECAY_RATE * i) if (i+1) % checkpoint == 0: print('epoch: %d, total rewards: %f.3, cost: %f, total money: %f'%(i + 1, total_profit, cost, starting_money)) close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 batch_size = 32 agent = Agent(state_size = window_size, window_size = window_size, trend = close, skip = skip) agent.train(iterations = 200, checkpoint = 10, initial_money = initial_money) states_buy, states_sell, total_gains, invest = agent.buy(initial_money = initial_money) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() df= df_full.copy() name = 'Neuro-evolution agent' close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 class neuralnetwork: def __init__(self, id_, hidden_size = 128): self.W1 = np.random.randn(window_size, hidden_size) / np.sqrt(window_size) self.W2 = np.random.randn(hidden_size, 3) / np.sqrt(hidden_size) self.fitness = 0 self.id = id_ def relu(X): return np.maximum(X, 0) def softmax(X): e_x = np.exp(X - np.max(X, axis=-1, keepdims=True)) return e_x / np.sum(e_x, axis=-1, keepdims=True) def feed_forward(X, nets): a1 = np.dot(X, nets.W1) z1 = relu(a1) a2 = np.dot(z1, nets.W2) return softmax(a2) class NeuroEvolution: def __init__(self, population_size, mutation_rate, model_generator, state_size, window_size, trend, skip, initial_money): self.population_size = population_size self.mutation_rate = mutation_rate self.model_generator = model_generator self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.initial_money = initial_money def _initialize_population(self): self.population = [] for i in range(self.population_size): self.population.append(self.model_generator(i)) def mutate(self, individual, scale=1.0): mutation_mask = np.random.binomial(1, p=self.mutation_rate, size=individual.W1.shape) individual.W1 += np.random.normal(loc=0, scale=scale, size=individual.W1.shape) * mutation_mask mutation_mask = np.random.binomial(1, p=self.mutation_rate, size=individual.W2.shape) individual.W2 += np.random.normal(loc=0, scale=scale, size=individual.W2.shape) * mutation_mask return individual def inherit_weights(self, parent, child): child.W1 = parent.W1.copy() child.W2 = parent.W2.copy() return child def crossover(self, parent1, parent2): child1 = self.model_generator((parent1.id+1)*10) child1 = self.inherit_weights(parent1, child1) child2 = self.model_generator((parent2.id+1)*10) child2 = self.inherit_weights(parent2, child2) # first W n_neurons = child1.W1.shape[1] cutoff = np.random.randint(0, n_neurons) child1.W1[:, cutoff:] = parent2.W1[:, cutoff:].copy() child2.W1[:, cutoff:] = parent1.W1[:, cutoff:].copy() # second W n_neurons = child1.W2.shape[1] cutoff = np.random.randint(0, n_neurons) child1.W2[:, cutoff:] = parent2.W2[:, cutoff:].copy() child2.W2[:, cutoff:] = parent1.W2[:, cutoff:].copy() return child1, child2 def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array([res]) def act(self, p, state): logits = feed_forward(state, p) return np.argmax(logits, 1)[0] def buy(self, individual): initial_money = self.initial_money starting_money = initial_money state = self.get_state(0) inventory = [] states_sell = [] states_buy = [] for t in range(0, len(self.trend) - 1, self.skip): action = self.act(individual, state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((self.trend[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, self.trend[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def calculate_fitness(self): for i in range(self.population_size): initial_money = self.initial_money starting_money = initial_money state = self.get_state(0) inventory = [] for t in range(0, len(self.trend) - 1, self.skip): action = self.act(self.population[i], state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory): bought_price = inventory.pop(0) starting_money += self.trend[t] state = next_state invest = ((starting_money - initial_money) / initial_money) * 100 self.population[i].fitness = invest def evolve(self, generations=20, checkpoint= 5): self._initialize_population() n_winners = int(self.population_size * 0.4) n_parents = self.population_size - n_winners for epoch in range(generations): self.calculate_fitness() fitnesses = [i.fitness for i in self.population] sort_fitness = np.argsort(fitnesses)[::-1] self.population = [self.population[i] for i in sort_fitness] fittest_individual = self.population[0] if (epoch+1) % checkpoint == 0: print('epoch %d, fittest individual %d with accuracy %f'%(epoch+1, sort_fitness[0], fittest_individual.fitness)) next_population = [self.population[i] for i in range(n_winners)] total_fitness = np.sum([np.abs(i.fitness) for i in self.population]) parent_probabilities = [np.abs(i.fitness / total_fitness) for i in self.population] parents = np.random.choice(self.population, size=n_parents, p=parent_probabilities, replace=False) for i in np.arange(0, len(parents), 2): child1, child2 = self.crossover(parents[i], parents[i+1]) next_population += [self.mutate(child1), self.mutate(child2)] self.population = next_population return fittest_individual population_size = 100 generations = 100 mutation_rate = 0.1 neural_evolve = NeuroEvolution(population_size, mutation_rate, neuralnetwork, window_size, window_size, close, skip, initial_money) fittest_nets = neural_evolve.evolve(50) states_buy, states_sell, total_gains, invest = neural_evolve.buy(fittest_nets) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() df= df_full.copy() name = 'Neuro-evolution with Novelty search agent' close = df.Close.values.tolist() initial_money = 10000 window_size = 30 skip = 1 novelty_search_threshold = 6 novelty_log_maxlen = 1000 backlog_maxsize = 500 novelty_log_add_amount = 3 class neuralnetwork: def __init__(self, id_, hidden_size = 128): self.W1 = np.random.randn(window_size, hidden_size) / np.sqrt(window_size) self.W2 = np.random.randn(hidden_size, 3) / np.sqrt(hidden_size) self.fitness = 0 self.last_features = None self.id = id_ def relu(X): return np.maximum(X, 0) def softmax(X): e_x = np.exp(X - np.max(X, axis=-1, keepdims=True)) return e_x / np.sum(e_x, axis=-1, keepdims=True) def feed_forward(X, nets): a1 = np.dot(X, nets.W1) z1 = relu(a1) a2 = np.dot(z1, nets.W2) return softmax(a2) class NeuroEvolution: def __init__(self, population_size, mutation_rate, model_generator, state_size, window_size, trend, skip, initial_money): self.population_size = population_size self.mutation_rate = mutation_rate self.model_generator = model_generator self.state_size = state_size self.window_size = window_size self.half_window = window_size // 2 self.trend = trend self.skip = skip self.initial_money = initial_money self.generation_backlog = [] self.novel_backlog = [] self.novel_pop = [] def _initialize_population(self): self.population = [] for i in range(self.population_size): self.population.append(self.model_generator(i)) def _memorize(self, q, i, limit): q.append(i) if len(q) > limit: q.pop() def mutate(self, individual, scale=1.0): mutation_mask = np.random.binomial(1, p=self.mutation_rate, size=individual.W1.shape) individual.W1 += np.random.normal(loc=0, scale=scale, size=individual.W1.shape) * mutation_mask mutation_mask = np.random.binomial(1, p=self.mutation_rate, size=individual.W2.shape) individual.W2 += np.random.normal(loc=0, scale=scale, size=individual.W2.shape) * mutation_mask return individual def inherit_weights(self, parent, child): child.W1 = parent.W1.copy() child.W2 = parent.W2.copy() return child def crossover(self, parent1, parent2): child1 = self.model_generator((parent1.id+1)*10) child1 = self.inherit_weights(parent1, child1) child2 = self.model_generator((parent2.id+1)*10) child2 = self.inherit_weights(parent2, child2) # first W n_neurons = child1.W1.shape[1] cutoff = np.random.randint(0, n_neurons) child1.W1[:, cutoff:] = parent2.W1[:, cutoff:].copy() child2.W1[:, cutoff:] = parent1.W1[:, cutoff:].copy() # second W n_neurons = child1.W2.shape[1] cutoff = np.random.randint(0, n_neurons) child1.W2[:, cutoff:] = parent2.W2[:, cutoff:].copy() child2.W2[:, cutoff:] = parent1.W2[:, cutoff:].copy() return child1, child2 def get_state(self, t): window_size = self.window_size + 1 d = t - window_size + 1 block = self.trend[d : t + 1] if d >= 0 else -d * [self.trend[0]] + self.trend[0 : t + 1] res = [] for i in range(window_size - 1): res.append(block[i + 1] - block[i]) return np.array([res]) def act(self, p, state): logits = feed_forward(state, p) return np.argmax(logits, 1)[0] def buy(self, individual): initial_money = self.initial_money starting_money = initial_money state = self.get_state(0) inventory = [] states_sell = [] states_buy = [] for t in range(0, len(self.trend) - 1, self.skip): action = self.act(individual, state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) initial_money -= self.trend[t] states_buy.append(t) print('day %d: buy 1 unit at price %f, total balance %f'% (t, self.trend[t], initial_money)) elif action == 2 and len(inventory): bought_price = inventory.pop(0) initial_money += self.trend[t] states_sell.append(t) try: invest = ((self.trend[t] - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell 1 unit at price %f, investment %f %%, total balance %f,' % (t, self.trend[t], invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest def calculate_fitness(self): for i in range(self.population_size): initial_money = self.initial_money starting_money = initial_money state = self.get_state(0) inventory = [] for t in range(0, len(self.trend) - 1, self.skip): action = self.act(self.population[i], state) next_state = self.get_state(t + 1) if action == 1 and starting_money >= self.trend[t]: inventory.append(self.trend[t]) starting_money -= self.trend[t] elif action == 2 and len(inventory): bought_price = inventory.pop(0) starting_money += self.trend[t] state = next_state invest = ((starting_money - initial_money) / initial_money) * 100 self.population[i].fitness = invest self.population[i].last_features = self.population[i].W2.flatten() def evaluate(self, individual, backlog, pop, k = 4): score = 0 if len(backlog): x = np.array(backlog) nn = NearestNeighbors(n_neighbors = k, metric = 'euclidean').fit(np.array(backlog)) d, _ = nn.kneighbors([individual]) score += np.mean(d) if len(pop): nn = NearestNeighbors(n_neighbors = k, metric = 'euclidean').fit(np.array(pop)) d, _ = nn.kneighbors([individual]) score += np.mean(d) return score def evolve(self, generations=20, checkpoint= 5): self._initialize_population() n_winners = int(self.population_size * 0.4) n_parents = self.population_size - n_winners for epoch in range(generations): self.calculate_fitness() scores = [self.evaluate(p.last_features, self.novel_backlog, self.novel_pop) for p in self.population] sort_fitness = np.argsort(scores)[::-1] self.population = [self.population[i] for i in sort_fitness] fittest_individual = self.population[0] if (epoch+1) % checkpoint == 0: print('epoch %d, fittest individual %d with accuracy %f'%(epoch+1, sort_fitness[0], fittest_individual.fitness)) next_population = [self.population[i] for i in range(n_winners)] total_fitness = np.sum([np.abs(i.fitness) for i in self.population]) parent_probabilities = [np.abs(i.fitness / total_fitness) for i in self.population] parents = np.random.choice(self.population, size=n_parents, p=parent_probabilities, replace=False) for p in next_population: if p.last_features is not None: self._memorize(self.novel_pop, p.last_features, backlog_maxsize) if np.random.randint(0,10) < novelty_search_threshold: self._memorize(self.novel_backlog, p.last_features, novelty_log_maxlen) for i in np.arange(0, len(parents), 2): child1, child2 = self.crossover(parents[i], parents[i+1]) next_population += [self.mutate(child1), self.mutate(child2)] self.population = next_population if np.random.randint(0,10) < novelty_search_threshold: pop_sorted = sorted(self.population, key=lambda p: p.fitness, reverse=True) self.generation_backlog.append(pop_sorted[0]) print('novel add fittest, score: %f, backlog size: %d'%(pop_sorted[0].fitness, len(self.generation_backlog))) generation_backlog_temp = self.generation_backlog if len(self.generation_backlog) > backlog_maxsize: generation_backlog_temp = random.sample(generation_backlog, backlog_maxsize) for p in generation_backlog_temp: if p.last_features is not None: self._memorize(self.novel_backlog, p.last_features, novelty_log_maxlen) return fittest_individual population_size = 100 generations = 100 mutation_rate = 0.1 neural_evolve = NeuroEvolution(population_size, mutation_rate, neuralnetwork, window_size, window_size, close, skip, initial_money) fittest_nets = neural_evolve.evolve(100) states_buy, states_sell, total_gains, invest = neural_evolve.buy(fittest_nets) fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() df= df_full.copy() name = 'ABCD strategy agent' def abcd(trend, skip_loop = 4, ma = 7): ma = pd.Series(trend).rolling(ma).mean().values x = [] for a in range(ma.shape[0]): for b in range(a, ma.shape[0], skip_loop): for c in range(b, ma.shape[0], skip_loop): for d in range(c, ma.shape[0], skip_loop): if ma[b] > ma[a] and \ (ma[c] < ma[b] and ma[c] > ma[a]) \ and ma[d] > ma[b]: x.append([a,b,c,d]) x_np = np.array(x) ac = x_np[:,0].tolist() + x_np[:,2].tolist() bd = x_np[:,1].tolist() + x_np[:,3].tolist() ac_set = set(ac) bd_set = set(bd) signal = np.zeros(len(trend)) buy = list(ac_set - bd_set) sell = list(list(bd_set - ac_set)) signal[buy] = 1.0 signal[sell] = -1.0 return signal %%time signal = abcd(df['Close']) def buy_stock( real_movement, signal, initial_money = 10000, max_buy = 1, max_sell = 1, ): """ real_movement = actual movement in the real world delay = how much interval you want to delay to change our decision from buy to sell, vice versa initial_state = 1 is buy, 0 is sell initial_money = 10000, ignore what kind of currency max_buy = max quantity for share to buy max_sell = max quantity for share to sell """ starting_money = initial_money states_sell = [] states_buy = [] states_money = [] current_inventory = 0 def buy(i, initial_money, current_inventory): shares = initial_money // real_movement[i] if shares < 1: print( 'day %d: total balances %f, not enough money to buy a unit price %f' % (i, initial_money, real_movement[i]) ) else: if shares > max_buy: buy_units = max_buy else: buy_units = shares initial_money -= buy_units * real_movement[i] current_inventory += buy_units print( 'day %d: buy %d units at price %f, total balance %f' % (i, buy_units, buy_units * real_movement[i], initial_money) ) states_buy.append(0) return initial_money, current_inventory for i in range(real_movement.shape[0]): state = signal[i] if state == 1: initial_money, current_inventory = buy( i, initial_money, current_inventory ) states_buy.append(i) elif state == -1: if current_inventory == 0: print('day %d: cannot sell anything, inventory 0' % (i)) else: if current_inventory > max_sell: sell_units = max_sell else: sell_units = current_inventory current_inventory -= sell_units total_sell = sell_units * real_movement[i] initial_money += total_sell try: invest = ( (real_movement[i] - real_movement[states_buy[-1]]) / real_movement[states_buy[-1]] ) * 100 except: invest = 0 print( 'day %d, sell %d units at price %f, investment %f %%, total balance %f,' % (i, sell_units, total_sell, invest, initial_money) ) states_sell.append(i) states_money.append(initial_money) invest = ((initial_money - starting_money) / starting_money) * 100 total_gains = initial_money - starting_money return states_buy, states_sell, total_gains, invest, states_money states_buy, states_sell, total_gains, invest, states_money = buy_stock(df.Close, signal) close = df['Close'] fig = plt.figure(figsize = (15,5)) plt.plot(close, color='r', lw=2.) plt.plot(close, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(close, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.title('total gains %f, total investment %f%%'%(total_gains, invest)) plt.legend() plt.savefig('output/'+name+'.png') plt.show() fig = plt.figure(figsize = (15,5)) plt.plot(states_money, color='r', lw=2.) plt.plot(states_money, '^', markersize=10, color='m', label = 'buying signal', markevery = states_buy) plt.plot(states_money, 'v', markersize=10, color='k', label = 'selling signal', markevery = states_sell) plt.legend() plt.show() plt.figure(figsize = (10, 5)) bins = np.linspace(-10, 10, 100) solution = np.random.randn(100) w = np.random.randn(100) plt.hist(solution, bins, alpha = 0.5, label = 'solution', color = 'r') plt.hist(w, bins, alpha = 0.5, label = 'random', color = 'y') plt.legend() plt.show() def f(w): return -np.sum(np.square(solution - w)) npop = 50 sigma = 0.1 alpha = 0.001 for i in range(5000): if (i + 1) % 1000 == 0: print( 'iter %d. w: %s, solution: %s, reward: %f' % (i + 1, str(w[-1]), str(solution[-1]), f(w)) ) N = np.random.randn(npop, 100) R = np.zeros(npop) for j in range(npop): w_try = w + sigma * N[j] R[j] = f(w_try) A = (R - np.mean(R)) / np.std(R) w = w + alpha / (npop * sigma) * np.dot(N.T, A) ''' I want to compare my first two individuals with my real w ''' plt.figure(figsize=(10,5)) sigma = 0.1 N = np.random.randn(npop, 100) individuals = [] for j in range(2): individuals.append(w + sigma * N[j]) plt.hist(w, bins, alpha=0.5, label='w',color='r') plt.hist(individuals[0], bins, alpha=0.5, label='individual 1') plt.hist(individuals[1], bins, alpha=0.5, label='individual 2') plt.legend() plt.show() df= df_full.copy() name = 'Deep Evolution Strategy' def get_state(data, t, n): d = t - n + 1 block = data[d : t + 1] if d >= 0 else -d * [data[0]] + data[: t + 1] res = [] for i in range(n - 1): res.append(block[i + 1] - block[i]) return np.array([res]) close = df.Close.values.tolist() get_state(close, 0, 10) get_state(close, 1, 10) get_state(close, 2, 10) class Deep_Evolution_Strategy: def __init__( self, weights, reward_function, population_size, sigma, learning_rate ): self.weights = weights self.reward_function = reward_function self.population_size = population_size self.sigma = sigma self.learning_rate = learning_rate def _get_weight_from_population(self, weights, population): weights_population = [] for index, i in enumerate(population): jittered = self.sigma * i weights_population.append(weights[index] + jittered) return weights_population def get_weights(self): return self.weights def train(self, epoch = 100, print_every = 1): lasttime = time.time() for i in range(epoch): population = [] rewards = np.zeros(self.population_size) for k in range(self.population_size): x = [] for w in self.weights: x.append(np.random.randn(*w.shape)) population.append(x) for k in range(self.population_size): weights_population = self._get_weight_from_population( self.weights, population[k] ) rewards[k] = self.reward_function(weights_population) rewards = (rewards - np.mean(rewards)) / np.std(rewards) for index, w in enumerate(self.weights): A = np.array([p[index] for p in population]) self.weights[index] = ( w + self.learning_rate / (self.population_size * self.sigma) * np.dot(A.T, rewards).T ) if (i + 1) % print_every == 0: print( 'iter %d. reward: %f' % (i + 1, self.reward_function(self.weights)) ) print('time taken to train:', time.time() - lasttime, 'seconds') class Model: def __init__(self, input_size, layer_size, output_size): self.weights = [ np.random.randn(input_size, layer_size), np.random.randn(layer_size, output_size), np.random.randn(layer_size, 1), np.random.randn(1, layer_size), ] def predict(self, inputs): feed = np.dot(inputs, self.weights[0]) + self.weights[-1] decision = np.dot(feed, self.weights[1]) buy = np.dot(feed, self.weights[2]) return decision, buy def get_weights(self): return self.weights def set_weights(self, weights): self.weights = weights window_size = 30 model = Model(window_size, 500, 3) initial_money = 10000 starting_money = initial_money len_close = len(close) - 1 weight = model skip = 1 state = get_state(close, 0, window_size + 1) inventory = [] quantity = 0 max_buy = 5 max_sell = 5 def act(model, sequence): decision, buy = model.predict(np.array(sequence)) return np.argmax(decision[0]), int(buy[0]) for t in range(0, len_close, skip): action, buy = act(weight, state) next_state = get_state(close, t + 1, window_size + 1) if action == 1 and initial_money >= close[t]: if buy < 0: buy = 1 if buy > max_buy: buy_units = max_buy else: buy_units = buy total_buy = buy_units * close[t] initial_money -= total_buy inventory.append(total_buy) quantity += buy_units elif action == 2 and len(inventory) > 0: if quantity > max_sell: sell_units = max_sell else: sell_units = quantity quantity -= sell_units total_sell = sell_units * close[t] initial_money += total_sell state = next_state ((initial_money - starting_money) / starting_money) * 100 import time class Agent: POPULATION_SIZE = 15 SIGMA = 0.1 LEARNING_RATE = 0.03 def __init__( self, model, money, max_buy, max_sell, close, window_size, skip ): self.window_size = window_size self.skip = skip self.close = close self.model = model self.initial_money = money self.max_buy = max_buy self.max_sell = max_sell self.es = Deep_Evolution_Strategy( self.model.get_weights(), self.get_reward, self.POPULATION_SIZE, self.SIGMA, self.LEARNING_RATE, ) def act(self, sequence): decision, buy = self.model.predict(np.array(sequence)) return np.argmax(decision[0]), int(buy[0]) def get_reward(self, weights): initial_money = self.initial_money starting_money = initial_money len_close = len(self.close) - 1 self.model.weights = weights state = get_state(self.close, 0, self.window_size + 1) inventory = [] quantity = 0 for t in range(0, len_close, self.skip): action, buy = self.act(state) next_state = get_state(self.close, t + 1, self.window_size + 1) if action == 1 and initial_money >= self.close[t]: if buy < 0: buy = 1 if buy > self.max_buy: buy_units = self.max_buy else: buy_units = buy total_buy = buy_units * self.close[t] initial_money -= total_buy inventory.append(total_buy) quantity += buy_units elif action == 2 and len(inventory) > 0: if quantity > self.max_sell: sell_units = self.max_sell else: sell_units = quantity quantity -= sell_units total_sell = sell_units * self.close[t] initial_money += total_sell state = next_state return ((initial_money - starting_money) / starting_money) * 100 def fit(self, iterations, checkpoint): self.es.train(iterations, print_every = checkpoint) def buy(self): initial_money = self.initial_money len_close = len(self.close) - 1 state = get_state(self.close, 0, self.window_size + 1) starting_money = initial_money states_sell = [] states_buy = [] inventory = [] quantity = 0 for t in range(0, len_close, self.skip): action, buy = self.act(state) next_state = get_state(self.close, t + 1, self.window_size + 1) if action == 1 and initial_money >= self.close[t]: if buy < 0: buy = 1 if buy > self.max_buy: buy_units = self.max_buy else: buy_units = buy total_buy = buy_units * self.close[t] initial_money -= total_buy inventory.append(total_buy) quantity += buy_units states_buy.append(t) print( 'day %d: buy %d units at price %f, total balance %f' % (t, buy_units, total_buy, initial_money) ) elif action == 2 and len(inventory) > 0: bought_price = inventory.pop(0) if quantity > self.max_sell: sell_units = self.max_sell else: sell_units = quantity if sell_units < 1: continue quantity -= sell_units total_sell = sell_units * self.close[t] initial_money += total_sell states_sell.append(t) try: invest = ((total_sell - bought_price) / bought_price) * 100 except: invest = 0 print( 'day %d, sell %d units at price %f, investment %f %%, total balance %f,' % (t, sell_units, total_sell, invest, initial_money) ) state = next_state invest = ((initial_money - starting_money) / starting_money) * 100 print( '\ntotal gained %f, total investment %f %%' % (initial_money - starting_money, invest) ) plt.figure(figsize = (20, 10)) plt.plot(close, label = 'true close', c = 'g') plt.plot( close, 'X', label = 'predict buy', markevery = states_buy, c = 'b' ) plt.plot( close, 'o', label = 'predict sell', markevery = states_sell, c = 'r' ) plt.legend() plt.savefig('output/'+name+'.png') plt.show() model = Model(input_size = window_size, layer_size = 500, output_size = 3) agent = Agent( model = model, money = 10000, max_buy = 5, max_sell = 5, close = close, window_size = window_size, skip = 1, ) agent.fit(iterations = 500, checkpoint = 10) agent.buy()