Created
March 22, 2017 07:04
-
-
Save MorvanZhou/99da5d4f2b5bcc65af668a4f17efe4fe to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
The DQN improvement: Prioritized Experience Replay (based on https://arxiv.org/abs/1511.05952) | |
View more on 莫烦Python: https://morvanzhou.github.io/tutorials/ | |
Using: | |
Tensorflow: 1.0 | |
""" | |
import numpy as np | |
import tensorflow as tf | |
np.random.seed(1) | |
tf.set_random_seed(1) | |
class SumTree(object): | |
""" | |
This SumTree code is modified version and the original code is from: | |
https://github.com/jaara/AI-blog/blob/master/SumTree.py | |
Story the data with it priority in tree and data frameworks. | |
""" | |
data_pointer = 0 | |
def __init__(self, capacity): | |
self.capacity = capacity # for all priority values | |
self.tree = np.zeros(2 * capacity - 1) | |
# [--------------Parent nodes-------------][-------leaves to recode priority-------] | |
# size: capacity - 1 size: capacity | |
self.data = np.zeros(capacity, dtype=object) # for all transitions | |
# [--------------data frame-------------] | |
# size: capacity | |
def add_new_priority(self, p, data): | |
leaf_idx = self.data_pointer + self.capacity - 1 | |
self.data[self.data_pointer] = data # update data_frame | |
self.update(leaf_idx, p) # update tree_frame | |
self.data_pointer += 1 | |
if self.data_pointer >= self.capacity: # replace when exceed the capacity | |
self.data_pointer = 0 | |
def update(self, tree_idx, p): | |
change = p - self.tree[tree_idx] | |
self.tree[tree_idx] = p | |
self._propagate_change(tree_idx, change) | |
def _propagate_change(self, tree_idx, change): | |
"""change the sum of priority value in all parent nodes""" | |
parent_idx = (tree_idx - 1) // 2 | |
self.tree[parent_idx] += change | |
if parent_idx != 0: | |
self._propagate_change(parent_idx, change) | |
def get_leaf(self, lower_bound): | |
leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound | |
data_idx = leaf_idx - self.capacity + 1 | |
return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]] | |
def _retrieve(self, lower_bound, parent_idx=0): | |
""" | |
Tree structure and array storage: | |
Tree index: | |
0 -> storing priority sum | |
/ \ | |
1 2 | |
/ \ / \ | |
3 4 5 6 -> storing priority for transitions | |
Array type for storing: | |
[0,1,2,3,4,5,6] | |
""" | |
left_child_idx = 2 * parent_idx + 1 | |
right_child_idx = left_child_idx + 1 | |
if left_child_idx >= len(self.tree): # end search when no more child | |
return parent_idx | |
if self.tree[left_child_idx] == self.tree[right_child_idx]: | |
return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) | |
if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node | |
return self._retrieve(lower_bound, left_child_idx) | |
else: | |
return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx) | |
@property | |
def root_priority(self): | |
return self.tree[0] # the root | |
class Memory(object): # stored as ( s, a, r, s_ ) in SumTree | |
""" | |
This SumTree code is modified version and the original code is from: | |
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py | |
""" | |
epsilon = 0.001 # small amount to avoid zero priority | |
alpha = 0.6 # [0~1] convert the importance of TD error to priority | |
beta = 0.4 # importance-sampling, from initial value increasing to 1 | |
beta_increment_per_sampling = 1e-4 # annealing the bias | |
abs_err_upper = 1 # for stability refer to paper | |
def __init__(self, capacity): | |
self.tree = SumTree(capacity) | |
def store(self, error, transition): | |
p = self._get_priority(error) | |
self.tree.add_new_priority(p, transition) | |
def sample(self, n): | |
batch_idx, batch_memory, ISWeights = [], [], [] | |
segment = self.tree.root_priority / n | |
self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1 | |
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority | |
maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights | |
for i in range(n): | |
a = segment * i | |
b = segment * (i + 1) | |
lower_bound = np.random.uniform(a, b) | |
idx, p, data = self.tree.get_leaf(lower_bound) | |
prob = p / self.tree.root_priority | |
ISWeights.append(self.tree.capacity * prob) | |
batch_idx.append(idx) | |
batch_memory.append(data) | |
ISWeights = np.vstack(ISWeights) | |
ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize | |
return batch_idx, np.vstack(batch_memory), ISWeights | |
def update(self, idx, error): | |
p = self._get_priority(error) | |
self.tree.update(idx, p) | |
def _get_priority(self, error): | |
error += self.epsilon # avoid 0 | |
clipped_error = np.clip(error, 0, self.abs_err_upper) | |
return np.power(clipped_error, self.alpha) | |
class DuelingDQNPrioritizedReplay: | |
def __init__( | |
self, | |
n_actions, | |
n_features, | |
learning_rate=0.005, | |
reward_decay=0.9, | |
e_greedy=0.9, | |
replace_target_iter=500, | |
memory_size=10000, | |
batch_size=32, | |
e_greedy_increment=None, | |
hidden=[100, 50], | |
output_graph=False, | |
sess=None, | |
): | |
self.n_actions = n_actions | |
self.n_features = n_features | |
self.lr = learning_rate | |
self.gamma = reward_decay | |
self.epsilon_max = e_greedy | |
self.replace_target_iter = replace_target_iter | |
self.memory_size = memory_size | |
self.batch_size = batch_size | |
self.hidden = hidden | |
self.epsilon_increment = e_greedy_increment | |
self.epsilon = 0.5 if e_greedy_increment is not None else self.epsilon_max | |
self.learn_step_counter = 0 | |
self._build_net() | |
self.memory = Memory(capacity=memory_size) | |
if sess is None: | |
self.sess = tf.Session() | |
self.sess.run(tf.global_variables_initializer()) | |
else: | |
self.sess = sess | |
if output_graph: | |
tf.summary.FileWriter("logs/", self.sess.graph) | |
self.cost_his = [] | |
def _build_net(self): | |
def build_layers(s, c_names, w_initializer, b_initializer): | |
for i, h in enumerate(self.hidden): | |
if i == 0: | |
in_units, out_units, inputs = self.n_features, self.hidden[i], s | |
else: | |
in_units, out_units, inputs = self.hidden[i-1], self.hidden[i], l | |
with tf.variable_scope('l%i' % i): | |
w = tf.get_variable('w', [in_units, out_units], initializer=w_initializer, collections=c_names) | |
b = tf.get_variable('b', [1, out_units], initializer=b_initializer, collections=c_names) | |
l = tf.nn.relu(tf.matmul(inputs, w) + b) | |
with tf.variable_scope('Value'): | |
w = tf.get_variable('w', [self.hidden[-1], 1], initializer=w_initializer, collections=c_names) | |
b = tf.get_variable('b', [1, 1], initializer=b_initializer, collections=c_names) | |
self.V = tf.matmul(l, w) + b | |
with tf.variable_scope('Advantage'): | |
w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names) | |
b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names) | |
self.A = tf.matmul(l, w) + b | |
with tf.variable_scope('Q'): | |
out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True)) # Q = V(s) + A(s,a) | |
# with tf.variable_scope('out'): | |
# w = tf.get_variable('w', [self.hidden[-1], self.n_actions], initializer=w_initializer, collections=c_names) | |
# b = tf.get_variable('b', [1, self.n_actions], initializer=b_initializer, collections=c_names) | |
# out = tf.matmul(l, w) + b | |
return out | |
# ------------------ build evaluate_net ------------------ | |
self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input | |
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target') # for calculating loss | |
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') | |
with tf.variable_scope('eval_net'): | |
c_names, w_initializer, b_initializer = \ | |
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], \ | |
tf.random_normal_initializer(0., 0.01), tf.constant_initializer(0.01) # config of layers | |
self.q_eval = build_layers(self.s, c_names, w_initializer, b_initializer) | |
with tf.variable_scope('loss'): | |
self.abs_errors = tf.abs(tf.reduce_sum(self.q_target - self.q_eval, axis=1)) # for updating Sumtree | |
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.q_target, self.q_eval)) | |
with tf.variable_scope('train'): | |
self._train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss) | |
# ------------------ build target_net ------------------ | |
self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input | |
with tf.variable_scope('target_net'): | |
c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES] | |
self.q_next = build_layers(self.s_, c_names, w_initializer, b_initializer) | |
def store_transition(self, s, a, r, s_): | |
transition = np.hstack((s, [a, r], s_)) | |
max_p = np.max(self.memory.tree.tree[-self.memory.tree.capacity:]) | |
self.memory.store(max_p, transition) | |
def choose_action(self, observation): | |
observation = observation[np.newaxis, :] | |
if np.random.uniform() < self.epsilon: | |
actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation}) | |
action = np.argmax(actions_value) | |
else: | |
action = np.random.randint(0, self.n_actions) | |
return action | |
def _replace_target_params(self): | |
t_params = tf.get_collection('target_net_params') | |
e_params = tf.get_collection('eval_net_params') | |
self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)]) | |
def learn(self): | |
if self.learn_step_counter % self.replace_target_iter == 0: | |
self._replace_target_params() | |
tree_idx, batch_memory, ISWeights = self.memory.sample(self.batch_size) | |
# double DQN | |
q_next, q_eval4next = self.sess.run( | |
[self.q_next, self.q_eval], | |
feed_dict={self.s_: batch_memory[:, -self.n_features:], # next observation | |
self.s: batch_memory[:, -self.n_features:]}) # next observation | |
q_eval = self.sess.run(self.q_eval, {self.s: batch_memory[:, :self.n_features]}) | |
q_target = q_eval.copy() | |
batch_index = np.arange(self.batch_size, dtype=np.int32) | |
eval_act_index = batch_memory[:, self.n_features].astype(int) | |
reward = batch_memory[:, self.n_features + 1] | |
max_act4next = np.argmax(q_eval4next, | |
axis=1) # the action that brings the highest value is evaluated by q_eval | |
selected_q_next = q_next[batch_index, max_act4next] # Double DQN, select q_next depending on above actions | |
q_target[batch_index, eval_act_index] = reward + self.gamma * selected_q_next | |
# q_next, q_eval = self.sess.run( | |
# [self.q_next, self.q_eval], | |
# feed_dict={self.s_: batch_memory[:, -self.n_features:], | |
# self.s: batch_memory[:, :self.n_features]}) | |
# | |
# q_target = q_eval.copy() | |
# batch_index = np.arange(self.batch_size, dtype=np.int32) | |
# eval_act_index = batch_memory[:, self.n_features].astype(int) | |
# reward = batch_memory[:, self.n_features + 1] | |
# | |
# q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1) | |
_, abs_errors, self.cost = self.sess.run([self._train_op, self.abs_errors, self.loss], | |
feed_dict={self.s: batch_memory[:, :self.n_features], | |
self.q_target: q_target, | |
self.ISWeights: ISWeights}) | |
for i in range(len(tree_idx)): # update priority | |
idx = tree_idx[i] | |
self.memory.update(idx, abs_errors[i]) | |
self.cost_his.append(self.cost) | |
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max | |
self.learn_step_counter += 1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Deep Q network, | |
LunarLander-v2 example | |
Using: | |
Tensorflow: 1.0 | |
gym: 0.8.0 | |
""" | |
import gym | |
from gym import wrappers | |
from DuelingDQNPrioritizedReplay import DuelingDQNPrioritizedReplay | |
env = gym.make('LunarLander-v2') | |
# env = env.unwrapped | |
env.seed(1) | |
N_A = env.action_space.n | |
N_S = env.observation_space.shape[0] | |
MEMORY_CAPACITY = 50000 | |
TARGET_REP_ITER = 2000 | |
MAX_EPISODES = 900 | |
E_GREEDY = 0.95 | |
E_INCREMENT = 0.00001 | |
GAMMA = 0.99 | |
LR = 0.0001 | |
BATCH_SIZE = 32 | |
HIDDEN = [400, 400] | |
RENDER = True | |
RL = DuelingDQNPrioritizedReplay( | |
n_actions=N_A, n_features=N_S, learning_rate=LR, e_greedy=E_GREEDY, reward_decay=GAMMA, | |
hidden=HIDDEN, batch_size=BATCH_SIZE, replace_target_iter=TARGET_REP_ITER, | |
memory_size=MEMORY_CAPACITY, e_greedy_increment=E_INCREMENT,) | |
total_steps = 0 | |
running_r = 0 | |
r_scale = 100 | |
for i_episode in range(MAX_EPISODES): | |
s = env.reset() # (coord_x, coord_y, vel_x, vel_y, angle, angular_vel, l_leg_on_ground, r_leg_on_ground) | |
ep_r = 0 | |
while True: | |
if total_steps > MEMORY_CAPACITY: env.render() | |
a = RL.choose_action(s) | |
s_, r, done, _ = env.step(a) | |
if r == -100: r = -30 | |
r /= r_scale | |
ep_r += r | |
RL.store_transition(s, a, r, s_) | |
if total_steps > MEMORY_CAPACITY: | |
RL.learn() | |
if done: | |
land = '| Landed' if r == 100/r_scale else '| ------' | |
running_r = 0.99 * running_r + 0.01 * ep_r | |
print('Epi: ', i_episode, | |
land, | |
'| Epi_R: ', round(ep_r, 2), | |
'| Running_R: ', round(running_r, 2), | |
'| Epsilon: ', round(RL.epsilon, 3)) | |
break | |
s = s_ | |
total_steps += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment