Last active
December 17, 2018 16:45
-
-
Save tano297/913f5ab69f3638f7054706f173535682 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import numpy as np | |
# define the grid size | |
size_h = 4 | |
size_w = 4 | |
# define the actions | |
actions = ["up", "down", "left", "right"] | |
# define the reward for each action (-1 everywhere for all actions, | |
# except for the terminal states) | |
reward = np.full((size_h, size_w, len(actions)), -1.0) | |
reward[0, 0] = np.zeros((4), dtype=np.float32) | |
reward[-1, -1] = np.zeros((4), dtype=np.float32) | |
# define the pi | |
pi = [0.25, 0.25, 0.25, 0.25] | |
# s'|s,a in this problem is deterministic, so I can just define it as a 4x4, | |
transfer = np.zeros((size_h, size_w, len(actions), 2), dtype=np.int32) | |
for y in range(size_h): | |
for x in range(size_w): | |
for a in range(len(actions)): | |
if actions[a] == "up": | |
if y > 0: | |
transfer[y, x, a, 0] = y - 1 | |
else: | |
transfer[y, x, a, 0] = y | |
transfer[y, x, a, 1] = x | |
elif actions[a] == "down": | |
if y < size_h - 1: | |
transfer[y, x, a, 0] = y + 1 | |
else: | |
transfer[y, x, a, 0] = y | |
transfer[y, x, a, 1] = x | |
elif actions[a] == "left": | |
if x > 0: | |
transfer[y, x, a, 1] = x - 1 | |
else: | |
transfer[y, x, a, 1] = x | |
transfer[y, x, a, 0] = y | |
elif actions[a] == "right": | |
if x < size_w - 1: | |
transfer[y, x, a, 1] = x + 1 | |
else: | |
transfer[y, x, a, 1] = x | |
transfer[y, x, a, 0] = y | |
# now fill up the transfer at the end nodes | |
transfer[0, 0] = np.zeros((len(actions), 2)) | |
transfer[-1, -1] = np.full((len(actions), 2), -1) | |
# print transfer matrix | |
print("*" * 80) | |
print("s'|s,a : ") | |
for a in range(len(actions)): | |
print("action: ", actions[a]) | |
print("y: ", transfer[:, :, a, 0]) | |
print("x: ", transfer[:, :, a, 1]) | |
print("*" * 80) | |
# initial value function | |
value_0 = np.zeros((size_h, size_w), dtype=np.float32) | |
print("initial value function") | |
print(value_0) | |
# iterate | |
iterations = 10000 | |
epsilon = 0.0001 | |
for it in range(iterations): | |
value_t = np.zeros_like(value_0) | |
# do one bellman step in each state | |
for y in range(value_0.shape[0]): | |
for x in range(value_0.shape[1]): | |
for a, action in enumerate(actions): | |
# get the coordinates where I go with this action | |
newy, newx = transfer[y, x, a] | |
# make one lookahead step for this action | |
value_t[y, x] += pi[a] * (reward[y, x, a] + value_0[newy, newx]) | |
if it < 3 or it == 9: | |
print("-" * 40) | |
print("iterations: ", it + 1) | |
print(value_t) | |
# if value converged, exit | |
norm = 0.0 | |
for y in range(value_t.shape[0]): | |
for x in range(value_t.shape[1]): | |
norm += np.abs(value_0[y, x] - value_t[y, x]) | |
norm /= np.array(value_t.shape, dtype=np.float32).sum() | |
# print(norm) | |
if norm < epsilon: | |
print("!" * 80) | |
print("Exiting loop because I converged the value") | |
print("!" * 80) | |
break | |
else: | |
# if not converged, save current as old to iterate | |
value_0 = np.copy(value_t) | |
print("-" * 40) | |
print("iterations: ", it + 1) | |
print("value:") | |
print(value_t) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
RL Course David Silver, Lecture 3, minute 19:20
https://youtu.be/Nd1-UUMVfz4