Last active
August 29, 2015 13:56
-
-
Save 3Nigma/8964249 to your computer and use it in GitHub Desktop.
Octave/Matlab action-value method applied to the n-bandit problem and solved through a normal epsilon-greedy approach
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function totalRewards = simulate_epsg_n_bandit(n, eps, gamecnt, rollcnt) | |
# make room for the estimated value - action [Q_t(a)] | |
totalRewards = zeros(1, rollcnt); | |
for k = 1:gamecnt | |
# generate the true value - action [q_*(a)] from a normal distribution of mean 0 and variance 1 | |
q = randn(1, n); | |
# reset some game related, auxiliary variables | |
Qavg = zeros(1, n); | |
Qsum = zeros(1, n); | |
nPulls = zeros(1, n); | |
for i = 1:rollcnt | |
if (unifrnd(0, 1) <= 1 - eps) | |
# do a tiebraking, exploitation step | |
idsQstep = find(Qavg == max(Qavg)); | |
randperm(length(idsQstep)); | |
iQstep = idsQstep(1); | |
else | |
# do an exploration step | |
iQstep = unidrnd(n); | |
endif | |
# calculate the reward and integrate it into the knwoledge base | |
Rk = q(iQstep) + normrnd(0, 1); | |
totalRewards(i) = totalRewards(i) + Rk; | |
Qsum(iQstep) = Qsum(iQstep) + Rk; | |
nPulls(iQstep) = nPulls(iQstep) + 1; | |
Qavg(iQstep) = Qsum(iQstep)/nPulls(iQstep); | |
endfor | |
endfor | |
endfunction |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The full problem description can be found in Rich Sutton's wonderful reinforcement learning book.