#!/usr/bin/python
# -*- coding: utf-8 -*-
# coding=utf-8
'''
Implementation of small grid world example illustrated by David Silver
in his Reinforcement Learning Lecture3 - Planning by Dynamic
Programming.
Author: Qiang Ye
Date: July 1, 2017
The value function converges to:
0.00 -14.00 -20.00 -22.00
-14.00 -18.00 -20.00 -20.00
-20.00 -20.00 -18.00 -14.00
-22.00 -20.00 -14.00 0.00
At Iterate No.153
'''
# id of the states, 0 and 15 are terminal states
states = [i for i in range(16)]
# 0* 1 2 3
# 4 5 6 7
# 8 9 10 11
# 12 13 14 15*
# initial values of states
values = [0 for _ in range(16)]
# Action
actions = ["n", "e", "s", "w"]
# State changed by action
# use a dictionary for convenient computation of next state id.
ds_actions = {"n": -4, "e": 1, "s": 4, "w": -1}
# discount factor
gamma = 1.00
# Based on current state and action to get next state励
def nextState(s, a):
next_state = s
if (s%4 == 0 and a == "w") or (s<4 and a == "n") or \
((s+1)%4 == 0 and a == "e") or (s > 11 and a == "s"):
pass
else:
ds = ds_actions[a]
next_state = s + ds
return next_state
# reward of a state
def rewardOf(s):
return 0 if s in [0,15] else -1
# check if a state is terminate state
def isTerminateState(s):
return s in [0,15]
# get successor states of a given state s
def getSuccessors(s):
successors = []
if isTerminateState(s):
return successors
for a in actions:
next_state = nextState(s, a)
# if s != next_state:
successors.append(next_state)
return successors
# update the value of state s
def updateValue(s):
sucessors = getSuccessors(s)
newValue = 0 # values[s]
num = 4 # len(successors)
reward = rewardOf(s)
for next_state in sucessors:
newValue += 1.00/num * (reward + gamma * values[next_state])
return newValue
# perform one-step iteration
def performOneIteration():
newValues = [0 for _ in range(16)]
for s in states:
newValues[s] = updateValue(s)
global values
values = newValues
printValue(values)
# show some array info of the small grid world
def printValue(v):
for i in range(16):
print('{0:>6.2f}'.format(v[i],end = ""))
if (i+1)%4 == 0:
print("")
print()
# test function
def test():
printValue(states)
printValue(values)
for s in states:
reward = rewardOf(s)
for a in actions:
next_state = nextState(s, a)
print("({0}, {1}) -> {2}, with reward {3}".format(s, a,next_state, reward))
for i in range(200):
performOneIteration()
printValue(values)
def main():
max_iterate_times = 160
cur_iterate_times = 0
while cur_iterate_times <= max_iterate_times:
print("Iterate No.{0}".format(cur_iterate_times))
performOneIteration()
cur_iterate_times += 1
printValue(values)
if __name__ == '__main__':
main()
When executing it, you will get the first and second iteration's value function in the table
Value function of Iterate No.0
0.00 -1.00 -1.00 -1.00
-1.00 -1.00 -1.00 -1.00
-1.00 -1.00 -1.00 -1.00
-1.00 -1.00 -1.00 0.00
Value function of Iterate No.1
0.00 -1.75 -2.00 -2.00
-1.75 -2.00 -2.00 -2.00
-2.00 -2.00 -2.00 -1.75
-2.00 -2.00 -1.75 0.00
Here due to using dynamic programming, I only want to explain how the value function for state:1 is generated in the first and second iteration.
In Iterate No.0, all the value function for any state is 0.
Giving the state at 1, we can calculate value function for state 1 using the formula: Bellman Equation for MRP
In the source code, we implement this formula to this:
for next_state in sucessors:
newValue += 1.00/num * (reward + gamma * values[next_state])
As we know that there are 4 actions mapped to 4 next state below:
(1, n) -> 1, with reward -1 # next state:1
(1, e) -> 2, with reward -1 # next state:2(1, s) -> 5, with reward -1 # next state:5
(1, w) -> 0, with reward -1 # next state:0
= 1/4 * (-1 + 1 * 0) # next state:1's value = 0
+ 1/4 * (-1 + 1 * 0) # next state:2's value = 0
+ 1/4 * (-1 + 1 * 0) # next state:5's value = 0
+ 1/4 * (-1 + 1 * 0) # next state:1's value = 0
= -1
we can calculate value function for state 1 using the formula and the value function of Iterate No.0
= 1/4 * (-1 + 1 * -1) # next state:1's value = -1
+ 1/4 * (-1 + 1 * -1) # next state:2's value = -1
+ 1/4 * (-1 + 1 * -1) # next state:5's value = -1
+ 1/4 * (-1 + 1 * 0) # next state:0's value = 0
= -1.75
Bingo, the answer is correct!!
No comments:
Post a Comment