-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgridworld.py
62 lines (48 loc) · 1.82 KB
/
gridworld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import sys
from os.path import dirname, join, realpath
dir_path = dirname(dirname(realpath(__file__)))
sys.path.insert(1, join(dir_path, 'utils'))
import numpy as np
from env import GridWorld
def get_true_value(env: GridWorld, gamma: float) -> np.ndarray:
'''
Compute true value by Bellman equations by
constructing system of linear equations Ax=b from Bellman equations
Params
------
env: GridWorld
gamma: discount factor
'''
n_states = len(env.state_space)
A = np.zeros((n_states, n_states))
b = np.zeros(n_states)
for i, state in enumerate(env.state_space):
next_history = []
for action in env.action_space:
env.state = np.copy(state)
next_state, reward, _ = env.step(action)
if not (state == next_state).all() and \
(state[0], state[1]) not in env.states_:
reward = 0
next_history.append((next_state, action, reward))
coefficients = np.zeros(n_states)
reward_ = 0
for t, history in enumerate(next_history):
next_state, action, reward = history
coefficients[next_state[0] * env.height + next_state[1]] \
+= env.transition_probs[action] * gamma
reward_ += env.transition_probs[action] * 1 * reward
coefficients[state[0] * env.height + state[1]] -= 1
A[i] = coefficients
b[i] = -reward_
true_value = np.linalg.solve(A, b)
return true_value
if __name__ == '__main__':
height = width = 5
special_states = [[(0, 1), (0, 3)], [(4, 1), (2, 3)], [10, 5]]
env = GridWorld(height, width, special_states=special_states)
gamma = 0.9
true_value = get_true_value(env, gamma)
true_value = np.around(np.reshape(
true_value, (height, width)), decimals=1)
print(true_value)