-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathvalue_iteration.py
49 lines (40 loc) · 1.45 KB
/
value_iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
import gym
import envs.gridworld
from collections import defaultdict
from agents.qlearning.qlearning_agent import QLearningAgent
def value(policy, states, transition_probabilities, reward, discount, threshold=1e-2):
value_state = defaultdict(lambda: 0.0)
for state in states:
value_state[state] = 0.0
diff = float("inf")
while diff > threshold:
diff = 0
for state in states:
state = str(state)
vs = value_state[state]
action = policy[state]
for new_state in states:
new_state = str(new_state)
value_state[state] += transition_probabilities[state][action][new_state] * (
reward[new_state] + discount * value_state[new_state])
diff = max(diff, abs(vs - value_state[state]))
return value_state
def q_to_policy(q, offset=0):
optimal_policy = {}
for state in q:
optimal_policy[state] = np.argmax(q[state]) + offset
return optimal_policy
if __name__ == '__main__':
gamma = 0.9
env = gym.make('FourRooms-v1')
agent = QLearningAgent(env, gamma=gamma, alpha=0.12, epsilon=0.1)
agent.train(1000)
policy = q_to_policy(agent.q)
rewards = defaultdict(lambda: 0.0)
for s in env.possibleStates:
rewards[str(s)] = env._get_reward(s)
v = value(policy,
env.possibleStates,
env.transition_probability,
rewards, gamma)