-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfrozenlake_value_iteration.py
93 lines (76 loc) · 2.71 KB
/
frozenlake_value_iteration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
'''
Inspired by: Moustafa Alzantot
https://medium.com/@m.alzantot/deep-reinforcement-learning-demysitifed-episode-2-policy-iteration-value-iteration-and-q-978f9e89ddaa
'''
import gym
import numpy as np
def random_policy(env):
policy = np.zeros(env.env.nS)
for s in range(env.env.nS):
policy[s] = np.random.randint(0, env.env.nA)
return policy
def value_iteration(env, gamma=0.99, eps=1e-20, max_iterations=100000):
'''
Value iteration algorithm.
It finds the optimals Q and value functions.
'''
V = np.zeros(env.env.nS)
Q = np.zeros((env.env.nS, env.env.nA))
for i in range(max_iterations):
old_V = np.copy(V)
for s in range(env.env.nS):
for a in range(env.env.nA):
# Bellman equation
Q[s][a] = sum(p * (gamma * old_V[s_] + r) for p, s_, r, _ in env.env.P[s][a])
V[s] = max(Q[s])
if np.sum(np.fabs(V - old_V)) < eps:
print('Value iteration done with {} iterations.'.format(i))
break
elif i % 100 == 0:
# This evaluate our policy during iteration, just for analysis.
evaluate_policy(env, extract_policy(env, Q),
text='Our policy is performing well in {}% of times, with ' + str(i) + ' iterations.',
comparison=False)
return V, Q
def extract_policy(env, Q):
'''
Extract policy from Q function.
'''
policy = np.zeros(env.env.nS, dtype=np.int8)
for s in range(env.env.nS):
policy[s] = np.argmax(Q[s])
return policy
def evaluate_policy(env, policy, n=1000, text='Our policy performs well in {}% of times.', comparison=True):
'''
Evaluate the policy computing the percent of times it wins.
'''
success = 0
for _ in range(n):
success += run_episode(env, policy)
print(text.format(success/n*100))
if comparison:
success = 0
for _ in range(n):
success += run_episode(env, random_policy(env))
print('Just comparing, random policy performs well in {}% of times.'.format(success/n*100))
def run_episode(env, policy, render1=False, render2=False, max_iterations=10000000):
'''
Run one episode with the specified policy.
'''
obs = env.reset()
for _ in range(max_iterations):
if render1:
env.render()
obs, reward, done, _ = env.step(policy[obs])
if done:
if render2:
env.render()
return reward
return 0
if __name__ == '__main__':
env = gym.make('FrozenLake8x8-v0')
V, Q = value_iteration(env)
opt_policy = extract_policy(env, Q)
evaluate_policy(env, opt_policy)
print("An example:")
run_episode(env, opt_policy, render2=True)