-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmountain_car.py
151 lines (124 loc) · 3.91 KB
/
mountain_car.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
import gym
import matplotlib.pyplot as plt
def epsilon_greedy(epsilon, Q, n_actions, state):
'''
Choose action according to epsilon-greedy policy
Params:
-------
epsilon: float
Q: np.ndarray
action-value function
n_actions: int
number of actions
state: [int, int]
current state
Return
------
action: int
'''
if not np.random.binomial(1, epsilon):
action = np.argmax(Q[state[0], state[1]])
else:
action = np.random.randint(n_actions)
return action
def discretize_state(env, position, velocity):
'''
Discretize state
[position, velociy] => [int(position * 10), int(velocity * 100)]
Params
------
env: OpenAI MoutainCar env
position: float
position of the car
velocity: float
velocity of the car
'''
position_low, velocity_low = env.low
discretized_position = int((position - position_low) * 10)
discretized_velocity = int((velocity - velocity_low) * 100)
return discretized_position, discretized_velocity
def q_learning(Q, env, epsilon, alpha, gamma, current_ep):
'''
Q-learning algorithm
Params
------
Q: np.ndarray
action-value function
env: OpenAI MoutainCar env
epsilon: float
epsilon greedy param
alpha: float
step size
gamma: float
discount factor
current_ep: int
current episode
Return
------
total_reward: int
total reward of the episode
'''
start_state = env.reset()
state = discretize_state(env, start_state[0], start_state[1])
n_actions = env.action_space.n
total_reward = 0
while True:
if current_ep + 20 >= n_eps:
env.render()
action = epsilon_greedy(epsilon, Q, n_actions, state)
next_state_, reward, terminated, _ = env.step(action)
next_state = discretize_state(env, next_state_[0], next_state_[1])
if terminated and next_state_[0] >= 0.5:
Q[state[0], state[1], action] = reward
else:
delta = alpha * (reward + gamma
* np.max(Q[next_state[0], next_state[1]])
- Q[state[0], state[1], action])
Q[state[0], state[1], action] += delta
total_reward += reward
state = next_state
if terminated:
break
return total_reward
if __name__ == '__main__':
# state = [position, velocity]
# position_range = [-1.2, 0.6]
# velocity_range = [-0.7, 0.7]
env = gym.make('MountainCar-v0')
env.reset()
position_low, velocity_low = env.low
position_high, velocity_high = env.high
max_position, max_velocity = discretize_state(env, position_high, velocity_high)
min_position, min_velocity = discretize_state(env, position_low, velocity_low)
n_positions = max_position - min_position + 1
n_velocities = max_velocity - min_velocity + 1
n_actions = env.action_space.n
Q = np.zeros((n_positions, n_velocities, n_actions))
n_eps = 5000
min_epsilon = 0
epsilon = 0.1
alpha = 0.5
gamma = 0.9
min_alpha = 0.01
reward_list = []
ave_reward_list = []
reduction = (alpha - min_alpha) / n_eps
for ep in range(n_eps):
total_reward = q_learning(Q, env, epsilon, alpha, gamma, ep)
if alpha > min_alpha:
alpha -= reduction
reward_list.append(total_reward)
if (ep + 1) % 100 == 0:
ave_reward = np.mean(reward_list)
ave_reward_list.append(ave_reward)
reward_list = []
if (ep + 1) % 100 == 0:
print('Episode {} Average Reward: {}'.format(ep + 1, ave_reward))
env.close()
plt.plot(100 * (np.arange(len(ave_reward_list)) + 1), ave_reward_list)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('./mountain_car.png')
plt.close()