-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmarking_walk.py
100 lines (78 loc) · 3.32 KB
/
benchmarking_walk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# This file contains all the code for the TD3 and SAC RL algorithms across all benchmarking tasks in the DeepMind Control Suite.
# Path: benchmarking.py
### Imports
# Basic Libraries
from dm_control import suite, viewer
import numpy as np
import torch
from torch.optim import Adam
import matplotlib.pyplot as plt
# RL Algorithms
from td3_dm import td3
from core_td3_dm import MLPActorCritic as MLPActorCriticTD3
from sac_dm import sac
from core_sac_dm import MLPActorCritic as MLPActorCriticSAC
### Benchmarking
if __name__ == "__main__":
# Initialize the hyperparameters for the benchmarking
r0 = np.random.RandomState(42)
# Environment function
def env_fn():
return suite.load('walker', 'walk', task_kwargs={'random': r0})
e = env_fn()
U=e.action_spec();udim=U.shape[0];
X=e.observation_spec();xdim=14+1+9;
ac_kwargs = {
'hidden_sizes': [64, 64],
'activation': torch.nn.Tanh
}
# Define hyperparameters for TD3/SAC
seed = 42
steps_per_epoch = 1000
epochs = 500
gamma = 0.95 # gamma
polyak = 0.99 # polyak
clip_ratio = 0.2
pi_lr = 1e-2
vf_lr = 1e-4
lam = 0.97
max_ep_len = 1000
# Run TD3 (using fixed set of hyperparameters shown above)
# ac, average_test_returns, total_test_timesteps = td3(env_fn, actor_critic=MLPActorCriticTD3, ac_kwargs=ac_kwargs, seed=seed,
# steps_per_epoch=steps_per_epoch, epochs=epochs, replay_size=int(1e6), gamma=gamma,
# polyak=polyak, pi_lr=pi_lr, q_lr=vf_lr, batch_size=100, start_steps=10000,
# update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2,
# noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000,
# logger_kwargs=dict(), save_freq=1)
# Run SAC (using fixed set of hyperparameters shown above)
ac, average_test_returns, total_test_timesteps = sac(env_fn, actor_critic=MLPActorCriticSAC, ac_kwargs=ac_kwargs, seed=seed,
steps_per_epoch=steps_per_epoch, epochs=epochs, replay_size=int(1e6), gamma=gamma,
polyak=polyak, lr=1e-2, alpha=0.2, batch_size=100, start_steps=10000,
update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000,
logger_kwargs=dict(), save_freq=1)
# Store and plot the results
plt.figure()
plt.plot(total_test_timesteps, average_test_returns)
plt.title(f"SAC Test Returns over All Timesteps")
plt.xlabel("Timesteps")
plt.ylabel("Average Return")
plt.legend(["Test Returns"])
plt.savefig(f"SAC Test Returns over All Timesteps.png")
# plt.title(f"TD3 Test Returns over All Timesteps")
# plt.xlabel("Timesteps")
# plt.ylabel("Average Return")
# plt.legend(["Test Returns"])
# plt.savefig(f"TD3 Test Returns over All Timesteps.png")
# For use in the visualization
vis_env = env_fn()
# Define and use the policy function
def policy(time_step):
if time_step.last():
return np.zeros(vis_env.action_spec().shape)
obs = time_step.observation
obs_array = np.array(obs['orientations'].tolist() + [obs['height']] + obs['velocity'].tolist())
obs_tensor = torch.as_tensor(obs_array, dtype=torch.float32).unsqueeze(0)
action = ac.act(obs_tensor)
return action.squeeze(0)
# Launch the viewer
viewer.launch(vis_env, policy=policy)