Commit b5adf247 authored by Jiakai Song's avatar Jiakai Song
Browse files

upload model

parent 5c1926c7
Implementation of RL algorithm PDQN/HPPO for hybrid action space in HFO environment.
###Implementation of RL algorithm PDQN/HPPO for hybrid action space in HFO environment.
###Train
python run_hppo.py
python run_pdqn.py
###Load trained model
python load.py --algo='hppo' --load_dir='trained/HPPO' [--render]
python load.py --algo='pdqn' --load_dir='trained/PDQN' [--render]
\ No newline at end of file
......@@ -18,7 +18,7 @@ class Agent:
self.low.extend(l)
self.high.extend(h)
def norm_to(self, action, params):
def denormalize(self, action, params):
p = np.zeros_like(params)
for i in range(p.shape[0]):
p[i] = (params[i] + 1.) * (self.high[i] - self.low[i]) / 2 + self.low[i]
......
......@@ -84,15 +84,17 @@ class HPPO(Agent):
super(HPPO, self).__init__(env)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# self.device = torch.device('cpu')
load_dir = None
if not load:
self.frame_stack = args.frame_stack
self.hidden_size = args.hidden_size
else:
info = torch.load('./models/HPPO/info.pkl')
if args.load_dir is not None:
load_dir = args.load_dir
info = torch.load(os.path.join(load_dir, 'info.pkl'))
self.frame_stack = info['frame_stack']
self.hidden_size = info['hidden_size']
self.input_size = self.state_size * self.frame_stack
self.actor = Actor(self.input_size, self.n_discrete, self.params_size, self.hidden_size).to(self.device)
self.critic = Critic(self.input_size, self.hidden_size).to(self.device)
......@@ -109,9 +111,8 @@ class HPPO(Agent):
self.adv_norm = adv_norm
self.coef_entropy = args.coef_entropy
self.grad_clip = grad_clip
else:
self.load_model()
self.load_model(load_dir)
def choose_action(self, state, explore=True):
with torch.no_grad():
......@@ -127,15 +128,12 @@ class HPPO(Agent):
log_prob2 = dist.log_prob(params).sum(-1).item()
action = action.item()
params = params.cpu().squeeze(0).numpy()
return self.norm_to(action, params), action, params, log_prob1, log_prob2
def get_std(self):
return self.actor.log_std.detach().exp().mean().item()
return self.denormalize(action, params), action, params, log_prob1, log_prob2
def update_step(self, s_batch, idx_batch, p_batch, old_log_p1, old_log_p2, returns_batch, adv_batch):
def update_step(self, s_batch, a_batch, p_batch, old_log_p1, old_log_p2, returns_batch, adv_batch):
v = self.critic(s_batch)
categorical, dist = self.actor(s_batch)
log_p1 = categorical.log_prob(idx_batch)
log_p1 = categorical.log_prob(a_batch)
log_p2 = dist.log_prob(p_batch).sum(-1)
entropy1 = categorical.entropy().mean()
entropy2 = dist.entropy().mean()
......@@ -177,7 +175,7 @@ class HPPO(Agent):
r = torch.tensor(buffer_r, dtype=torch.float32).to(self.device)
log_prob1 = torch.tensor(log_p1, dtype=torch.float32).to(self.device)
log_prob2 = torch.tensor(log_p2, dtype=torch.float32).to(self.device)
v_s = self.critic(s).detach().squeeze(dim=0)
v_s = self.critic(s).detach()
action_idx = torch.tensor(buffer_a, dtype=torch.int64).to(self.device)
mask = torch.tensor(buffer_mask, dtype=torch.float32).to(self.device)
......@@ -213,28 +211,18 @@ class HPPO(Agent):
s_batch = s[minibatch]
returns_batch = returns[minibatch]
adv_batch = adv[minibatch]
idx_batch = action_idx[minibatch]
a_batch = action_idx[minibatch]
p_batch = p[minibatch]
log_p1_batch = log_prob1[minibatch]
log_p2_batch = log_prob2[minibatch]
e1, e2 = self.update_step(s_batch, idx_batch, p_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
e1, e2 = self.update_step(s_batch, a_batch, p_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
entropy1_record.append(e1)
entropy2_record.append(e2)
return np.mean(entropy1_record), np.mean(entropy2_record)
# for _ in range(self.epochs * n // self.batch_size):
# minibatch = np.random.choice(n, self.batch_size, replace=False)
# s_batch = s[minibatch]
# returns_batch = returns[minibatch]
# adv_batch = adv[minibatch]
# idx_batch = action_idx[minibatch]
# p_batch = p[minibatch]
# log_p1_batch = log_prob1[minibatch]
# log_p2_batch = log_prob2[minibatch]
# self.update_step(s_batch, idx_batch, p_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
def save_model(self):
save_dir = './models/HPPO/'
def save_model(self, save_dir=None):
if save_dir is None:
save_dir = './models/HPPO/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
info = {'hidden_size': self.hidden_size, 'frame_stack': self.frame_stack}
......@@ -242,7 +230,6 @@ class HPPO(Agent):
torch.save(self.actor.state_dict(), os.path.join(save_dir, 'actor.pkl'))
torch.save(self.critic.state_dict(), os.path.join(save_dir, 'critic.pkl'))
def load_model(self):
load_dir = './models/HPPO/'
def load_model(self, load_dir=None):
self.actor.load_state_dict(torch.load(os.path.join(load_dir, 'actor.pkl')))
self.critic.load_state_dict(torch.load(os.path.join(load_dir, 'critic.pkl')))
......@@ -161,13 +161,15 @@ class PDQN(Agent):
super(PDQN, self).__init__(env)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# self.device = torch.device('cpu')
load_dir = None
if not load:
self.frame_stack = args.frame_stack
self.hidden_size = args.hidden_size
self.mp = args.mp
else:
info = torch.load('./models/PDQN/info.pkl')
if args.load_dir is not None:
load_dir = args.load_dir
info = torch.load(os.path.join(load_dir, 'info.pkl'))
self.frame_stack = info['frame_stack']
self.hidden_size = info['hidden_size']
self.mp = info['mp']
......@@ -207,7 +209,7 @@ class PDQN(Agent):
self.ou_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros([self.params_size]), sigma=0.005)
else:
self.add_ou_noise = False
self.load_model()
self.load_model(load_dir)
# def get_ave_max_q(self, state):
# with torch.no_grad():
......@@ -265,7 +267,7 @@ class PDQN(Agent):
if self.add_ou_noise:
params = params + self.ou_noise()
params = np.clip(params, -1, 1)
return self.norm_to(action, params), action, params
return self.denormalize(action, params), action, params
def train_step(self):
if len(self.replay) > self.memory_start:
......@@ -306,16 +308,16 @@ class PDQN(Agent):
self.optim2.step()
self.step += 1
def save_model(self):
save_dir = './models/PDQN'
def save_model(self, save_dir=None):
if save_dir is None:
save_dir = './models/PDQN'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
info = {'hidden_size': self.hidden_size, 'frame_stack': self.frame_stack, 'mp':self.mp}
info = {'hidden_size': self.hidden_size, 'frame_stack': self.frame_stack, 'mp': self.mp}
torch.save(info, os.path.join(save_dir, 'info.pkl'))
torch.save(self.q_network.state_dict(), os.path.join(save_dir, 'q_net.pkl'))
torch.save(self.params.state_dict(), os.path.join(save_dir, 'p_net.pkl'))
def load_model(self):
load_dir = './models/PDQN'
def load_model(self, load_dir):
self.q_network.load_state_dict(torch.load(os.path.join(load_dir, 'q_net.pkl')))
self.params.load_state_dict(torch.load(os.path.join(load_dir, 'p_net.pkl')))
......@@ -64,11 +64,11 @@ class SoccerEnv(gym.Env, utils.EzPickle):
self._seed = -1
def __del__(self):
self.env.act(hfo_py.QUIT)
self.env.step()
os.kill(self.server_process.pid, signal.SIGINT)
if self.viewer is not None:
os.kill(self.viewer.pid, signal.SIGKILL)
self.env.act(hfo_py.QUIT)
self.env.step()
def _configure_environment(self):
"""
......@@ -187,13 +187,6 @@ class SoccerEnv(gym.Env, utils.EzPickle):
else:
if self.viewer is None:
self._start_viewer()
def close(self):
if self.server_process is not None:
try:
os.kill(self.server_process.pid, signal.SIGKILL)
except Exception:
pass
class ServerDownException(Exception):
......
......@@ -9,10 +9,14 @@ from envs.soccer_score_goal import SoccerScoreGoalEnv
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num_episode', type=int, default=1000)
parser.add_argument('--load_dir', type=str, default=None)
parser.add_argument('--render', default=False, action='store_true')
parser.add_argument('--algo', type=str, default='hppo', choices=['hppo', 'pdqn'])
args = parser.parse_args()
env = SoccerScoreGoalEnv()
if args.render:
env.render()
if args.algo == 'hppo':
agent = HPPO(env, args, load=True)
else:
......@@ -37,3 +41,5 @@ if __name__ == '__main__':
break
test_r /= args.num_episode
print('average reward: ', test_r)
......@@ -85,9 +85,7 @@ if __name__ == '__main__':
num_steps += (t + 1)
reward_list.append(ep_r)
len_list.append(t + 1)
break
state = next_state
reward_record.append({
'episode': iteration,
......
......@@ -15,8 +15,7 @@ SimpleTransition = namedtuple("Transition", ("state", "action", "params", "rewar
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--num_episode', type=int, default=50000)
parser.add_argument('--eval_interval', type=int, default=1000)
parser.add_argument('--num_episode', type=int, default=100000)
parser.add_argument('--save_interval', type=int, default=100)
parser.add_argument('--frame_stack', type=int, default=1)
parser.add_argument('--lr_q', type=float, default=0.0002)
......@@ -47,6 +46,9 @@ if __name__ == '__main__':
state_size = env.observation_space.shape[0]
frame_stack = args.frame_stack
q = deque(maxlen=frame_stack)
ep_returns = []
lens = []
max_return = 0
for ep in range(args.num_episode):
state = env.reset()
ep_r = 0
......@@ -63,34 +65,19 @@ if __name__ == '__main__':
trans.append(SimpleTransition(stack_state, action, params, reward, next_stack_state, done))
agent.train_step()
stack_state = next_stack_state
if done:
writer.add_scalar('reward/episode', ep_r, ep)
ep_returns.append(ep_r)
lens.append(t + 1)
break
agent.push_transitions(trans)
if (ep + 1) % args.save_interval == 0:
agent.save_model()
avg_return = np.mean(ep_returns[-args.save_interval:])
writer.add_scalar('reward/episode', avg_return, ep + 1)
writer.add_scalar('length/episode', np.mean(lens[-args.save_interval:]), ep + 1)
if avg_return > max_return:
agent.save_model()
max_return = avg_return
# if (ep + 1) % args.eval_interval == 0:
# test_r = 0
# lens = 0
# for test in range(100):
# state = env.reset()
# for _ in range(frame_stack):
# q.append(state)
# for t in itertools.count():
# stack_state = np.array(q).flatten()
# hybrid_action, action, params = agent.choose_action(stack_state, explore=False)
# next_state, reward, done, info = env.step(hybrid_action)
# q.append(next_state)
# test_r += reward
# if done:
# lens += t + 1
# break
# test_r /= 100
# lens /= 100
# writer.add_scalar('eval_reward/episode', test_r, ep + 1)
# writer.add_scalar('eval_length/episode', lens, ep + 1)
writer.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment