Commit eff3b01f authored by Jiakai Song's avatar Jiakai Song
Browse files

add 1v1 model

parent 4b35b90c
## Implementation of RL algorithm PDQN/HPPO for hybrid action space in HFO environment.
# Implementation of RL algorithm PDQN/HPPO for hybrid action space in HFO environment.
### Train
## Action space
python run_hppo.py
### 1v0:
python run_pdqn.py
Dash(power, degrees),
### Load trained model
Turn(degrees),
python load.py --algo='hppo' --load_dir='models/HPPO_1v0' [--render --no_sync]
Kick(power, degrees)
python load.py --algo='pdqn' --load_dir='models/PDQN_1v0' [--render --no_sync]
### 1v1(npc):
Kick_To(x, y, speed),
Move_To(x, y),
Dribble_To(x, y),
Shoot
## Train
python run_hppo.py [--defense_npc]
python run_pdqn.py [--defense_npc]
## Load trained model
python load.py --algo=hppo --load_dir=models/HPPO_1v0 [--render --no_sync]
python load.py --algo=pdqn --load_dir=models/PDQN_1v0 [--render --no_sync]
python load.py --algo=hppo --defense_npc --load_dir=models/HPPO_1v1 [--render --no_sync]
python load.py --algo=pdqn --defense_npc --load_dir=models/PDQN_1v1 [--render --no_sync]
......@@ -25,5 +25,5 @@ class Agent:
hybrid_action = np.concatenate(([action], p))
return hybrid_action
def choose_action(self, state):
def choose_action(self, state, avail_actions):
raise NotImplemented
......@@ -36,11 +36,12 @@ class Actor(nn.Module):
layer_init(self.discrete_action, std=1.0)
layer_init(self.mu, std=1.0)
def forward(self, state):
def forward(self, state, avail_actions):
discrete = state
for hidden_layer in self.discrete_layers:
discrete = F.relu(hidden_layer(discrete))
discrete_action = self.discrete_action(discrete)
discrete_action[avail_actions == 0] = -999999
prob = torch.softmax(discrete_action, dim=-1)
categorical = Categorical(prob)
......@@ -81,6 +82,7 @@ class HPPO(Agent):
super(HPPO, self).__init__(env)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# self.device = torch.device('cpu')
load_dir = None
if not load:
self.frame_stack = args.frame_stack
self.hidden_size = args.hidden_size
......@@ -112,10 +114,11 @@ class HPPO(Agent):
else:
self.load_model(load_dir)
def choose_action(self, state, explore=True):
def choose_action(self, state, avail_actions, explore=True):
with torch.no_grad():
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
categorical, dist = self.actor(state)
avail_actions = torch.tensor(avail_actions, dtype=torch.float32).unsqueeze(0).to(self.device)
categorical, dist = self.actor(state, avail_actions)
if explore:
action = categorical.sample()
params = dist.sample().clamp_(-1, 1)
......@@ -128,9 +131,9 @@ class HPPO(Agent):
params = params.cpu().squeeze(0).numpy()
return self.denormalize(action, params), action, params, log_prob1, log_prob2
def update_step(self, s_batch, a_batch, p_batch, old_log_p1, old_log_p2, returns_batch, adv_batch):
def update_step(self, s_batch, a_batch, p_batch, avail_batch, old_log_p1, old_log_p2, returns_batch, adv_batch):
v = self.critic(s_batch)
categorical, dist = self.actor(s_batch)
categorical, dist = self.actor(s_batch, avail_batch)
log_p1 = categorical.log_prob(a_batch)
log_p2 = dist.log_prob(p_batch).sum(-1)
entropy1 = categorical.entropy().mean()
......@@ -165,12 +168,13 @@ class HPPO(Agent):
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_clip)
self.optim2.step()
return entropy1.item(), entropy2.data.item()
return entropy1.item(), entropy2.item()
def update_network(self, buffer_s, buffer_a, buffer_p, log_p1, log_p2, buffer_r, buffer_mask, n):
def update_network(self, buffer_s, buffer_a, buffer_p, buffer_avail, log_p1, log_p2, buffer_r, buffer_mask, n):
s = torch.tensor(buffer_s, dtype=torch.float32).to(self.device)
p = torch.tensor(buffer_p, dtype=torch.float32).to(self.device)
r = torch.tensor(buffer_r, dtype=torch.float32).to(self.device)
avail = torch.tensor(buffer_avail, dtype=torch.float32).to(self.device)
log_prob1 = torch.tensor(log_p1, dtype=torch.float32).to(self.device)
log_prob2 = torch.tensor(log_p2, dtype=torch.float32).to(self.device)
v_s = self.critic(s).detach()
......@@ -205,7 +209,7 @@ class HPPO(Agent):
if i == self.mini_batch - 1:
minibatch = shuffle[i * mini_batch_size:n]
else:
minibatch = shuffle[i * mini_batch_size:(i+1)*mini_batch_size]
minibatch = shuffle[i * mini_batch_size:(i + 1) * mini_batch_size]
s_batch = s[minibatch]
returns_batch = returns[minibatch]
adv_batch = adv[minibatch]
......@@ -213,7 +217,8 @@ class HPPO(Agent):
p_batch = p[minibatch]
log_p1_batch = log_prob1[minibatch]
log_p2_batch = log_prob2[minibatch]
e1, e2 = self.update_step(s_batch, a_batch, p_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
avail_batch = avail[minibatch]
e1, e2 = self.update_step(s_batch, a_batch, p_batch, avail_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
entropy1_record.append(e1)
entropy2_record.append(e2)
return np.mean(entropy1_record), np.mean(entropy2_record)
......@@ -228,6 +233,6 @@ class HPPO(Agent):
torch.save(self.actor.state_dict(), os.path.join(save_dir, 'actor.pkl'))
torch.save(self.critic.state_dict(), os.path.join(save_dir, 'critic.pkl'))
def load_model(self, load_dir=None):
def load_model(self, load_dir):
self.actor.load_state_dict(torch.load(os.path.join(load_dir, 'actor.pkl')))
self.critic.load_state_dict(torch.load(os.path.join(load_dir, 'critic.pkl')))
......@@ -11,7 +11,7 @@ from .common import Agent
Transition = namedtuple("Transition",
("state", "action", "params", "n_step_reward", "next_state", "done", "mc_target"))
("state", "action", "params", "n_step_reward", "next_state", "avail_actions_next", "done", "mc_target"))
class ParamsNet(nn.Module):
......@@ -211,40 +211,47 @@ class PDQN(Agent):
action = trans[i].action
params = trans[i].params
r = trans[i].reward
if i + self.n_step < n:
avail_actions_next = trans[i + self.n_step].avail_actions
else:
avail_actions_next = np.zeros([self.n_discrete])
mc_target = r + mc_target * self.gamma
if i + self.n_step >= n:
n_step_reward = n_step_reward * self.gamma + r
look_ahead_state = trans[n-1].next_state
look_ahead_state = trans[n - 1].next_state
done = True
else:
r_ = trans[i + self.n_step].reward
n_step_reward = (n_step_reward - r_ * self.gamma ** (self.n_step - 1)) * self.gamma + r
look_ahead_state = trans[i + self.n_step - 1].next_state
done = False
transitions.append(Transition(state, action, params, n_step_reward, look_ahead_state, done, mc_target))
transitions.append(Transition(state, action, params, n_step_reward, look_ahead_state, avail_actions_next, done, mc_target))
self.replay.extend(transitions)
def sample(self, mini_batch_size):
transitions = random.sample(self.replay, mini_batch_size)
return Transition(*zip(*transitions))
def choose_action(self, state, explore=True):
def choose_action(self, state, avail_actions, explore=True):
if explore:
epsilon = self.epsilon_schedule.get_epsilon()
else:
epsilon = 0
p = random.random()
if p < epsilon:
action = random.randint(0, self.n_discrete - 1)
# action = random.randint(0, self.n_discrete - 1)
avail_agent_action_idx = np.nonzero(avail_actions)[0]
action = np.random.choice(avail_agent_action_idx)
params = np.random.uniform(-1, 1, size=[self.params_size])
# params = np.clip(np.random.normal(0, 1, size=[self.params_size]), -1, 1)
else:
with torch.no_grad():
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
params = self.p_net(state)
out = self.q_network(state, params).squeeze()
action = out.argmax().item()
q_vals = self.q_network(state, params).squeeze(0)
q_vals[avail_actions == 0] = -float('inf')
action = q_vals.argmax().item()
params = params.cpu().numpy()[0]
if self.add_ou_noise:
params = params + self.ou_noise()
......@@ -265,11 +272,14 @@ class PDQN(Agent):
next_state_batch = torch.from_numpy(np.array(minibatch.next_state)).float().to(self.device)
d_batch = torch.tensor(minibatch.done).float().to(self.device)
mc_batch = torch.tensor(minibatch.mc_target).float().to(self.device)
avail_actions_next_batch = torch.from_numpy(np.array(minibatch.avail_actions_next)).float().to(self.device)
Q = self.q_network(state_batch, params_batch)
Q = torch.gather(Q, dim=1, index=a_batch).squeeze()
with torch.no_grad():
params = self.target_p_net(next_state_batch)
target_q = self.target_q_network(next_state_batch, params).max(1)[0]
target_q = self.target_q_network(next_state_batch, params)
target_q[avail_actions_next_batch == 0] = -999999
target_q = target_q.max(1)[0]
target_q = n_step_reward_batch + target_q * (self.gamma ** self.n_step) * (1.0 - d_batch)
target_q = self.coeff_mc * mc_batch + (1 - self.coeff_mc) * target_q
q_loss = F.mse_loss(target_q, Q)
......
......@@ -136,6 +136,10 @@ class SoccerEnv(gym.Env, utils.EzPickle):
" --connect --port %d" % (self.server_port)
self.viewer = subprocess.Popen(cmd.split(' '), shell=False)
def get_avail_actions(self):
avail_actions = np.ones([3])
return avail_actions
def step(self, action):
self._take_action(action)
self.status = self.env.step()
......
......@@ -5,6 +5,7 @@ import numpy as np
from algorithms.pdqn import PDQN
from algorithms.hppo import HPPO
from envs.soccer_score_goal import SoccerScoreGoalEnv
from envs.soccer_against_keeper import SoccerAgainstKeeperEnv
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......@@ -13,10 +14,14 @@ if __name__ == '__main__':
parser.add_argument('--render', default=False, action='store_true')
parser.add_argument('--no_sync', default=False, action='store_true')
parser.add_argument('--algo', type=str, default='hppo', choices=['hppo', 'pdqn'])
parser.add_argument('--defense_npc', default=False, action='store_true')
args = parser.parse_args()
sync = not (args.render and args.no_sync)
env = SoccerScoreGoalEnv(sync)
if args.defense_npc:
env = SoccerAgainstKeeperEnv(sync)
else:
env = SoccerScoreGoalEnv(sync)
if args.render:
env.render()
if args.algo == 'hppo':
......@@ -35,7 +40,8 @@ if __name__ == '__main__':
q.append(state)
for t in itertools.count():
stack_state = np.array(q).flatten()
hybrid_action, action, params = agent.choose_action(stack_state, explore=False)[:3]
avail_actions = env.get_avail_actions()
hybrid_action, action, params = agent.choose_action(stack_state, avail_actions, explore=False)[:3]
next_state, reward, done, info = env.step(hybrid_action)
q.append(next_state)
test_r += reward
......
import itertools
from _collections import deque
from collections import namedtuple
import numpy as np
from algorithms.hppo import HPPO
Transition = namedtuple('Transition', ('state', 'action', 'params', 'log_prob1', 'log_prob2', 'reward', 'mask'))
from envs.soccer_score_goal import SoccerScoreGoalEnv
from envs.soccer_against_keeper import SoccerAgainstKeeperEnv
from tensorboardX import SummaryWriter
import argparse
import yaml
import os
Transition = namedtuple('Transition', ('state', 'action', 'params', 'avail_actions','log_prob1', 'log_prob2', 'reward', 'mask'))
class Memory:
def __init__(self):
......@@ -44,6 +43,7 @@ if __name__ == '__main__':
parser.add_argument('--epsilon', type=float, default=0.2)
parser.add_argument('--coef_entropy', type=float, nargs='+', default=[0.001, 0.001])
parser.add_argument('--hidden_size', type=int, nargs='+', default=[256, 256, 256, 256])
parser.add_argument('--defense_npc', default=False, action='store_true')
args = parser.parse_args()
writer = SummaryWriter()
......@@ -54,7 +54,10 @@ if __name__ == '__main__':
num_iteration = args.num_iteration
batch_size = args.batch_size
frame_stack = args.frame_stack
env = SoccerScoreGoalEnv()
if args.defense_npc:
env = SoccerAgainstKeeperEnv()
else:
env = SoccerScoreGoalEnv()
agent = HPPO(env, args)
reward_record = []
......@@ -76,12 +79,13 @@ if __name__ == '__main__':
for t in itertools.count():
total_steps += 1
stack_state = np.array(q).flatten()
hybrid_action, action, params, log_prob1, log_prob2 = agent.choose_action(stack_state)
avail_actions = env.get_avail_actions()
hybrid_action, action, params, log_prob1, log_prob2 = agent.choose_action(stack_state, avail_actions)
next_state, reward, done, info = env.step(hybrid_action)
q.append(next_state)
ep_r += reward
mask = 0 if done else 1
memory.push(stack_state, action, params, log_prob1, log_prob2, reward, mask)
memory.push(stack_state, action, params, avail_actions, log_prob1, log_prob2, reward, mask)
if done:
num_steps += (t + 1)
reward_list.append(ep_r)
......@@ -103,9 +107,10 @@ if __name__ == '__main__':
log_prob2_batch = batch.log_prob2
reward_batch = batch.reward
mask_batch = batch.mask
avail_batch = batch.avail_actions
e1, e2 = agent.update_network(state_batch, action_batch, params_batch, log_prob1_batch, log_prob2_batch,
reward_batch, mask_batch, n)
e1, e2 = agent.update_network(state_batch, action_batch, params_batch, avail_batch, log_prob1_batch,
log_prob2_batch, reward_batch, mask_batch, n)
mean_ep_reward = reward_record[-1]['mean_ep_reward']
mean_ep_length = reward_record[-1]['mean_ep_length']
......
......@@ -10,8 +10,9 @@ from tensorboardX import SummaryWriter
from algorithms.pdqn import PDQN
from envs.soccer_score_goal import SoccerScoreGoalEnv
from envs.soccer_against_keeper import SoccerAgainstKeeperEnv
SimpleTransition = namedtuple("Transition", ("state", "action", "params", "reward", "next_state", "done"))
SimpleTransition = namedtuple("Transition", ("state", "action", "params", "avail_actions", "reward", "next_state", "done"))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
......@@ -36,14 +37,17 @@ if __name__ == '__main__':
parser.add_argument('--mp', default=False, action='store_true') # multi pass
parser.add_argument('--squash', default=False, action='store_true') # tanh to (-1, 1)
parser.add_argument('--hidden_size', type=int, nargs='+', default=[256, 256, 256, 256])
parser.add_argument('--defense_npc', default=False, action='store_true')
args = parser.parse_args()
writer = SummaryWriter()
logdir = writer.logdir
with open(os.path.join(logdir, 'PDQN_config.yaml'), 'w') as f:
yaml.dump(args.__dict__, f)
env = SoccerScoreGoalEnv()
if args.defense_npc:
env = SoccerAgainstKeeperEnv()
else:
env = SoccerScoreGoalEnv()
agent = PDQN(env, args)
state_size = env.observation_space.shape[0]
frame_stack = args.frame_stack
......@@ -61,12 +65,13 @@ if __name__ == '__main__':
stack_state = np.array(q).flatten()
for t in itertools.count():
total_steps += 1
hybrid_action, action, params = agent.choose_action(stack_state)
avail_actions = env.get_avail_actions()
hybrid_action, action, params = agent.choose_action(stack_state, avail_actions)
next_state, reward, done, info = env.step(hybrid_action)
q.append(next_state)
next_stack_state = np.array(q).flatten()
ep_r += reward
trans.append(SimpleTransition(stack_state, action, params, reward, next_stack_state, done))
trans.append(SimpleTransition(stack_state, action, params, avail_actions, reward, next_stack_state, done))
agent.train_step()
stack_state = next_stack_state
if done:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment