Commit 86eaf68e authored by Jiakai Song's avatar Jiakai Song
Browse files

upload model

parent 65fe5cbe
......@@ -8,6 +8,6 @@ python run_pdqn.py
### Load trained model
python load.py --algo='hppo' --load_dir='trained/HPPO' [--render]
python load.py --algo='hppo' --load_dir='models/_HPPO' [--render --no_sync]
python load.py --algo='pdqn' --load_dir='trained/PDQN' [--render]
python load.py --algo='pdqn' --load_dir='models/_PDQN' [--render --no_sync]
......@@ -17,11 +17,11 @@ class Agent:
for l, h in zip(self.space_low, self.space_high):
self.low.extend(l)
self.high.extend(h)
self.low = np.array(self.low)
self.high = np.array(self.high)
def denormalize(self, action, params):
p = np.zeros_like(params)
for i in range(p.shape[0]):
p[i] = (params[i] + 1.) * (self.high[i] - self.low[i]) / 2 + self.low[i]
p = (params + 1.) * (self.high - self.low) / 2 + self.low
hybrid_action = np.concatenate(([action], p))
return hybrid_action
......
......@@ -12,9 +12,6 @@ def layer_init(layer, std=1.0, bias_const=0.0):
torch.nn.init.constant_(layer.bias, bias_const)
hidden_activate = F.relu
class Actor(nn.Module):
def __init__(self, input_size, n_discrete, params_size, hidden_size=None):
super(Actor, self).__init__()
......@@ -42,14 +39,14 @@ class Actor(nn.Module):
def forward(self, state):
discrete = state
for hidden_layer in self.discrete_layers:
discrete = hidden_activate(hidden_layer(discrete))
discrete = F.relu(hidden_layer(discrete))
discrete_action = self.discrete_action(discrete)
prob = torch.softmax(discrete_action, dim=-1)
categorical = Categorical(prob)
continuous = state
for hidden_layer in self.continuous_layers:
continuous = hidden_activate(hidden_layer(continuous))
continuous = F.relu(hidden_layer(continuous))
mu = torch.tanh(self.mu(continuous))
# mu = self.mu(continuous)
......@@ -74,7 +71,7 @@ class Critic(nn.Module):
def forward(self, state):
out = state
for hidden_layer in self.layers:
out = hidden_activate(hidden_layer(out))
out = F.relu(hidden_layer(out))
v = self.v(out)
return v.squeeze(-1)
......@@ -84,13 +81,14 @@ class HPPO(Agent):
super(HPPO, self).__init__(env)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# self.device = torch.device('cpu')
load_dir = None
if not load:
self.frame_stack = args.frame_stack
self.hidden_size = args.hidden_size
else:
if args.load_dir is not None:
load_dir = args.load_dir
else:
load_dir = './models/HPPO/'
info = torch.load(os.path.join(load_dir, 'info.pkl'))
self.frame_stack = info['frame_stack']
self.hidden_size = info['hidden_size']
......
......@@ -9,17 +9,17 @@ import torch.nn.functional as F
from .common import Agent
dueling = False
scale = True
Transition = namedtuple("Transition",
("state", "action", "params", "n_step_reward", "next_state", "done", "mc_target"))
class ParamsNet(nn.Module):
def __init__(self, input_size, params_size, hidden_size=None):
def __init__(self, input_size, params_size, squash=False, hidden_size=None):
super(ParamsNet, self).__init__()
if hidden_size is None:
hidden_size = [256, 128, 64]
self.squash = squash
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
......@@ -30,7 +30,7 @@ class ParamsNet(nn.Module):
for hidden_layer in self.layers:
out = F.leaky_relu(hidden_layer(out), 0.01)
out = self.output_layer(out)
if scale:
if self.squash:
out = torch.tanh(out)
return out
......@@ -44,22 +44,13 @@ class QNet(nn.Module):
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
if not dueling:
self.q = nn.Linear(hidden_size[-1], n_actions)
else:
self.adv = nn.Linear(hidden_size[-1], n_actions)
self.v = nn.Linear(hidden_size[-1], 1)
self.q = nn.Linear(hidden_size[-1], n_actions)
def forward(self, state, params):
out = torch.cat((state, params), dim=1)
for hidden_layer in self.layers:
out = F.leaky_relu(hidden_layer(out), 0.01)
if not dueling:
q_val = self.q(out)
else:
v = self.v(out)
adv = self.adv(out)
q_val = v + (adv - adv.mean(dim=1, keepdim=True))
q_val = self.q(out)
return q_val
......@@ -80,11 +71,8 @@ class MultiPassQNet(nn.Module):
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
if not dueling:
self.q = nn.Linear(hidden_size[-1], n_actions)
else:
self.adv = nn.Linear(hidden_size[-1], n_actions)
self.v = nn.Linear(hidden_size[-1], 1)
self.q = nn.Linear(hidden_size[-1], n_actions)
self.to(self.device)
def forward(self, state, params):
......@@ -114,12 +102,7 @@ class MultiPassQNet(nn.Module):
out = torch.cat((s, x), dim=-1)
for hidden_layer in self.layers:
out = F.leaky_relu(hidden_layer(out), 0.01)
if not dueling:
q_val = self.q(out)
else:
v = self.v(out)
adv = self.adv(out)
q_val = v + (adv - adv.mean(dim=1, keepdim=True))
q_val = self.q(out)
q_val = q_val.view(n, -1)
slides = [i * (self.n_actions + 1) for i in range(self.n_actions)]
index = torch.tensor(slides).long().expand(n, -1).to(self.device)
......@@ -166,6 +149,7 @@ class PDQN(Agent):
self.frame_stack = args.frame_stack
self.hidden_size = args.hidden_size
self.mp = args.mp
self.squash = args.squash
else:
if args.load_dir is not None:
load_dir = args.load_dir
......@@ -173,20 +157,18 @@ class PDQN(Agent):
self.frame_stack = info['frame_stack']
self.hidden_size = info['hidden_size']
self.mp = info['mp']
self.squash = info['squash']
self.input_size = self.state_size * self.frame_stack
self.params = ParamsNet(self.input_size, self.params_size, self.hidden_size).to(self.device)
self.target_params = ParamsNet(self.input_size, self.params_size, self.hidden_size).to(self.device)
self.p_net = ParamsNet(self.input_size, self.params_size, self.squash, self.hidden_size).to(self.device)
self.target_p_net = ParamsNet(self.input_size, self.params_size, self.squash, self.hidden_size).to(self.device)
if not self.mp:
self.q_network = QNet(self.input_size, self.n_discrete, self.params_size, self.hidden_size).to(self.device)
self.target_q_network = QNet(self.input_size, self.n_discrete, self.params_size, self.hidden_size).to(
self.device)
self.target_q_network = QNet(self.input_size, self.n_discrete, self.params_size, self.hidden_size).to(self.device)
else:
self.q_network = MultiPassQNet(self.input_size, self.n_discrete, self.each_param_size, self.device,
self.hidden_size)
self.target_q_network = MultiPassQNet(self.input_size, self.n_discrete, self.each_param_size, self.device,
self.hidden_size)
self.q_network = MultiPassQNet(self.input_size, self.n_discrete, self.each_param_size, self.device, self.hidden_size)
self.target_q_network = MultiPassQNet(self.input_size, self.n_discrete, self.each_param_size, self.device, self.hidden_size)
if not load:
self.lr_q = args.lr_q
......@@ -202,7 +184,7 @@ class PDQN(Agent):
self.coeff_out_of_range = args.coeff_out_of_range
self.replay = deque(maxlen=self.replay_size)
self.optim1 = torch.optim.Adam(lr=self.lr_q, params=self.q_network.parameters())
self.optim2 = torch.optim.Adam(lr=self.lr_p, params=self.params.parameters())
self.optim2 = torch.optim.Adam(lr=self.lr_p, params=self.p_net.parameters())
self.replace_freq = args.replace_freq
self.train_freq = args.train_freq
self.step = 0
......@@ -214,7 +196,7 @@ class PDQN(Agent):
# def get_ave_max_q(self, state):
# with torch.no_grad():
# state = torch.tensor(state, dtype=torch.float32).to(self.device)
# params = self.params(state)
# params = self.p_net(state)
# out = self.q_network(state, params).squeeze()
# max_q = out.max(dim=-1)[0].mean().cpu().numpy()
# return max_q
......@@ -260,7 +242,7 @@ class PDQN(Agent):
else:
with torch.no_grad():
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
params = self.params(state)
params = self.p_net(state)
out = self.q_network(state, params).squeeze()
action = out.argmax().item()
params = params.cpu().numpy()[0]
......@@ -273,7 +255,7 @@ class PDQN(Agent):
if len(self.replay) > self.memory_start:
if self.step % self.replace_freq == 0:
self.target_q_network.load_state_dict(self.q_network.state_dict())
self.target_params.load_state_dict(self.params.state_dict())
self.target_p_net.load_state_dict(self.p_net.state_dict())
if self.step % self.train_freq == 0:
minibatch = self.sample(self.mini_batch_size)
state_batch = torch.from_numpy(np.array(minibatch.state)).float().to(self.device)
......@@ -286,7 +268,7 @@ class PDQN(Agent):
Q = self.q_network(state_batch, params_batch)
Q = torch.gather(Q, dim=1, index=a_batch).squeeze()
with torch.no_grad():
params = self.target_params(next_state_batch)
params = self.target_p_net(next_state_batch)
target_q = self.target_q_network(next_state_batch, params).max(1)[0]
target_q = n_step_reward_batch + target_q * (self.gamma ** self.n_step) * (1.0 - d_batch)
target_q = self.coeff_mc * mc_batch + (1 - self.coeff_mc) * target_q
......@@ -297,7 +279,7 @@ class PDQN(Agent):
param.grad.data.clamp_(-1, 1)
self.optim1.step()
params = self.params(state_batch)
params = self.p_net(state_batch)
q_val = self.q_network(state_batch, params)
p_loss = -q_val.mean()
p = params.abs() - 1
......@@ -313,11 +295,11 @@ class PDQN(Agent):
save_dir = './models/PDQN'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
info = {'hidden_size': self.hidden_size, 'frame_stack': self.frame_stack, 'mp': self.mp}
info = {'hidden_size': self.hidden_size, 'frame_stack': self.frame_stack, 'mp': self.mp, 'squash': self.squash}
torch.save(info, os.path.join(save_dir, 'info.pkl'))
torch.save(self.q_network.state_dict(), os.path.join(save_dir, 'q_net.pkl'))
torch.save(self.params.state_dict(), os.path.join(save_dir, 'p_net.pkl'))
torch.save(self.p_net.state_dict(), os.path.join(save_dir, 'p_net.pkl'))
def load_model(self, load_dir):
self.q_network.load_state_dict(torch.load(os.path.join(load_dir, 'q_net.pkl')))
self.params.load_state_dict(torch.load(os.path.join(load_dir, 'p_net.pkl')))
self.p_net.load_state_dict(torch.load(os.path.join(load_dir, 'p_net.pkl')))
import logging
from envs.soccer_empty_goal import SoccerEmptyGoalEnv
from envs.soccer_score_goal import SoccerScoreGoalEnv
import math
import numpy as np
from gym import spaces
import hfo_py
logger = logging.getLogger(__name__)
......@@ -16,14 +18,121 @@ class SoccerAgainstKeeperEnv(SoccerScoreGoalEnv):
"""
def __init__(self, ball_x_min=0.0, ball_x_max=0.2):
self.ball_x_min = ball_x_min
self.ball_x_max = ball_x_max
super(SoccerAgainstKeeperEnv, self).__init__()
def __init__(self, sync=True, offense_on_ball=True):
self.offense_on_ball = int(offense_on_ball)
super(SoccerAgainstKeeperEnv, self).__init__(sync=sync)
low0 = np.array([-1, -1, 0], dtype=np.float32)
high0 = np.array([1, 1, 3], dtype=np.float32)
low1 = np.array([-1, -1], dtype=np.float32)
high1 = np.array([1, 1], dtype=np.float32)
low2 = np.array([-1, -1], dtype=np.float32)
high2 = np.array([1, 1], dtype=np.float32)
low3 = -1.0
high3 = 1.0
self.action_space = spaces.Tuple((spaces.Discrete(4),
spaces.Box(low=low0, high=high0, dtype=np.float32),
spaces.Box(low=low1, high=high1, dtype=np.float32),
spaces.Box(low=low2, high=high2, dtype=np.float32),
spaces.Box(low=low3, high=high3, dtype=np.float32, shape=(0,)),
))
def get_avail_actions(self):
avail_actions = np.ones([4])
state = self.env.getState()
if int(state[12]) != 1:
avail_actions[[0, 3]] = 0 # KICK_TO, SHOOT unavailable
return avail_actions
def _take_action(self, action):
""" Converts the action space into an HFO action. """
action_type = ACTION_LOOKUP[action[0]]
if action_type == hfo_py.KICK_TO:
self.env.act(action_type, action[1], action[2], action[3])
elif action_type == hfo_py.MOVE_TO:
self.env.act(action_type, action[4], action[5])
elif action_type == hfo_py.DRIBBLE_TO:
self.env.act(action_type, action[6], action[7])
elif action_type == hfo_py.SHOOT:
self.env.act(action_type)
else:
print('Unrecognized action %d' % action_type)
self.env.act(hfo_py.NOOP)
def _configure_environment(self):
super(SoccerAgainstKeeperEnv, self)._start_hfo_server(defense_npcs=1,
offense_on_ball=1,
ball_x_min=self.ball_x_min,
ball_x_max=self.ball_x_max)
offense_on_ball=self.offense_on_ball)
def _get_reward(self):
"""
Agent is rewarded for minimizing the distance between itself and
the ball, minimizing the distance between the ball and the goal,
and scoring a goal.
"""
current_state = self.env.getState()
# print("State =",current_state)
# print("len State =",len(current_state))
ball_proximity = current_state[53]
goal_proximity = current_state[15]
ball_dist = 1.0 - ball_proximity
goal_dist = 1.0 - goal_proximity
kickable = current_state[12]
ball_ang_sin_rad = current_state[51]
ball_ang_cos_rad = current_state[52]
ball_ang_rad = math.acos(ball_ang_cos_rad)
if ball_ang_sin_rad < 0:
ball_ang_rad *= -1.
goal_ang_sin_rad = current_state[13]
goal_ang_cos_rad = current_state[14]
goal_ang_rad = math.acos(goal_ang_cos_rad)
if goal_ang_sin_rad < 0:
goal_ang_rad *= -1.
alpha = max(ball_ang_rad, goal_ang_rad) - min(ball_ang_rad, goal_ang_rad)
ball_dist_goal = math.sqrt(ball_dist * ball_dist + goal_dist * goal_dist -
2. * ball_dist * goal_dist * math.cos(alpha))
# Compute the difference in ball proximity from the last step
if not self.first_step:
ball_prox_delta = ball_proximity - self.old_ball_prox
kickable_delta = kickable - self.old_kickable
ball_dist_goal_delta = ball_dist_goal - self.old_ball_dist_goal
self.old_ball_prox = ball_proximity
self.old_kickable = kickable
self.old_ball_dist_goal = ball_dist_goal
# print(self.env.playerOnBall())
# print(self.env.playerOnBall().unum)
# print(self.env.getUnum())
reward = 0
if not self.first_step:
'''# Reward the agent for moving towards the ball
reward += ball_prox_delta
if kickable_delta > 0 and not self.got_kickable_reward:
reward += 1.
self.got_kickable_reward = True
# Reward the agent for kicking towards the goal
reward += 0.6 * -ball_dist_goal_delta
# Reward the agent for scoring
if self.status == hfo_py.GOAL:
reward += 5.0'''
'''reward = self.__move_to_ball_reward(kickable_delta, ball_prox_delta) + \
3. * self.__kick_to_goal_reward(ball_dist_goal_delta) + \
self.__EOT_reward();'''
mtb = self._move_to_ball_reward(kickable_delta, ball_prox_delta)
ktg = 3. * self._kick_to_goal_reward(ball_dist_goal_delta)
eot = self._EOT_reward()
reward = ktg + eot
if not self.offense_on_ball:
reward += mtb
# print("mtb: %.06f ktg: %.06f eot: %.06f"%(mtb,ktg,eot))
self.first_step = False
# print("r =",reward)
return reward
ACTION_LOOKUP = {
0 : hfo_py.KICK_TO,
1 : hfo_py.MOVE_TO,
2 : hfo_py.DRIBBLE_TO,
3 : hfo_py.SHOOT,
}
......@@ -18,8 +18,8 @@ class SoccerEmptyGoalEnv(SoccerEnv):
the ball, kicks the ball towards the goal, and scores a goal.
"""
def __init__(self):
super(SoccerEmptyGoalEnv, self).__init__()
def __init__(self, sync=True):
super(SoccerEmptyGoalEnv, self).__init__(sync=sync)
self.old_ball_prox = 0
self.old_kickable = 0
self.old_ball_dist_goal = 0
......
......@@ -34,7 +34,8 @@ def find_free_port():
class SoccerEnv(gym.Env, utils.EzPickle):
metadata = {'render.modes': ['human']}
def __init__(self):
def __init__(self, sync=True):
self.sync = sync
self.viewer = None
self.server_process = None
self.server_port = None
......@@ -53,8 +54,6 @@ class SoccerEnv(gym.Env, utils.EzPickle):
high1 = np.array([180], dtype=np.float32)
low2 = np.array([0, -180], dtype=np.float32)
high2 = np.array([100, 180], dtype=np.float32)
low3 = np.array([-180], dtype=np.float32)
high3 = np.array([180], dtype=np.float32)
self.action_space = spaces.Tuple((spaces.Discrete(3),
spaces.Box(low=low0, high=high0, dtype=np.float32),
spaces.Box(low=low1, high=high1, dtype=np.float32),
......@@ -62,6 +61,7 @@ class SoccerEnv(gym.Env, utils.EzPickle):
self.status = hfo_py.IN_GAME
self._seed = -1
self.first_episode = True
def __del__(self):
os.kill(self.server_process.pid, signal.SIGINT)
......@@ -79,11 +79,10 @@ class SoccerEnv(gym.Env, utils.EzPickle):
self._start_hfo_server()
def _start_hfo_server(self, frames_per_trial=500,
#untouched_time=1000,
untouched_time=100,
offense_agents=1,
defense_agents=0, offense_npcs=0,
defense_npcs=0, sync_mode=True, port=None,
defense_npcs=0, port=None,
offense_on_ball=0, fullstate=True, seed=-1,
ball_x_min=0.0, ball_x_max=0.2,
verbose=False, log_game=False,
......@@ -109,7 +108,6 @@ class SoccerEnv(gym.Env, utils.EzPickle):
if port is None:
port = find_free_port()
self.server_port = port
self.offense_on_ball = offense_on_ball
cmd = "/home/sjk1997/2d/HFO/bin/HFO" + \
" --headless --frames-per-trial %i --offense-agents %i"\
" --defense-agents %i --offense-npcs %i --defense-npcs %i"\
......@@ -120,7 +118,7 @@ class SoccerEnv(gym.Env, utils.EzPickle):
defense_agents, offense_npcs, defense_npcs, port,
offense_on_ball, seed, ball_x_min, ball_x_max,
log_dir)
if not sync_mode: cmd += " --no-sync"
if not self.sync: cmd += " --no-sync"
if fullstate: cmd += " --fullstate"
if verbose: cmd += " --verbose"
if not log_game: cmd += " --no-logging"
......@@ -168,6 +166,9 @@ class SoccerEnv(gym.Env, utils.EzPickle):
def reset(self):
""" Repeats NO-OP action until a new episode begins. """
if self.first_episode:
self.first_episode = False
return self.env.getState()
while self.status == hfo_py.IN_GAME:
self.env.act(hfo_py.NOOP)
self.status = self.env.step()
......
import logging
import math
import numpy as np
from gym import spaces
from envs.soccer_env import SoccerEnv, ACTION_LOOKUP
from envs.soccer_empty_goal import SoccerEmptyGoalEnv
......@@ -26,26 +23,11 @@ class SoccerScoreGoalEnv(SoccerEmptyGoalEnv):
Action Spaces".
"""
def __init__(self):
super(SoccerScoreGoalEnv, self).__init__()
def __init__(self, sync=True):
super(SoccerScoreGoalEnv, self).__init__(sync=sync)
# dash, turn, kick, tackle
low0 = np.array([0, -180], dtype=np.float32) # meant to be 0, not -100! (according to original soccer env and dqn-hfo inverting gradients)
high0 = np.array([100, 180], dtype=np.float32)
low1 = np.array([-180], dtype=np.float32)
high1 = np.array([180], dtype=np.float32)
low2 = np.array([0, -180], dtype=np.float32)
high2 = np.array([100, 180], dtype=np.float32)
# low2 = -1.0
# high2 = 1.0
self.action_space = spaces.Tuple((spaces.Discrete(3),
spaces.Box(low=low0, high=high0, dtype=np.float32),
spaces.Box(low=low1, high=high1, dtype=np.float32),
spaces.Box(low=low2, high=high2, dtype=np.float32),
# spaces.Box(low=low2, high=high2, dtype=np.float32, shape=(0, )),
))
self.unum = self.env.getUnum() # uniform number (identifier) of our lone agent
print("UNUM =",self.unum)
print("UNUM =", self.unum)
def _get_reward(self):
"""
......@@ -100,37 +82,33 @@ class SoccerScoreGoalEnv(SoccerEmptyGoalEnv):
'''reward = self.__move_to_ball_reward(kickable_delta, ball_prox_delta) + \
3. * self.__kick_to_goal_reward(ball_dist_goal_delta) + \
self.__EOT_reward();'''
mtb = self.__move_to_ball_reward(kickable_delta, ball_prox_delta)
ktg = 3. * self.__kick_to_goal_reward(ball_dist_goal_delta)
eot = self.__EOT_reward()
mtb = self._move_to_ball_reward(kickable_delta, ball_prox_delta)
ktg = 3. * self._kick_to_goal_reward(ball_dist_goal_delta)
eot = self._EOT_reward()
reward = mtb + ktg + eot
# reward = ktg + eot
#print("mtb: %.06f ktg: %.06f eot: %.06f"%(mtb,ktg,eot))
self.first_step = False
#print("r =",reward)
return reward
def __move_to_ball_reward(self, kickable_delta, ball_prox_delta):
def _move_to_ball_reward(self, kickable_delta, ball_prox_delta):
reward = 0.
if self.env.playerOnBall().unum < 0 or self.env.playerOnBall().unum == self.unum:
reward += ball_prox_delta
if kickable_delta >= 1 and not self.got_kickable_reward:
if self.offense_on_ball:
reward += 0.01
else:
reward += 1.
reward += 1.
self.got_kickable_reward = True
return reward
def __kick_to_goal_reward(self, ball_dist_goal_delta):
def _kick_to_goal_reward(self, ball_dist_goal_delta):
if(self.env.playerOnBall().unum == self.unum):
return -ball_dist_goal_delta
elif self.got_kickable_reward == True:
return 0.2 * -ball_dist_goal_delta
return 0.
def __EOT_reward(self):
def _EOT_reward(self):
if self.status == hfo_py.GOAL:
return 5.
#elif self.status == hfo_py.CAPTURED_BY_DEFENSE:
......
......@@ -11,10 +11,12 @@ if __name__ == '__main__':
parser.add_argument('--num_episode', type=int, default=1000)
parser.add_argument('--load_dir', type=str, default=None)
parser.add_argument('--render', default=False, action='store_true')
parser.add_argument('--no_sync', default=False, action='store_true')
parser.add_argument('--algo', type=str, default='hppo', choices=['hppo', 'pdqn'])
args = parser.parse_args()
env = SoccerScoreGoalEnv()
sync = not (args.render and args.no_sync)
env = SoccerScoreGoalEnv(sync)
if args.render:
env.render()
if args.algo == 'hppo':
......
Markdown is supported
0% or .