Commit 854f2af7 authored by Jiakai Song's avatar Jiakai Song
Browse files

first commit

parents
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/soccer_socre_goal.iml" filepath="$PROJECT_DIR$/.idea/soccer_socre_goal.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.6" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ChangeListManager">
<list default="true" id="d1f316c6-e5e5-4842-8bc7-098d69dc5fc1" name="Default Changelist" comment="" />
<option name="SHOW_DIALOG" value="false" />
<option name="HIGHLIGHT_CONFLICTS" value="true" />
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
<option name="LAST_RESOLUTION" value="IGNORE" />
</component>
<component name="FileTemplateManagerImpl">
<option name="RECENT_TEMPLATES">
<list>
<option value="Python Script" />
</list>
</option>
</component>
<component name="ProjectId" id="1xNlMpsZNvdtgrhPySoP54Ui6ck" />
<component name="ProjectViewState">
<option name="hideEmptyMiddlePackages" value="true" />
<option name="showExcludedFiles" value="true" />
<option name="showLibraryContents" value="true" />
</component>
<component name="PropertiesComponent">
<property name="RunOnceActivity.ShowReadmeOnStart" value="true" />
<property name="last_opened_file_path" value="$PROJECT_DIR$/../pdqn_RND" />
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
</component>
<component name="RunManager" selected="Python.load">
<configuration name="hppo" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="soccer_socre_goal" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/algorithms" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/algorithms/hppo.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="load" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="soccer_socre_goal" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/load.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="run_hppo" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="soccer_socre_goal" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/run_hppo.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="run_pdqn" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="soccer_socre_goal" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/run_pdqn.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<configuration name="test" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
<module name="soccer_socre_goal" />
<option name="INTERPRETER_OPTIONS" value="" />
<option name="PARENT_ENVS" value="true" />
<envs>
<env name="PYTHONUNBUFFERED" value="1" />
</envs>
<option name="SDK_HOME" value="" />
<option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
<option name="IS_MODULE_SDK" value="true" />
<option name="ADD_CONTENT_ROOTS" value="true" />
<option name="ADD_SOURCE_ROOTS" value="true" />
<option name="SCRIPT_NAME" value="$PROJECT_DIR$/test.py" />
<option name="PARAMETERS" value="" />
<option name="SHOW_COMMAND_LINE" value="false" />
<option name="EMULATE_TERMINAL" value="false" />
<option name="MODULE_MODE" value="false" />
<option name="REDIRECT_INPUT" value="false" />
<option name="INPUT_FILE" value="" />
<method v="2" />
</configuration>
<recent_temporary>
<list>
<item itemvalue="Python.load" />
<item itemvalue="Python.run_hppo" />
<item itemvalue="Python.run_pdqn" />
<item itemvalue="Python.test" />
<item itemvalue="Python.hppo" />
</list>
</recent_temporary>
</component>
<component name="SvnConfiguration">
<configuration />
</component>
<component name="TaskManager">
<task active="true" id="Default" summary="Default task">
<changelist id="d1f316c6-e5e5-4842-8bc7-098d69dc5fc1" name="Default Changelist" comment="" />
<created>1630203381241</created>
<option name="number" value="Default" />
<option name="presentableId" value="Default" />
<updated>1630203381241</updated>
</task>
<servers />
</component>
<component name="WindowStateProjectService">
<state x="483" y="298" width="424" height="482" key="FileChooserDialogImpl" timestamp="1630205035022">
<screen x="65" y="24" width="1855" height="1056" />
</state>
<state x="483" y="298" width="424" height="482" key="FileChooserDialogImpl/65.24.1855.1056@65.24.1855.1056" timestamp="1630205035022" />
<state width="1832" height="296" key="GridCell.Tab.0.bottom" timestamp="1630242701795">
<screen x="65" y="24" width="1855" height="1056" />
</state>
<state width="1832" height="296" key="GridCell.Tab.0.bottom/65.24.1855.1056@65.24.1855.1056" timestamp="1630242701795" />
<state width="1832" height="296" key="GridCell.Tab.0.center" timestamp="1630242701795">
<screen x="65" y="24" width="1855" height="1056" />
</state>
<state width="1832" height="296" key="GridCell.Tab.0.center/65.24.1855.1056@65.24.1855.1056" timestamp="1630242701795" />
<state width="1832" height="296" key="GridCell.Tab.0.left" timestamp="1630242701795">
<screen x="65" y="24" width="1855" height="1056" />
</state>
<state width="1832" height="296" key="GridCell.Tab.0.left/65.24.1855.1056@65.24.1855.1056" timestamp="1630242701795" />
<state width="1832" height="296" key="GridCell.Tab.0.right" timestamp="1630242701795">
<screen x="65" y="24" width="1855" height="1056" />
</state>
<state width="1832" height="296" key="GridCell.Tab.0.right/65.24.1855.1056@65.24.1855.1056" timestamp="1630242701795" />
<state x="486" y="174" key="SettingsEditor" timestamp="1630203426868">
<screen x="65" y="24" width="1855" height="1056" />
</state>
<state x="486" y="174" key="SettingsEditor/65.24.1855.1056@65.24.1855.1056" timestamp="1630203426868" />
</component>
</project>
\ No newline at end of file
import numpy as np
class Agent:
def __init__(self, env):
self.env = env
self.action_space = env.action_space
self.observation_space = env.observation_space
self.state_size = env.observation_space.shape[0]
self.n_discrete = env.action_space.spaces[0].n
self.each_param_size = [space.shape[0] for space in env.action_space.spaces[1:]]
self.loc = [sum(self.each_param_size[:i]) for i in range(self.n_discrete + 1)]
self.params_size = sum(self.each_param_size)
self.space_low = [space.low.tolist() for space in self.action_space.spaces[1:]]
self.space_high = [space.high.tolist() for space in self.action_space.spaces[1:]]
self.low, self.high = [], []
for l, h in zip(self.space_low, self.space_high):
self.low.extend(l)
self.high.extend(h)
def norm_to(self, action, params):
p = np.zeros_like(params)
for i in range(p.shape[0]):
p[i] = (params[i] + 1.) * (self.high[i] - self.low[i]) / 2 + self.low[i]
hybrid_action = np.concatenate(([action], p))
return hybrid_action
def choose_action(self, state):
raise NotImplemented
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from .common import Agent
from torch.distributions import normal, Categorical
def layer_init(layer, std=1.0, bias_const=0.0):
torch.nn.init.orthogonal_(layer.weight, std)
torch.nn.init.constant_(layer.bias, bias_const)
hidden_activate = F.relu
class Actor(nn.Module):
def __init__(self, input_size, n_discrete, params_size, hidden_size=None):
super(Actor, self).__init__()
super(Actor, self).__init__()
if hidden_size is None:
hidden_size = [256, 256, 256]
self.discrete_layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.discrete_layers.append(nn.Linear(x, y))
self.continuous_layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.continuous_layers.append(nn.Linear(x, y))
self.discrete_action = nn.Linear(hidden_size[-1], n_discrete)
self.mu = nn.Linear(hidden_size[-1], params_size)
self.log_std = nn.Parameter(-1.0 * torch.ones([1, params_size]), requires_grad=True)
for layer in self.discrete_layers:
layer_init(layer, std=1.0)
for layer in self.continuous_layers:
layer_init(layer, std=1.0)
layer_init(self.discrete_action, std=1.0)
layer_init(self.mu, std=1.0)
def forward(self, state):
discrete = state
for hidden_layer in self.discrete_layers:
discrete = hidden_activate(hidden_layer(discrete))
discrete_action = self.discrete_action(discrete)
prob = torch.softmax(discrete_action, dim=-1)
categorical = Categorical(prob)
continuous = state
for hidden_layer in self.continuous_layers:
continuous = hidden_activate(hidden_layer(continuous))
mu = torch.tanh(self.mu(continuous))
# mu = self.mu(continuous)
std = torch.exp(self.log_std)
dist = normal.Normal(mu, std)
return categorical, dist
class Critic(nn.Module):
def __init__(self, input_size, hidden_size=None):
super(Critic, self).__init__()
if hidden_size is None:
hidden_size = [256, 256, 256]
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
self.v = nn.Linear(hidden_size[-1], 1)
for layer in self.layers:
layer_init(layer, std=1.0)
layer_init(self.v, std=1.0)
def forward(self, state):
out = state
for hidden_layer in self.layers:
out = hidden_activate(hidden_layer(out))
v = self.v(out)
return v.squeeze(-1)
class HPPO(Agent):
def __init__(self, env, args, adv_norm=True, grad_clip=10, load=False):
super(HPPO, self).__init__(env)
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# self.device = torch.device('cpu')
if not load:
self.frame_stack = args.frame_stack
self.hidden_size = args.hidden_size
else:
info = torch.load('./models/HPPO/info.pkl')
self.frame_stack = info['frame_stack']
self.hidden_size = info['hidden_size']
self.input_size = self.state_size * self.frame_stack
self.actor = Actor(self.input_size, self.n_discrete, self.params_size, self.hidden_size).to(self.device)
self.critic = Critic(self.input_size, self.hidden_size).to(self.device)
if not load:
self.lr_a = args.lr_a
self.lr_c = args.lr_c
self.gamma, self.gae_lam = args.gamma, args.gae_lam
self.mini_batch = args.mini_batch
self.epsilon = args.epsilon
self.epochs = args.epochs
self.optim1 = torch.optim.Adam(lr=args.lr_a, params=self.actor.parameters())
self.optim2 = torch.optim.Adam(lr=args.lr_c, params=self.critic.parameters())
self.adv_norm = adv_norm
self.coef_entropy = args.coef_entropy
self.grad_clip = grad_clip
else:
self.load_model()
def choose_action(self, state, explore=True):
with torch.no_grad():
state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
categorical, dist = self.actor(state)
if explore:
action = categorical.sample()
params = dist.sample().clamp_(-1, 1)
else:
action = categorical.probs.argmax()
params = dist.mean
log_prob1 = categorical.log_prob(action).item()
log_prob2 = dist.log_prob(params).sum(-1).item()
action = action.item()
params = params.cpu().squeeze(0).numpy()
return self.norm_to(action, params), action, params, log_prob1, log_prob2
def get_std(self):
return self.actor.log_std.detach().exp().mean().item()
def update_step(self, s_batch, idx_batch, p_batch, old_log_p1, old_log_p2, returns_batch, adv_batch):
v = self.critic(s_batch)
categorical, dist = self.actor(s_batch)
log_p1 = categorical.log_prob(idx_batch)
log_p2 = dist.log_prob(p_batch).sum(-1)
entropy1 = categorical.entropy().mean()
entropy2 = dist.entropy().mean()
# entropy2 = dist.entropy().sum(-1).mean()
ratio1 = torch.exp(log_p1 - old_log_p1)
ratio2 = torch.exp(log_p2 - old_log_p2)
discrete_loss = -torch.mean(torch.min(
torch.clamp(ratio1, 1 - self.epsilon, 1 + self.epsilon) * adv_batch,
ratio1 * adv_batch
))
continuous_loss = -torch.mean(torch.min(
torch.clamp(ratio2, 1 - self.epsilon, 1 + self.epsilon) * adv_batch,
ratio2 * adv_batch
))
action_loss = discrete_loss + continuous_loss
entropy_loss = - (self.coef_entropy[0] * entropy1 + self.coef_entropy[1] * entropy2)
self.optim1.zero_grad()
(action_loss + entropy_loss).backward()
if self.grad_clip is not None:
torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_clip)
self.optim1.step()
value_loss = F.mse_loss(v, returns_batch)
self.optim2.zero_grad()
value_loss.backward()
if self.grad_clip is not None:
torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_clip)
self.optim2.step()
return entropy1.item(), entropy2.data.item()
def update_network(self, buffer_s, buffer_a, buffer_p, log_p1, log_p2, buffer_r, buffer_mask, n):
s = torch.tensor(buffer_s, dtype=torch.float32).to(self.device)
p = torch.tensor(buffer_p, dtype=torch.float32).to(self.device)
r = torch.tensor(buffer_r, dtype=torch.float32).to(self.device)
log_prob1 = torch.tensor(log_p1, dtype=torch.float32).to(self.device)
log_prob2 = torch.tensor(log_p2, dtype=torch.float32).to(self.device)
v_s = self.critic(s).detach().squeeze(dim=0)
action_idx = torch.tensor(buffer_a, dtype=torch.int64).to(self.device)
mask = torch.tensor(buffer_mask, dtype=torch.float32).to(self.device)
adv = torch.zeros([n], dtype=torch.float32).to(self.device)
detlas = torch.zeros([n], dtype=torch.float32).to(self.device)
returns = torch.zeros([n], dtype=torch.float32).to(self.device)
pre_return = 0
pre_adv = 0
pre_v = 0
for i in reversed(range(n)):
returns[i] = r[i] + self.gamma * pre_return * mask[i]
detlas[i] = r[i] + self.gamma * pre_v * mask[i] - v_s[i]
adv[i] = detlas[i] + self.gamma * self.gae_lam * pre_adv * mask[i]
pre_v = v_s[i]
pre_adv = adv[i]
pre_return = returns[i]
if self.adv_norm:
adv = (adv - adv.mean()) / (adv.std() + 1e-8)
adv.clamp_(-10.0, 10.0)
shuffle = np.random.permutation(n)
mini_batch_size = n // self.mini_batch
entropy1_record = []
entropy2_record = []
for _ in range(self.epochs):
for i in range(self.mini_batch):
if i == self.mini_batch - 1:
minibatch = shuffle[i * mini_batch_size:n]
else:
minibatch = shuffle[i * mini_batch_size:(i+1)*mini_batch_size]
s_batch = s[minibatch]
returns_batch = returns[minibatch]
adv_batch = adv[minibatch]
idx_batch = action_idx[minibatch]
p_batch = p[minibatch]
log_p1_batch = log_prob1[minibatch]
log_p2_batch = log_prob2[minibatch]
e1, e2 = self.update_step(s_batch, idx_batch, p_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
entropy1_record.append(e1)
entropy2_record.append(e2)
return np.mean(entropy1_record), np.mean(entropy2_record)
# for _ in range(self.epochs * n // self.batch_size):
# minibatch = np.random.choice(n, self.batch_size, replace=False)
# s_batch = s[minibatch]
# returns_batch = returns[minibatch]
# adv_batch = adv[minibatch]
# idx_batch = action_idx[minibatch]
# p_batch = p[minibatch]
# log_p1_batch = log_prob1[minibatch]
# log_p2_batch = log_prob2[minibatch]
# self.update_step(s_batch, idx_batch, p_batch, log_p1_batch, log_p2_batch, returns_batch, adv_batch)
def save_model(self):
save_dir = './models/HPPO/'
if not os.path.exists(save_dir):
os.makedirs(save_dir)
info = {'hidden_size': self.hidden_size, 'frame_stack': self.frame_stack}
torch.save(info, os.path.join(save_dir, 'info.pkl'))
torch.save(self.actor.state_dict(), os.path.join(save_dir, 'actor.pkl'))
torch.save(self.critic.state_dict(), os.path.join(save_dir, 'critic.pkl'))
def load_model(self):
load_dir = './models/HPPO/'
self.actor.load_state_dict(torch.load(os.path.join(load_dir, 'actor.pkl')))
self.critic.load_state_dict(torch.load(os.path.join(load_dir, 'critic.pkl')))
import random
from _collections import deque
from collections import namedtuple
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from .common import Agent
dueling = False
scale = True
Transition = namedtuple("Transition",
("state", "action", "params", "n_step_reward", "next_state", "done", "mc_target"))
class ParamsNet(nn.Module):
def __init__(self, input_size, params_size, hidden_size=None):
super(ParamsNet, self).__init__()
if hidden_size is None:
hidden_size = [256, 128, 64]
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
self.output_layer = nn.Linear(hidden_size[-1], params_size)
def forward(self, state):
out = state
for hidden_layer in self.layers:
out = F.leaky_relu(hidden_layer(out), 0.01)
out = self.output_layer(out)
if scale:
out = torch.tanh(out)
return out
class QNet(nn.Module):
def __init__(self, input_size, n_actions, params_size, hidden_size=None):
super(QNet, self).__init__()
if hidden_size is None:
hidden_size = [256, 128, 64, 64]
input_size = input_size + params_size
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
if not dueling:
self.q = nn.Linear(hidden_size[-1], n_actions)
else:
self.adv = nn.Linear(hidden_size[-1], n_actions)
self.v = nn.Linear(hidden_size[-1], 1)
def forward(self, state, params):
out = torch.cat((state, params), dim=1)
for hidden_layer in self.layers:
out = F.leaky_relu(hidden_layer(out), 0.01)
if not dueling:
q_val = self.q(out)
else:
v = self.v(out)
adv = self.adv(out)
q_val = v + (adv - adv.mean(dim=1, keepdim=True))
return q_val
class MultiPassQNet(nn.Module):
def __init__(self, state_size, n_actions, each_params_size, device, hidden_size=None):
super(MultiPassQNet, self).__init__()
self.device = device
self.n_actions = n_actions
self.each_params_loc = []
s = 0
for size in each_params_size:
self.each_params_loc.append(list(range(s, s + size)))
s += size
self.params_size = sum(each_params_size)
if hidden_size is None:
hidden_size = [256, 128, 64, 64]
input_size = state_size + self.params_size
self.layers = nn.ModuleList([nn.Linear(input_size, hidden_size[0])])
for x, y in zip(hidden_size[:-1], hidden_size[1:]):
self.layers.append(nn.Linear(x, y))
if not dueling:
self.q = nn.Linear(hidden_size[-1], n_actions)
else:
self.adv = nn.Linear(hidden_size[-1], n_actions)
self.v = nn.Linear(hidden_size[-1], 1)
self.to