Commit 6562b854 authored by hezhiqiang01's avatar hezhiqiang01

initial commit

parent d4b028ab
from mappo import algorithms, envs, runner, scripts, utils, config
__version__ = "0.1.0"
__all__ = [
"algorithms",
"envs",
"runner",
"scripts",
"utils",
"config",
]
\ No newline at end of file
"""
# @Time : 2021/7/1 6:49 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : __init__.py.py
"""
"""
# @Time : 2021/7/1 6:53 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : rMAPPOPolicy.py
"""
import torch
from mappo.algorithms.algorithm.r_actor_critic import R_Actor, R_Critic
from mappo.utils.util import update_linear_schedule
class RMAPPOPolicy:
"""
MAPPO Policy class. Wraps actor and critic networks to compute actions and value function predictions.
:param args: (argparse.Namespace) arguments containing relevant model and policy information.
:param obs_space: (gym.Space) observation space.
:param cent_obs_space: (gym.Space) value function input space (centralized input for MAPPO, decentralized for IPPO).
:param action_space: (gym.Space) action space.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self, args, obs_space, cent_obs_space, act_space, device=torch.device("cpu")):
self.device = device
self.lr = args.lr
self.critic_lr = args.critic_lr
self.opti_eps = args.opti_eps
self.weight_decay = args.weight_decay
self.obs_space = obs_space
self.share_obs_space = cent_obs_space
self.act_space = act_space
self.actor = R_Actor(args, self.obs_space, self.act_space, self.device)
self.critic = R_Critic(args, self.share_obs_space, self.device)
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
lr=self.lr, eps=self.opti_eps,
weight_decay=self.weight_decay)
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
lr=self.critic_lr,
eps=self.opti_eps,
weight_decay=self.weight_decay)
def lr_decay(self, episode, episodes):
"""
Decay the actor and critic learning rates.
:param episode: (int) current training episode.
:param episodes: (int) total number of training episodes.
"""
update_linear_schedule(self.actor_optimizer, episode, episodes, self.lr)
update_linear_schedule(self.critic_optimizer, episode, episodes, self.critic_lr)
def get_actions(self, cent_obs, obs, rnn_states_actor, rnn_states_critic, masks, available_actions=None,
deterministic=False):
"""
Compute actions and value function predictions for the given inputs.
:param cent_obs (np.ndarray): centralized input to the critic.
:param obs (np.ndarray): local agent inputs to the actor.
:param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor.
:param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:param available_actions: (np.ndarray) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether the action should be mode of distribution or should be sampled.
:return values: (torch.Tensor) value function predictions.
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of chosen actions.
:return rnn_states_actor: (torch.Tensor) updated actor network RNN states.
:return rnn_states_critic: (torch.Tensor) updated critic network RNN states.
"""
actions, action_log_probs, rnn_states_actor = self.actor(obs,
rnn_states_actor,
masks,
available_actions,
deterministic)
values, rnn_states_critic = self.critic(cent_obs, rnn_states_critic, masks)
return values, actions, action_log_probs, rnn_states_actor, rnn_states_critic
def get_values(self, cent_obs, rnn_states_critic, masks):
"""
Get value function predictions.
:param cent_obs (np.ndarray): centralized input to the critic.
:param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:return values: (torch.Tensor) value function predictions.
"""
values, _ = self.critic(cent_obs, rnn_states_critic, masks)
return values
def evaluate_actions(self, cent_obs, obs, rnn_states_actor, rnn_states_critic, action, masks,
available_actions=None, active_masks=None):
"""
Get action logprobs / entropy and value function predictions for actor update.
:param cent_obs (np.ndarray): centralized input to the critic.
:param obs (np.ndarray): local agent inputs to the actor.
:param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor.
:param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic.
:param action: (np.ndarray) actions whose log probabilites and entropy to compute.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:param available_actions: (np.ndarray) denotes which actions are available to agent
(if None, all actions available)
:param active_masks: (torch.Tensor) denotes whether an agent is active or dead.
:return values: (torch.Tensor) value function predictions.
:return action_log_probs: (torch.Tensor) log probabilities of the input actions.
:return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs.
"""
action_log_probs, dist_entropy = self.actor.evaluate_actions(obs,
rnn_states_actor,
action,
masks,
available_actions,
active_masks)
values, _ = self.critic(cent_obs, rnn_states_critic, masks)
return values, action_log_probs, dist_entropy
def act(self, obs, rnn_states_actor, masks, available_actions=None, deterministic=False):
"""
Compute actions using the given inputs.
:param obs (np.ndarray): local agent inputs to the actor.
:param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:param available_actions: (np.ndarray) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether the action should be mode of distribution or should be sampled.
"""
actions, _, rnn_states_actor = self.actor(obs, rnn_states_actor, masks, available_actions, deterministic)
return actions, rnn_states_actor
"""
# @Time : 2021/7/1 6:53 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : r_actor_critic.py
"""
import torch
import torch.nn as nn
from mappo.algorithms.utils.util import init, check
from mappo.algorithms.utils.cnn import CNNBase
from mappo.algorithms.utils.mlp import MLPBase
from mappo.algorithms.utils.rnn import RNNLayer
from mappo.algorithms.utils.act import ACTLayer
from mappo.algorithms.utils.popart import PopArt
from mappo.utils.util import get_shape_from_obs_space
class R_Actor(nn.Module):
"""
Actor network class for MAPPO. Outputs actions given observations.
:param args: (argparse.Namespace) arguments containing relevant model information.
:param obs_space: (gym.Space) observation space.
:param action_space: (gym.Space) action space.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")):
super(R_Actor, self).__init__()
self.hidden_size = args.hidden_size
self._gain = args.gain
self._use_orthogonal = args.use_orthogonal
self._use_policy_active_masks = args.use_policy_active_masks
self._use_naive_recurrent_policy = args.use_naive_recurrent_policy
self._use_recurrent_policy = args.use_recurrent_policy
self._recurrent_N = args.recurrent_N
self.tpdv = dict(dtype=torch.float32, device=device)
obs_shape = get_shape_from_obs_space(obs_space)
base = CNNBase if len(obs_shape) == 3 else MLPBase
self.base = base(args, obs_shape)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal)
self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain)
self.to(device)
def forward(self, obs, rnn_states, masks, available_actions=None, deterministic=False):
"""
Compute actions from the given inputs.
:param obs: (np.ndarray / torch.Tensor) observation inputs into network.
:param rnn_states: (np.ndarray / torch.Tensor) if RNN network, hidden states for RNN.
:param masks: (np.ndarray / torch.Tensor) mask tensor denoting if hidden states should be reinitialized to zeros.
:param available_actions: (np.ndarray / torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether to sample from action distribution or return the mode.
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of taken actions.
:return rnn_states: (torch.Tensor) updated RNN hidden states.
"""
obs = check(obs).to(**self.tpdv)
rnn_states = check(rnn_states).to(**self.tpdv)
masks = check(masks).to(**self.tpdv)
if available_actions is not None:
available_actions = check(available_actions).to(**self.tpdv)
actor_features = self.base(obs)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks)
actions, action_log_probs = self.act(actor_features, available_actions, deterministic)
return actions, action_log_probs, rnn_states
def evaluate_actions(self, obs, rnn_states, action, masks, available_actions=None, active_masks=None):
"""
Compute log probability and entropy of given actions.
:param obs: (torch.Tensor) observation inputs into network.
:param action: (torch.Tensor) actions whose entropy and log probability to evaluate.
:param rnn_states: (torch.Tensor) if RNN network, hidden states for RNN.
:param masks: (torch.Tensor) mask tensor denoting if hidden states should be reinitialized to zeros.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param active_masks: (torch.Tensor) denotes whether an agent is active or dead.
:return action_log_probs: (torch.Tensor) log probabilities of the input actions.
:return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs.
"""
obs = check(obs).to(**self.tpdv)
rnn_states = check(rnn_states).to(**self.tpdv)
action = check(action).to(**self.tpdv)
masks = check(masks).to(**self.tpdv)
if available_actions is not None:
available_actions = check(available_actions).to(**self.tpdv)
if active_masks is not None:
active_masks = check(active_masks).to(**self.tpdv)
actor_features = self.base(obs)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks)
action_log_probs, dist_entropy = self.act.evaluate_actions(actor_features,
action, available_actions,
active_masks=
active_masks if self._use_policy_active_masks
else None)
return action_log_probs, dist_entropy
class R_Critic(nn.Module):
"""
Critic network class for MAPPO. Outputs value function predictions given centralized input (MAPPO) or
local observations (IPPO).
:param args: (argparse.Namespace) arguments containing relevant model information.
:param cent_obs_space: (gym.Space) (centralized) observation space.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self, args, cent_obs_space, device=torch.device("cpu")):
super(R_Critic, self).__init__()
self.hidden_size = args.hidden_size
self._use_orthogonal = args.use_orthogonal
self._use_naive_recurrent_policy = args.use_naive_recurrent_policy
self._use_recurrent_policy = args.use_recurrent_policy
self._recurrent_N = args.recurrent_N
self._use_popart = args.use_popart
self.tpdv = dict(dtype=torch.float32, device=device)
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal]
cent_obs_shape = get_shape_from_obs_space(cent_obs_space)
base = CNNBase if len(cent_obs_shape) == 3 else MLPBase
self.base = base(args, cent_obs_shape)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal)
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0))
if self._use_popart:
self.v_out = init_(PopArt(self.hidden_size, 1, device=device))
else:
self.v_out = init_(nn.Linear(self.hidden_size, 1))
self.to(device)
def forward(self, cent_obs, rnn_states, masks):
"""
Compute actions from the given inputs.
:param cent_obs: (np.ndarray / torch.Tensor) observation inputs into network.
:param rnn_states: (np.ndarray / torch.Tensor) if RNN network, hidden states for RNN.
:param masks: (np.ndarray / torch.Tensor) mask tensor denoting if RNN states should be reinitialized to zeros.
:return values: (torch.Tensor) value function predictions.
:return rnn_states: (torch.Tensor) updated RNN hidden states.
"""
cent_obs = check(cent_obs).to(**self.tpdv)
rnn_states = check(rnn_states).to(**self.tpdv)
masks = check(masks).to(**self.tpdv)
critic_features = self.base(cent_obs)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
critic_features, rnn_states = self.rnn(critic_features, rnn_states, masks)
values = self.v_out(critic_features)
return values, rnn_states
This diff is collapsed.
from .distributions import Bernoulli, Categorical, DiagGaussian
import torch
import torch.nn as nn
class ACTLayer(nn.Module):
"""
MLP Module to compute actions.
:param action_space: (gym.Space) action space.
:param inputs_dim: (int) dimension of network input.
:param use_orthogonal: (bool) whether to use orthogonal initialization.
:param gain: (float) gain of the output layer of the network.
"""
def __init__(self, action_space, inputs_dim, use_orthogonal, gain):
super(ACTLayer, self).__init__()
self.mixed_action = False
self.multi_discrete = False
if action_space.__class__.__name__ == "Discrete":
action_dim = action_space.n
self.action_out = Categorical(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "Box":
action_dim = action_space.shape[0]
self.action_out = DiagGaussian(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "MultiBinary":
action_dim = action_space.shape[0]
self.action_out = Bernoulli(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "MultiDiscrete":
self.multi_discrete = True
action_dims = action_space.high - action_space.low + 1
self.action_outs = []
for action_dim in action_dims:
self.action_outs.append(Categorical(inputs_dim, action_dim, use_orthogonal, gain))
self.action_outs = nn.ModuleList(self.action_outs)
else: # discrete + continous
self.mixed_action = True
continous_dim = action_space[0].shape[0]
discrete_dim = action_space[1].n
self.action_outs = nn.ModuleList([DiagGaussian(inputs_dim, continous_dim, use_orthogonal, gain), Categorical(
inputs_dim, discrete_dim, use_orthogonal, gain)])
def forward(self, x, available_actions=None, deterministic=False):
"""
Compute actions and action logprobs from given input.
:param x: (torch.Tensor) input to network.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether to sample from action distribution or return the mode.
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of taken actions.
"""
if self.mixed_action :
actions = []
action_log_probs = []
for action_out in self.action_outs:
action_logit = action_out(x)
action = action_logit.mode() if deterministic else action_logit.sample()
action_log_prob = action_logit.log_probs(action)
actions.append(action.float())
action_log_probs.append(action_log_prob)
actions = torch.cat(actions, -1)
action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
elif self.multi_discrete:
actions = []
action_log_probs = []
for action_out in self.action_outs:
action_logit = action_out(x)
action = action_logit.mode() if deterministic else action_logit.sample()
action_log_prob = action_logit.log_probs(action)
actions.append(action)
action_log_probs.append(action_log_prob)
actions = torch.cat(actions, -1)
action_log_probs = torch.cat(action_log_probs, -1)
else:
action_logits = self.action_out(x, available_actions)
actions = action_logits.mode() if deterministic else action_logits.sample()
action_log_probs = action_logits.log_probs(actions)
return actions, action_log_probs
def get_probs(self, x, available_actions=None):
"""
Compute action probabilities from inputs.
:param x: (torch.Tensor) input to network.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:return action_probs: (torch.Tensor)
"""
if self.mixed_action or self.multi_discrete:
action_probs = []
for action_out in self.action_outs:
action_logit = action_out(x)
action_prob = action_logit.probs
action_probs.append(action_prob)
action_probs = torch.cat(action_probs, -1)
else:
action_logits = self.action_out(x, available_actions)
action_probs = action_logits.probs
return action_probs
def evaluate_actions(self, x, action, available_actions=None, active_masks=None):
"""
Compute log probability and entropy of given actions.
:param x: (torch.Tensor) input to network.
:param action: (torch.Tensor) actions whose entropy and log probability to evaluate.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param active_masks: (torch.Tensor) denotes whether an agent is active or dead.
:return action_log_probs: (torch.Tensor) log probabilities of the input actions.
:return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs.
"""
if self.mixed_action:
a, b = action.split((2, 1), -1)
b = b.long()
action = [a, b]
action_log_probs = []
dist_entropy = []
for action_out, act in zip(self.action_outs, action):
action_logit = action_out(x)
action_log_probs.append(action_logit.log_probs(act))
if active_masks is not None:
if len(action_logit.entropy().shape) == len(active_masks.shape):
dist_entropy.append((action_logit.entropy() * active_masks).sum()/active_masks.sum())
else:
dist_entropy.append((action_logit.entropy() * active_masks.squeeze(-1)).sum()/active_masks.sum())
else:
dist_entropy.append(action_logit.entropy().mean())
action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
dist_entropy = dist_entropy[0] / 2.0 + dist_entropy[1] / 0.98 #! dosen't make sense
elif self.multi_discrete:
action = torch.transpose(action, 0, 1)
action_log_probs = []
dist_entropy = []
for action_out, act in zip(self.action_outs, action):
action_logit = action_out(x)
action_log_probs.append(action_logit.log_probs(act))
if active_masks is not None:
dist_entropy.append((action_logit.entropy()*active_masks.squeeze(-1)).sum()/active_masks.sum())
else:
dist_entropy.append(action_logit.entropy().mean())
action_log_probs = torch.cat(action_log_probs, -1) # ! could be wrong
dist_entropy = torch.tensor(dist_entropy).mean()
else:
action_logits = self.action_out(x, available_actions)
action_log_probs = action_logits.log_probs(action)
if active_masks is not None:
dist_entropy = (action_logits.entropy()*active_masks.squeeze(-1)).sum()/active_masks.sum()
else:
dist_entropy = action_logits.entropy().mean()
return action_log_probs, dist_entropy
import torch.nn as nn
from .util import init
"""CNN Modules and utils."""
class Flatten(nn.Module):
def forward(self, x):
return x.view(x.size(0), -1)
class CNNLayer(nn.Module):
def __init__(self, obs_shape, hidden_size, use_orthogonal, use_ReLU, kernel_size=3, stride=1):
super(CNNLayer, self).__init__()
active_func = [nn.Tanh(), nn.ReLU()][use_ReLU]
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU])
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain)
input_channel = obs_shape[0]
input_width = obs_shape[1]
input_height = obs_shape[2]
self.cnn = nn.Sequential(
init_(nn.Conv2d(in_channels=input_channel,
out_channels=hidden_size // 2,
kernel_size=kernel_size,
stride=stride)
),
active_func,
Flatten(),
init_(nn.Linear(hidden_size // 2 * (input_width - kernel_size + stride) * (input_height - kernel_size + stride),
hidden_size)
),
active_func,
init_(nn.Linear(hidden_size, hidden_size)), active_func)
def forward(self, x):
x = x / 255.0
x = self.cnn(x)
return x
class CNNBase(nn.Module):
def __init__(self, args, obs_shape):
super(CNNBase, self).__init__()
self._use_orthogonal = args.use_orthogonal
self._use_ReLU = args.use_ReLU
self.hidden_size = args.hidden_size
self.cnn = CNNLayer(obs_shape, self.hidden_size, self._use_orthogonal, self._use_ReLU)
def forward(self, x):
x = self.cnn(x)
return x
import torch
import torch.nn as nn
from .util import init
"""
Modify standard PyTorch distributions so they to make compatible with this codebase.
"""
#
# Standardize distribution interfaces
#
# Categorical
class FixedCategorical(torch.distributions.Categorical):
def sample(self):
return super().sample().unsqueeze(-1)
def log_probs(self, actions):
return (
super()
.log_prob(actions.squeeze(-1))
.view(actions.size(0), -1)
.sum(-1)
.unsqueeze(-1)
)
def mode(self):
return self.probs.argmax(dim=-1, keepdim=True)
# Normal
class FixedNormal(torch.distributions.Normal):
def log_probs(self, actions):
return super().log_prob(actions).sum(-1, keepdim=True)
def entrop(self):
return super.entropy().sum(-1)
def mode(self):
return self.mean
# Bernoulli
class FixedBernoulli(torch.distributions.Bernoulli):
def log_probs(self, actions):
return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1)
def entropy(self):
return super().entropy().sum(-1)
def mode(self):
return torch.gt(self.probs, 0.5).float()
class Categorical(nn.Module):
def __init__(self, num_inputs, num_outputs, use_orthogonal=True, gain=0.01):
super(Categorical, self).__init__()
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain)
self.linear = init_(nn.Linear(num_inputs, num_outputs))
def forward(self, x, available_actions=None):
x = self.linear(x)
if available_actions is not None:
x[available_actions == 0] = -1e10
return FixedCategorical(logits=x)
class DiagGaussian(nn.Module):
def __init__(self, num_inputs, num_outputs, use_orthogonal=True, gain=0.01):
super(DiagGaussian, self).__init__()
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain)
self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
self.logstd = AddBias(torch.zeros(num_outputs))
def forward(self, x):
action_mean = self.fc_mean(x)
# An ugly hack for my KFAC implementation.
zeros = torch.zeros(action_mean.size())
if x.is_cuda:
zeros = zeros.cuda()
action_logstd = self.logstd(zeros)
return FixedNormal(action_mean, action_logstd.exp())
class Bernoulli(nn.Module):
def __init__(self, num_inputs, num_outputs, use_orthogonal=True, gain=0.01):
super(Bernoulli, self).__init__()
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain)
self.linear = init_(nn.Linear(num_inputs, num_outputs))
def forward(self, x):
x = self.linear(x)
return FixedBernoulli(logits=x)
class AddBias(nn.Module):
def __init__(self, bias):
super(AddBias, self).__init__()
self._bias = nn.Parameter(bias.unsqueeze(1))
def forward(self, x):
if x.dim() == 2:
bias = self._bias.t().view(1, -1)
else:
bias = self._bias.t().view(1, -1, 1, 1)
return x + bias
import torch.nn as nn
from .util import init, get_clones
"""MLP modules."""
class MLPLayer(nn.Module):
def __init__(self, input_dim, hidden_size, layer_N, use_orthogonal, use_ReLU):
super(MLPLayer, self).__init__()
self._layer_N = layer_N
active_func = [nn.Tanh(), nn.ReLU()][use_ReLU]
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU])
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain)
self.fc1 = nn.Sequential(
init_(nn.Linear(input_dim, hidden_size)), active_func, nn.LayerNorm(hidden_size))
self.fc_h = nn.Sequential(init_(
nn.Linear(hidden_size, hidden_size)), active_func, nn.LayerNorm(hidden_size))
self.fc2 = get_clones(self.fc_h, self._layer_N)
def forward(self, x):
x = self.fc1(x)
for i in range(self._layer_N):
x = self.fc2[i](x)
return x
class MLPBase(nn.Module):
def __init__(self, args, obs_shape, cat_self=True, attn_internal=False):
super(MLPBase, self).__init__()
self._use_feature_normalization = args.use_feature_normalization
self._use_orthogonal = args.use_orthogonal
self._use_ReLU = args.use_ReLU
self._stacked_frames = args.stacked_frames
self._layer_N = args.layer_N
self.hidden_size = args.hidden_size
obs_dim = obs_shape[0]
if self._use_feature_normalization:
self.feature_norm = nn.LayerNorm(obs_dim)
self.mlp = MLPLayer(obs_dim, self.hidden_size,
self._layer_N, self._use_orthogonal, self._use_ReLU)
def forward(self, x):
if self._use_feature_normalization:
x = self.feature_norm(x)
x = self.mlp(x)
return x
\ No newline at end of file
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class PopArt(torch.nn.Module):
def __init__(self, input_shape, output_shape, norm_axes=1, beta=0.99999, epsilon=1e-5, device=torch.device("cpu")):
super(PopArt, self).__init__()
self.beta = beta
self.epsilon = epsilon
self.norm_axes = norm_axes
self.tpdv = dict(dtype=torch.float32, device=device)
self.input_shape = input_shape
self.output_shape = output_shape
self.weight = nn.Parameter(torch.Tensor(output_shape, input_shape)).to(**self.tpdv)
self.bias = nn.Parameter(torch.Tensor(output_shape)).to(**self.tpdv)
self.stddev = nn.Parameter(torch.ones(output_shape), requires_grad=False).to(**self.tpdv)
self.mean = nn.Parameter(torch.zeros(output_shape), requires_grad=False).to(**self.tpdv)
self.mean_sq = nn.Parameter(torch.zeros(output_shape), requires_grad=False).to(**self.tpdv)
self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv)
self.reset_parameters()
def reset_parameters(self):
torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
torch.nn.init.uniform_(self.bias, -bound, bound)
self.mean.zero_()
self.mean_sq.zero_()
self.debiasing_term.zero_()
def forward(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
return F.linear(input_vector, self.weight, self.bias)
@torch.no_grad()
def update(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
old_mean, old_stddev = self.mean, self.stddev
batch_mean = input_vector.mean(dim=tuple(range(self.norm_axes)))
batch_sq_mean = (input_vector ** 2).mean(dim=tuple(range(self.norm_axes)))
self.mean.mul_(self.beta).add_(batch_mean * (1.0 - self.beta))
self.mean_sq.mul_(self.beta).add_(batch_sq_mean * (1.0 - self.beta))
self.debiasing_term.mul_(self.beta).add_(1.0 * (1.0 - self.beta))
self.stddev = (self.mean_sq - self.mean ** 2).sqrt().clamp(min=1e-4)
self.weight = self.weight * old_stddev / self.stddev
self.bias = (old_stddev * self.bias + old_mean - self.mean) / self.stddev
def debiased_mean_var(self):
debiased_mean = self.mean / self.debiasing_term.clamp(min=self.epsilon)
debiased_mean_sq = self.mean_sq / self.debiasing_term.clamp(min=self.epsilon)
debiased_var = (debiased_mean_sq - debiased_mean ** 2).clamp(min=1e-2)
return debiased_mean, debiased_var
def normalize(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
mean, var = self.debiased_mean_var()
out = (input_vector - mean[(None,) * self.norm_axes]) / torch.sqrt(var)[(None,) * self.norm_axes]
return out
def denormalize(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
mean, var = self.debiased_mean_var()
out = input_vector * torch.sqrt(var)[(None,) * self.norm_axes] + mean[(None,) * self.norm_axes]
out = out.cpu().numpy()
return out
import torch
import torch.nn as nn
"""RNN modules."""
class RNNLayer(nn.Module):
def __init__(self, inputs_dim, outputs_dim, recurrent_N, use_orthogonal):
super(RNNLayer, self).__init__()
self._recurrent_N = recurrent_N
self._use_orthogonal = use_orthogonal
self.rnn = nn.GRU(inputs_dim, outputs_dim, num_layers=self._recurrent_N)
for name, param in self.rnn.named_parameters():
if 'bias' in name:
nn.init.constant_(param, 0)
elif 'weight' in name:
if self._use_orthogonal:
nn.init.orthogonal_(param)
else:
nn.init.xavier_uniform_(param)
self.norm = nn.LayerNorm(outputs_dim)
def forward(self, x, hxs, masks):
if x.size(0) == hxs.size(0):
x, hxs = self.rnn(x.unsqueeze(0),
(hxs * masks.repeat(1, self._recurrent_N).unsqueeze(-1)).transpose(0, 1).contiguous())
x = x.squeeze(0)
hxs = hxs.transpose(0, 1)
else:
# x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
N = hxs.size(0)
T = int(x.size(0) / N)
# unflatten
x = x.view(T, N, x.size(1))
# Same deal with masks
masks = masks.view(T, N)
# Let's figure out which steps in the sequence have a zero for any agent
# We will always assume t=0 has a zero in it as that makes the logic cleaner
has_zeros = ((masks[1:] == 0.0)
.any(dim=-1)
.nonzero()
.squeeze()
.cpu())
# +1 to correct the masks[1:]
if has_zeros.dim() == 0:
# Deal with scalar
has_zeros = [has_zeros.item() + 1]
else:
has_zeros = (has_zeros + 1).numpy().tolist()
# add t=0 and t=T to the list
has_zeros = [0] + has_zeros + [T]
hxs = hxs.transpose(0, 1)
outputs = []
for i in range(len(has_zeros) - 1):
# We can now process steps that don't have any zeros in masks together!
# This is much faster
start_idx = has_zeros[i]
end_idx = has_zeros[i + 1]
temp = (hxs * masks[start_idx].view(1, -1, 1).repeat(self._recurrent_N, 1, 1)).contiguous()
rnn_scores, hxs = self.rnn(x[start_idx:end_idx], temp)
outputs.append(rnn_scores)
# assert len(outputs) == T
# x is a (T, N, -1) tensor
x = torch.cat(outputs, dim=0)
# flatten
x = x.reshape(T * N, -1)
hxs = hxs.transpose(0, 1)
x = self.norm(x)
return x, hxs
import copy
import numpy as np
import torch
import torch.nn as nn
def init(module, weight_init, bias_init, gain=1):
weight_init(module.weight.data, gain=gain)
bias_init(module.bias.data)
return module
def get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def check(input):
output = torch.from_numpy(input) if type(input) == np.ndarray else input
return output
This diff is collapsed.
import socket
from absl import flags
FLAGS = flags.FLAGS
FLAGS(['train_sc.py'])
This diff is collapsed.
import time
import wandb
import os
import numpy as np
from itertools import chain
import torch
from tensorboardX import SummaryWriter
from mappo.utils.separated_buffer import SeparatedReplayBuffer
from mappo.utils.util import update_linear_schedule
def _t2n(x):
return x.detach().cpu().numpy()
class Runner(object):
def __init__(self, config):
self.all_args = config['all_args']
self.envs = config['envs']
self.eval_envs = config['eval_envs']
self.device = config['device']
self.num_agents = config['num_agents']
# parameters
self.env_name = self.all_args.env_name
self.algorithm_name = self.all_args.algorithm_name
self.experiment_name = self.all_args.experiment_name
self.use_centralized_V = self.all_args.use_centralized_V
self.use_obs_instead_of_state = self.all_args.use_obs_instead_of_state
self.num_env_steps = self.all_args.num_env_steps
self.episode_length = self.all_args.episode_length
self.n_rollout_threads = self.all_args.n_rollout_threads
self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
# interval
self.save_interval = self.all_args.save_interval
self.use_eval = self.all_args.use_eval
self.eval_interval = self.all_args.eval_interval
self.log_interval = self.all_args.log_interval
# dir
self.model_dir = self.all_args.model_dir
if self.use_render:
import imageio
self.run_dir = config["run_dir"]
self.gif_dir = str(self.run_dir / 'gifs')
if not os.path.exists(self.gif_dir):
os.makedirs(self.gif_dir)
else:
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
from mappo.algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from mappo.algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
self.policy = []
for agent_id in range(self.num_agents):
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id]
# policy network
po = Policy(self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id],
device = self.device)
self.policy.append(po)
if self.model_dir is not None:
self.restore()
self.trainer = []
self.buffer = []
for agent_id in range(self.num_agents):
# algorithm
tr = TrainAlgo(self.all_args, self.policy[agent_id], device = self.device)
# buffer
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id]
bu = SeparatedReplayBuffer(self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id])
self.buffer.append(bu)
self.trainer.append(tr)
def run(self):
raise NotImplementedError
def warmup(self):
raise NotImplementedError
def collect(self, step):
raise NotImplementedError
def insert(self, data):
raise NotImplementedError
@torch.no_grad()
def compute(self):
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_rollout()
next_value = self.trainer[agent_id].policy.get_values(self.buffer[agent_id].share_obs[-1],
self.buffer[agent_id].rnn_states_critic[-1],
self.buffer[agent_id].masks[-1])
next_value = _t2n(next_value)
self.buffer[agent_id].compute_returns(next_value, self.trainer[agent_id].value_normalizer)
def train(self):
train_infos = []
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_training()
train_info = self.trainer[agent_id].train(self.buffer[agent_id])
train_infos.append(train_info)
self.buffer[agent_id].after_update()
return train_infos
def save(self):
for agent_id in range(self.num_agents):
policy_actor = self.trainer[agent_id].policy.actor
torch.save(policy_actor.state_dict(), str(self.save_dir) + "/actor_agent" + str(agent_id) + ".pt")
policy_critic = self.trainer[agent_id].policy.critic
torch.save(policy_critic.state_dict(), str(self.save_dir) + "/critic_agent" + str(agent_id) + ".pt")
def restore(self):
for agent_id in range(self.num_agents):
policy_actor_state_dict = torch.load(str(self.model_dir) + '/actor_agent' + str(agent_id) + '.pt')
self.policy[agent_id].actor.load_state_dict(policy_actor_state_dict)
policy_critic_state_dict = torch.load(str(self.model_dir) + '/critic_agent' + str(agent_id) + '.pt')
self.policy[agent_id].critic.load_state_dict(policy_critic_state_dict)
def log_train(self, train_infos, total_num_steps):
for agent_id in range(self.num_agents):
for k, v in train_infos[agent_id].items():
agent_k = "agent%i/" % agent_id + k
if self.use_wandb:
wandb.log({agent_k: v}, step=total_num_steps)
else:
self.writter.add_scalars(agent_k, {agent_k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
for k, v in env_infos.items():
if len(v) > 0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
This diff is collapsed.
import wandb
import os
import numpy as np
import torch
from tensorboardX import SummaryWriter
from mappo.utils.shared_buffer import SharedReplayBuffer
def _t2n(x):
"""Convert torch tensor to a numpy array."""
return x.detach().cpu().numpy()
class Runner(object):
"""
Base class for training recurrent policies.
:param config: (dict) Config dictionary containing parameters for training.
"""
def __init__(self, config):
self.all_args = config['all_args']
self.envs = config['envs']
self.eval_envs = config['eval_envs']
self.device = config['device']
self.num_agents = config['num_agents']
if config.__contains__("render_envs"):
self.render_envs = config['render_envs']
# parameters
self.env_name = self.all_args.env_name
self.algorithm_name = self.all_args.algorithm_name
self.experiment_name = self.all_args.experiment_name
self.use_centralized_V = self.all_args.use_centralized_V
self.use_obs_instead_of_state = self.all_args.use_obs_instead_of_state
self.num_env_steps = self.all_args.num_env_steps
self.episode_length = self.all_args.episode_length
self.n_rollout_threads = self.all_args.n_rollout_threads
self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads
self.n_render_rollout_threads = self.all_args.n_render_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
# interval
self.save_interval = self.all_args.save_interval
self.use_eval = self.all_args.use_eval
self.eval_interval = self.all_args.eval_interval
self.log_interval = self.all_args.log_interval
# dir
self.model_dir = self.all_args.model_dir
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
self.run_dir = str(wandb.run.dir)
else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
from mappo.algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from mappo.algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
share_observation_space = self.envs.share_observation_space[0] if self.use_centralized_V else self.envs.observation_space[0]
# policy network
self.policy = Policy(self.all_args,
self.envs.observation_space[0],
share_observation_space,
self.envs.action_space[0],
device = self.device)
if self.model_dir is not None:
self.restore()
# algorithm
self.trainer = TrainAlgo(self.all_args, self.policy, device = self.device)
# buffer
self.buffer = SharedReplayBuffer(self.all_args,
self.num_agents,
self.envs.observation_space[0],
share_observation_space,
self.envs.action_space[0])
def run(self):
"""Collect training data, perform training updates, and evaluate policy."""
raise NotImplementedError
def warmup(self):
"""Collect warmup pre-training data."""
raise NotImplementedError
def collect(self, step):
"""Collect rollouts for training."""
raise NotImplementedError
def insert(self, data):
"""
Insert data into buffer.
:param data: (Tuple) data to insert into training buffer.
"""
raise NotImplementedError
@torch.no_grad()
def compute(self):
"""Calculate returns for the collected data."""
self.trainer.prep_rollout()
next_values = self.trainer.policy.get_values(np.concatenate(self.buffer.share_obs[-1]),
np.concatenate(self.buffer.rnn_states_critic[-1]),
np.concatenate(self.buffer.masks[-1]))
next_values = np.array(np.split(_t2n(next_values), self.n_rollout_threads))
self.buffer.compute_returns(next_values, self.trainer.value_normalizer)
def train(self):
"""Train policies with data in buffer. """
self.trainer.prep_training()
train_infos = self.trainer.train(self.buffer)
self.buffer.after_update()
return train_infos
def save(self):
"""Save policy's actor and critic networks."""
policy_actor = self.trainer.policy.actor
torch.save(policy_actor.state_dict(), str(self.save_dir) + "/actor.pt")
policy_critic = self.trainer.policy.critic
torch.save(policy_critic.state_dict(), str(self.save_dir) + "/critic.pt")
def restore(self):
"""Restore policy's networks from a saved model."""
policy_actor_state_dict = torch.load(str(self.model_dir) + '/actor.pt')
self.policy.actor.load_state_dict(policy_actor_state_dict)
if not self.all_args.use_render:
policy_critic_state_dict = torch.load(str(self.model_dir) + '/critic.pt')
self.policy.critic.load_state_dict(policy_critic_state_dict)
def log_train(self, train_infos, total_num_steps):
"""
Log training info.
:param train_infos: (dict) information about training update.
:param total_num_steps: (int) total number of training env steps.
"""
for k, v in train_infos.items():
if self.use_wandb:
wandb.log({k: v}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
"""
Log env info.
:param env_infos: (dict) information about env state.
:param total_num_steps: (int) total number of training env steps.
"""
for k, v in env_infos.items():
if len(v)>0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
This diff is collapsed.
#!/usr/bin/env python
import sys
import os
import wandb
import socket
import setproctitle
import numpy as np
from pathlib import Path
import torch
from mappo.config import get_config
from mappo.envs.mpe.MPE_env import MPEEnv
from mappo.envs.env_wrappers import SubprocVecEnv, DummyVecEnv
def make_render_env(all_args):
def get_env_fn(rank):
def init_env():
if all_args.env_name == "MPE":
env = MPEEnv(all_args)
else:
print("Can not support the " +
all_args.env_name + "environment.")
raise NotImplementedError
env.seed(all_args.seed + rank * 1000)
return env
return init_env
if all_args.n_rollout_threads == 1:
return DummyVecEnv([get_env_fn(0)])
else:
return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def parse_args(args, parser):
parser.add_argument('--scenario_name', type=str,
default='simple_spread', help="Which scenario to run on")
parser.add_argument("--num_landmarks", type=int, default=3)
parser.add_argument('--num_agents', type=int,
default=2, help="number of players")
all_args = parser.parse_known_args(args)[0]
return all_args
def main(args):
parser = get_config()
all_args = parse_args(args, parser)
if all_args.algorithm_name == "rmappo" or all_args.algorithm_name == "rmappg":
assert (
all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!")
elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mappg":
assert (all_args.use_recurrent_policy and all_args.use_naive_recurrent_policy) == False, (
"check recurrent policy!")
else:
raise NotImplementedError
assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, (
"The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")
assert all_args.use_render, ("u need to set use_render be True")
assert not (all_args.model_dir == None or all_args.model_dir == ""), ("set model_dir first")
assert all_args.n_rollout_threads==1, ("only support to use 1 env to render.")
# cuda
if all_args.cuda and torch.cuda.is_available():
print("choose to use gpu...")
device = torch.device("cuda:0")
torch.set_num_threads(all_args.n_training_threads)
if all_args.cuda_deterministic:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
else:
print("choose to use cpu...")
device = torch.device("cpu")
torch.set_num_threads(all_args.n_training_threads)
# run dir
run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
if not run_dir.exists():
os.makedirs(str(run_dir))
if not run_dir.exists():
curr_run = 'run1'
else:
exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')]
if len(exst_run_nums) == 0:
curr_run = 'run1'
else:
curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = run_dir / curr_run
if not run_dir.exists():
os.makedirs(str(run_dir))
setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \
str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name))
# seed
torch.manual_seed(all_args.seed)
torch.cuda.manual_seed_all(all_args.seed)
np.random.seed(all_args.seed)
# env init
envs = make_render_env(all_args)
eval_envs = None
num_agents = all_args.num_agents
config = {
"all_args": all_args,
"envs": envs,
"eval_envs": eval_envs,
"num_agents": num_agents,
"device": device,
"run_dir": run_dir
}
# run experiments
if all_args.share_policy:
from onpolicy.runner.shared.mpe_runner import MPERunner as Runner
else:
from onpolicy.runner.separated.mpe_runner import MPERunner as Runner
runner = Runner(config)
runner.render()
# post process
envs.close()
if __name__ == "__main__":
main(sys.argv[1:])
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment