Commit 6562b854 authored by hezhiqiang01's avatar hezhiqiang01

initial commit

parent d4b028ab
from mappo import algorithms, envs, runner, scripts, utils, config
__version__ = "0.1.0"
__all__ = [
"algorithms",
"envs",
"runner",
"scripts",
"utils",
"config",
]
\ No newline at end of file
"""
# @Time : 2021/7/1 6:49 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : __init__.py.py
"""
"""
# @Time : 2021/7/1 6:53 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : rMAPPOPolicy.py
"""
import torch
from mappo.algorithms.algorithm.r_actor_critic import R_Actor, R_Critic
from mappo.utils.util import update_linear_schedule
class RMAPPOPolicy:
"""
MAPPO Policy class. Wraps actor and critic networks to compute actions and value function predictions.
:param args: (argparse.Namespace) arguments containing relevant model and policy information.
:param obs_space: (gym.Space) observation space.
:param cent_obs_space: (gym.Space) value function input space (centralized input for MAPPO, decentralized for IPPO).
:param action_space: (gym.Space) action space.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self, args, obs_space, cent_obs_space, act_space, device=torch.device("cpu")):
self.device = device
self.lr = args.lr
self.critic_lr = args.critic_lr
self.opti_eps = args.opti_eps
self.weight_decay = args.weight_decay
self.obs_space = obs_space
self.share_obs_space = cent_obs_space
self.act_space = act_space
self.actor = R_Actor(args, self.obs_space, self.act_space, self.device)
self.critic = R_Critic(args, self.share_obs_space, self.device)
self.actor_optimizer = torch.optim.Adam(self.actor.parameters(),
lr=self.lr, eps=self.opti_eps,
weight_decay=self.weight_decay)
self.critic_optimizer = torch.optim.Adam(self.critic.parameters(),
lr=self.critic_lr,
eps=self.opti_eps,
weight_decay=self.weight_decay)
def lr_decay(self, episode, episodes):
"""
Decay the actor and critic learning rates.
:param episode: (int) current training episode.
:param episodes: (int) total number of training episodes.
"""
update_linear_schedule(self.actor_optimizer, episode, episodes, self.lr)
update_linear_schedule(self.critic_optimizer, episode, episodes, self.critic_lr)
def get_actions(self, cent_obs, obs, rnn_states_actor, rnn_states_critic, masks, available_actions=None,
deterministic=False):
"""
Compute actions and value function predictions for the given inputs.
:param cent_obs (np.ndarray): centralized input to the critic.
:param obs (np.ndarray): local agent inputs to the actor.
:param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor.
:param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:param available_actions: (np.ndarray) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether the action should be mode of distribution or should be sampled.
:return values: (torch.Tensor) value function predictions.
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of chosen actions.
:return rnn_states_actor: (torch.Tensor) updated actor network RNN states.
:return rnn_states_critic: (torch.Tensor) updated critic network RNN states.
"""
actions, action_log_probs, rnn_states_actor = self.actor(obs,
rnn_states_actor,
masks,
available_actions,
deterministic)
values, rnn_states_critic = self.critic(cent_obs, rnn_states_critic, masks)
return values, actions, action_log_probs, rnn_states_actor, rnn_states_critic
def get_values(self, cent_obs, rnn_states_critic, masks):
"""
Get value function predictions.
:param cent_obs (np.ndarray): centralized input to the critic.
:param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:return values: (torch.Tensor) value function predictions.
"""
values, _ = self.critic(cent_obs, rnn_states_critic, masks)
return values
def evaluate_actions(self, cent_obs, obs, rnn_states_actor, rnn_states_critic, action, masks,
available_actions=None, active_masks=None):
"""
Get action logprobs / entropy and value function predictions for actor update.
:param cent_obs (np.ndarray): centralized input to the critic.
:param obs (np.ndarray): local agent inputs to the actor.
:param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor.
:param rnn_states_critic: (np.ndarray) if critic is RNN, RNN states for critic.
:param action: (np.ndarray) actions whose log probabilites and entropy to compute.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:param available_actions: (np.ndarray) denotes which actions are available to agent
(if None, all actions available)
:param active_masks: (torch.Tensor) denotes whether an agent is active or dead.
:return values: (torch.Tensor) value function predictions.
:return action_log_probs: (torch.Tensor) log probabilities of the input actions.
:return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs.
"""
action_log_probs, dist_entropy = self.actor.evaluate_actions(obs,
rnn_states_actor,
action,
masks,
available_actions,
active_masks)
values, _ = self.critic(cent_obs, rnn_states_critic, masks)
return values, action_log_probs, dist_entropy
def act(self, obs, rnn_states_actor, masks, available_actions=None, deterministic=False):
"""
Compute actions using the given inputs.
:param obs (np.ndarray): local agent inputs to the actor.
:param rnn_states_actor: (np.ndarray) if actor is RNN, RNN states for actor.
:param masks: (np.ndarray) denotes points at which RNN states should be reset.
:param available_actions: (np.ndarray) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether the action should be mode of distribution or should be sampled.
"""
actions, _, rnn_states_actor = self.actor(obs, rnn_states_actor, masks, available_actions, deterministic)
return actions, rnn_states_actor
"""
# @Time : 2021/7/1 6:53 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : r_actor_critic.py
"""
import torch
import torch.nn as nn
from mappo.algorithms.utils.util import init, check
from mappo.algorithms.utils.cnn import CNNBase
from mappo.algorithms.utils.mlp import MLPBase
from mappo.algorithms.utils.rnn import RNNLayer
from mappo.algorithms.utils.act import ACTLayer
from mappo.algorithms.utils.popart import PopArt
from mappo.utils.util import get_shape_from_obs_space
class R_Actor(nn.Module):
"""
Actor network class for MAPPO. Outputs actions given observations.
:param args: (argparse.Namespace) arguments containing relevant model information.
:param obs_space: (gym.Space) observation space.
:param action_space: (gym.Space) action space.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self, args, obs_space, action_space, device=torch.device("cpu")):
super(R_Actor, self).__init__()
self.hidden_size = args.hidden_size
self._gain = args.gain
self._use_orthogonal = args.use_orthogonal
self._use_policy_active_masks = args.use_policy_active_masks
self._use_naive_recurrent_policy = args.use_naive_recurrent_policy
self._use_recurrent_policy = args.use_recurrent_policy
self._recurrent_N = args.recurrent_N
self.tpdv = dict(dtype=torch.float32, device=device)
obs_shape = get_shape_from_obs_space(obs_space)
base = CNNBase if len(obs_shape) == 3 else MLPBase
self.base = base(args, obs_shape)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal)
self.act = ACTLayer(action_space, self.hidden_size, self._use_orthogonal, self._gain)
self.to(device)
def forward(self, obs, rnn_states, masks, available_actions=None, deterministic=False):
"""
Compute actions from the given inputs.
:param obs: (np.ndarray / torch.Tensor) observation inputs into network.
:param rnn_states: (np.ndarray / torch.Tensor) if RNN network, hidden states for RNN.
:param masks: (np.ndarray / torch.Tensor) mask tensor denoting if hidden states should be reinitialized to zeros.
:param available_actions: (np.ndarray / torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether to sample from action distribution or return the mode.
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of taken actions.
:return rnn_states: (torch.Tensor) updated RNN hidden states.
"""
obs = check(obs).to(**self.tpdv)
rnn_states = check(rnn_states).to(**self.tpdv)
masks = check(masks).to(**self.tpdv)
if available_actions is not None:
available_actions = check(available_actions).to(**self.tpdv)
actor_features = self.base(obs)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks)
actions, action_log_probs = self.act(actor_features, available_actions, deterministic)
return actions, action_log_probs, rnn_states
def evaluate_actions(self, obs, rnn_states, action, masks, available_actions=None, active_masks=None):
"""
Compute log probability and entropy of given actions.
:param obs: (torch.Tensor) observation inputs into network.
:param action: (torch.Tensor) actions whose entropy and log probability to evaluate.
:param rnn_states: (torch.Tensor) if RNN network, hidden states for RNN.
:param masks: (torch.Tensor) mask tensor denoting if hidden states should be reinitialized to zeros.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param active_masks: (torch.Tensor) denotes whether an agent is active or dead.
:return action_log_probs: (torch.Tensor) log probabilities of the input actions.
:return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs.
"""
obs = check(obs).to(**self.tpdv)
rnn_states = check(rnn_states).to(**self.tpdv)
action = check(action).to(**self.tpdv)
masks = check(masks).to(**self.tpdv)
if available_actions is not None:
available_actions = check(available_actions).to(**self.tpdv)
if active_masks is not None:
active_masks = check(active_masks).to(**self.tpdv)
actor_features = self.base(obs)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
actor_features, rnn_states = self.rnn(actor_features, rnn_states, masks)
action_log_probs, dist_entropy = self.act.evaluate_actions(actor_features,
action, available_actions,
active_masks=
active_masks if self._use_policy_active_masks
else None)
return action_log_probs, dist_entropy
class R_Critic(nn.Module):
"""
Critic network class for MAPPO. Outputs value function predictions given centralized input (MAPPO) or
local observations (IPPO).
:param args: (argparse.Namespace) arguments containing relevant model information.
:param cent_obs_space: (gym.Space) (centralized) observation space.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self, args, cent_obs_space, device=torch.device("cpu")):
super(R_Critic, self).__init__()
self.hidden_size = args.hidden_size
self._use_orthogonal = args.use_orthogonal
self._use_naive_recurrent_policy = args.use_naive_recurrent_policy
self._use_recurrent_policy = args.use_recurrent_policy
self._recurrent_N = args.recurrent_N
self._use_popart = args.use_popart
self.tpdv = dict(dtype=torch.float32, device=device)
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal]
cent_obs_shape = get_shape_from_obs_space(cent_obs_space)
base = CNNBase if len(cent_obs_shape) == 3 else MLPBase
self.base = base(args, cent_obs_shape)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
self.rnn = RNNLayer(self.hidden_size, self.hidden_size, self._recurrent_N, self._use_orthogonal)
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0))
if self._use_popart:
self.v_out = init_(PopArt(self.hidden_size, 1, device=device))
else:
self.v_out = init_(nn.Linear(self.hidden_size, 1))
self.to(device)
def forward(self, cent_obs, rnn_states, masks):
"""
Compute actions from the given inputs.
:param cent_obs: (np.ndarray / torch.Tensor) observation inputs into network.
:param rnn_states: (np.ndarray / torch.Tensor) if RNN network, hidden states for RNN.
:param masks: (np.ndarray / torch.Tensor) mask tensor denoting if RNN states should be reinitialized to zeros.
:return values: (torch.Tensor) value function predictions.
:return rnn_states: (torch.Tensor) updated RNN hidden states.
"""
cent_obs = check(cent_obs).to(**self.tpdv)
rnn_states = check(rnn_states).to(**self.tpdv)
masks = check(masks).to(**self.tpdv)
critic_features = self.base(cent_obs)
if self._use_naive_recurrent_policy or self._use_recurrent_policy:
critic_features, rnn_states = self.rnn(critic_features, rnn_states, masks)
values = self.v_out(critic_features)
return values, rnn_states
"""
# @Time : 2021/7/1 6:52 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : r_mappo.py
"""
import numpy as np
import torch
import torch.nn as nn
from mappo.utils.util import get_gard_norm, huber_loss, mse_loss
from mappo.utils.valuenorm import ValueNorm
from mappo.algorithms.utils.util import check
class RMAPPO():
"""
Trainer class for MAPPO to update policies.
:param args: (argparse.Namespace) arguments containing relevant model, policy, and env information.
:param policy: (R_MAPPO_Policy) policy to update.
:param device: (torch.device) specifies the device to run on (cpu/gpu).
"""
def __init__(self,
args,
policy,
device=torch.device("cpu")):
self.device = device
self.tpdv = dict(dtype=torch.float32, device=device)
self.policy = policy
self.clip_param = args.clip_param
self.ppo_epoch = args.ppo_epoch
self.num_mini_batch = args.num_mini_batch
self.data_chunk_length = args.data_chunk_length
self.value_loss_coef = args.value_loss_coef
self.entropy_coef = args.entropy_coef
self.max_grad_norm = args.max_grad_norm
self.huber_delta = args.huber_delta
self._use_recurrent_policy = args.use_recurrent_policy
self._use_naive_recurrent = args.use_naive_recurrent_policy
self._use_max_grad_norm = args.use_max_grad_norm
self._use_clipped_value_loss = args.use_clipped_value_loss
self._use_huber_loss = args.use_huber_loss
self._use_popart = args.use_popart
self._use_valuenorm = args.use_valuenorm
self._use_value_active_masks = args.use_value_active_masks
self._use_policy_active_masks = args.use_policy_active_masks
assert (self._use_popart and self._use_valuenorm) == False, (
"self._use_popart and self._use_valuenorm can not be set True simultaneously")
if self._use_popart:
self.value_normalizer = self.policy.critic.v_out
elif self._use_valuenorm:
self.value_normalizer = ValueNorm(1, device=self.device)
else:
self.value_normalizer = None
def cal_value_loss(self, values, value_preds_batch, return_batch, active_masks_batch):
"""
Calculate value function loss.
:param values: (torch.Tensor) value function predictions.
:param value_preds_batch: (torch.Tensor) "old" value predictions from data batch (used for value clip loss)
:param return_batch: (torch.Tensor) reward to go returns.
:param active_masks_batch: (torch.Tensor) denotes if agent is active or dead at a given timesep.
:return value_loss: (torch.Tensor) value function loss.
"""
value_pred_clipped = value_preds_batch + (values - value_preds_batch).clamp(-self.clip_param,
self.clip_param)
if self._use_popart or self._use_valuenorm:
self.value_normalizer.update(return_batch)
error_clipped = self.value_normalizer.normalize(return_batch) - value_pred_clipped
error_original = self.value_normalizer.normalize(return_batch) - values
else:
error_clipped = return_batch - value_pred_clipped
error_original = return_batch - values
if self._use_huber_loss:
value_loss_clipped = huber_loss(error_clipped, self.huber_delta)
value_loss_original = huber_loss(error_original, self.huber_delta)
else:
value_loss_clipped = mse_loss(error_clipped)
value_loss_original = mse_loss(error_original)
if self._use_clipped_value_loss:
value_loss = torch.max(value_loss_original, value_loss_clipped)
else:
value_loss = value_loss_original
if self._use_value_active_masks:
value_loss = (value_loss * active_masks_batch).sum() / active_masks_batch.sum()
else:
value_loss = value_loss.mean()
return value_loss
def ppo_update(self, sample, update_actor=True):
"""
Update actor and critic networks.
:param sample: (Tuple) contains data batch with which to update networks.
:update_actor: (bool) whether to update actor network.
:return value_loss: (torch.Tensor) value function loss.
:return critic_grad_norm: (torch.Tensor) gradient norm from critic up9date.
;return policy_loss: (torch.Tensor) actor(policy) loss value.
:return dist_entropy: (torch.Tensor) action entropies.
:return actor_grad_norm: (torch.Tensor) gradient norm from actor update.
:return imp_weights: (torch.Tensor) importance sampling weights.
"""
share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, \
value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, \
adv_targ, available_actions_batch = sample
old_action_log_probs_batch = check(old_action_log_probs_batch).to(**self.tpdv)
adv_targ = check(adv_targ).to(**self.tpdv)
value_preds_batch = check(value_preds_batch).to(**self.tpdv)
return_batch = check(return_batch).to(**self.tpdv)
active_masks_batch = check(active_masks_batch).to(**self.tpdv)
# Reshape to do in a single forward pass for all steps
values, action_log_probs, dist_entropy = self.policy.evaluate_actions(share_obs_batch,
obs_batch,
rnn_states_batch,
rnn_states_critic_batch,
actions_batch,
masks_batch,
available_actions_batch,
active_masks_batch)
# actor update
imp_weights = torch.exp(action_log_probs - old_action_log_probs_batch)
surr1 = imp_weights * adv_targ
surr2 = torch.clamp(imp_weights, 1.0 - self.clip_param, 1.0 + self.clip_param) * adv_targ
if self._use_policy_active_masks:
policy_action_loss = (-torch.sum(torch.min(surr1, surr2),
dim=-1,
keepdim=True) * active_masks_batch).sum() / active_masks_batch.sum()
else:
policy_action_loss = -torch.sum(torch.min(surr1, surr2), dim=-1, keepdim=True).mean()
policy_loss = policy_action_loss
self.policy.actor_optimizer.zero_grad()
if update_actor:
(policy_loss - dist_entropy * self.entropy_coef).backward()
if self._use_max_grad_norm:
actor_grad_norm = nn.utils.clip_grad_norm_(self.policy.actor.parameters(), self.max_grad_norm)
else:
actor_grad_norm = get_gard_norm(self.policy.actor.parameters())
self.policy.actor_optimizer.step()
# critic update
value_loss = self.cal_value_loss(values, value_preds_batch, return_batch, active_masks_batch)
self.policy.critic_optimizer.zero_grad()
(value_loss * self.value_loss_coef).backward()
if self._use_max_grad_norm:
critic_grad_norm = nn.utils.clip_grad_norm_(self.policy.critic.parameters(), self.max_grad_norm)
else:
critic_grad_norm = get_gard_norm(self.policy.critic.parameters())
self.policy.critic_optimizer.step()
return value_loss, critic_grad_norm, policy_loss, dist_entropy, actor_grad_norm, imp_weights
def train(self, buffer, update_actor=True):
"""
Perform a training update using minibatch GD.
:param buffer: (SharedReplayBuffer) buffer containing training data.
:param update_actor: (bool) whether to update actor network.
:return train_info: (dict) contains information regarding training update (e.g. loss, grad norms, etc).
"""
if self._use_popart or self._use_valuenorm:
advantages = buffer.returns[:-1] - self.value_normalizer.denormalize(buffer.value_preds[:-1])
else:
advantages = buffer.returns[:-1] - buffer.value_preds[:-1]
advantages_copy = advantages.copy()
advantages_copy[buffer.active_masks[:-1] == 0.0] = np.nan
mean_advantages = np.nanmean(advantages_copy)
std_advantages = np.nanstd(advantages_copy)
advantages = (advantages - mean_advantages) / (std_advantages + 1e-5)
train_info = {}
train_info['value_loss'] = 0
train_info['policy_loss'] = 0
train_info['dist_entropy'] = 0
train_info['actor_grad_norm'] = 0
train_info['critic_grad_norm'] = 0
train_info['ratio'] = 0
for _ in range(self.ppo_epoch):
if self._use_recurrent_policy:
data_generator = buffer.recurrent_generator(advantages, self.num_mini_batch, self.data_chunk_length)
elif self._use_naive_recurrent:
data_generator = buffer.naive_recurrent_generator(advantages, self.num_mini_batch)
else:
data_generator = buffer.feed_forward_generator(advantages, self.num_mini_batch)
for sample in data_generator:
value_loss, critic_grad_norm, policy_loss, dist_entropy, actor_grad_norm, imp_weights \
= self.ppo_update(sample, update_actor)
train_info['value_loss'] += value_loss.item()
train_info['policy_loss'] += policy_loss.item()
train_info['dist_entropy'] += dist_entropy.item()
train_info['actor_grad_norm'] += actor_grad_norm
train_info['critic_grad_norm'] += critic_grad_norm
train_info['ratio'] += imp_weights.mean()
num_updates = self.ppo_epoch * self.num_mini_batch
for k in train_info.keys():
train_info[k] /= num_updates
return train_info
def prep_training(self):
self.policy.actor.train()
self.policy.critic.train()
def prep_rollout(self):
self.policy.actor.eval()
self.policy.critic.eval()
from .distributions import Bernoulli, Categorical, DiagGaussian
import torch
import torch.nn as nn
class ACTLayer(nn.Module):
"""
MLP Module to compute actions.
:param action_space: (gym.Space) action space.
:param inputs_dim: (int) dimension of network input.
:param use_orthogonal: (bool) whether to use orthogonal initialization.
:param gain: (float) gain of the output layer of the network.
"""
def __init__(self, action_space, inputs_dim, use_orthogonal, gain):
super(ACTLayer, self).__init__()
self.mixed_action = False
self.multi_discrete = False
if action_space.__class__.__name__ == "Discrete":
action_dim = action_space.n
self.action_out = Categorical(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "Box":
action_dim = action_space.shape[0]
self.action_out = DiagGaussian(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "MultiBinary":
action_dim = action_space.shape[0]
self.action_out = Bernoulli(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "MultiDiscrete":
self.multi_discrete = True
action_dims = action_space.high - action_space.low + 1
self.action_outs = []
for action_dim in action_dims:
self.action_outs.append(Categorical(inputs_dim, action_dim, use_orthogonal, gain))
self.action_outs = nn.ModuleList(self.action_outs)
else: # discrete + continous
self.mixed_action = True
continous_dim = action_space[0].shape[0]
discrete_dim = action_space[1].n
self.action_outs = nn.ModuleList([DiagGaussian(inputs_dim, continous_dim, use_orthogonal, gain), Categorical(
inputs_dim, discrete_dim, use_orthogonal, gain)])
def forward(self, x, available_actions=None, deterministic=False):
"""
Compute actions and action logprobs from given input.
:param x: (torch.Tensor) input to network.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param deterministic: (bool) whether to sample from action distribution or return the mode.
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of taken actions.
"""
if self.mixed_action :
actions = []
action_log_probs = []
for action_out in self.action_outs:
action_logit = action_out(x)
action = action_logit.mode() if deterministic else action_logit.sample()
action_log_prob = action_logit.log_probs(action)
actions.append(action.float())
action_log_probs.append(action_log_prob)
actions = torch.cat(actions, -1)
action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
elif self.multi_discrete:
actions = []
action_log_probs = []
for action_out in self.action_outs:
action_logit = action_out(x)
action = action_logit.mode() if deterministic else action_logit.sample()
action_log_prob = action_logit.log_probs(action)
actions.append(action)
action_log_probs.append(action_log_prob)
actions = torch.cat(actions, -1)
action_log_probs = torch.cat(action_log_probs, -1)
else:
action_logits = self.action_out(x, available_actions)
actions = action_logits.mode() if deterministic else action_logits.sample()
action_log_probs = action_logits.log_probs(actions)
return actions, action_log_probs
def get_probs(self, x, available_actions=None):
"""
Compute action probabilities from inputs.
:param x: (torch.Tensor) input to network.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:return action_probs: (torch.Tensor)
"""
if self.mixed_action or self.multi_discrete:
action_probs = []
for action_out in self.action_outs:
action_logit = action_out(x)
action_prob = action_logit.probs
action_probs.append(action_prob)
action_probs = torch.cat(action_probs, -1)
else:
action_logits = self.action_out(x, available_actions)
action_probs = action_logits.probs
return action_probs
def evaluate_actions(self, x, action, available_actions=None, active_masks=None):
"""
Compute log probability and entropy of given actions.
:param x: (torch.Tensor) input to network.
:param action: (torch.Tensor) actions whose entropy and log probability to evaluate.
:param available_actions: (torch.Tensor) denotes which actions are available to agent
(if None, all actions available)
:param active_masks: (torch.Tensor) denotes whether an agent is active or dead.
:return action_log_probs: (torch.Tensor) log probabilities of the input actions.
:return dist_entropy: (torch.Tensor) action distribution entropy for the given inputs.
"""
if self.mixed_action:
a, b = action.split((2, 1), -1)
b = b.long()
action = [a, b]
action_log_probs = []
dist_entropy = []
for action_out, act in zip(self.action_outs, action):
action_logit = action_out(x)
action_log_probs.append(action_logit.log_probs(act))
if active_masks is not None:
if len(action_logit.entropy().shape) == len(active_masks.shape):
dist_entropy.append((action_logit.entropy() * active_masks).sum()/active_masks.sum())
else:
dist_entropy.append((action_logit.entropy() * active_masks.squeeze(-1)).sum()/active_masks.sum())
else:
dist_entropy.append(action_logit.entropy().mean())
action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
dist_entropy = dist_entropy[0] / 2.0 + dist_entropy[1] / 0.98 #! dosen't make sense
elif self.multi_discrete:
action = torch.transpose(action, 0, 1)
action_log_probs = []
dist_entropy = []
for action_out, act in zip(self.action_outs, action):
action_logit = action_out(x)
action_log_probs.append(action_logit.log_probs(act))
if active_masks is not None:
dist_entropy.append((action_logit.entropy()*active_masks.squeeze(-1)).sum()/active_masks.sum())
else:
dist_entropy.append(action_logit.entropy().mean())
action_log_probs = torch.cat(action_log_probs, -1) # ! could be wrong
dist_entropy = torch.tensor(dist_entropy).mean()
else:
action_logits = self.action_out(x, available_actions)
action_log_probs = action_logits.log_probs(action)
if active_masks is not None:
dist_entropy = (action_logits.entropy()*active_masks.squeeze(-1)).sum()/active_masks.sum()
else:
dist_entropy = action_logits.entropy().mean()
return action_log_probs, dist_entropy
import torch.nn as nn
from .util import init
"""CNN Modules and utils."""
class Flatten(nn.Module):
def forward(self, x):
return x.view(x.size(0), -1)
class CNNLayer(nn.Module):
def __init__(self, obs_shape, hidden_size, use_orthogonal, use_ReLU, kernel_size=3, stride=1):
super(CNNLayer, self).__init__()
active_func = [nn.Tanh(), nn.ReLU()][use_ReLU]
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU])
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain)
input_channel = obs_shape[0]
input_width = obs_shape[1]
input_height = obs_shape[2]
self.cnn = nn.Sequential(
init_(nn.Conv2d(in_channels=input_channel,
out_channels=hidden_size // 2,
kernel_size=kernel_size,
stride=stride)
),
active_func,
Flatten(),
init_(nn.Linear(hidden_size // 2 * (input_width - kernel_size + stride) * (input_height - kernel_size + stride),
hidden_size)
),
active_func,
init_(nn.Linear(hidden_size, hidden_size)), active_func)
def forward(self, x):
x = x / 255.0
x = self.cnn(x)
return x
class CNNBase(nn.Module):
def __init__(self, args, obs_shape):
super(CNNBase, self).__init__()
self._use_orthogonal = args.use_orthogonal
self._use_ReLU = args.use_ReLU
self.hidden_size = args.hidden_size
self.cnn = CNNLayer(obs_shape, self.hidden_size, self._use_orthogonal, self._use_ReLU)
def forward(self, x):
x = self.cnn(x)
return x
import torch
import torch.nn as nn
from .util import init
"""
Modify standard PyTorch distributions so they to make compatible with this codebase.
"""
#
# Standardize distribution interfaces
#
# Categorical
class FixedCategorical(torch.distributions.Categorical):
def sample(self):
return super().sample().unsqueeze(-1)
def log_probs(self, actions):
return (
super()
.log_prob(actions.squeeze(-1))
.view(actions.size(0), -1)
.sum(-1)
.unsqueeze(-1)
)
def mode(self):
return self.probs.argmax(dim=-1, keepdim=True)
# Normal
class FixedNormal(torch.distributions.Normal):
def log_probs(self, actions):
return super().log_prob(actions).sum(-1, keepdim=True)
def entrop(self):
return super.entropy().sum(-1)
def mode(self):
return self.mean
# Bernoulli
class FixedBernoulli(torch.distributions.Bernoulli):
def log_probs(self, actions):
return super.log_prob(actions).view(actions.size(0), -1).sum(-1).unsqueeze(-1)
def entropy(self):
return super().entropy().sum(-1)
def mode(self):
return torch.gt(self.probs, 0.5).float()
class Categorical(nn.Module):
def __init__(self, num_inputs, num_outputs, use_orthogonal=True, gain=0.01):
super(Categorical, self).__init__()
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain)
self.linear = init_(nn.Linear(num_inputs, num_outputs))
def forward(self, x, available_actions=None):
x = self.linear(x)
if available_actions is not None:
x[available_actions == 0] = -1e10
return FixedCategorical(logits=x)
class DiagGaussian(nn.Module):
def __init__(self, num_inputs, num_outputs, use_orthogonal=True, gain=0.01):
super(DiagGaussian, self).__init__()
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain)
self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
self.logstd = AddBias(torch.zeros(num_outputs))
def forward(self, x):
action_mean = self.fc_mean(x)
# An ugly hack for my KFAC implementation.
zeros = torch.zeros(action_mean.size())
if x.is_cuda:
zeros = zeros.cuda()
action_logstd = self.logstd(zeros)
return FixedNormal(action_mean, action_logstd.exp())
class Bernoulli(nn.Module):
def __init__(self, num_inputs, num_outputs, use_orthogonal=True, gain=0.01):
super(Bernoulli, self).__init__()
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain)
self.linear = init_(nn.Linear(num_inputs, num_outputs))
def forward(self, x):
x = self.linear(x)
return FixedBernoulli(logits=x)
class AddBias(nn.Module):
def __init__(self, bias):
super(AddBias, self).__init__()
self._bias = nn.Parameter(bias.unsqueeze(1))
def forward(self, x):
if x.dim() == 2:
bias = self._bias.t().view(1, -1)
else:
bias = self._bias.t().view(1, -1, 1, 1)
return x + bias
import torch.nn as nn
from .util import init, get_clones
"""MLP modules."""
class MLPLayer(nn.Module):
def __init__(self, input_dim, hidden_size, layer_N, use_orthogonal, use_ReLU):
super(MLPLayer, self).__init__()
self._layer_N = layer_N
active_func = [nn.Tanh(), nn.ReLU()][use_ReLU]
init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][use_orthogonal]
gain = nn.init.calculate_gain(['tanh', 'relu'][use_ReLU])
def init_(m):
return init(m, init_method, lambda x: nn.init.constant_(x, 0), gain=gain)
self.fc1 = nn.Sequential(
init_(nn.Linear(input_dim, hidden_size)), active_func, nn.LayerNorm(hidden_size))
self.fc_h = nn.Sequential(init_(
nn.Linear(hidden_size, hidden_size)), active_func, nn.LayerNorm(hidden_size))
self.fc2 = get_clones(self.fc_h, self._layer_N)
def forward(self, x):
x = self.fc1(x)
for i in range(self._layer_N):
x = self.fc2[i](x)
return x
class MLPBase(nn.Module):
def __init__(self, args, obs_shape, cat_self=True, attn_internal=False):
super(MLPBase, self).__init__()
self._use_feature_normalization = args.use_feature_normalization
self._use_orthogonal = args.use_orthogonal
self._use_ReLU = args.use_ReLU
self._stacked_frames = args.stacked_frames
self._layer_N = args.layer_N
self.hidden_size = args.hidden_size
obs_dim = obs_shape[0]
if self._use_feature_normalization:
self.feature_norm = nn.LayerNorm(obs_dim)
self.mlp = MLPLayer(obs_dim, self.hidden_size,
self._layer_N, self._use_orthogonal, self._use_ReLU)
def forward(self, x):
if self._use_feature_normalization:
x = self.feature_norm(x)
x = self.mlp(x)
return x
\ No newline at end of file
import math
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
class PopArt(torch.nn.Module):
def __init__(self, input_shape, output_shape, norm_axes=1, beta=0.99999, epsilon=1e-5, device=torch.device("cpu")):
super(PopArt, self).__init__()
self.beta = beta
self.epsilon = epsilon
self.norm_axes = norm_axes
self.tpdv = dict(dtype=torch.float32, device=device)
self.input_shape = input_shape
self.output_shape = output_shape
self.weight = nn.Parameter(torch.Tensor(output_shape, input_shape)).to(**self.tpdv)
self.bias = nn.Parameter(torch.Tensor(output_shape)).to(**self.tpdv)
self.stddev = nn.Parameter(torch.ones(output_shape), requires_grad=False).to(**self.tpdv)
self.mean = nn.Parameter(torch.zeros(output_shape), requires_grad=False).to(**self.tpdv)
self.mean_sq = nn.Parameter(torch.zeros(output_shape), requires_grad=False).to(**self.tpdv)
self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv)
self.reset_parameters()
def reset_parameters(self):
torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
torch.nn.init.uniform_(self.bias, -bound, bound)
self.mean.zero_()
self.mean_sq.zero_()
self.debiasing_term.zero_()
def forward(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
return F.linear(input_vector, self.weight, self.bias)
@torch.no_grad()
def update(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
old_mean, old_stddev = self.mean, self.stddev
batch_mean = input_vector.mean(dim=tuple(range(self.norm_axes)))
batch_sq_mean = (input_vector ** 2).mean(dim=tuple(range(self.norm_axes)))
self.mean.mul_(self.beta).add_(batch_mean * (1.0 - self.beta))
self.mean_sq.mul_(self.beta).add_(batch_sq_mean * (1.0 - self.beta))
self.debiasing_term.mul_(self.beta).add_(1.0 * (1.0 - self.beta))
self.stddev = (self.mean_sq - self.mean ** 2).sqrt().clamp(min=1e-4)
self.weight = self.weight * old_stddev / self.stddev
self.bias = (old_stddev * self.bias + old_mean - self.mean) / self.stddev
def debiased_mean_var(self):
debiased_mean = self.mean / self.debiasing_term.clamp(min=self.epsilon)
debiased_mean_sq = self.mean_sq / self.debiasing_term.clamp(min=self.epsilon)
debiased_var = (debiased_mean_sq - debiased_mean ** 2).clamp(min=1e-2)
return debiased_mean, debiased_var
def normalize(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
mean, var = self.debiased_mean_var()
out = (input_vector - mean[(None,) * self.norm_axes]) / torch.sqrt(var)[(None,) * self.norm_axes]
return out
def denormalize(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
mean, var = self.debiased_mean_var()
out = input_vector * torch.sqrt(var)[(None,) * self.norm_axes] + mean[(None,) * self.norm_axes]
out = out.cpu().numpy()
return out
import torch
import torch.nn as nn
"""RNN modules."""
class RNNLayer(nn.Module):
def __init__(self, inputs_dim, outputs_dim, recurrent_N, use_orthogonal):
super(RNNLayer, self).__init__()
self._recurrent_N = recurrent_N
self._use_orthogonal = use_orthogonal
self.rnn = nn.GRU(inputs_dim, outputs_dim, num_layers=self._recurrent_N)
for name, param in self.rnn.named_parameters():
if 'bias' in name:
nn.init.constant_(param, 0)
elif 'weight' in name:
if self._use_orthogonal:
nn.init.orthogonal_(param)
else:
nn.init.xavier_uniform_(param)
self.norm = nn.LayerNorm(outputs_dim)
def forward(self, x, hxs, masks):
if x.size(0) == hxs.size(0):
x, hxs = self.rnn(x.unsqueeze(0),
(hxs * masks.repeat(1, self._recurrent_N).unsqueeze(-1)).transpose(0, 1).contiguous())
x = x.squeeze(0)
hxs = hxs.transpose(0, 1)
else:
# x is a (T, N, -1) tensor that has been flatten to (T * N, -1)
N = hxs.size(0)
T = int(x.size(0) / N)
# unflatten
x = x.view(T, N, x.size(1))
# Same deal with masks
masks = masks.view(T, N)
# Let's figure out which steps in the sequence have a zero for any agent
# We will always assume t=0 has a zero in it as that makes the logic cleaner
has_zeros = ((masks[1:] == 0.0)
.any(dim=-1)
.nonzero()
.squeeze()
.cpu())
# +1 to correct the masks[1:]
if has_zeros.dim() == 0:
# Deal with scalar
has_zeros = [has_zeros.item() + 1]
else:
has_zeros = (has_zeros + 1).numpy().tolist()
# add t=0 and t=T to the list
has_zeros = [0] + has_zeros + [T]
hxs = hxs.transpose(0, 1)
outputs = []
for i in range(len(has_zeros) - 1):
# We can now process steps that don't have any zeros in masks together!
# This is much faster
start_idx = has_zeros[i]
end_idx = has_zeros[i + 1]
temp = (hxs * masks[start_idx].view(1, -1, 1).repeat(self._recurrent_N, 1, 1)).contiguous()
rnn_scores, hxs = self.rnn(x[start_idx:end_idx], temp)
outputs.append(rnn_scores)
# assert len(outputs) == T
# x is a (T, N, -1) tensor
x = torch.cat(outputs, dim=0)
# flatten
x = x.reshape(T * N, -1)
hxs = hxs.transpose(0, 1)
x = self.norm(x)
return x, hxs
import copy
import numpy as np
import torch
import torch.nn as nn
def init(module, weight_init, bias_init, gain=1):
weight_init(module.weight.data, gain=gain)
bias_init(module.bias.data)
return module
def get_clones(module, N):
return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
def check(input):
output = torch.from_numpy(input) if type(input) == np.ndarray else input
return output
import argparse
def get_config():
"""
The configuration parser for common hyperparameters of all environment.
Please reach each `scripts/train/<env>_runner.py` file to find private hyperparameters
only used in <env>.
Prepare parameters:
--algorithm_name <algorithm_name>
specifiy the algorithm, including `["rmappo", "mappo", "rmappg", "mappg", "trpo"]`
--experiment_name <str>
an identifier to distinguish different experiment.
--seed <int>
set seed for numpy and torch
--cuda
by default True, will use GPU to train; or else will use CPU;
--cuda_deterministic
by default, make sure random seed effective. if set, bypass such function.
--n_training_threads <int>
number of training threads working in parallel. by default 1
--n_rollout_threads <int>
number of parallel envs for training rollout. by default 32
--n_eval_rollout_threads <int>
number of parallel envs for evaluating rollout. by default 1
--n_render_rollout_threads <int>
number of parallel envs for rendering, could only be set as 1 for some environments.
--num_env_steps <int>
number of env steps to train (default: 10e6)
--user_name <str>
[for wandb usage], to specify user's name for simply collecting training data.
--use_wandb
[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.
Env parameters:
--env_name <str>
specify the name of environment
--use_obs_instead_of_state
[only for some env] by default False, will use global state; or else will use concatenated local obs.
Replay Buffer parameters:
--episode_length <int>
the max length of episode in the buffer.
Network parameters:
--share_policy
by default True, all agents will share the same network; set to make training agents use different policies.
--use_centralized_V
by default True, use centralized training mode; or else will decentralized training mode.
--stacked_frames <int>
Number of input frames which should be stack together.
--hidden_size <int>
Dimension of hidden layers for actor/critic networks
--layer_N <int>
Number of layers for actor/critic networks
--use_ReLU
by default True, will use ReLU. or else will use Tanh.
--use_popart
by default True, use PopArt to normalize rewards.
--use_valuenorm
by default True, use running mean and std to normalize rewards.
--use_feature_normalization
by default True, apply layernorm to normalize inputs.
--use_orthogonal
by default True, use Orthogonal initialization for weights and 0 initialization for biases. or else, will use xavier uniform inilialization.
--gain
by default 0.01, use the gain # of last action layer
--use_naive_recurrent_policy
by default False, use the whole trajectory to calculate hidden states.
--use_recurrent_policy
by default, use Recurrent Policy. If set, do not use.
--recurrent_N <int>
The number of recurrent layers ( default 1).
--data_chunk_length <int>
Time length of chunks used to train a recurrent_policy, default 10.
Optimizer parameters:
--lr <float>
learning rate parameter, (default: 5e-4, fixed).
--critic_lr <float>
learning rate of critic (default: 5e-4, fixed)
--opti_eps <float>
RMSprop optimizer epsilon (default: 1e-5)
--weight_decay <float>
coefficience of weight decay (default: 0)
PPO parameters:
--ppo_epoch <int>
number of ppo epochs (default: 15)
--use_clipped_value_loss
by default, clip loss value. If set, do not clip loss value.
--clip_param <float>
ppo clip parameter (default: 0.2)
--num_mini_batch <int>
number of batches for ppo (default: 1)
--entropy_coef <float>
entropy term coefficient (default: 0.01)
--use_max_grad_norm
by default, use max norm of gradients. If set, do not use.
--max_grad_norm <float>
max norm of gradients (default: 0.5)
--use_gae
by default, use generalized advantage estimation. If set, do not use gae.
--gamma <float>
discount factor for rewards (default: 0.99)
--gae_lambda <float>
gae lambda parameter (default: 0.95)
--use_proper_time_limits
by default, the return value does consider limits of time. If set, compute returns with considering time limits factor.
--use_huber_loss
by default, use huber loss. If set, do not use huber loss.
--use_value_active_masks
by default True, whether to mask useless data in value loss.
--huber_delta <float>
coefficient of huber loss.
PPG parameters:
--aux_epoch <int>
number of auxiliary epochs. (default: 4)
--clone_coef <float>
clone term coefficient (default: 0.01)
Run parameters:
--use_linear_lr_decay
by default, do not apply linear decay to learning rate. If set, use a linear schedule on the learning rate
Save & Log parameters:
--save_interval <int>
time duration between contiunous twice models saving.
--log_interval <int>
time duration between contiunous twice log printing.
Eval parameters:
--use_eval
by default, do not start evaluation. If set`, start evaluation alongside with training.
--eval_interval <int>
time duration between contiunous twice evaluation progress.
--eval_episodes <int>
number of episodes of a single evaluation.
Render parameters:
--save_gifs
by default, do not save render video. If set, save video.
--use_render
by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.
--render_episodes <int>
the number of episodes to render a given env
--ifi <float>
the play interval of each rendered image in saved video.
Pretrained parameters:
--model_dir <str>
by default None. set the path to pretrained model.
"""
parser = argparse.ArgumentParser(
description='onpolicy', formatter_class=argparse.RawDescriptionHelpFormatter)
# prepare parameters
parser.add_argument("--algorithm_name", type=str,
default='mappo', choices=["rmappo", "mappo"])
parser.add_argument("--experiment_name", type=str, default="check", help="an identifier to distinguish different experiment.")
parser.add_argument("--seed", type=int, default=1, help="Random seed for numpy/torch")
parser.add_argument("--cuda", action='store_false', default=True, help="by default True, will use GPU to train; or else will use CPU;")
parser.add_argument("--cuda_deterministic",
action='store_false', default=True, help="by default, make sure random seed effective. if set, bypass such function.")
parser.add_argument("--n_training_threads", type=int,
default=1, help="Number of torch threads for training")
parser.add_argument("--n_rollout_threads", type=int, default=5,
help="Number of parallel envs for training rollouts")
parser.add_argument("--n_eval_rollout_threads", type=int, default=1,
help="Number of parallel envs for evaluating rollouts")
parser.add_argument("--n_render_rollout_threads", type=int, default=1,
help="Number of parallel envs for rendering rollouts")
parser.add_argument("--num_env_steps", type=int, default=10e6,
help='Number of environment steps to train (default: 10e6)')
parser.add_argument("--user_name", type=str, default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.")
parser.add_argument("--use_wandb", action='store_false', default=False, help="[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.")
# env parameters
parser.add_argument("--env_name", type=str, default='MyEnv', help="specify the name of environment")
parser.add_argument("--use_obs_instead_of_state", action='store_true',
default=False, help="Whether to use global state or concatenated obs")
# replay buffer parameters
parser.add_argument("--episode_length", type=int,
default=200, help="Max length for any episode")
# network parameters
parser.add_argument("--share_policy", action='store_false',
default=False, help='Whether agent share the same policy')
parser.add_argument("--use_centralized_V", action='store_false',
default=True, help="Whether to use centralized V function")
parser.add_argument("--stacked_frames", type=int, default=1,
help="Dimension of hidden layers for actor/critic networks")
parser.add_argument("--use_stacked_frames", action='store_true',
default=False, help="Whether to use stacked_frames")
parser.add_argument("--hidden_size", type=int, default=64,
help="Dimension of hidden layers for actor/critic networks")
parser.add_argument("--layer_N", type=int, default=1,
help="Number of layers for actor/critic networks")
parser.add_argument("--use_ReLU", action='store_false',
default=True, help="Whether to use ReLU")
parser.add_argument("--use_popart", action='store_true', default=False, help="by default False, use PopArt to normalize rewards.")
parser.add_argument("--use_valuenorm", action='store_false', default=True, help="by default True, use running mean and std to normalize rewards.")
parser.add_argument("--use_feature_normalization", action='store_false',
default=True, help="Whether to apply layernorm to the inputs")
parser.add_argument("--use_orthogonal", action='store_false', default=True,
help="Whether to use Orthogonal initialization for weights and 0 initialization for biases")
parser.add_argument("--gain", type=float, default=0.01,
help="The gain # of last action layer")
# recurrent parameters
parser.add_argument("--use_naive_recurrent_policy", action='store_true',
default=False, help='Whether to use a naive recurrent policy')
parser.add_argument("--use_recurrent_policy", action='store_false',
default=False, help='use a recurrent policy')
parser.add_argument("--recurrent_N", type=int, default=1, help="The number of recurrent layers.")
parser.add_argument("--data_chunk_length", type=int, default=10,
help="Time length of chunks used to train a recurrent_policy")
# optimizer parameters
parser.add_argument("--lr", type=float, default=5e-4,
help='learning rate (default: 5e-4)')
parser.add_argument("--critic_lr", type=float, default=5e-4,
help='critic learning rate (default: 5e-4)')
parser.add_argument("--opti_eps", type=float, default=1e-5,
help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument("--weight_decay", type=float, default=0)
# ppo parameters
parser.add_argument("--ppo_epoch", type=int, default=15,
help='number of ppo epochs (default: 15)')
parser.add_argument("--use_clipped_value_loss",
action='store_false', default=True, help="by default, clip loss value. If set, do not clip loss value.")
parser.add_argument("--clip_param", type=float, default=0.2,
help='ppo clip parameter (default: 0.2)')
parser.add_argument("--num_mini_batch", type=int, default=1,
help='number of batches for ppo (default: 1)')
parser.add_argument("--entropy_coef", type=float, default=0.01,
help='entropy term coefficient (default: 0.01)')
parser.add_argument("--value_loss_coef", type=float,
default=1, help='value loss coefficient (default: 0.5)')
parser.add_argument("--use_max_grad_norm",
action='store_false', default=True, help="by default, use max norm of gradients. If set, do not use.")
parser.add_argument("--max_grad_norm", type=float, default=10.0,
help='max norm of gradients (default: 0.5)')
parser.add_argument("--use_gae", action='store_false',
default=True, help='use generalized advantage estimation')
parser.add_argument("--gamma", type=float, default=0.99,
help='discount factor for rewards (default: 0.99)')
parser.add_argument("--gae_lambda", type=float, default=0.95,
help='gae lambda parameter (default: 0.95)')
parser.add_argument("--use_proper_time_limits", action='store_true',
default=False, help='compute returns taking into account time limits')
parser.add_argument("--use_huber_loss", action='store_false', default=True, help="by default, use huber loss. If set, do not use huber loss.")
parser.add_argument("--use_value_active_masks",
action='store_false', default=True, help="by default True, whether to mask useless data in value loss.")
parser.add_argument("--use_policy_active_masks",
action='store_false', default=True, help="by default True, whether to mask useless data in policy loss.")
parser.add_argument("--huber_delta", type=float, default=10.0, help=" coefficience of huber loss.")
# run parameters
parser.add_argument("--use_linear_lr_decay", action='store_true',
default=False, help='use a linear schedule on the learning rate')
# save parameters
parser.add_argument("--save_interval", type=int, default=1, help="time duration between contiunous twice models saving.")
# log parameters
parser.add_argument("--log_interval", type=int, default=5, help="time duration between contiunous twice log printing.")
# eval parameters
parser.add_argument("--use_eval", action='store_true', default=False, help="by default, do not start evaluation. If set`, start evaluation alongside with training.")
parser.add_argument("--eval_interval", type=int, default=25, help="time duration between contiunous twice evaluation progress.")
parser.add_argument("--eval_episodes", type=int, default=32, help="number of episodes of a single evaluation.")
# render parameters
parser.add_argument("--save_gifs", action='store_true', default=False, help="by default, do not save render video. If set, save video.")
parser.add_argument("--use_render", action='store_true', default=False, help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.")
parser.add_argument("--render_episodes", type=int, default=5, help="the number of episodes to render a given env")
parser.add_argument("--ifi", type=float, default=0.1, help="the play interval of each rendered image in saved video.")
# pretrained parameters
parser.add_argument("--model_dir", type=str, default=None, help="by default None. set the path to pretrained model.")
return parser
import socket
from absl import flags
FLAGS = flags.FLAGS
FLAGS(['train_sc.py'])
"""
# @Time : 2021/7/1 8:44 上午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env_wrappers.py
Modified from OpenAI Baselines code to work with multi-agent envs
"""
import numpy as np
import gym
from gym import spaces
class MultiDiscrete(gym.Space):
"""
- The multi-discrete action space consists of a series of discrete action spaces with different parameters
- It can be adapted to both a Discrete action space or a continuous (Box) action space
- It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
where the discrete action space can take any integers from `min` to `max` (both inclusive)
Note: A value of 0 always need to represent the NOOP action.
e.g. Nintendo Game Controller
- Can be conceptualized as 3 discrete action spaces:
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
- Can be initialized as
MultiDiscrete([ [0,4], [0,1], [0,1] ])
"""
def __init__(self, array_of_param_array):
super().__init__()
self.low = np.array([x[0] for x in array_of_param_array])
self.high = np.array([x[1] for x in array_of_param_array])
self.num_discrete_space = self.low.shape[0]
self.n = np.sum(self.high) + 2
def sample(self):
""" Returns a array with one sample from each discrete action space """
# For each row: round(random .* (max - min) + min, 0)
random_array = np.random.rand(self.num_discrete_space)
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
def contains(self, x):
return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (
np.array(x) <= self.high).all()
@property
def shape(self):
return self.num_discrete_space
def __repr__(self):
return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
class Env(object):
"""
# 环境中的智能体
"""
def __init__(self, i):
self.agent_num = 2 # 设置智能体(小飞机)的个数,这里设置为两个
self.obs_dim = 14 # 设置智能体的观测纬度
self.action_dim = 5 # 设置智能体的动作纬度,这里假定为一个五个纬度的
def reset(self):
"""
# self.agent_num设定为2个智能体时,返回值为一个list,每个list里面为一个shape = (self.obs_dim, )的观测数据
"""
sub_agent_obs = []
for i in range(self.agent_num):
sub_obs = np.random.random(size=(14, ))
sub_agent_obs.append(sub_obs)
return sub_agent_obs
def step(self, actions):
"""
# self.agent_num设定为2个智能体时,actions的输入为一个2纬的list,每个list里面为一个shape = (self.action_dim, )的动作数据
# 默认参数情况下,输入为一个list,里面含有两个元素,因为动作纬度为5,所里每个元素shape = (5, )
"""
sub_agent_obs = []
sub_agent_reward = []
sub_agent_done = []
sub_agent_info = []
for i in range(self.agent_num):
sub_agent_obs.append(np.random.random(size=(14,)))
sub_agent_reward.append([np.random.rand()])
sub_agent_done.append(False)
sub_agent_info.append({})
return [sub_agent_obs, sub_agent_reward, sub_agent_done, sub_agent_info]
class SubprocVecEnv(object):
def __init__(self, all_args):
"""
envs: list of gym environments to run in subprocesses
"""
self.env_list = [Env(i) for i in range(all_args.n_rollout_threads)]
self.num_envs = all_args.n_rollout_threads
self.num_agent = self.env_list[0].agent_num
self.signal_obs_dim = self.env_list[0].obs_dim
self.signal_action_dim = self.env_list[0].action_dim
self.u_range = 1.0 # control range for continuous control
self.movable = True
# environment parameters
# self.discrete_action_space = True
self.discrete_action_space = True
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
# if true, even the action is continuous, action will be performed discretely
self.force_discrete_action = False
# configure spaces
self.action_space = []
self.observation_space = []
self.share_observation_space = []
share_obs_dim = 0
for agent in range(self.num_agent):
total_action_space = []
# physical action space
if self.discrete_action_space:
u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作
else:
u_action_space = spaces.Box(low=-self.u_range, high=+self.u_range, shape=(2,), dtype=np.float32) # [-1,1]
if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# observation space
share_obs_dim += self.signal_obs_dim
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(self.signal_obs_dim,),
dtype=np.float32)) # [-inf,inf]
self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
dtype=np.float32) for _ in range(self.num_agent)]
def step(self, actions):
"""
输入actions纬度假设:
# actions shape = (5, 2, 5)
# 5个线程的环境,里面有2个智能体,每个智能体的动作是一个one_hot的5维编码
"""
results = [env.step(action) for env, action in zip(self.env_list, actions)]
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
obs = [env.reset() for env in self.env_list]
return np.stack(obs)
def close(self):
pass
def render(self, mode="rgb_array"):
pass
# single env
class DummyVecEnv(object):
def __init__(self, all_args):
"""
envs: list of gym environments to run in subprocesses
"""
self.env_list = [Env(i) for i in range(all_args.n_eval_rollout_threadss)]
self.num_envs = all_args.n_rollout_threads
self.num_agent = self.env_list[0].agent_num
self.u_range = 1.0 # control range for continuous control
self.movable = True
# environment parameters
self.discrete_action_space = True
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
# if true, even the action is continuous, action will be performed discretely
self.force_discrete_action = False
# in this env, force_discrete_action == False��because world do not have discrete_action
# configure spaces
self.action_space = []
self.observation_space = []
self.share_observation_space = []
share_obs_dim = 0
for agent_num in range(self.num_agent):
total_action_space = []
# physical action space
if self.discrete_action_space:
u_action_space = spaces.Discrete(5) # 5个离散的动作
else:
u_action_space = spaces.Box(low=-self.u_range, high=+self.u_range, shape=(2,), dtype=np.float32) # [-1,1]
if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# observation space
obs_dim = 14 # 单个智能体的观测维度
share_obs_dim += obs_dim
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32)) # [-inf,inf]
self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
dtype=np.float32) for _ in range(self.num_agent)]
def step(self, actions):
"""
输入actions纬度假设:
# actions shape = (5, 2, 5)
# 5个线程的环境,里面有2个智能体,每个智能体的动作是一个one_hot的5维编码
"""
results = [env.step(action) for env, action in zip(self.env_list, actions)]
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
obs = [env.reset() for env in self.env_list]
return np.stack(obs)
def close(self):
pass
def render(self, mode="rgb_array"):
pass
import time
import wandb
import os
import numpy as np
from itertools import chain
import torch
from tensorboardX import SummaryWriter
from mappo.utils.separated_buffer import SeparatedReplayBuffer
from mappo.utils.util import update_linear_schedule
def _t2n(x):
return x.detach().cpu().numpy()
class Runner(object):
def __init__(self, config):
self.all_args = config['all_args']
self.envs = config['envs']
self.eval_envs = config['eval_envs']
self.device = config['device']
self.num_agents = config['num_agents']
# parameters
self.env_name = self.all_args.env_name
self.algorithm_name = self.all_args.algorithm_name
self.experiment_name = self.all_args.experiment_name
self.use_centralized_V = self.all_args.use_centralized_V
self.use_obs_instead_of_state = self.all_args.use_obs_instead_of_state
self.num_env_steps = self.all_args.num_env_steps
self.episode_length = self.all_args.episode_length
self.n_rollout_threads = self.all_args.n_rollout_threads
self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
# interval
self.save_interval = self.all_args.save_interval
self.use_eval = self.all_args.use_eval
self.eval_interval = self.all_args.eval_interval
self.log_interval = self.all_args.log_interval
# dir
self.model_dir = self.all_args.model_dir
if self.use_render:
import imageio
self.run_dir = config["run_dir"]
self.gif_dir = str(self.run_dir / 'gifs')
if not os.path.exists(self.gif_dir):
os.makedirs(self.gif_dir)
else:
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
from mappo.algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from mappo.algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
self.policy = []
for agent_id in range(self.num_agents):
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id]
# policy network
po = Policy(self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id],
device = self.device)
self.policy.append(po)
if self.model_dir is not None:
self.restore()
self.trainer = []
self.buffer = []
for agent_id in range(self.num_agents):
# algorithm
tr = TrainAlgo(self.all_args, self.policy[agent_id], device = self.device)
# buffer
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id]
bu = SeparatedReplayBuffer(self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id])
self.buffer.append(bu)
self.trainer.append(tr)
def run(self):
raise NotImplementedError
def warmup(self):
raise NotImplementedError
def collect(self, step):
raise NotImplementedError
def insert(self, data):
raise NotImplementedError
@torch.no_grad()
def compute(self):
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_rollout()
next_value = self.trainer[agent_id].policy.get_values(self.buffer[agent_id].share_obs[-1],
self.buffer[agent_id].rnn_states_critic[-1],
self.buffer[agent_id].masks[-1])
next_value = _t2n(next_value)
self.buffer[agent_id].compute_returns(next_value, self.trainer[agent_id].value_normalizer)
def train(self):
train_infos = []
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_training()
train_info = self.trainer[agent_id].train(self.buffer[agent_id])
train_infos.append(train_info)
self.buffer[agent_id].after_update()
return train_infos
def save(self):
for agent_id in range(self.num_agents):
policy_actor = self.trainer[agent_id].policy.actor
torch.save(policy_actor.state_dict(), str(self.save_dir) + "/actor_agent" + str(agent_id) + ".pt")
policy_critic = self.trainer[agent_id].policy.critic
torch.save(policy_critic.state_dict(), str(self.save_dir) + "/critic_agent" + str(agent_id) + ".pt")
def restore(self):
for agent_id in range(self.num_agents):
policy_actor_state_dict = torch.load(str(self.model_dir) + '/actor_agent' + str(agent_id) + '.pt')
self.policy[agent_id].actor.load_state_dict(policy_actor_state_dict)
policy_critic_state_dict = torch.load(str(self.model_dir) + '/critic_agent' + str(agent_id) + '.pt')
self.policy[agent_id].critic.load_state_dict(policy_critic_state_dict)
def log_train(self, train_infos, total_num_steps):
for agent_id in range(self.num_agents):
for k, v in train_infos[agent_id].items():
agent_k = "agent%i/" % agent_id + k
if self.use_wandb:
wandb.log({agent_k: v}, step=total_num_steps)
else:
self.writter.add_scalars(agent_k, {agent_k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
for k, v in env_infos.items():
if len(v) > 0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
"""
# @Time : 2021/7/1 7:14 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env_runner.py
"""
import time
import wandb
import os
import numpy as np
from itertools import chain
import torch
from mappo.utils.util import update_linear_schedule
from mappo.runner.separated.base_runner import Runner
import imageio
def _t2n(x):
return x.detach().cpu().numpy()
class EnvRunner(Runner):
def __init__(self, config):
super(EnvRunner, self).__init__(config)
def run(self):
self.warmup()
start = time.time()
episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
for episode in range(episodes):
if self.use_linear_lr_decay:
for agent_id in range(self.num_agents):
self.trainer[agent_id].policy.lr_decay(episode, episodes)
for step in range(self.episode_length):
# Sample actions
values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env = self.collect(step)
# Obser reward and next obs
obs, rewards, dones, infos = self.envs.step(actions_env)
data = obs, rewards, dones, infos, values, actions, action_log_probs, rnn_states, rnn_states_critic
# insert data into buffer
self.insert(data)
# compute return and update network
self.compute()
train_infos = self.train()
# post process
total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
# save model
if (episode % self.save_interval == 0 or episode == episodes - 1):
self.save()
# log information
if episode % self.log_interval == 0:
end = time.time()
print("\n Scenario {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
.format(self.all_args.scenario_name,
self.algorithm_name,
self.experiment_name,
episode,
episodes,
total_num_steps,
self.num_env_steps,
int(total_num_steps / (end - start))))
if self.env_name == "MPE":
for agent_id in range(self.num_agents):
idv_rews = []
for info in infos:
if 'individual_reward' in info[agent_id].keys():
idv_rews.append(info[agent_id]['individual_reward'])
train_infos[agent_id].update({'individual_rewards': np.mean(idv_rews)})
train_infos[agent_id].update(
{"average_episode_rewards": np.mean(self.buffer[agent_id].rewards) * self.episode_length})
self.log_train(train_infos, total_num_steps)
# eval
if episode % self.eval_interval == 0 and self.use_eval:
self.eval(total_num_steps)
def warmup(self):
# reset env
obs = self.envs.reset()
share_obs = []
for o in obs:
share_obs.append(list(chain(*o)))
share_obs = np.array(share_obs)
for agent_id in range(self.num_agents):
if not self.use_centralized_V:
share_obs = np.array(list(obs[:, agent_id]))
self.buffer[agent_id].share_obs[0] = share_obs.copy()
self.buffer[agent_id].obs[0] = np.array(list(obs[:, agent_id])).copy()
@torch.no_grad()
def collect(self, step):
values = []
actions = []
temp_actions_env = []
action_log_probs = []
rnn_states = []
rnn_states_critic = []
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_rollout()
value, action, action_log_prob, rnn_state, rnn_state_critic \
= self.trainer[agent_id].policy.get_actions(self.buffer[agent_id].share_obs[step],
self.buffer[agent_id].obs[step],
self.buffer[agent_id].rnn_states[step],
self.buffer[agent_id].rnn_states_critic[step],
self.buffer[agent_id].masks[step])
# [agents, envs, dim]
values.append(_t2n(value))
action = _t2n(action)
# rearrange action
if self.envs.action_space[agent_id].__class__.__name__ == 'MultiDiscrete':
for i in range(self.envs.action_space[agent_id].shape):
uc_action_env = np.eye(self.envs.action_space[agent_id].high[i] + 1)[action[:, i]]
if i == 0:
action_env = uc_action_env
else:
action_env = np.concatenate((action_env, uc_action_env), axis=1)
elif self.envs.action_space[agent_id].__class__.__name__ == 'Discrete':
action_env = np.squeeze(np.eye(self.envs.action_space[agent_id].n)[action], 1)
else:
raise NotImplementedError
actions.append(action)
temp_actions_env.append(action_env)
action_log_probs.append(_t2n(action_log_prob))
rnn_states.append(_t2n(rnn_state))
rnn_states_critic.append(_t2n(rnn_state_critic))
# [envs, agents, dim]
actions_env = []
for i in range(self.n_rollout_threads):
one_hot_action_env = []
for temp_action_env in temp_actions_env:
one_hot_action_env.append(temp_action_env[i])
actions_env.append(one_hot_action_env)
values = np.array(values).transpose(1, 0, 2)
actions = np.array(actions).transpose(1, 0, 2)
action_log_probs = np.array(action_log_probs).transpose(1, 0, 2)
rnn_states = np.array(rnn_states).transpose(1, 0, 2, 3)
rnn_states_critic = np.array(rnn_states_critic).transpose(1, 0, 2, 3)
return values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env
def insert(self, data):
obs, rewards, dones, infos, values, actions, action_log_probs, rnn_states, rnn_states_critic = data
rnn_states[dones == True] = np.zeros(((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32)
rnn_states_critic[dones == True] = np.zeros(((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
share_obs = []
for o in obs:
share_obs.append(list(chain(*o)))
share_obs = np.array(share_obs)
for agent_id in range(self.num_agents):
if not self.use_centralized_V:
share_obs = np.array(list(obs[:, agent_id]))
self.buffer[agent_id].insert(share_obs,
np.array(list(obs[:, agent_id])),
rnn_states[:, agent_id],
rnn_states_critic[:, agent_id],
actions[:, agent_id],
action_log_probs[:, agent_id],
values[:, agent_id],
rewards[:, agent_id],
masks[:, agent_id])
@torch.no_grad()
def eval(self, total_num_steps):
eval_episode_rewards = []
eval_obs = self.eval_envs.reset()
eval_rnn_states = np.zeros((self.n_eval_rollout_threads, self.num_agents, self.recurrent_N, self.hidden_size),
dtype=np.float32)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
for eval_step in range(self.episode_length):
eval_temp_actions_env = []
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_rollout()
eval_action, eval_rnn_state = self.trainer[agent_id].policy.act(np.array(list(eval_obs[:, agent_id])),
eval_rnn_states[:, agent_id],
eval_masks[:, agent_id],
deterministic=True)
eval_action = eval_action.detach().cpu().numpy()
# rearrange action
if self.eval_envs.action_space[agent_id].__class__.__name__ == 'MultiDiscrete':
for i in range(self.eval_envs.action_space[agent_id].shape):
eval_uc_action_env = np.eye(self.eval_envs.action_space[agent_id].high[i] + 1)[
eval_action[:, i]]
if i == 0:
eval_action_env = eval_uc_action_env
else:
eval_action_env = np.concatenate((eval_action_env, eval_uc_action_env), axis=1)
elif self.eval_envs.action_space[agent_id].__class__.__name__ == 'Discrete':
eval_action_env = np.squeeze(np.eye(self.eval_envs.action_space[agent_id].n)[eval_action], 1)
else:
raise NotImplementedError
eval_temp_actions_env.append(eval_action_env)
eval_rnn_states[:, agent_id] = _t2n(eval_rnn_state)
# [envs, agents, dim]
eval_actions_env = []
for i in range(self.n_eval_rollout_threads):
eval_one_hot_action_env = []
for eval_temp_action_env in eval_temp_actions_env:
eval_one_hot_action_env.append(eval_temp_action_env[i])
eval_actions_env.append(eval_one_hot_action_env)
# Obser reward and next obs
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(eval_actions_env)
eval_episode_rewards.append(eval_rewards)
eval_rnn_states[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), self.recurrent_N, self.hidden_size), dtype=np.float32)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
eval_masks[eval_dones == True] = np.zeros(((eval_dones == True).sum(), 1), dtype=np.float32)
eval_episode_rewards = np.array(eval_episode_rewards)
eval_train_infos = []
for agent_id in range(self.num_agents):
eval_average_episode_rewards = np.mean(np.sum(eval_episode_rewards[:, :, agent_id], axis=0))
eval_train_infos.append({'eval_average_episode_rewards': eval_average_episode_rewards})
print("eval average episode rewards of agent%i: " % agent_id + str(eval_average_episode_rewards))
self.log_train(eval_train_infos, total_num_steps)
@torch.no_grad()
def render(self):
all_frames = []
for episode in range(self.all_args.render_episodes):
episode_rewards = []
obs = self.envs.reset()
if self.all_args.save_gifs:
image = self.envs.render('rgb_array')[0][0]
all_frames.append(image)
rnn_states = np.zeros((self.n_rollout_threads, self.num_agents, self.recurrent_N, self.hidden_size),
dtype=np.float32)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
for step in range(self.episode_length):
calc_start = time.time()
temp_actions_env = []
for agent_id in range(self.num_agents):
if not self.use_centralized_V:
share_obs = np.array(list(obs[:, agent_id]))
self.trainer[agent_id].prep_rollout()
action, rnn_state = self.trainer[agent_id].policy.act(np.array(list(obs[:, agent_id])),
rnn_states[:, agent_id],
masks[:, agent_id],
deterministic=True)
action = action.detach().cpu().numpy()
# rearrange action
if self.envs.action_space[agent_id].__class__.__name__ == 'MultiDiscrete':
for i in range(self.envs.action_space[agent_id].shape):
uc_action_env = np.eye(self.envs.action_space[agent_id].high[i] + 1)[action[:, i]]
if i == 0:
action_env = uc_action_env
else:
action_env = np.concatenate((action_env, uc_action_env), axis=1)
elif self.envs.action_space[agent_id].__class__.__name__ == 'Discrete':
action_env = np.squeeze(np.eye(self.envs.action_space[agent_id].n)[action], 1)
else:
raise NotImplementedError
temp_actions_env.append(action_env)
rnn_states[:, agent_id] = _t2n(rnn_state)
# [envs, agents, dim]
actions_env = []
for i in range(self.n_rollout_threads):
one_hot_action_env = []
for temp_action_env in temp_actions_env:
one_hot_action_env.append(temp_action_env[i])
actions_env.append(one_hot_action_env)
# Obser reward and next obs
obs, rewards, dones, infos = self.envs.step(actions_env)
episode_rewards.append(rewards)
rnn_states[dones == True] = np.zeros(((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
if self.all_args.save_gifs:
image = self.envs.render('rgb_array')[0][0]
all_frames.append(image)
calc_end = time.time()
elapsed = calc_end - calc_start
if elapsed < self.all_args.ifi:
time.sleep(self.all_args.ifi - elapsed)
episode_rewards = np.array(episode_rewards)
for agent_id in range(self.num_agents):
average_episode_rewards = np.mean(np.sum(episode_rewards[:, :, agent_id], axis=0))
print("eval average episode rewards of agent%i: " % agent_id + str(average_episode_rewards))
if self.all_args.save_gifs:
imageio.mimsave(str(self.gif_dir) + '/render.gif', all_frames, duration=self.all_args.ifi)
\ No newline at end of file
import wandb
import os
import numpy as np
import torch
from tensorboardX import SummaryWriter
from mappo.utils.shared_buffer import SharedReplayBuffer
def _t2n(x):
"""Convert torch tensor to a numpy array."""
return x.detach().cpu().numpy()
class Runner(object):
"""
Base class for training recurrent policies.
:param config: (dict) Config dictionary containing parameters for training.
"""
def __init__(self, config):
self.all_args = config['all_args']
self.envs = config['envs']
self.eval_envs = config['eval_envs']
self.device = config['device']
self.num_agents = config['num_agents']
if config.__contains__("render_envs"):
self.render_envs = config['render_envs']
# parameters
self.env_name = self.all_args.env_name
self.algorithm_name = self.all_args.algorithm_name
self.experiment_name = self.all_args.experiment_name
self.use_centralized_V = self.all_args.use_centralized_V
self.use_obs_instead_of_state = self.all_args.use_obs_instead_of_state
self.num_env_steps = self.all_args.num_env_steps
self.episode_length = self.all_args.episode_length
self.n_rollout_threads = self.all_args.n_rollout_threads
self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads
self.n_render_rollout_threads = self.all_args.n_render_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
# interval
self.save_interval = self.all_args.save_interval
self.use_eval = self.all_args.use_eval
self.eval_interval = self.all_args.eval_interval
self.log_interval = self.all_args.log_interval
# dir
self.model_dir = self.all_args.model_dir
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
self.run_dir = str(wandb.run.dir)
else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
from mappo.algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from mappo.algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
share_observation_space = self.envs.share_observation_space[0] if self.use_centralized_V else self.envs.observation_space[0]
# policy network
self.policy = Policy(self.all_args,
self.envs.observation_space[0],
share_observation_space,
self.envs.action_space[0],
device = self.device)
if self.model_dir is not None:
self.restore()
# algorithm
self.trainer = TrainAlgo(self.all_args, self.policy, device = self.device)
# buffer
self.buffer = SharedReplayBuffer(self.all_args,
self.num_agents,
self.envs.observation_space[0],
share_observation_space,
self.envs.action_space[0])
def run(self):
"""Collect training data, perform training updates, and evaluate policy."""
raise NotImplementedError
def warmup(self):
"""Collect warmup pre-training data."""
raise NotImplementedError
def collect(self, step):
"""Collect rollouts for training."""
raise NotImplementedError
def insert(self, data):
"""
Insert data into buffer.
:param data: (Tuple) data to insert into training buffer.
"""
raise NotImplementedError
@torch.no_grad()
def compute(self):
"""Calculate returns for the collected data."""
self.trainer.prep_rollout()
next_values = self.trainer.policy.get_values(np.concatenate(self.buffer.share_obs[-1]),
np.concatenate(self.buffer.rnn_states_critic[-1]),
np.concatenate(self.buffer.masks[-1]))
next_values = np.array(np.split(_t2n(next_values), self.n_rollout_threads))
self.buffer.compute_returns(next_values, self.trainer.value_normalizer)
def train(self):
"""Train policies with data in buffer. """
self.trainer.prep_training()
train_infos = self.trainer.train(self.buffer)
self.buffer.after_update()
return train_infos
def save(self):
"""Save policy's actor and critic networks."""
policy_actor = self.trainer.policy.actor
torch.save(policy_actor.state_dict(), str(self.save_dir) + "/actor.pt")
policy_critic = self.trainer.policy.critic
torch.save(policy_critic.state_dict(), str(self.save_dir) + "/critic.pt")
def restore(self):
"""Restore policy's networks from a saved model."""
policy_actor_state_dict = torch.load(str(self.model_dir) + '/actor.pt')
self.policy.actor.load_state_dict(policy_actor_state_dict)
if not self.all_args.use_render:
policy_critic_state_dict = torch.load(str(self.model_dir) + '/critic.pt')
self.policy.critic.load_state_dict(policy_critic_state_dict)
def log_train(self, train_infos, total_num_steps):
"""
Log training info.
:param train_infos: (dict) information about training update.
:param total_num_steps: (int) total number of training env steps.
"""
for k, v in train_infos.items():
if self.use_wandb:
wandb.log({k: v}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
"""
Log env info.
:param env_infos: (dict) information about env state.
:param total_num_steps: (int) total number of training env steps.
"""
for k, v in env_infos.items():
if len(v)>0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
"""
# @Time : 2021/7/1 7:15 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env_runner.py
"""
"""
# @Time : 2021/7/1 7:04 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : huaru_runner.py
"""
import time
import numpy as np
import torch
from mappo.runner.shared.base_runner import Runner
import wandb
import imageio
def _t2n(x):
return x.detach().cpu().numpy()
class EnvRunner(Runner):
"""Runner class to perform training, evaluation. and data collection for the MPEs. See parent class for details."""
def __init__(self, config):
super(EnvRunner, self).__init__(config)
def run(self):
self.warmup()
start = time.time()
episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
for episode in range(episodes):
if self.use_linear_lr_decay:
self.trainer.policy.lr_decay(episode, episodes)
for step in range(self.episode_length):
# Sample actions
values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env = self.collect(step)
# Obser reward and next obs
obs, rewards, dones, infos = self.envs.step(actions_env)
data = obs, rewards, dones, infos, values, actions, action_log_probs, rnn_states, rnn_states_critic
# insert data into buffer
self.insert(data)
# compute return and update network
self.compute()
train_infos = self.train()
# post process
total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
# save model
if (episode % self.save_interval == 0 or episode == episodes - 1):
self.save()
# log information
if episode % self.log_interval == 0:
end = time.time()
print("\n Scenario {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n"
.format(self.all_args.scenario_name,
self.algorithm_name,
self.experiment_name,
episode,
episodes,
total_num_steps,
self.num_env_steps,
int(total_num_steps / (end - start))))
# if self.env_name == "MPE":
# env_infos = {}
# for agent_id in range(self.num_agents):
# idv_rews = []
# for info in infos:
# if 'individual_reward' in info[agent_id].keys():
# idv_rews.append(info[agent_id]['individual_reward'])
# agent_k = 'agent%i/individual_rewards' % agent_id
# env_infos[agent_k] = idv_rews
train_infos["average_episode_rewards"] = np.mean(self.buffer.rewards) * self.episode_length
print("average episode rewards is {}".format(train_infos["average_episode_rewards"]))
self.log_train(train_infos, total_num_steps)
# self.log_env(env_infos, total_num_steps)
# eval
if episode % self.eval_interval == 0 and self.use_eval:
self.eval(total_num_steps)
def warmup(self):
# reset env
obs = self.envs.reset() # shape = (5, 2, 14)
# replay buffer
if self.use_centralized_V:
share_obs = obs.reshape(self.n_rollout_threads, -1) # shape = (5, 28)
share_obs = np.expand_dims(share_obs, 1).repeat(self.num_agents, axis=1) # shape = (5, 2, 28)
else:
share_obs = obs
self.buffer.share_obs[0] = share_obs.copy()
self.buffer.obs[0] = obs.copy()
@torch.no_grad()
def collect(self, step):
self.trainer.prep_rollout()
value, action, action_log_prob, rnn_states, rnn_states_critic \
= self.trainer.policy.get_actions(np.concatenate(self.buffer.share_obs[step]),
np.concatenate(self.buffer.obs[step]),
np.concatenate(self.buffer.rnn_states[step]),
np.concatenate(self.buffer.rnn_states_critic[step]),
np.concatenate(self.buffer.masks[step]))
# [self.envs, agents, dim]
values = np.array(np.split(_t2n(value), self.n_rollout_threads))
actions = np.array(np.split(_t2n(action), self.n_rollout_threads))
action_log_probs = np.array(np.split(_t2n(action_log_prob), self.n_rollout_threads))
rnn_states = np.array(np.split(_t2n(rnn_states), self.n_rollout_threads))
rnn_states_critic = np.array(np.split(_t2n(rnn_states_critic), self.n_rollout_threads))
# rearrange action
if self.envs.action_space[0].__class__.__name__ == 'MultiDiscrete':
for i in range(self.envs.action_space[0].shape):
uc_actions_env = np.eye(self.envs.action_space[0].high[i] + 1)[actions[:, :, i]]
if i == 0:
actions_env = uc_actions_env
else:
actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
elif self.envs.action_space[0].__class__.__name__ == 'Discrete':
actions_env = np.squeeze(np.eye(self.envs.action_space[0].n)[actions], 2)
else:
raise NotImplementedError
return values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env
def insert(self, data):
obs, rewards, dones, infos, values, actions, action_log_probs, rnn_states, rnn_states_critic = data
rnn_states[dones == True] = np.zeros(((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32)
rnn_states_critic[dones == True] = np.zeros(((dones == True).sum(), *self.buffer.rnn_states_critic.shape[3:]),
dtype=np.float32)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
if self.use_centralized_V:
share_obs = obs.reshape(self.n_rollout_threads, -1)
share_obs = np.expand_dims(share_obs, 1).repeat(self.num_agents, axis=1)
else:
share_obs = obs
self.buffer.insert(share_obs, obs, rnn_states, rnn_states_critic, actions, action_log_probs, values, rewards,
masks)
@torch.no_grad()
def eval(self, total_num_steps):
eval_episode_rewards = []
eval_obs = self.eval_envs.reset()
eval_rnn_states = np.zeros((self.n_eval_rollout_threads, *self.buffer.rnn_states.shape[2:]), dtype=np.float32)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
for eval_step in range(self.episode_length):
self.trainer.prep_rollout()
eval_action, eval_rnn_states = self.trainer.policy.act(np.concatenate(eval_obs),
np.concatenate(eval_rnn_states),
np.concatenate(eval_masks),
deterministic=True)
eval_actions = np.array(np.split(_t2n(eval_action), self.n_eval_rollout_threads))
eval_rnn_states = np.array(np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads))
if self.eval_envs.action_space[0].__class__.__name__ == 'MultiDiscrete':
for i in range(self.eval_envs.action_space[0].shape):
eval_uc_actions_env = np.eye(self.eval_envs.action_space[0].high[i] + 1)[eval_actions[:, :, i]]
if i == 0:
eval_actions_env = eval_uc_actions_env
else:
eval_actions_env = np.concatenate((eval_actions_env, eval_uc_actions_env), axis=2)
elif self.eval_envs.action_space[0].__class__.__name__ == 'Discrete':
eval_actions_env = np.squeeze(np.eye(self.eval_envs.action_space[0].n)[eval_actions], 2)
else:
raise NotImplementedError
# Obser reward and next obs
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(eval_actions_env)
eval_episode_rewards.append(eval_rewards)
eval_rnn_states[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), self.recurrent_N, self.hidden_size), dtype=np.float32)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
eval_masks[eval_dones == True] = np.zeros(((eval_dones == True).sum(), 1), dtype=np.float32)
eval_episode_rewards = np.array(eval_episode_rewards)
eval_env_infos = {}
eval_env_infos['eval_average_episode_rewards'] = np.sum(np.array(eval_episode_rewards), axis=0)
eval_average_episode_rewards = np.mean(eval_env_infos['eval_average_episode_rewards'])
print("eval average episode rewards of agent: " + str(eval_average_episode_rewards))
self.log_env(eval_env_infos, total_num_steps)
@torch.no_grad()
def render(self):
"""Visualize the env."""
envs = self.envs
all_frames = []
for episode in range(self.all_args.render_episodes):
obs = envs.reset()
if self.all_args.save_gifs:
image = envs.render('rgb_array')[0][0]
all_frames.append(image)
else:
envs.render('human')
rnn_states = np.zeros((self.n_rollout_threads, self.num_agents, self.recurrent_N, self.hidden_size),
dtype=np.float32)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
episode_rewards = []
for step in range(self.episode_length):
calc_start = time.time()
self.trainer.prep_rollout()
action, rnn_states = self.trainer.policy.act(np.concatenate(obs),
np.concatenate(rnn_states),
np.concatenate(masks),
deterministic=True)
actions = np.array(np.split(_t2n(action), self.n_rollout_threads))
rnn_states = np.array(np.split(_t2n(rnn_states), self.n_rollout_threads))
if envs.action_space[0].__class__.__name__ == 'MultiDiscrete':
for i in range(envs.action_space[0].shape):
uc_actions_env = np.eye(envs.action_space[0].high[i] + 1)[actions[:, :, i]]
if i == 0:
actions_env = uc_actions_env
else:
actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
elif envs.action_space[0].__class__.__name__ == 'Discrete':
actions_env = np.squeeze(np.eye(envs.action_space[0].n)[actions], 2)
else:
raise NotImplementedError
# Obser reward and next obs
obs, rewards, dones, infos = envs.step(actions_env)
episode_rewards.append(rewards)
rnn_states[dones == True] = np.zeros(((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
if self.all_args.save_gifs:
image = envs.render('rgb_array')[0][0]
all_frames.append(image)
calc_end = time.time()
elapsed = calc_end - calc_start
if elapsed < self.all_args.ifi:
time.sleep(self.all_args.ifi - elapsed)
else:
envs.render('human')
print("average episode rewards is: " + str(np.mean(np.sum(np.array(episode_rewards), axis=0))))
if self.all_args.save_gifs:
imageio.mimsave(str(self.gif_dir) + '/render.gif', all_frames, duration=self.all_args.ifi)
#!/usr/bin/env python
import sys
import os
import wandb
import socket
import setproctitle
import numpy as np
from pathlib import Path
import torch
from mappo.config import get_config
from mappo.envs.mpe.MPE_env import MPEEnv
from mappo.envs.env_wrappers import SubprocVecEnv, DummyVecEnv
def make_render_env(all_args):
def get_env_fn(rank):
def init_env():
if all_args.env_name == "MPE":
env = MPEEnv(all_args)
else:
print("Can not support the " +
all_args.env_name + "environment.")
raise NotImplementedError
env.seed(all_args.seed + rank * 1000)
return env
return init_env
if all_args.n_rollout_threads == 1:
return DummyVecEnv([get_env_fn(0)])
else:
return SubprocVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def parse_args(args, parser):
parser.add_argument('--scenario_name', type=str,
default='simple_spread', help="Which scenario to run on")
parser.add_argument("--num_landmarks", type=int, default=3)
parser.add_argument('--num_agents', type=int,
default=2, help="number of players")
all_args = parser.parse_known_args(args)[0]
return all_args
def main(args):
parser = get_config()
all_args = parse_args(args, parser)
if all_args.algorithm_name == "rmappo" or all_args.algorithm_name == "rmappg":
assert (
all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!")
elif all_args.algorithm_name == "mappo" or all_args.algorithm_name == "mappg":
assert (all_args.use_recurrent_policy and all_args.use_naive_recurrent_policy) == False, (
"check recurrent policy!")
else:
raise NotImplementedError
assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, (
"The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")
assert all_args.use_render, ("u need to set use_render be True")
assert not (all_args.model_dir == None or all_args.model_dir == ""), ("set model_dir first")
assert all_args.n_rollout_threads==1, ("only support to use 1 env to render.")
# cuda
if all_args.cuda and torch.cuda.is_available():
print("choose to use gpu...")
device = torch.device("cuda:0")
torch.set_num_threads(all_args.n_training_threads)
if all_args.cuda_deterministic:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
else:
print("choose to use cpu...")
device = torch.device("cpu")
torch.set_num_threads(all_args.n_training_threads)
# run dir
run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
if not run_dir.exists():
os.makedirs(str(run_dir))
if not run_dir.exists():
curr_run = 'run1'
else:
exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if str(folder.name).startswith('run')]
if len(exst_run_nums) == 0:
curr_run = 'run1'
else:
curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = run_dir / curr_run
if not run_dir.exists():
os.makedirs(str(run_dir))
setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \
str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(all_args.user_name))
# seed
torch.manual_seed(all_args.seed)
torch.cuda.manual_seed_all(all_args.seed)
np.random.seed(all_args.seed)
# env init
envs = make_render_env(all_args)
eval_envs = None
num_agents = all_args.num_agents
config = {
"all_args": all_args,
"envs": envs,
"eval_envs": eval_envs,
"num_agents": num_agents,
"device": device,
"run_dir": run_dir
}
# run experiments
if all_args.share_policy:
from onpolicy.runner.shared.mpe_runner import MPERunner as Runner
else:
from onpolicy.runner.separated.mpe_runner import MPERunner as Runner
runner = Runner(config)
runner.render()
# post process
envs.close()
if __name__ == "__main__":
main(sys.argv[1:])
"""
# @Time : 2021/6/30 10:07 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : train.py
"""
# !/usr/bin/env python
import sys
import os
import wandb
import socket
import setproctitle
import numpy as np
from pathlib import Path
import torch
from mappo.config import get_config
from mappo.envs.env_wrappers import SubprocVecEnv, DummyVecEnv
"""Train script for MPEs."""
def make_train_env(all_args):
return SubprocVecEnv(all_args)
def make_eval_env(all_args):
return DummyVecEnv(all_args)
def parse_args(args, parser):
parser.add_argument('--scenario_name', type=str, default='MyEnv', help="Which scenario to run on")
parser.add_argument("--num_landmarks", type=int, default=3)
parser.add_argument('--num_agents', type=int, default=2, help="number of players")
all_args = parser.parse_known_args(args)[0]
return all_args
def main(args):
parser = get_config()
all_args = parse_args(args, parser)
if all_args.algorithm_name == "rmappo":
assert (all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy), ("check recurrent policy!")
elif all_args.algorithm_name == "mappo":
assert (all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False), (
"check recurrent policy!")
else:
raise NotImplementedError
assert (all_args.share_policy == True and all_args.scenario_name == 'simple_speaker_listener') == False, (
"The simple_speaker_listener scenario can not use shared policy. Please check the config.py.")
# cuda
if all_args.cuda and torch.cuda.is_available():
print("choose to use gpu...")
device = torch.device("cuda:0")
torch.set_num_threads(all_args.n_training_threads)
if all_args.cuda_deterministic:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
else:
print("choose to use cpu...")
device = torch.device("cpu")
torch.set_num_threads(all_args.n_training_threads)
# run dir
run_dir = Path(os.path.split(os.path.dirname(os.path.abspath(__file__)))[
0] + "/results") / all_args.env_name / all_args.scenario_name / all_args.algorithm_name / all_args.experiment_name
if not run_dir.exists():
os.makedirs(str(run_dir))
# wandb
if all_args.use_wandb:
run = wandb.init(config=all_args,
project=all_args.env_name,
entity=all_args.user_name,
notes=socket.gethostname(),
name=str(all_args.algorithm_name) + "_" +
str(all_args.experiment_name) +
"_seed" + str(all_args.seed),
group=all_args.scenario_name,
dir=str(run_dir),
job_type="training",
reinit=True)
else:
if not run_dir.exists():
curr_run = 'run1'
else:
exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if
str(folder.name).startswith('run')]
if len(exst_run_nums) == 0:
curr_run = 'run1'
else:
curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = run_dir / curr_run
if not run_dir.exists():
os.makedirs(str(run_dir))
setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \
str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(
all_args.user_name))
# seed
torch.manual_seed(all_args.seed)
torch.cuda.manual_seed_all(all_args.seed)
np.random.seed(all_args.seed)
# env init
envs = make_train_env(all_args)
eval_envs = make_eval_env(all_args) if all_args.use_eval else None
num_agents = all_args.num_agents
config = {
"all_args": all_args,
"envs": envs,
"eval_envs": eval_envs,
"num_agents": num_agents,
"device": device,
"run_dir": run_dir
}
# run experiments
if all_args.share_policy:
from mappo.runner.shared.env_runner import EnvRunner as Runner
else:
from mappo.runner.separated.env_runner import EnvRunner as Runner
runner = Runner(config)
runner.run()
# post process
envs.close()
if all_args.use_eval and eval_envs is not envs:
eval_envs.close()
if all_args.use_wandb:
run.finish()
else:
runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
runner.writter.close()
if __name__ == "__main__":
main(sys.argv[1:])
#!/bin/sh
env="MPE"
scenario="simple_spread" # simple_speaker_listener # simple_reference
num_landmarks=3
num_agents=3
algo="rmappo"
exp="check"
seed_max=1
echo "env is ${env}, scenario is ${scenario}, algo is ${algo}, exp is ${exp}, max seed is ${seed_max}"
for seed in `seq ${seed_max}`;
do
echo "seed is ${seed}:"
CUDA_VISIBLE_DEVICES=0 python train/train_mpe.py --use_valuenorm --use_popart --env_name ${env} --algorithm_name ${algo} --experiment_name ${exp} --scenario_name ${scenario} --num_agents ${num_agents} --num_landmarks ${num_landmarks} --seed ${seed} --n_training_threads 1 --n_rollout_threads 128 --num_mini_batch 1 --episode_length 25 --num_env_steps 20000000 --ppo_epoch 10 --use_ReLU --gain 0.01 --lr 7e-4 --critic_lr 7e-4 --wandb_name "zoeyuchao" --user_name "zoeyuchao"
done
\ No newline at end of file
import torch
import numpy as np
from collections import defaultdict
from mappo.utils.util import check, get_shape_from_obs_space, get_shape_from_act_space
def _flatten(T, N, x):
return x.reshape(T * N, *x.shape[2:])
def _cast(x):
return x.transpose(1,0,2).reshape(-1, *x.shape[2:])
class SeparatedReplayBuffer(object):
def __init__(self, args, obs_space, share_obs_space, act_space):
self.episode_length = args.episode_length
self.n_rollout_threads = args.n_rollout_threads
self.rnn_hidden_size = args.hidden_size
self.recurrent_N = args.recurrent_N
self.gamma = args.gamma
self.gae_lambda = args.gae_lambda
self._use_gae = args.use_gae
self._use_popart = args.use_popart
self._use_valuenorm = args.use_valuenorm
self._use_proper_time_limits = args.use_proper_time_limits
obs_shape = get_shape_from_obs_space(obs_space)
share_obs_shape = get_shape_from_obs_space(share_obs_space)
if type(obs_shape[-1]) == list:
obs_shape = obs_shape[:1]
if type(share_obs_shape[-1]) == list:
share_obs_shape = share_obs_shape[:1]
self.share_obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, *share_obs_shape), dtype=np.float32)
self.obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, *obs_shape), dtype=np.float32)
self.rnn_states = np.zeros((self.episode_length + 1, self.n_rollout_threads, self.recurrent_N, self.rnn_hidden_size), dtype=np.float32)
self.rnn_states_critic = np.zeros_like(self.rnn_states)
self.value_preds = np.zeros((self.episode_length + 1, self.n_rollout_threads, 1), dtype=np.float32)
self.returns = np.zeros((self.episode_length + 1, self.n_rollout_threads, 1), dtype=np.float32)
if act_space.__class__.__name__ == 'Discrete':
self.available_actions = np.ones((self.episode_length + 1, self.n_rollout_threads, act_space.n), dtype=np.float32)
else:
self.available_actions = None
act_shape = get_shape_from_act_space(act_space)
self.actions = np.zeros((self.episode_length, self.n_rollout_threads, act_shape), dtype=np.float32)
self.action_log_probs = np.zeros((self.episode_length, self.n_rollout_threads, act_shape), dtype=np.float32)
self.rewards = np.zeros((self.episode_length, self.n_rollout_threads, 1), dtype=np.float32)
self.masks = np.ones((self.episode_length + 1, self.n_rollout_threads, 1), dtype=np.float32)
self.bad_masks = np.ones_like(self.masks)
self.active_masks = np.ones_like(self.masks)
self.step = 0
def insert(self, share_obs, obs, rnn_states, rnn_states_critic, actions, action_log_probs,
value_preds, rewards, masks, bad_masks=None, active_masks=None, available_actions=None):
self.share_obs[self.step + 1] = share_obs.copy()
self.obs[self.step + 1] = obs.copy()
self.rnn_states[self.step + 1] = rnn_states.copy()
self.rnn_states_critic[self.step + 1] = rnn_states_critic.copy()
self.actions[self.step] = actions.copy()
self.action_log_probs[self.step] = action_log_probs.copy()
self.value_preds[self.step] = value_preds.copy()
self.rewards[self.step] = rewards.copy()
self.masks[self.step + 1] = masks.copy()
if bad_masks is not None:
self.bad_masks[self.step + 1] = bad_masks.copy()
if active_masks is not None:
self.active_masks[self.step + 1] = active_masks.copy()
if available_actions is not None:
self.available_actions[self.step + 1] = available_actions.copy()
self.step = (self.step + 1) % self.episode_length
def chooseinsert(self, share_obs, obs, rnn_states, rnn_states_critic, actions, action_log_probs,
value_preds, rewards, masks, bad_masks=None, active_masks=None, available_actions=None):
self.share_obs[self.step] = share_obs.copy()
self.obs[self.step] = obs.copy()
self.rnn_states[self.step + 1] = rnn_states.copy()
self.rnn_states_critic[self.step + 1] = rnn_states_critic.copy()
self.actions[self.step] = actions.copy()
self.action_log_probs[self.step] = action_log_probs.copy()
self.value_preds[self.step] = value_preds.copy()
self.rewards[self.step] = rewards.copy()
self.masks[self.step + 1] = masks.copy()
if bad_masks is not None:
self.bad_masks[self.step + 1] = bad_masks.copy()
if active_masks is not None:
self.active_masks[self.step] = active_masks.copy()
if available_actions is not None:
self.available_actions[self.step] = available_actions.copy()
self.step = (self.step + 1) % self.episode_length
def after_update(self):
self.share_obs[0] = self.share_obs[-1].copy()
self.obs[0] = self.obs[-1].copy()
self.rnn_states[0] = self.rnn_states[-1].copy()
self.rnn_states_critic[0] = self.rnn_states_critic[-1].copy()
self.masks[0] = self.masks[-1].copy()
self.bad_masks[0] = self.bad_masks[-1].copy()
self.active_masks[0] = self.active_masks[-1].copy()
if self.available_actions is not None:
self.available_actions[0] = self.available_actions[-1].copy()
def chooseafter_update(self):
self.rnn_states[0] = self.rnn_states[-1].copy()
self.rnn_states_critic[0] = self.rnn_states_critic[-1].copy()
self.masks[0] = self.masks[-1].copy()
self.bad_masks[0] = self.bad_masks[-1].copy()
def compute_returns(self, next_value, value_normalizer=None):
if self._use_proper_time_limits:
if self._use_gae:
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.shape[0])):
if self._use_popart or self._use_valuenorm:
delta = self.rewards[step] + self.gamma * value_normalizer.denormalize(self.value_preds[
step + 1]) * self.masks[step + 1] - value_normalizer.denormalize(self.value_preds[step])
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
gae = gae * self.bad_masks[step + 1]
self.returns[step] = gae + value_normalizer.denormalize(self.value_preds[step])
else:
delta = self.rewards[step] + self.gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
gae = gae * self.bad_masks[step + 1]
self.returns[step] = gae + self.value_preds[step]
else:
self.returns[-1] = next_value
for step in reversed(range(self.rewards.shape[0])):
if self._use_popart:
self.returns[step] = (self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step]) * self.bad_masks[step + 1] \
+ (1 - self.bad_masks[step + 1]) * value_normalizer.denormalize(self.value_preds[step])
else:
self.returns[step] = (self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step]) * self.bad_masks[step + 1] \
+ (1 - self.bad_masks[step + 1]) * self.value_preds[step]
else:
if self._use_gae:
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.shape[0])):
if self._use_popart or self._use_valuenorm:
delta = self.rewards[step] + self.gamma * value_normalizer.denormalize(self.value_preds[step + 1]) * self.masks[step + 1] - value_normalizer.denormalize(self.value_preds[step])
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
self.returns[step] = gae + value_normalizer.denormalize(self.value_preds[step])
else:
delta = self.rewards[step] + self.gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
self.returns[step] = gae + self.value_preds[step]
else:
self.returns[-1] = next_value
for step in reversed(range(self.rewards.shape[0])):
self.returns[step] = self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step]
def feed_forward_generator(self, advantages, num_mini_batch=None, mini_batch_size=None):
episode_length, n_rollout_threads = self.rewards.shape[0:2]
batch_size = n_rollout_threads * episode_length
if mini_batch_size is None:
assert batch_size >= num_mini_batch, (
"PPO requires the number of processes ({}) "
"* number of steps ({}) = {} "
"to be greater than or equal to the number of PPO mini batches ({})."
"".format(n_rollout_threads, episode_length, n_rollout_threads * episode_length,
num_mini_batch))
mini_batch_size = batch_size // num_mini_batch
rand = torch.randperm(batch_size).numpy()
sampler = [rand[i*mini_batch_size:(i+1)*mini_batch_size] for i in range(num_mini_batch)]
share_obs = self.share_obs[:-1].reshape(-1, *self.share_obs.shape[2:])
obs = self.obs[:-1].reshape(-1, *self.obs.shape[2:])
rnn_states = self.rnn_states[:-1].reshape(-1, *self.rnn_states.shape[2:])
rnn_states_critic = self.rnn_states_critic[:-1].reshape(-1, *self.rnn_states_critic.shape[2:])
actions = self.actions.reshape(-1, self.actions.shape[-1])
if self.available_actions is not None:
available_actions = self.available_actions[:-1].reshape(-1, self.available_actions.shape[-1])
value_preds = self.value_preds[:-1].reshape(-1, 1)
returns = self.returns[:-1].reshape(-1, 1)
masks = self.masks[:-1].reshape(-1, 1)
active_masks = self.active_masks[:-1].reshape(-1, 1)
action_log_probs = self.action_log_probs.reshape(-1, self.action_log_probs.shape[-1])
advantages = advantages.reshape(-1, 1)
for indices in sampler:
# obs size [T+1 N Dim]-->[T N Dim]-->[T*N,Dim]-->[index,Dim]
share_obs_batch = share_obs[indices]
obs_batch = obs[indices]
rnn_states_batch = rnn_states[indices]
rnn_states_critic_batch = rnn_states_critic[indices]
actions_batch = actions[indices]
if self.available_actions is not None:
available_actions_batch = available_actions[indices]
else:
available_actions_batch = None
value_preds_batch = value_preds[indices]
return_batch = returns[indices]
masks_batch = masks[indices]
active_masks_batch = active_masks[indices]
old_action_log_probs_batch = action_log_probs[indices]
if advantages is None:
adv_targ = None
else:
adv_targ = advantages[indices]
yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, adv_targ, available_actions_batch
def naive_recurrent_generator(self, advantages, num_mini_batch):
n_rollout_threads = self.rewards.shape[1]
assert n_rollout_threads >= num_mini_batch, (
"PPO requires the number of processes ({}) "
"to be greater than or equal to the number of "
"PPO mini batches ({}).".format(n_rollout_threads, num_mini_batch))
num_envs_per_batch = n_rollout_threads // num_mini_batch
perm = torch.randperm(n_rollout_threads).numpy()
for start_ind in range(0, n_rollout_threads, num_envs_per_batch):
share_obs_batch = []
obs_batch = []
rnn_states_batch = []
rnn_states_critic_batch = []
actions_batch = []
available_actions_batch = []
value_preds_batch = []
return_batch = []
masks_batch = []
active_masks_batch = []
old_action_log_probs_batch = []
adv_targ = []
for offset in range(num_envs_per_batch):
ind = perm[start_ind + offset]
share_obs_batch.append(self.share_obs[:-1, ind])
obs_batch.append(self.obs[:-1, ind])
rnn_states_batch.append(self.rnn_states[0:1, ind])
rnn_states_critic_batch.append(self.rnn_states_critic[0:1, ind])
actions_batch.append(self.actions[:, ind])
if self.available_actions is not None:
available_actions_batch.append(self.available_actions[:-1, ind])
value_preds_batch.append(self.value_preds[:-1, ind])
return_batch.append(self.returns[:-1, ind])
masks_batch.append(self.masks[:-1, ind])
active_masks_batch.append(self.active_masks[:-1, ind])
old_action_log_probs_batch.append(self.action_log_probs[:, ind])
adv_targ.append(advantages[:, ind])
# [N[T, dim]]
T, N = self.episode_length, num_envs_per_batch
# These are all from_numpys of size (T, N, -1)
share_obs_batch = np.stack(share_obs_batch, 1)
obs_batch = np.stack(obs_batch, 1)
actions_batch = np.stack(actions_batch, 1)
if self.available_actions is not None:
available_actions_batch = np.stack(available_actions_batch, 1)
value_preds_batch = np.stack(value_preds_batch, 1)
return_batch = np.stack(return_batch, 1)
masks_batch = np.stack(masks_batch, 1)
active_masks_batch = np.stack(active_masks_batch, 1)
old_action_log_probs_batch = np.stack(old_action_log_probs_batch, 1)
adv_targ = np.stack(adv_targ, 1)
# States is just a (N, -1) from_numpy [N[1,dim]]
rnn_states_batch = np.stack(rnn_states_batch, 1).reshape(N, *self.rnn_states.shape[2:])
rnn_states_critic_batch = np.stack(rnn_states_critic_batch, 1).reshape(N, *self.rnn_states_critic.shape[2:])
# Flatten the (T, N, ...) from_numpys to (T * N, ...)
share_obs_batch = _flatten(T, N, share_obs_batch)
obs_batch = _flatten(T, N, obs_batch)
actions_batch = _flatten(T, N, actions_batch)
if self.available_actions is not None:
available_actions_batch = _flatten(T, N, available_actions_batch)
else:
available_actions_batch = None
value_preds_batch = _flatten(T, N, value_preds_batch)
return_batch = _flatten(T, N, return_batch)
masks_batch = _flatten(T, N, masks_batch)
active_masks_batch = _flatten(T, N, active_masks_batch)
old_action_log_probs_batch = _flatten(T, N, old_action_log_probs_batch)
adv_targ = _flatten(T, N, adv_targ)
yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, adv_targ, available_actions_batch
def recurrent_generator(self, advantages, num_mini_batch, data_chunk_length):
episode_length, n_rollout_threads = self.rewards.shape[0:2]
batch_size = n_rollout_threads * episode_length
data_chunks = batch_size // data_chunk_length # [C=r*T/L]
mini_batch_size = data_chunks // num_mini_batch
assert episode_length * n_rollout_threads >= data_chunk_length, (
"PPO requires the number of processes ({}) * episode length ({}) "
"to be greater than or equal to the number of "
"data chunk length ({}).".format(n_rollout_threads, episode_length, data_chunk_length))
assert data_chunks >= 2, ("need larger batch size")
rand = torch.randperm(data_chunks).numpy()
sampler = [rand[i*mini_batch_size:(i+1)*mini_batch_size] for i in range(num_mini_batch)]
if len(self.share_obs.shape) > 3:
share_obs = self.share_obs[:-1].transpose(1, 0, 2, 3, 4).reshape(-1, *self.share_obs.shape[2:])
obs = self.obs[:-1].transpose(1, 0, 2, 3, 4).reshape(-1, *self.obs.shape[2:])
else:
share_obs = _cast(self.share_obs[:-1])
obs = _cast(self.obs[:-1])
actions = _cast(self.actions)
action_log_probs = _cast(self.action_log_probs)
advantages = _cast(advantages)
value_preds = _cast(self.value_preds[:-1])
returns = _cast(self.returns[:-1])
masks = _cast(self.masks[:-1])
active_masks = _cast(self.active_masks[:-1])
# rnn_states = _cast(self.rnn_states[:-1])
# rnn_states_critic = _cast(self.rnn_states_critic[:-1])
rnn_states = self.rnn_states[:-1].transpose(1, 0, 2, 3).reshape(-1, *self.rnn_states.shape[2:])
rnn_states_critic = self.rnn_states_critic[:-1].transpose(1, 0, 2, 3).reshape(-1, *self.rnn_states_critic.shape[2:])
if self.available_actions is not None:
available_actions = _cast(self.available_actions[:-1])
for indices in sampler:
share_obs_batch = []
obs_batch = []
rnn_states_batch = []
rnn_states_critic_batch = []
actions_batch = []
available_actions_batch = []
value_preds_batch = []
return_batch = []
masks_batch = []
active_masks_batch = []
old_action_log_probs_batch = []
adv_targ = []
for index in indices:
ind = index * data_chunk_length
# size [T+1 N M Dim]-->[T N Dim]-->[N T Dim]-->[T*N,Dim]-->[L,Dim]
share_obs_batch.append(share_obs[ind:ind+data_chunk_length])
obs_batch.append(obs[ind:ind+data_chunk_length])
actions_batch.append(actions[ind:ind+data_chunk_length])
if self.available_actions is not None:
available_actions_batch.append(available_actions[ind:ind+data_chunk_length])
value_preds_batch.append(value_preds[ind:ind+data_chunk_length])
return_batch.append(returns[ind:ind+data_chunk_length])
masks_batch.append(masks[ind:ind+data_chunk_length])
active_masks_batch.append(active_masks[ind:ind+data_chunk_length])
old_action_log_probs_batch.append(action_log_probs[ind:ind+data_chunk_length])
adv_targ.append(advantages[ind:ind+data_chunk_length])
# size [T+1 N Dim]-->[T N Dim]-->[T*N,Dim]-->[1,Dim]
rnn_states_batch.append(rnn_states[ind])
rnn_states_critic_batch.append(rnn_states_critic[ind])
L, N = data_chunk_length, mini_batch_size
# These are all from_numpys of size (N, L, Dim)
share_obs_batch = np.stack(share_obs_batch)
obs_batch = np.stack(obs_batch)
actions_batch = np.stack(actions_batch)
if self.available_actions is not None:
available_actions_batch = np.stack(available_actions_batch)
value_preds_batch = np.stack(value_preds_batch)
return_batch = np.stack(return_batch)
masks_batch = np.stack(masks_batch)
active_masks_batch = np.stack(active_masks_batch)
old_action_log_probs_batch = np.stack(old_action_log_probs_batch)
adv_targ = np.stack(adv_targ)
# States is just a (N, -1) from_numpy
rnn_states_batch = np.stack(rnn_states_batch).reshape(N, *self.rnn_states.shape[2:])
rnn_states_critic_batch = np.stack(rnn_states_critic_batch).reshape(N, *self.rnn_states_critic.shape[2:])
# Flatten the (L, N, ...) from_numpys to (L * N, ...)
share_obs_batch = _flatten(L, N, share_obs_batch)
obs_batch = _flatten(L, N, obs_batch)
actions_batch = _flatten(L, N, actions_batch)
if self.available_actions is not None:
available_actions_batch = _flatten(L, N, available_actions_batch)
else:
available_actions_batch = None
value_preds_batch = _flatten(L, N, value_preds_batch)
return_batch = _flatten(L, N, return_batch)
masks_batch = _flatten(L, N, masks_batch)
active_masks_batch = _flatten(L, N, active_masks_batch)
old_action_log_probs_batch = _flatten(L, N, old_action_log_probs_batch)
adv_targ = _flatten(L, N, adv_targ)
yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, adv_targ, available_actions_batch
import torch
import numpy as np
from mappo.utils.util import get_shape_from_obs_space, get_shape_from_act_space
def _flatten(T, N, x):
return x.reshape(T * N, *x.shape[2:])
def _cast(x):
return x.transpose(1, 2, 0, 3).reshape(-1, *x.shape[3:])
class SharedReplayBuffer(object):
"""
Buffer to store training data.
:param args: (argparse.Namespace) arguments containing relevant model, policy, and env information.
:param num_agents: (int) number of agents in the env.
:param obs_space: (gym.Space) observation space of agents.
:param cent_obs_space: (gym.Space) centralized observation space of agents.
:param act_space: (gym.Space) action space for agents.
"""
def __init__(self, args, num_agents, obs_space, cent_obs_space, act_space):
self.episode_length = args.episode_length
self.n_rollout_threads = args.n_rollout_threads
self.hidden_size = args.hidden_size
self.recurrent_N = args.recurrent_N
self.gamma = args.gamma
self.gae_lambda = args.gae_lambda
self._use_gae = args.use_gae
self._use_popart = args.use_popart
self._use_valuenorm = args.use_valuenorm
self._use_proper_time_limits = args.use_proper_time_limits
obs_shape = get_shape_from_obs_space(obs_space)
share_obs_shape = get_shape_from_obs_space(cent_obs_space)
if type(obs_shape[-1]) == list:
obs_shape = obs_shape[:1]
if type(share_obs_shape[-1]) == list:
share_obs_shape = share_obs_shape[:1]
self.share_obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, num_agents, *share_obs_shape),
dtype=np.float32)
self.obs = np.zeros((self.episode_length + 1, self.n_rollout_threads, num_agents, *obs_shape), dtype=np.float32)
self.rnn_states = np.zeros(
(self.episode_length + 1, self.n_rollout_threads, num_agents, self.recurrent_N, self.hidden_size),
dtype=np.float32)
self.rnn_states_critic = np.zeros_like(self.rnn_states)
self.value_preds = np.zeros(
(self.episode_length + 1, self.n_rollout_threads, num_agents, 1), dtype=np.float32)
self.returns = np.zeros_like(self.value_preds)
if act_space.__class__.__name__ == 'Discrete':
self.available_actions = np.ones((self.episode_length + 1, self.n_rollout_threads, num_agents, act_space.n),
dtype=np.float32)
else:
self.available_actions = None
act_shape = get_shape_from_act_space(act_space)
self.actions = np.zeros(
(self.episode_length, self.n_rollout_threads, num_agents, act_shape), dtype=np.float32)
self.action_log_probs = np.zeros(
(self.episode_length, self.n_rollout_threads, num_agents, act_shape), dtype=np.float32)
self.rewards = np.zeros(
(self.episode_length, self.n_rollout_threads, num_agents, 1), dtype=np.float32)
self.masks = np.ones((self.episode_length + 1, self.n_rollout_threads, num_agents, 1), dtype=np.float32)
self.bad_masks = np.ones_like(self.masks)
self.active_masks = np.ones_like(self.masks)
self.step = 0
def insert(self, share_obs, obs, rnn_states_actor, rnn_states_critic, actions, action_log_probs,
value_preds, rewards, masks, bad_masks=None, active_masks=None, available_actions=None):
"""
Insert data into the buffer.
:param share_obs: (argparse.Namespace) arguments containing relevant model, policy, and env information.
:param obs: (np.ndarray) local agent observations.
:param rnn_states_actor: (np.ndarray) RNN states for actor network.
:param rnn_states_critic: (np.ndarray) RNN states for critic network.
:param actions:(np.ndarray) actions taken by agents.
:param action_log_probs:(np.ndarray) log probs of actions taken by agents
:param value_preds: (np.ndarray) value function prediction at each step.
:param rewards: (np.ndarray) reward collected at each step.
:param masks: (np.ndarray) denotes whether the environment has terminated or not.
:param bad_masks: (np.ndarray) action space for agents.
:param active_masks: (np.ndarray) denotes whether an agent is active or dead in the env.
:param available_actions: (np.ndarray) actions available to each agent. If None, all actions are available.
"""
self.share_obs[self.step + 1] = share_obs.copy()
self.obs[self.step + 1] = obs.copy()
self.rnn_states[self.step + 1] = rnn_states_actor.copy()
self.rnn_states_critic[self.step + 1] = rnn_states_critic.copy()
self.actions[self.step] = actions.copy()
self.action_log_probs[self.step] = action_log_probs.copy()
self.value_preds[self.step] = value_preds.copy()
self.rewards[self.step] = rewards.copy()
self.masks[self.step + 1] = masks.copy()
if bad_masks is not None:
self.bad_masks[self.step + 1] = bad_masks.copy()
if active_masks is not None:
self.active_masks[self.step + 1] = active_masks.copy()
if available_actions is not None:
self.available_actions[self.step + 1] = available_actions.copy()
self.step = (self.step + 1) % self.episode_length
def chooseinsert(self, share_obs, obs, rnn_states, rnn_states_critic, actions, action_log_probs,
value_preds, rewards, masks, bad_masks=None, active_masks=None, available_actions=None):
"""
Insert data into the buffer. This insert function is used specifically for Hanabi, which is turn based.
:param share_obs: (argparse.Namespace) arguments containing relevant model, policy, and env information.
:param obs: (np.ndarray) local agent observations.
:param rnn_states_actor: (np.ndarray) RNN states for actor network.
:param rnn_states_critic: (np.ndarray) RNN states for critic network.
:param actions:(np.ndarray) actions taken by agents.
:param action_log_probs:(np.ndarray) log probs of actions taken by agents
:param value_preds: (np.ndarray) value function prediction at each step.
:param rewards: (np.ndarray) reward collected at each step.
:param masks: (np.ndarray) denotes whether the environment has terminated or not.
:param bad_masks: (np.ndarray) denotes indicate whether whether true terminal state or due to episode limit
:param active_masks: (np.ndarray) denotes whether an agent is active or dead in the env.
:param available_actions: (np.ndarray) actions available to each agent. If None, all actions are available.
"""
self.share_obs[self.step] = share_obs.copy()
self.obs[self.step] = obs.copy()
self.rnn_states[self.step + 1] = rnn_states.copy()
self.rnn_states_critic[self.step + 1] = rnn_states_critic.copy()
self.actions[self.step] = actions.copy()
self.action_log_probs[self.step] = action_log_probs.copy()
self.value_preds[self.step] = value_preds.copy()
self.rewards[self.step] = rewards.copy()
self.masks[self.step + 1] = masks.copy()
if bad_masks is not None:
self.bad_masks[self.step + 1] = bad_masks.copy()
if active_masks is not None:
self.active_masks[self.step] = active_masks.copy()
if available_actions is not None:
self.available_actions[self.step] = available_actions.copy()
self.step = (self.step + 1) % self.episode_length
def after_update(self):
"""Copy last timestep data to first index. Called after update to model."""
self.share_obs[0] = self.share_obs[-1].copy()
self.obs[0] = self.obs[-1].copy()
self.rnn_states[0] = self.rnn_states[-1].copy()
self.rnn_states_critic[0] = self.rnn_states_critic[-1].copy()
self.masks[0] = self.masks[-1].copy()
self.bad_masks[0] = self.bad_masks[-1].copy()
self.active_masks[0] = self.active_masks[-1].copy()
if self.available_actions is not None:
self.available_actions[0] = self.available_actions[-1].copy()
def chooseafter_update(self):
"""Copy last timestep data to first index. This method is used for Hanabi."""
self.rnn_states[0] = self.rnn_states[-1].copy()
self.rnn_states_critic[0] = self.rnn_states_critic[-1].copy()
self.masks[0] = self.masks[-1].copy()
self.bad_masks[0] = self.bad_masks[-1].copy()
def compute_returns(self, next_value, value_normalizer=None):
"""
Compute returns either as discounted sum of rewards, or using GAE.
:param next_value: (np.ndarray) value predictions for the step after the last episode step.
:param value_normalizer: (PopArt) If not None, PopArt value normalizer instance.
"""
if self._use_proper_time_limits:
if self._use_gae:
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.shape[0])):
if self._use_popart or self._use_valuenorm:
# step + 1
delta = self.rewards[step] + self.gamma * value_normalizer.denormalize(
self.value_preds[step + 1]) * self.masks[step + 1] \
- value_normalizer.denormalize(self.value_preds[step])
gae = delta + self.gamma * self.gae_lambda * gae * self.masks[step + 1]
gae = gae * self.bad_masks[step + 1]
self.returns[step] = gae + value_normalizer.denormalize(self.value_preds[step])
else:
delta = self.rewards[step] + self.gamma * self.value_preds[step + 1] * self.masks[step + 1] - \
self.value_preds[step]
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
gae = gae * self.bad_masks[step + 1]
self.returns[step] = gae + self.value_preds[step]
else:
self.returns[-1] = next_value
for step in reversed(range(self.rewards.shape[0])):
if self._use_popart or self._use_valuenorm:
self.returns[step] = (self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[
step]) * self.bad_masks[step + 1] \
+ (1 - self.bad_masks[step + 1]) * value_normalizer.denormalize(
self.value_preds[step])
else:
self.returns[step] = (self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[
step]) * self.bad_masks[step + 1] \
+ (1 - self.bad_masks[step + 1]) * self.value_preds[step]
else:
if self._use_gae:
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.shape[0])):
if self._use_popart or self._use_valuenorm:
delta = self.rewards[step] + self.gamma * value_normalizer.denormalize(
self.value_preds[step + 1]) * self.masks[step + 1] \
- value_normalizer.denormalize(self.value_preds[step])
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
self.returns[step] = gae + value_normalizer.denormalize(self.value_preds[step])
else:
delta = self.rewards[step] + self.gamma * self.value_preds[step + 1] * self.masks[step + 1] - \
self.value_preds[step]
gae = delta + self.gamma * self.gae_lambda * self.masks[step + 1] * gae
self.returns[step] = gae + self.value_preds[step]
else:
self.returns[-1] = next_value
for step in reversed(range(self.rewards.shape[0])):
self.returns[step] = self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step]
def feed_forward_generator(self, advantages, num_mini_batch=None, mini_batch_size=None):
"""
Yield training data for MLP policies.
:param advantages: (np.ndarray) advantage estimates.
:param num_mini_batch: (int) number of minibatches to split the batch into.
:param mini_batch_size: (int) number of samples in each minibatch.
"""
episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3]
batch_size = n_rollout_threads * episode_length * num_agents
if mini_batch_size is None:
assert batch_size >= num_mini_batch, (
"PPO requires the number of processes ({}) "
"* number of steps ({}) * number of agents ({}) = {} "
"to be greater than or equal to the number of PPO mini batches ({})."
"".format(n_rollout_threads, episode_length, num_agents,
n_rollout_threads * episode_length * num_agents,
num_mini_batch))
mini_batch_size = batch_size // num_mini_batch
rand = torch.randperm(batch_size).numpy()
sampler = [rand[i * mini_batch_size:(i + 1) * mini_batch_size] for i in range(num_mini_batch)]
share_obs = self.share_obs[:-1].reshape(-1, *self.share_obs.shape[3:])
obs = self.obs[:-1].reshape(-1, *self.obs.shape[3:])
rnn_states = self.rnn_states[:-1].reshape(-1, *self.rnn_states.shape[3:])
rnn_states_critic = self.rnn_states_critic[:-1].reshape(-1, *self.rnn_states_critic.shape[3:])
actions = self.actions.reshape(-1, self.actions.shape[-1])
if self.available_actions is not None:
available_actions = self.available_actions[:-1].reshape(-1, self.available_actions.shape[-1])
value_preds = self.value_preds[:-1].reshape(-1, 1)
returns = self.returns[:-1].reshape(-1, 1)
masks = self.masks[:-1].reshape(-1, 1)
active_masks = self.active_masks[:-1].reshape(-1, 1)
action_log_probs = self.action_log_probs.reshape(-1, self.action_log_probs.shape[-1])
advantages = advantages.reshape(-1, 1)
for indices in sampler:
# obs size [T+1 N M Dim]-->[T N M Dim]-->[T*N*M,Dim]-->[index,Dim]
share_obs_batch = share_obs[indices]
obs_batch = obs[indices]
rnn_states_batch = rnn_states[indices]
rnn_states_critic_batch = rnn_states_critic[indices]
actions_batch = actions[indices]
if self.available_actions is not None:
available_actions_batch = available_actions[indices]
else:
available_actions_batch = None
value_preds_batch = value_preds[indices]
return_batch = returns[indices]
masks_batch = masks[indices]
active_masks_batch = active_masks[indices]
old_action_log_probs_batch = action_log_probs[indices]
if advantages is None:
adv_targ = None
else:
adv_targ = advantages[indices]
yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch,\
value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch,\
adv_targ, available_actions_batch
def naive_recurrent_generator(self, advantages, num_mini_batch):
"""
Yield training data for non-chunked RNN training.
:param advantages: (np.ndarray) advantage estimates.
:param num_mini_batch: (int) number of minibatches to split the batch into.
"""
episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3]
batch_size = n_rollout_threads * num_agents
assert n_rollout_threads * num_agents >= num_mini_batch, (
"PPO requires the number of processes ({})* number of agents ({}) "
"to be greater than or equal to the number of "
"PPO mini batches ({}).".format(n_rollout_threads, num_agents, num_mini_batch))
num_envs_per_batch = batch_size // num_mini_batch
perm = torch.randperm(batch_size).numpy()
share_obs = self.share_obs.reshape(-1, batch_size, *self.share_obs.shape[3:])
obs = self.obs.reshape(-1, batch_size, *self.obs.shape[3:])
rnn_states = self.rnn_states.reshape(-1, batch_size, *self.rnn_states.shape[3:])
rnn_states_critic = self.rnn_states_critic.reshape(-1, batch_size, *self.rnn_states_critic.shape[3:])
actions = self.actions.reshape(-1, batch_size, self.actions.shape[-1])
if self.available_actions is not None:
available_actions = self.available_actions.reshape(-1, batch_size, self.available_actions.shape[-1])
value_preds = self.value_preds.reshape(-1, batch_size, 1)
returns = self.returns.reshape(-1, batch_size, 1)
masks = self.masks.reshape(-1, batch_size, 1)
active_masks = self.active_masks.reshape(-1, batch_size, 1)
action_log_probs = self.action_log_probs.reshape(-1, batch_size, self.action_log_probs.shape[-1])
advantages = advantages.reshape(-1, batch_size, 1)
for start_ind in range(0, batch_size, num_envs_per_batch):
share_obs_batch = []
obs_batch = []
rnn_states_batch = []
rnn_states_critic_batch = []
actions_batch = []
available_actions_batch = []
value_preds_batch = []
return_batch = []
masks_batch = []
active_masks_batch = []
old_action_log_probs_batch = []
adv_targ = []
for offset in range(num_envs_per_batch):
ind = perm[start_ind + offset]
share_obs_batch.append(share_obs[:-1, ind])
obs_batch.append(obs[:-1, ind])
rnn_states_batch.append(rnn_states[0:1, ind])
rnn_states_critic_batch.append(rnn_states_critic[0:1, ind])
actions_batch.append(actions[:, ind])
if self.available_actions is not None:
available_actions_batch.append(available_actions[:-1, ind])
value_preds_batch.append(value_preds[:-1, ind])
return_batch.append(returns[:-1, ind])
masks_batch.append(masks[:-1, ind])
active_masks_batch.append(active_masks[:-1, ind])
old_action_log_probs_batch.append(action_log_probs[:, ind])
adv_targ.append(advantages[:, ind])
# [N[T, dim]]
T, N = self.episode_length, num_envs_per_batch
# These are all from_numpys of size (T, N, -1)
share_obs_batch = np.stack(share_obs_batch, 1)
obs_batch = np.stack(obs_batch, 1)
actions_batch = np.stack(actions_batch, 1)
if self.available_actions is not None:
available_actions_batch = np.stack(available_actions_batch, 1)
value_preds_batch = np.stack(value_preds_batch, 1)
return_batch = np.stack(return_batch, 1)
masks_batch = np.stack(masks_batch, 1)
active_masks_batch = np.stack(active_masks_batch, 1)
old_action_log_probs_batch = np.stack(old_action_log_probs_batch, 1)
adv_targ = np.stack(adv_targ, 1)
# States is just a (N, dim) from_numpy [N[1,dim]]
rnn_states_batch = np.stack(rnn_states_batch).reshape(N, *self.rnn_states.shape[3:])
rnn_states_critic_batch = np.stack(rnn_states_critic_batch).reshape(N, *self.rnn_states_critic.shape[3:])
# Flatten the (T, N, ...) from_numpys to (T * N, ...)
share_obs_batch = _flatten(T, N, share_obs_batch)
obs_batch = _flatten(T, N, obs_batch)
actions_batch = _flatten(T, N, actions_batch)
if self.available_actions is not None:
available_actions_batch = _flatten(T, N, available_actions_batch)
else:
available_actions_batch = None
value_preds_batch = _flatten(T, N, value_preds_batch)
return_batch = _flatten(T, N, return_batch)
masks_batch = _flatten(T, N, masks_batch)
active_masks_batch = _flatten(T, N, active_masks_batch)
old_action_log_probs_batch = _flatten(T, N, old_action_log_probs_batch)
adv_targ = _flatten(T, N, adv_targ)
yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch,\
value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch,\
adv_targ, available_actions_batch
def recurrent_generator(self, advantages, num_mini_batch, data_chunk_length):
"""
Yield training data for chunked RNN training.
:param advantages: (np.ndarray) advantage estimates.
:param num_mini_batch: (int) number of minibatches to split the batch into.
:param data_chunk_length: (int) length of sequence chunks with which to train RNN.
"""
episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3]
batch_size = n_rollout_threads * episode_length * num_agents
data_chunks = batch_size // data_chunk_length # [C=r*T*M/L]
mini_batch_size = data_chunks // num_mini_batch
rand = torch.randperm(data_chunks).numpy()
sampler = [rand[i * mini_batch_size:(i + 1) * mini_batch_size] for i in range(num_mini_batch)]
if len(self.share_obs.shape) > 4:
share_obs = self.share_obs[:-1].transpose(1, 2, 0, 3, 4, 5).reshape(-1, *self.share_obs.shape[3:])
obs = self.obs[:-1].transpose(1, 2, 0, 3, 4, 5).reshape(-1, *self.obs.shape[3:])
else:
share_obs = _cast(self.share_obs[:-1])
obs = _cast(self.obs[:-1])
actions = _cast(self.actions)
action_log_probs = _cast(self.action_log_probs)
advantages = _cast(advantages)
value_preds = _cast(self.value_preds[:-1])
returns = _cast(self.returns[:-1])
masks = _cast(self.masks[:-1])
active_masks = _cast(self.active_masks[:-1])
# rnn_states = _cast(self.rnn_states[:-1])
# rnn_states_critic = _cast(self.rnn_states_critic[:-1])
rnn_states = self.rnn_states[:-1].transpose(1, 2, 0, 3, 4).reshape(-1, *self.rnn_states.shape[3:])
rnn_states_critic = self.rnn_states_critic[:-1].transpose(1, 2, 0, 3, 4).reshape(-1,
*self.rnn_states_critic.shape[
3:])
if self.available_actions is not None:
available_actions = _cast(self.available_actions[:-1])
for indices in sampler:
share_obs_batch = []
obs_batch = []
rnn_states_batch = []
rnn_states_critic_batch = []
actions_batch = []
available_actions_batch = []
value_preds_batch = []
return_batch = []
masks_batch = []
active_masks_batch = []
old_action_log_probs_batch = []
adv_targ = []
for index in indices:
ind = index * data_chunk_length
# size [T+1 N M Dim]-->[T N M Dim]-->[N,M,T,Dim]-->[N*M*T,Dim]-->[L,Dim]
share_obs_batch.append(share_obs[ind:ind + data_chunk_length])
obs_batch.append(obs[ind:ind + data_chunk_length])
actions_batch.append(actions[ind:ind + data_chunk_length])
if self.available_actions is not None:
available_actions_batch.append(available_actions[ind:ind + data_chunk_length])
value_preds_batch.append(value_preds[ind:ind + data_chunk_length])
return_batch.append(returns[ind:ind + data_chunk_length])
masks_batch.append(masks[ind:ind + data_chunk_length])
active_masks_batch.append(active_masks[ind:ind + data_chunk_length])
old_action_log_probs_batch.append(action_log_probs[ind:ind + data_chunk_length])
adv_targ.append(advantages[ind:ind + data_chunk_length])
# size [T+1 N M Dim]-->[T N M Dim]-->[N M T Dim]-->[N*M*T,Dim]-->[1,Dim]
rnn_states_batch.append(rnn_states[ind])
rnn_states_critic_batch.append(rnn_states_critic[ind])
L, N = data_chunk_length, mini_batch_size
# These are all from_numpys of size (L, N, Dim)
share_obs_batch = np.stack(share_obs_batch, axis=1)
obs_batch = np.stack(obs_batch, axis=1)
actions_batch = np.stack(actions_batch, axis=1)
if self.available_actions is not None:
available_actions_batch = np.stack(available_actions_batch, axis=1)
value_preds_batch = np.stack(value_preds_batch, axis=1)
return_batch = np.stack(return_batch, axis=1)
masks_batch = np.stack(masks_batch, axis=1)
active_masks_batch = np.stack(active_masks_batch, axis=1)
old_action_log_probs_batch = np.stack(old_action_log_probs_batch, axis=1)
adv_targ = np.stack(adv_targ, axis=1)
# States is just a (N, -1) from_numpy
rnn_states_batch = np.stack(rnn_states_batch).reshape(N, *self.rnn_states.shape[3:])
rnn_states_critic_batch = np.stack(rnn_states_critic_batch).reshape(N, *self.rnn_states_critic.shape[3:])
# Flatten the (L, N, ...) from_numpys to (L * N, ...)
share_obs_batch = _flatten(L, N, share_obs_batch)
obs_batch = _flatten(L, N, obs_batch)
actions_batch = _flatten(L, N, actions_batch)
if self.available_actions is not None:
available_actions_batch = _flatten(L, N, available_actions_batch)
else:
available_actions_batch = None
value_preds_batch = _flatten(L, N, value_preds_batch)
return_batch = _flatten(L, N, return_batch)
masks_batch = _flatten(L, N, masks_batch)
active_masks_batch = _flatten(L, N, active_masks_batch)
old_action_log_probs_batch = _flatten(L, N, old_action_log_probs_batch)
adv_targ = _flatten(L, N, adv_targ)
yield share_obs_batch, obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch,\
value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch,\
adv_targ, available_actions_batch
import numpy as np
import math
import torch
def check(input):
if type(input) == np.ndarray:
return torch.from_numpy(input)
def get_gard_norm(it):
sum_grad = 0
for x in it:
if x.grad is None:
continue
sum_grad += x.grad.norm() ** 2
return math.sqrt(sum_grad)
def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
"""Decreases the learning rate linearly"""
lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def huber_loss(e, d):
a = (abs(e) <= d).float()
b = (e > d).float()
return a*e**2/2 + b*d*(abs(e)-d/2)
def mse_loss(e):
return e**2/2
def get_shape_from_obs_space(obs_space):
if obs_space.__class__.__name__ == 'Box':
obs_shape = obs_space.shape
elif obs_space.__class__.__name__ == 'list':
obs_shape = obs_space
else:
raise NotImplementedError
return obs_shape
def get_shape_from_act_space(act_space):
if act_space.__class__.__name__ == 'Discrete':
act_shape = 1
elif act_space.__class__.__name__ == "MultiDiscrete":
act_shape = act_space.shape
elif act_space.__class__.__name__ == "Box":
act_shape = act_space.shape[0]
elif act_space.__class__.__name__ == "MultiBinary":
act_shape = act_space.shape[0]
else: # agar
act_shape = act_space[0].shape[0] + 1
return act_shape
def tile_images(img_nhwc):
"""
Tile N images into one big PxQ image
(P,Q) are chosen to be as close as possible, and if N
is square, then P=Q.
input: img_nhwc, list or array of images, ndim=4 once turned into array
n = batch index, h = height, w = width, c = channel
returns:
bigim_HWc, ndarray with ndim=3
"""
img_nhwc = np.asarray(img_nhwc)
N, h, w, c = img_nhwc.shape
H = int(np.ceil(np.sqrt(N)))
W = int(np.ceil(float(N)/H))
img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
return img_Hh_Ww_c
\ No newline at end of file
import numpy as np
import torch
import torch.nn as nn
class ValueNorm(nn.Module):
""" Normalize a vector of observations - across the first norm_axes dimensions"""
def __init__(self, input_shape, norm_axes=1, beta=0.99999, per_element_update=False, epsilon=1e-5, device=torch.device("cpu")):
super(ValueNorm, self).__init__()
self.input_shape = input_shape
self.norm_axes = norm_axes
self.epsilon = epsilon
self.beta = beta
self.per_element_update = per_element_update
self.tpdv = dict(dtype=torch.float32, device=device)
self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv)
self.running_mean_sq = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv)
self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv)
self.reset_parameters()
def reset_parameters(self):
self.running_mean.zero_()
self.running_mean_sq.zero_()
self.debiasing_term.zero_()
def running_mean_var(self):
debiased_mean = self.running_mean / self.debiasing_term.clamp(min=self.epsilon)
debiased_mean_sq = self.running_mean_sq / self.debiasing_term.clamp(min=self.epsilon)
debiased_var = (debiased_mean_sq - debiased_mean ** 2).clamp(min=1e-2)
return debiased_mean, debiased_var
@torch.no_grad()
def update(self, input_vector):
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
batch_mean = input_vector.mean(dim=tuple(range(self.norm_axes)))
batch_sq_mean = (input_vector ** 2).mean(dim=tuple(range(self.norm_axes)))
if self.per_element_update:
batch_size = np.prod(input_vector.size()[:self.norm_axes])
weight = self.beta ** batch_size
else:
weight = self.beta
self.running_mean.mul_(weight).add_(batch_mean * (1.0 - weight))
self.running_mean_sq.mul_(weight).add_(batch_sq_mean * (1.0 - weight))
self.debiasing_term.mul_(weight).add_(1.0 * (1.0 - weight))
def normalize(self, input_vector):
# Make sure input is float32
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
mean, var = self.running_mean_var()
out = (input_vector - mean[(None,) * self.norm_axes]) / torch.sqrt(var)[(None,) * self.norm_axes]
return out
def denormalize(self, input_vector):
""" Transform normalized data back into original distribution """
if type(input_vector) == np.ndarray:
input_vector = torch.from_numpy(input_vector)
input_vector = input_vector.to(**self.tpdv)
mean, var = self.running_mean_var()
out = input_vector * torch.sqrt(var)[(None,) * self.norm_axes] + mean[(None,) * self.norm_axes]
out = out.cpu().numpy()
return out
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment