Commit ea508a3a authored by hezhiqiang01's avatar hezhiqiang01

fix share_policy = false的错误

parent f2073aa3
*.pyc
results
.*
\ No newline at end of file
......@@ -154,133 +154,344 @@ def get_config():
by default None. set the path to pretrained model.
"""
parser = argparse.ArgumentParser(
description='onpolicy', formatter_class=argparse.RawDescriptionHelpFormatter)
description="onpolicy", formatter_class=argparse.RawDescriptionHelpFormatter
)
# prepare parameters
parser.add_argument("--algorithm_name", type=str,
default='mappo', choices=["rmappo", "mappo"])
parser.add_argument("--algorithm_name", type=str, default="mappo", choices=["rmappo", "mappo"])
parser.add_argument("--experiment_name", type=str, default="check", help="an identifier to distinguish different experiment.")
parser.add_argument(
"--experiment_name",
type=str,
default="check",
help="an identifier to distinguish different experiment.",
)
parser.add_argument("--seed", type=int, default=1, help="Random seed for numpy/torch")
parser.add_argument("--cuda", action='store_false', default=True, help="by default True, will use GPU to train; or else will use CPU;")
parser.add_argument("--cuda_deterministic",
action='store_false', default=True, help="by default, make sure random seed effective. if set, bypass such function.")
parser.add_argument("--n_training_threads", type=int,
default=1, help="Number of torch threads for training")
parser.add_argument("--n_rollout_threads", type=int, default=5,
help="Number of parallel envs for training rollouts")
parser.add_argument("--n_eval_rollout_threads", type=int, default=1,
help="Number of parallel envs for evaluating rollouts")
parser.add_argument("--n_render_rollout_threads", type=int, default=1,
help="Number of parallel envs for rendering rollouts")
parser.add_argument("--num_env_steps", type=int, default=10e6,
help='Number of environment steps to train (default: 10e6)')
parser.add_argument("--user_name", type=str, default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.")
parser.add_argument(
"--cuda",
action="store_false",
default=True,
help="by default True, will use GPU to train; or else will use CPU;",
)
parser.add_argument(
"--cuda_deterministic",
action="store_false",
default=True,
help="by default, make sure random seed effective. if set, bypass such function.",
)
parser.add_argument(
"--n_training_threads",
type=int,
default=2,
help="Number of torch threads for training",
)
parser.add_argument(
"--n_rollout_threads",
type=int,
default=5,
help="Number of parallel envs for training rollouts",
)
parser.add_argument(
"--n_eval_rollout_threads",
type=int,
default=2,
help="Number of parallel envs for evaluating rollouts",
)
parser.add_argument(
"--n_render_rollout_threads",
type=int,
default=1,
help="Number of parallel envs for rendering rollouts",
)
parser.add_argument(
"--num_env_steps",
type=int,
default=10e6,
help="Number of environment steps to train (default: 10e6)",
)
parser.add_argument(
"--user_name",
type=str,
default="marl",
help="[for wandb usage], to specify user's name for simply collecting training data.",
)
# env parameters
parser.add_argument("--env_name", type=str, default='MyEnv', help="specify the name of environment")
parser.add_argument("--use_obs_instead_of_state", action='store_true',
default=False, help="Whether to use global state or concatenated obs")
parser.add_argument("--env_name", type=str, default="MyEnv", help="specify the name of environment")
parser.add_argument(
"--use_obs_instead_of_state",
action="store_true",
default=False,
help="Whether to use global state or concatenated obs",
)
# replay buffer parameters
parser.add_argument("--episode_length", type=int,
default=200, help="Max length for any episode")
parser.add_argument("--episode_length", type=int, default=200, help="Max length for any episode")
# network parameters
parser.add_argument("--share_policy", action='store_false',
default=True, help='Whether agent share the same policy')
parser.add_argument("--use_centralized_V", action='store_false',
default=True, help="Whether to use centralized V function")
parser.add_argument("--stacked_frames", type=int, default=1,
help="Dimension of hidden layers for actor/critic networks")
parser.add_argument("--use_stacked_frames", action='store_true',
default=False, help="Whether to use stacked_frames")
parser.add_argument("--hidden_size", type=int, default=64,
help="Dimension of hidden layers for actor/critic networks")
parser.add_argument("--layer_N", type=int, default=1,
help="Number of layers for actor/critic networks")
parser.add_argument("--use_ReLU", action='store_false',
default=True, help="Whether to use ReLU")
parser.add_argument("--use_popart", action='store_true', default=False, help="by default False, use PopArt to normalize rewards.")
parser.add_argument("--use_valuenorm", action='store_false', default=True, help="by default True, use running mean and std to normalize rewards.")
parser.add_argument("--use_feature_normalization", action='store_false',
default=True, help="Whether to apply layernorm to the inputs")
parser.add_argument("--use_orthogonal", action='store_false', default=True,
help="Whether to use Orthogonal initialization for weights and 0 initialization for biases")
parser.add_argument("--gain", type=float, default=0.01,
help="The gain # of last action layer")
parser.add_argument(
"--share_policy",
action="store_false",
default=False,
help="Whether agent share the same policy",
)
parser.add_argument(
"--use_centralized_V",
action="store_false",
default=True,
help="Whether to use centralized V function",
)
parser.add_argument(
"--stacked_frames",
type=int,
default=1,
help="Dimension of hidden layers for actor/critic networks",
)
parser.add_argument(
"--use_stacked_frames",
action="store_true",
default=False,
help="Whether to use stacked_frames",
)
parser.add_argument(
"--hidden_size",
type=int,
default=64,
help="Dimension of hidden layers for actor/critic networks",
)
parser.add_argument(
"--layer_N",
type=int,
default=1,
help="Number of layers for actor/critic networks",
)
parser.add_argument("--use_ReLU", action="store_false", default=True, help="Whether to use ReLU")
parser.add_argument(
"--use_popart",
action="store_true",
default=False,
help="by default False, use PopArt to normalize rewards.",
)
parser.add_argument(
"--use_valuenorm",
action="store_false",
default=True,
help="by default True, use running mean and std to normalize rewards.",
)
parser.add_argument(
"--use_feature_normalization",
action="store_false",
default=True,
help="Whether to apply layernorm to the inputs",
)
parser.add_argument(
"--use_orthogonal",
action="store_false",
default=True,
help="Whether to use Orthogonal initialization for weights and 0 initialization for biases",
)
parser.add_argument("--gain", type=float, default=0.01, help="The gain # of last action layer")
# recurrent parameters
parser.add_argument("--use_naive_recurrent_policy", action='store_true',
default=False, help='Whether to use a naive recurrent policy')
parser.add_argument("--use_recurrent_policy", action='store_false',
default=False, help='use a recurrent policy')
parser.add_argument(
"--use_naive_recurrent_policy",
action="store_true",
default=False,
help="Whether to use a naive recurrent policy",
)
parser.add_argument(
"--use_recurrent_policy",
action="store_false",
default=False,
help="use a recurrent policy",
)
parser.add_argument("--recurrent_N", type=int, default=1, help="The number of recurrent layers.")
parser.add_argument("--data_chunk_length", type=int, default=10,
help="Time length of chunks used to train a recurrent_policy")
parser.add_argument(
"--data_chunk_length",
type=int,
default=10,
help="Time length of chunks used to train a recurrent_policy",
)
# optimizer parameters
parser.add_argument("--lr", type=float, default=5e-4,
help='learning rate (default: 5e-4)')
parser.add_argument("--critic_lr", type=float, default=5e-4,
help='critic learning rate (default: 5e-4)')
parser.add_argument("--opti_eps", type=float, default=1e-5,
help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument("--lr", type=float, default=5e-4, help="learning rate (default: 5e-4)")
parser.add_argument(
"--critic_lr",
type=float,
default=5e-4,
help="critic learning rate (default: 5e-4)",
)
parser.add_argument(
"--opti_eps",
type=float,
default=1e-5,
help="RMSprop optimizer epsilon (default: 1e-5)",
)
parser.add_argument("--weight_decay", type=float, default=0)
# ppo parameters
parser.add_argument("--ppo_epoch", type=int, default=15,
help='number of ppo epochs (default: 15)')
parser.add_argument("--use_clipped_value_loss",
action='store_false', default=True, help="by default, clip loss value. If set, do not clip loss value.")
parser.add_argument("--clip_param", type=float, default=0.2,
help='ppo clip parameter (default: 0.2)')
parser.add_argument("--num_mini_batch", type=int, default=1,
help='number of batches for ppo (default: 1)')
parser.add_argument("--entropy_coef", type=float, default=0.01,
help='entropy term coefficient (default: 0.01)')
parser.add_argument("--value_loss_coef", type=float,
default=1, help='value loss coefficient (default: 0.5)')
parser.add_argument("--use_max_grad_norm",
action='store_false', default=True, help="by default, use max norm of gradients. If set, do not use.")
parser.add_argument("--max_grad_norm", type=float, default=10.0,
help='max norm of gradients (default: 0.5)')
parser.add_argument("--use_gae", action='store_false',
default=True, help='use generalized advantage estimation')
parser.add_argument("--gamma", type=float, default=0.99,
help='discount factor for rewards (default: 0.99)')
parser.add_argument("--gae_lambda", type=float, default=0.95,
help='gae lambda parameter (default: 0.95)')
parser.add_argument("--use_proper_time_limits", action='store_true',
default=False, help='compute returns taking into account time limits')
parser.add_argument("--use_huber_loss", action='store_false', default=True, help="by default, use huber loss. If set, do not use huber loss.")
parser.add_argument("--use_value_active_masks",
action='store_false', default=True, help="by default True, whether to mask useless data in value loss.")
parser.add_argument("--use_policy_active_masks",
action='store_false', default=True, help="by default True, whether to mask useless data in policy loss.")
parser.add_argument("--ppo_epoch", type=int, default=15, help="number of ppo epochs (default: 15)")
parser.add_argument(
"--use_clipped_value_loss",
action="store_false",
default=True,
help="by default, clip loss value. If set, do not clip loss value.",
)
parser.add_argument(
"--clip_param",
type=float,
default=0.2,
help="ppo clip parameter (default: 0.2)",
)
parser.add_argument(
"--num_mini_batch",
type=int,
default=1,
help="number of batches for ppo (default: 1)",
)
parser.add_argument(
"--entropy_coef",
type=float,
default=0.01,
help="entropy term coefficient (default: 0.01)",
)
parser.add_argument(
"--value_loss_coef",
type=float,
default=1,
help="value loss coefficient (default: 0.5)",
)
parser.add_argument(
"--use_max_grad_norm",
action="store_false",
default=True,
help="by default, use max norm of gradients. If set, do not use.",
)
parser.add_argument(
"--max_grad_norm",
type=float,
default=10.0,
help="max norm of gradients (default: 0.5)",
)
parser.add_argument(
"--use_gae",
action="store_false",
default=True,
help="use generalized advantage estimation",
)
parser.add_argument(
"--gamma",
type=float,
default=0.99,
help="discount factor for rewards (default: 0.99)",
)
parser.add_argument(
"--gae_lambda",
type=float,
default=0.95,
help="gae lambda parameter (default: 0.95)",
)
parser.add_argument(
"--use_proper_time_limits",
action="store_true",
default=False,
help="compute returns taking into account time limits",
)
parser.add_argument(
"--use_huber_loss",
action="store_false",
default=True,
help="by default, use huber loss. If set, do not use huber loss.",
)
parser.add_argument(
"--use_value_active_masks",
action="store_false",
default=True,
help="by default True, whether to mask useless data in value loss.",
)
parser.add_argument(
"--use_policy_active_masks",
action="store_false",
default=True,
help="by default True, whether to mask useless data in policy loss.",
)
parser.add_argument("--huber_delta", type=float, default=10.0, help=" coefficience of huber loss.")
# run parameters
parser.add_argument("--use_linear_lr_decay", action='store_true',
default=False, help='use a linear schedule on the learning rate')
parser.add_argument(
"--use_linear_lr_decay",
action="store_true",
default=False,
help="use a linear schedule on the learning rate",
)
# save parameters
parser.add_argument("--save_interval", type=int, default=1, help="time duration between contiunous twice models saving.")
parser.add_argument(
"--save_interval",
type=int,
default=1,
help="time duration between contiunous twice models saving.",
)
# log parameters
parser.add_argument("--log_interval", type=int, default=5, help="time duration between contiunous twice log printing.")
parser.add_argument(
"--log_interval",
type=int,
default=5,
help="time duration between contiunous twice log printing.",
)
# eval parameters
parser.add_argument("--use_eval", action='store_true', default=False, help="by default, do not start evaluation. If set`, start evaluation alongside with training.")
parser.add_argument("--eval_interval", type=int, default=25, help="time duration between contiunous twice evaluation progress.")
parser.add_argument("--eval_episodes", type=int, default=32, help="number of episodes of a single evaluation.")
parser.add_argument(
"--use_eval",
action="store_true",
default=False,
help="by default, do not start evaluation. If set`, start evaluation alongside with training.",
)
parser.add_argument(
"--eval_interval",
type=int,
default=25,
help="time duration between contiunous twice evaluation progress.",
)
parser.add_argument(
"--eval_episodes",
type=int,
default=32,
help="number of episodes of a single evaluation.",
)
# render parameters
parser.add_argument("--save_gifs", action='store_true', default=False, help="by default, do not save render video. If set, save video.")
parser.add_argument("--use_render", action='store_true', default=False, help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.")
parser.add_argument("--render_episodes", type=int, default=5, help="the number of episodes to render a given env")
parser.add_argument("--ifi", type=float, default=0.1, help="the play interval of each rendered image in saved video.")
parser.add_argument(
"--save_gifs",
action="store_true",
default=False,
help="by default, do not save render video. If set, save video.",
)
parser.add_argument(
"--use_render",
action="store_true",
default=False,
help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.",
)
parser.add_argument(
"--render_episodes",
type=int,
default=5,
help="the number of episodes to render a given env",
)
parser.add_argument(
"--ifi",
type=float,
default=0.1,
help="the play interval of each rendered image in saved video.",
)
# pretrained parameters
parser.add_argument("--model_dir", type=str, default=None, help="by default None. set the path to pretrained model.")
parser.add_argument(
"--model_dir",
type=str,
default=None,
help="by default None. set the path to pretrained model.",
)
return parser
......@@ -36,30 +36,30 @@ class DiscreteActionEnv(object):
share_obs_dim = 0
total_action_space = []
for agent in range(self.num_agent):
for agent_idx in range(self.num_agent):
# physical action space
u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作
if self.movable:
# if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all(
[
isinstance(act_space, spaces.Discrete)
for act_space in total_action_space
]
):
act_space = MultiDiscrete(
[[0, act_space.n - 1] for act_space in total_action_space]
)
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# if len(total_action_space) > 1:
# # all action spaces are discrete, so simplify to MultiDiscrete action space
# if all(
# [
# isinstance(act_space, spaces.Discrete)
# for act_space in total_action_space
# ]
# ):
# act_space = MultiDiscrete(
# [[0, act_space.n - 1] for act_space in total_action_space]
# )
# else:
# act_space = spaces.Tuple(total_action_space)
# self.action_space.append(act_space)
# else:
self.action_space.append(total_action_space[agent_idx])
# observation space
share_obs_dim += self.signal_obs_dim
......@@ -73,9 +73,7 @@ class DiscreteActionEnv(object):
) # [-inf,inf]
self.share_observation_space = [
spaces.Box(
low=-np.inf, high=+np.inf, shape=(share_obs_dim,), dtype=np.float32
)
spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,), dtype=np.float32)
for _ in range(self.num_agent)
]
......@@ -135,12 +133,7 @@ class MultiDiscrete:
"""Returns a array with one sample from each discrete action space"""
# For each row: round(random .* (max - min) + min, 0)
random_array = np.random.rand(self.num_discrete_space)
return [
int(x)
for x in np.floor(
np.multiply((self.high - self.low + 1.0), random_array) + self.low
)
]
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.0), random_array) + self.low)]
def contains(self, x):
return (
......@@ -157,9 +150,7 @@ class MultiDiscrete:
return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(
self.high, other.high
)
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
if __name__ == "__main__":
......
......@@ -46,7 +46,7 @@ class DummyVecEnv():
return obs, rews, dones, infos
def reset(self):
obs = [env.reset() for env in self.envs]
obs = [env.reset() for env in self.envs] # [env_num, agent_num, obs_dim]
return np.array(obs)
def close(self):
......
import time
import wandb
import os
import numpy as np
from itertools import chain
......@@ -10,17 +8,18 @@ from tensorboardX import SummaryWriter
from utils.separated_buffer import SeparatedReplayBuffer
from utils.util import update_linear_schedule
def _t2n(x):
return x.detach().cpu().numpy()
class Runner(object):
def __init__(self, config):
self.all_args = config['all_args']
self.envs = config['envs']
self.eval_envs = config['eval_envs']
self.device = config['device']
self.num_agents = config['num_agents']
self.all_args = config["all_args"]
self.envs = config["envs"]
self.eval_envs = config["eval_envs"]
self.device = config["device"]
self.num_agents = config["num_agents"]
# parameters
self.env_name = self.all_args.env_name
......@@ -34,7 +33,6 @@ class Runner(object):
self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
......@@ -49,37 +47,42 @@ class Runner(object):
if self.use_render:
import imageio
self.run_dir = config["run_dir"]
self.gif_dir = str(self.run_dir / 'gifs')
self.gif_dir = str(self.run_dir / "gifs")
if not os.path.exists(self.gif_dir):
os.makedirs(self.gif_dir)
else:
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
else:
# if self.use_wandb:
# self.save_dir = str(wandb.run.dir)
# else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
self.log_dir = str(self.run_dir / "logs")
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
self.save_dir = str(self.run_dir / "models")
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
from algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
self.policy = []
for agent_id in range(self.num_agents):
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id]
share_observation_space = (
self.envs.share_observation_space[agent_id]
if self.use_centralized_V
else self.envs.observation_space[agent_id]
)
# policy network
po = Policy(self.all_args,
po = Policy(
self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id],
device = self.device)
device=self.device,
)
self.policy.append(po)
if self.model_dir is not None:
......@@ -89,13 +92,19 @@ class Runner(object):
self.buffer = []
for agent_id in range(self.num_agents):
# algorithm
tr = TrainAlgo(self.all_args, self.policy[agent_id], device = self.device)
tr = TrainAlgo(self.all_args, self.policy[agent_id], device=self.device)
# buffer
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id]
bu = SeparatedReplayBuffer(self.all_args,
share_observation_space = (
self.envs.share_observation_space[agent_id]
if self.use_centralized_V
else self.envs.observation_space[agent_id]
)
bu = SeparatedReplayBuffer(
self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id])
self.envs.action_space[agent_id],
)
self.buffer.append(bu)
self.trainer.append(tr)
......@@ -115,9 +124,11 @@ class Runner(object):
def compute(self):
for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_rollout()
next_value = self.trainer[agent_id].policy.get_values(self.buffer[agent_id].share_obs[-1],
next_value = self.trainer[agent_id].policy.get_values(
self.buffer[agent_id].share_obs[-1],
self.buffer[agent_id].rnn_states_critic[-1],
self.buffer[agent_id].masks[-1])
self.buffer[agent_id].masks[-1],
)
next_value = _t2n(next_value)
self.buffer[agent_id].compute_returns(next_value, self.trainer[agent_id].value_normalizer)
......@@ -134,30 +145,39 @@ class Runner(object):
def save(self):
for agent_id in range(self.num_agents):
policy_actor = self.trainer[agent_id].policy.actor
torch.save(policy_actor.state_dict(), str(self.save_dir) + "/actor_agent" + str(agent_id) + ".pt")
torch.save(
policy_actor.state_dict(),
str(self.save_dir) + "/actor_agent" + str(agent_id) + ".pt",
)
policy_critic = self.trainer[agent_id].policy.critic
torch.save(policy_critic.state_dict(), str(self.save_dir) + "/critic_agent" + str(agent_id) + ".pt")
torch.save(
policy_critic.state_dict(),
str(self.save_dir) + "/critic_agent" + str(agent_id) + ".pt",
)
def restore(self):
for agent_id in range(self.num_agents):
policy_actor_state_dict = torch.load(str(self.model_dir) + '/actor_agent' + str(agent_id) + '.pt')
policy_actor_state_dict = torch.load(str(self.model_dir) + "/actor_agent" + str(agent_id) + ".pt")
self.policy[agent_id].actor.load_state_dict(policy_actor_state_dict)
policy_critic_state_dict = torch.load(str(self.model_dir) + '/critic_agent' + str(agent_id) + '.pt')
policy_critic_state_dict = torch.load(
str(self.model_dir) + "/critic_agent" + str(agent_id) + ".pt"
)
self.policy[agent_id].critic.load_state_dict(policy_critic_state_dict)
def log_train(self, train_infos, total_num_steps):
for agent_id in range(self.num_agents):
for k, v in train_infos[agent_id].items():
agent_k = "agent%i/" % agent_id + k
if self.use_wandb:
wandb.log({agent_k: v}, step=total_num_steps)
else:
# if self.use_wandb:
# pass
# wandb.log({agent_k: v}, step=total_num_steps)
# else:
self.writter.add_scalars(agent_k, {agent_k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
for k, v in env_infos.items():
if len(v) > 0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
# if self.use_wandb:
# wandb.log({k: np.mean(v)}, step=total_num_steps)
# else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
"""
# @Time : 2021/7/1 7:14 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env_runner.py
"""
import time
import wandb
import os
import numpy as np
from itertools import chain
......@@ -14,7 +6,6 @@ import torch
from utils.util import update_linear_schedule
from runner.separated.base_runner import Runner
import imageio
def _t2n(x):
......@@ -29,9 +20,7 @@ class EnvRunner(Runner):
self.warmup()
start = time.time()
episodes = (
int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
)
episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
for episode in range(episodes):
if self.use_linear_lr_decay:
......@@ -72,9 +61,7 @@ class EnvRunner(Runner):
train_infos = self.train()
# post process
total_num_steps = (
(episode + 1) * self.episode_length * self.n_rollout_threads
)
total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
# save model
if episode % self.save_interval == 0 or episode == episodes - 1:
......@@ -102,14 +89,10 @@ class EnvRunner(Runner):
for info in infos:
if "individual_reward" in info[agent_id].keys():
idv_rews.append(info[agent_id]["individual_reward"])
train_infos[agent_id].update(
{"individual_rewards": np.mean(idv_rews)}
)
train_infos[agent_id].update({"individual_rewards": np.mean(idv_rews)})
train_infos[agent_id].update(
{
"average_episode_rewards": np.mean(
self.buffer[agent_id].rewards
)
"average_episode_rewards": np.mean(self.buffer[agent_id].rewards)
* self.episode_length
}
)
......@@ -121,12 +104,12 @@ class EnvRunner(Runner):
def warmup(self):
# reset env
obs = self.envs.reset()
obs = self.envs.reset() # shape = [env_num, agent_num, obs_dim]
share_obs = []
for o in obs:
share_obs.append(list(chain(*o)))
share_obs = np.array(share_obs)
share_obs = np.array(share_obs) # shape = [env_num, agent_num * obs_dim]
for agent_id in range(self.num_agents):
if not self.use_centralized_V:
......@@ -160,21 +143,17 @@ class EnvRunner(Runner):
# rearrange action
if self.envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete":
for i in range(self.envs.action_space[agent_id].shape):
uc_action_env = np.eye(
self.envs.action_space[agent_id].high[i] + 1
)[action[:, i]]
uc_action_env = np.eye(self.envs.action_space[agent_id].high[i] + 1)[action[:, i]]
if i == 0:
action_env = uc_action_env
else:
action_env = np.concatenate((action_env, uc_action_env), axis=1)
elif self.envs.action_space[agent_id].__class__.__name__ == "Discrete":
action_env = np.squeeze(
np.eye(self.envs.action_space[agent_id].n)[action], 1
)
action_env = np.squeeze(np.eye(self.envs.action_space[agent_id].n)[action], 1)
else:
# TODO 这里改造成自己环境需要的形式即可
# TODO Here, you can change the action_env to the form you need
action_env = actions
action_env = action
# raise NotImplementedError
actions.append(action)
......@@ -265,9 +244,7 @@ class EnvRunner(Runner):
),
dtype=np.float32,
)
eval_masks = np.ones(
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32
)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
for eval_step in range(self.episode_length):
eval_temp_actions_env = []
......@@ -282,24 +259,16 @@ class EnvRunner(Runner):
eval_action = eval_action.detach().cpu().numpy()
# rearrange action
if (
self.eval_envs.action_space[agent_id].__class__.__name__
== "MultiDiscrete"
):
if self.eval_envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete":
for i in range(self.eval_envs.action_space[agent_id].shape):
eval_uc_action_env = np.eye(
self.eval_envs.action_space[agent_id].high[i] + 1
)[eval_action[:, i]]
eval_uc_action_env = np.eye(self.eval_envs.action_space[agent_id].high[i] + 1)[
eval_action[:, i]
]
if i == 0:
eval_action_env = eval_uc_action_env
else:
eval_action_env = np.concatenate(
(eval_action_env, eval_uc_action_env), axis=1
)
elif (
self.eval_envs.action_space[agent_id].__class__.__name__
== "Discrete"
):
eval_action_env = np.concatenate((eval_action_env, eval_uc_action_env), axis=1)
elif self.eval_envs.action_space[agent_id].__class__.__name__ == "Discrete":
eval_action_env = np.squeeze(
np.eye(self.eval_envs.action_space[agent_id].n)[eval_action], 1
)
......@@ -318,36 +287,23 @@ class EnvRunner(Runner):
eval_actions_env.append(eval_one_hot_action_env)
# Obser reward and next obs
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(
eval_actions_env
)
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(eval_actions_env)
eval_episode_rewards.append(eval_rewards)
eval_rnn_states[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32,
)
eval_masks = np.ones(
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32
)
eval_masks[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), 1), dtype=np.float32
)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
eval_masks[eval_dones == True] = np.zeros(((eval_dones == True).sum(), 1), dtype=np.float32)
eval_episode_rewards = np.array(eval_episode_rewards)
eval_train_infos = []
for agent_id in range(self.num_agents):
eval_average_episode_rewards = np.mean(
np.sum(eval_episode_rewards[:, :, agent_id], axis=0)
)
eval_train_infos.append(
{"eval_average_episode_rewards": eval_average_episode_rewards}
)
print(
"eval average episode rewards of agent%i: " % agent_id
+ str(eval_average_episode_rewards)
)
eval_average_episode_rewards = np.mean(np.sum(eval_episode_rewards[:, :, agent_id], axis=0))
eval_train_infos.append({"eval_average_episode_rewards": eval_average_episode_rewards})
print("eval average episode rewards of agent%i: " % agent_id + str(eval_average_episode_rewards))
self.log_train(eval_train_infos, total_num_steps)
......@@ -370,9 +326,7 @@ class EnvRunner(Runner):
),
dtype=np.float32,
)
masks = np.ones(
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32
)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
for step in range(self.episode_length):
calc_start = time.time()
......@@ -391,27 +345,15 @@ class EnvRunner(Runner):
action = action.detach().cpu().numpy()
# rearrange action
if (
self.envs.action_space[agent_id].__class__.__name__
== "MultiDiscrete"
):
if self.envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete":
for i in range(self.envs.action_space[agent_id].shape):
uc_action_env = np.eye(
self.envs.action_space[agent_id].high[i] + 1
)[action[:, i]]
uc_action_env = np.eye(self.envs.action_space[agent_id].high[i] + 1)[action[:, i]]
if i == 0:
action_env = uc_action_env
else:
action_env = np.concatenate(
(action_env, uc_action_env), axis=1
)
elif (
self.envs.action_space[agent_id].__class__.__name__
== "Discrete"
):
action_env = np.squeeze(
np.eye(self.envs.action_space[agent_id].n)[action], 1
)
action_env = np.concatenate((action_env, uc_action_env), axis=1)
elif self.envs.action_space[agent_id].__class__.__name__ == "Discrete":
action_env = np.squeeze(np.eye(self.envs.action_space[agent_id].n)[action], 1)
else:
raise NotImplementedError
......@@ -434,12 +376,8 @@ class EnvRunner(Runner):
((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32,
)
masks = np.ones(
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32
)
masks[dones == True] = np.zeros(
((dones == True).sum(), 1), dtype=np.float32
)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
if self.all_args.save_gifs:
image = self.envs.render("rgb_array")[0][0]
......@@ -451,13 +389,8 @@ class EnvRunner(Runner):
episode_rewards = np.array(episode_rewards)
for agent_id in range(self.num_agents):
average_episode_rewards = np.mean(
np.sum(episode_rewards[:, :, agent_id], axis=0)
)
print(
"eval average episode rewards of agent%i: " % agent_id
+ str(average_episode_rewards)
)
average_episode_rewards = np.mean(np.sum(episode_rewards[:, :, agent_id], axis=0))
print("eval average episode rewards of agent%i: " % agent_id + str(average_episode_rewards))
if self.all_args.save_gifs:
imageio.mimsave(
......
......@@ -27,9 +27,7 @@ class EnvRunner(Runner):
self.warmup()
start = time.time()
episodes = (
int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
)
episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
for episode in range(episodes):
if self.use_linear_lr_decay:
......@@ -69,9 +67,7 @@ class EnvRunner(Runner):
train_infos = self.train()
# post process
total_num_steps = (
(episode + 1) * self.episode_length * self.n_rollout_threads
)
total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
# save model
if episode % self.save_interval == 0 or episode == episodes - 1:
......@@ -103,14 +99,8 @@ class EnvRunner(Runner):
# agent_k = 'agent%i/individual_rewards' % agent_id
# env_infos[agent_k] = idv_rews
train_infos["average_episode_rewards"] = (
np.mean(self.buffer.rewards) * self.episode_length
)
print(
"average episode rewards is {}".format(
train_infos["average_episode_rewards"]
)
)
train_infos["average_episode_rewards"] = np.mean(self.buffer.rewards) * self.episode_length
print("average episode rewards is {}".format(train_infos["average_episode_rewards"]))
self.log_train(train_infos, total_num_steps)
# self.log_env(env_infos, total_num_steps)
......@@ -120,14 +110,14 @@ class EnvRunner(Runner):
def warmup(self):
# reset env
obs = self.envs.reset() # shape = (5, 2, 14)
obs = self.envs.reset() # shape = [env_num, agent_num, obs_dim]
# replay buffer
if self.use_centralized_V:
share_obs = obs.reshape(self.n_rollout_threads, -1) # shape = (5, 28)
share_obs = obs.reshape(self.n_rollout_threads, -1) # shape = [env_num, agent_num * obs_dim]
share_obs = np.expand_dims(share_obs, 1).repeat(
self.num_agents, axis=1
) # shape = (5, 2, 28)
) # shape = shape = [env_num, agent_num, agent_num * obs_dim]
else:
share_obs = obs
......@@ -151,21 +141,21 @@ class EnvRunner(Runner):
np.concatenate(self.buffer.masks[step]),
)
# [self.envs, agents, dim]
values = np.array(np.split(_t2n(value), self.n_rollout_threads))
actions = np.array(np.split(_t2n(action), self.n_rollout_threads))
values = np.array(np.split(_t2n(value), self.n_rollout_threads)) # [env_num, agent_num, 1]
actions = np.array(np.split(_t2n(action), self.n_rollout_threads)) # [env_num, agent_num, action_dim]
action_log_probs = np.array(
np.split(_t2n(action_log_prob), self.n_rollout_threads)
)
rnn_states = np.array(np.split(_t2n(rnn_states), self.n_rollout_threads))
) # [env_num, agent_num, 1]
rnn_states = np.array(
np.split(_t2n(rnn_states), self.n_rollout_threads)
) # [env_num, agent_num, 1, hidden_size]
rnn_states_critic = np.array(
np.split(_t2n(rnn_states_critic), self.n_rollout_threads)
)
) # [env_num, agent_num, 1, hidden_size]
# rearrange action
if self.envs.action_space[0].__class__.__name__ == "MultiDiscrete":
for i in range(self.envs.action_space[0].shape):
uc_actions_env = np.eye(self.envs.action_space[0].high[i] + 1)[
actions[:, :, i]
]
uc_actions_env = np.eye(self.envs.action_space[0].high[i] + 1)[actions[:, :, i]]
if i == 0:
actions_env = uc_actions_env
else:
......@@ -239,9 +229,7 @@ class EnvRunner(Runner):
(self.n_eval_rollout_threads, *self.buffer.rnn_states.shape[2:]),
dtype=np.float32,
)
eval_masks = np.ones(
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32
)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
for eval_step in range(self.episode_length):
self.trainer.prep_rollout()
......@@ -251,60 +239,39 @@ class EnvRunner(Runner):
np.concatenate(eval_masks),
deterministic=True,
)
eval_actions = np.array(
np.split(_t2n(eval_action), self.n_eval_rollout_threads)
)
eval_rnn_states = np.array(
np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads)
)
eval_actions = np.array(np.split(_t2n(eval_action), self.n_eval_rollout_threads))
eval_rnn_states = np.array(np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads))
if self.eval_envs.action_space[0].__class__.__name__ == "MultiDiscrete":
for i in range(self.eval_envs.action_space[0].shape):
eval_uc_actions_env = np.eye(
self.eval_envs.action_space[0].high[i] + 1
)[eval_actions[:, :, i]]
eval_uc_actions_env = np.eye(self.eval_envs.action_space[0].high[i] + 1)[
eval_actions[:, :, i]
]
if i == 0:
eval_actions_env = eval_uc_actions_env
else:
eval_actions_env = np.concatenate(
(eval_actions_env, eval_uc_actions_env), axis=2
)
eval_actions_env = np.concatenate((eval_actions_env, eval_uc_actions_env), axis=2)
elif self.eval_envs.action_space[0].__class__.__name__ == "Discrete":
eval_actions_env = np.squeeze(
np.eye(self.eval_envs.action_space[0].n)[eval_actions], 2
)
eval_actions_env = np.squeeze(np.eye(self.eval_envs.action_space[0].n)[eval_actions], 2)
else:
raise NotImplementedError
# Obser reward and next obs
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(
eval_actions_env
)
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(eval_actions_env)
eval_episode_rewards.append(eval_rewards)
eval_rnn_states[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32,
)
eval_masks = np.ones(
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32
)
eval_masks[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), 1), dtype=np.float32
)
eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
eval_masks[eval_dones == True] = np.zeros(((eval_dones == True).sum(), 1), dtype=np.float32)
eval_episode_rewards = np.array(eval_episode_rewards)
eval_env_infos = {}
eval_env_infos["eval_average_episode_rewards"] = np.sum(
np.array(eval_episode_rewards), axis=0
)
eval_average_episode_rewards = np.mean(
eval_env_infos["eval_average_episode_rewards"]
)
print(
"eval average episode rewards of agent: "
+ str(eval_average_episode_rewards)
)
eval_env_infos["eval_average_episode_rewards"] = np.sum(np.array(eval_episode_rewards), axis=0)
eval_average_episode_rewards = np.mean(eval_env_infos["eval_average_episode_rewards"])
print("eval average episode rewards of agent: " + str(eval_average_episode_rewards))
self.log_env(eval_env_infos, total_num_steps)
@torch.no_grad()
......@@ -330,9 +297,7 @@ class EnvRunner(Runner):
),
dtype=np.float32,
)
masks = np.ones(
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32
)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
episode_rewards = []
......@@ -347,21 +312,15 @@ class EnvRunner(Runner):
deterministic=True,
)
actions = np.array(np.split(_t2n(action), self.n_rollout_threads))
rnn_states = np.array(
np.split(_t2n(rnn_states), self.n_rollout_threads)
)
rnn_states = np.array(np.split(_t2n(rnn_states), self.n_rollout_threads))
if envs.action_space[0].__class__.__name__ == "MultiDiscrete":
for i in range(envs.action_space[0].shape):
uc_actions_env = np.eye(envs.action_space[0].high[i] + 1)[
actions[:, :, i]
]
uc_actions_env = np.eye(envs.action_space[0].high[i] + 1)[actions[:, :, i]]
if i == 0:
actions_env = uc_actions_env
else:
actions_env = np.concatenate(
(actions_env, uc_actions_env), axis=2
)
actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
elif envs.action_space[0].__class__.__name__ == "Discrete":
actions_env = np.squeeze(np.eye(envs.action_space[0].n)[actions], 2)
else:
......@@ -375,12 +334,8 @@ class EnvRunner(Runner):
((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32,
)
masks = np.ones(
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32
)
masks[dones == True] = np.zeros(
((dones == True).sum(), 1), dtype=np.float32
)
masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
if self.all_args.save_gifs:
image = envs.render("rgb_array")[0][0]
......@@ -392,10 +347,7 @@ class EnvRunner(Runner):
else:
envs.render("human")
print(
"average episode rewards is: "
+ str(np.mean(np.sum(np.array(episode_rewards), axis=0)))
)
print("average episode rewards is: " + str(np.mean(np.sum(np.array(episode_rewards), axis=0))))
# if self.all_args.save_gifs:
# imageio.mimsave(str(self.gif_dir) + '/render.gif', all_frames, duration=self.all_args.ifi)
......@@ -31,11 +31,15 @@ def make_train_env(all_args):
def init_env():
# TODO 注意注意,这里选择连续还是离散可以选择注释上面两行,或者下面两行。
# TODO Important, here you can choose continuous or discrete action space by uncommenting the above two lines or the below two lines.
from envs.env_continuous import ContinuousActionEnv
env = ContinuousActionEnv()
# from envs.env_discrete import DiscreteActionEnv
# env = DiscreteActionEnv()
env.seed(all_args.seed + rank * 1000)
return env
......@@ -63,9 +67,7 @@ def make_eval_env(all_args):
def parse_args(args, parser):
parser.add_argument(
"--scenario_name", type=str, default="MyEnv", help="Which scenario to run on"
)
parser.add_argument("--scenario_name", type=str, default="MyEnv", help="Which scenario to run on")
parser.add_argument("--num_landmarks", type=int, default=3)
parser.add_argument("--num_agents", type=int, default=2, help="number of players")
......@@ -79,20 +81,16 @@ def main(args):
all_args = parse_args(args, parser)
if all_args.algorithm_name == "rmappo":
assert (
all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy
), "check recurrent policy!"
assert all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy, "check recurrent policy!"
elif all_args.algorithm_name == "mappo":
assert (
all_args.use_recurrent_policy == False
and all_args.use_naive_recurrent_policy == False
all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False
), "check recurrent policy!"
else:
raise NotImplementedError
assert (
all_args.share_policy == True
and all_args.scenario_name == "simple_speaker_listener"
all_args.share_policy == True and all_args.scenario_name == "simple_speaker_listener"
) == False, "The simple_speaker_listener scenario can not use shared policy. Please check the config.py."
# cuda
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment