Commit ea508a3a authored by hezhiqiang01's avatar hezhiqiang01

fix share_policy = false的错误

parent f2073aa3
*.pyc *.pyc
results results
\ No newline at end of file .*
\ No newline at end of file
...@@ -3,7 +3,7 @@ import argparse ...@@ -3,7 +3,7 @@ import argparse
def get_config(): def get_config():
""" """
The configuration parser for common hyperparameters of all environment. The configuration parser for common hyperparameters of all environment.
Please reach each `scripts/train/<env>_runner.py` file to find private hyperparameters Please reach each `scripts/train/<env>_runner.py` file to find private hyperparameters
only used in <env>. only used in <env>.
...@@ -13,9 +13,9 @@ def get_config(): ...@@ -13,9 +13,9 @@ def get_config():
--experiment_name <str> --experiment_name <str>
an identifier to distinguish different experiment. an identifier to distinguish different experiment.
--seed <int> --seed <int>
set seed for numpy and torch set seed for numpy and torch
--cuda --cuda
by default True, will use GPU to train; or else will use CPU; by default True, will use GPU to train; or else will use CPU;
--cuda_deterministic --cuda_deterministic
by default, make sure random seed effective. if set, bypass such function. by default, make sure random seed effective. if set, bypass such function.
--n_training_threads <int> --n_training_threads <int>
...@@ -32,20 +32,20 @@ def get_config(): ...@@ -32,20 +32,20 @@ def get_config():
[for wandb usage], to specify user's name for simply collecting training data. [for wandb usage], to specify user's name for simply collecting training data.
--use_wandb --use_wandb
[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data. [for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.
Env parameters: Env parameters:
--env_name <str> --env_name <str>
specify the name of environment specify the name of environment
--use_obs_instead_of_state --use_obs_instead_of_state
[only for some env] by default False, will use global state; or else will use concatenated local obs. [only for some env] by default False, will use global state; or else will use concatenated local obs.
Replay Buffer parameters: Replay Buffer parameters:
--episode_length <int> --episode_length <int>
the max length of episode in the buffer. the max length of episode in the buffer.
Network parameters: Network parameters:
--share_policy --share_policy
by default True, all agents will share the same network; set to make training agents use different policies. by default True, all agents will share the same network; set to make training agents use different policies.
--use_centralized_V --use_centralized_V
by default True, use centralized training mode; or else will decentralized training mode. by default True, use centralized training mode; or else will decentralized training mode.
--stacked_frames <int> --stacked_frames <int>
...@@ -57,11 +57,11 @@ def get_config(): ...@@ -57,11 +57,11 @@ def get_config():
--use_ReLU --use_ReLU
by default True, will use ReLU. or else will use Tanh. by default True, will use ReLU. or else will use Tanh.
--use_popart --use_popart
by default True, use PopArt to normalize rewards. by default True, use PopArt to normalize rewards.
--use_valuenorm --use_valuenorm
by default True, use running mean and std to normalize rewards. by default True, use running mean and std to normalize rewards.
--use_feature_normalization --use_feature_normalization
by default True, apply layernorm to normalize inputs. by default True, apply layernorm to normalize inputs.
--use_orthogonal --use_orthogonal
by default True, use Orthogonal initialization for weights and 0 initialization for biases. or else, will use xavier uniform inilialization. by default True, use Orthogonal initialization for weights and 0 initialization for biases. or else, will use xavier uniform inilialization.
--gain --gain
...@@ -74,7 +74,7 @@ def get_config(): ...@@ -74,7 +74,7 @@ def get_config():
The number of recurrent layers ( default 1). The number of recurrent layers ( default 1).
--data_chunk_length <int> --data_chunk_length <int>
Time length of chunks used to train a recurrent_policy, default 10. Time length of chunks used to train a recurrent_policy, default 10.
Optimizer parameters: Optimizer parameters:
--lr <float> --lr <float>
learning rate parameter, (default: 5e-4, fixed). learning rate parameter, (default: 5e-4, fixed).
...@@ -84,11 +84,11 @@ def get_config(): ...@@ -84,11 +84,11 @@ def get_config():
RMSprop optimizer epsilon (default: 1e-5) RMSprop optimizer epsilon (default: 1e-5)
--weight_decay <float> --weight_decay <float>
coefficience of weight decay (default: 0) coefficience of weight decay (default: 0)
PPO parameters: PPO parameters:
--ppo_epoch <int> --ppo_epoch <int>
number of ppo epochs (default: 15) number of ppo epochs (default: 15)
--use_clipped_value_loss --use_clipped_value_loss
by default, clip loss value. If set, do not clip loss value. by default, clip loss value. If set, do not clip loss value.
--clip_param <float> --clip_param <float>
ppo clip parameter (default: 0.2) ppo clip parameter (default: 0.2)
...@@ -96,7 +96,7 @@ def get_config(): ...@@ -96,7 +96,7 @@ def get_config():
number of batches for ppo (default: 1) number of batches for ppo (default: 1)
--entropy_coef <float> --entropy_coef <float>
entropy term coefficient (default: 0.01) entropy term coefficient (default: 0.01)
--use_max_grad_norm --use_max_grad_norm
by default, use max norm of gradients. If set, do not use. by default, use max norm of gradients. If set, do not use.
--max_grad_norm <float> --max_grad_norm <float>
max norm of gradients (default: 0.5) max norm of gradients (default: 0.5)
...@@ -111,26 +111,26 @@ def get_config(): ...@@ -111,26 +111,26 @@ def get_config():
--use_huber_loss --use_huber_loss
by default, use huber loss. If set, do not use huber loss. by default, use huber loss. If set, do not use huber loss.
--use_value_active_masks --use_value_active_masks
by default True, whether to mask useless data in value loss. by default True, whether to mask useless data in value loss.
--huber_delta <float> --huber_delta <float>
coefficient of huber loss. coefficient of huber loss.
PPG parameters: PPG parameters:
--aux_epoch <int> --aux_epoch <int>
number of auxiliary epochs. (default: 4) number of auxiliary epochs. (default: 4)
--clone_coef <float> --clone_coef <float>
clone term coefficient (default: 0.01) clone term coefficient (default: 0.01)
Run parameters: Run parameters:
--use_linear_lr_decay --use_linear_lr_decay
by default, do not apply linear decay to learning rate. If set, use a linear schedule on the learning rate by default, do not apply linear decay to learning rate. If set, use a linear schedule on the learning rate
Save & Log parameters: Save & Log parameters:
--save_interval <int> --save_interval <int>
time duration between contiunous twice models saving. time duration between contiunous twice models saving.
--log_interval <int> --log_interval <int>
time duration between contiunous twice log printing. time duration between contiunous twice log printing.
Eval parameters: Eval parameters:
--use_eval --use_eval
by default, do not start evaluation. If set`, start evaluation alongside with training. by default, do not start evaluation. If set`, start evaluation alongside with training.
...@@ -138,7 +138,7 @@ def get_config(): ...@@ -138,7 +138,7 @@ def get_config():
time duration between contiunous twice evaluation progress. time duration between contiunous twice evaluation progress.
--eval_episodes <int> --eval_episodes <int>
number of episodes of a single evaluation. number of episodes of a single evaluation.
Render parameters: Render parameters:
--save_gifs --save_gifs
by default, do not save render video. If set, save video. by default, do not save render video. If set, save video.
...@@ -148,139 +148,350 @@ def get_config(): ...@@ -148,139 +148,350 @@ def get_config():
the number of episodes to render a given env the number of episodes to render a given env
--ifi <float> --ifi <float>
the play interval of each rendered image in saved video. the play interval of each rendered image in saved video.
Pretrained parameters: Pretrained parameters:
--model_dir <str> --model_dir <str>
by default None. set the path to pretrained model. by default None. set the path to pretrained model.
""" """
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description='onpolicy', formatter_class=argparse.RawDescriptionHelpFormatter) description="onpolicy", formatter_class=argparse.RawDescriptionHelpFormatter
)
# prepare parameters # prepare parameters
parser.add_argument("--algorithm_name", type=str, parser.add_argument("--algorithm_name", type=str, default="mappo", choices=["rmappo", "mappo"])
default='mappo', choices=["rmappo", "mappo"])
parser.add_argument("--experiment_name", type=str, default="check", help="an identifier to distinguish different experiment.") parser.add_argument(
"--experiment_name",
type=str,
default="check",
help="an identifier to distinguish different experiment.",
)
parser.add_argument("--seed", type=int, default=1, help="Random seed for numpy/torch") parser.add_argument("--seed", type=int, default=1, help="Random seed for numpy/torch")
parser.add_argument("--cuda", action='store_false', default=True, help="by default True, will use GPU to train; or else will use CPU;") parser.add_argument(
parser.add_argument("--cuda_deterministic", "--cuda",
action='store_false', default=True, help="by default, make sure random seed effective. if set, bypass such function.") action="store_false",
parser.add_argument("--n_training_threads", type=int, default=True,
default=1, help="Number of torch threads for training") help="by default True, will use GPU to train; or else will use CPU;",
parser.add_argument("--n_rollout_threads", type=int, default=5, )
help="Number of parallel envs for training rollouts") parser.add_argument(
parser.add_argument("--n_eval_rollout_threads", type=int, default=1, "--cuda_deterministic",
help="Number of parallel envs for evaluating rollouts") action="store_false",
parser.add_argument("--n_render_rollout_threads", type=int, default=1, default=True,
help="Number of parallel envs for rendering rollouts") help="by default, make sure random seed effective. if set, bypass such function.",
parser.add_argument("--num_env_steps", type=int, default=10e6, )
help='Number of environment steps to train (default: 10e6)') parser.add_argument(
parser.add_argument("--user_name", type=str, default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.") "--n_training_threads",
type=int,
default=2,
help="Number of torch threads for training",
)
parser.add_argument(
"--n_rollout_threads",
type=int,
default=5,
help="Number of parallel envs for training rollouts",
)
parser.add_argument(
"--n_eval_rollout_threads",
type=int,
default=2,
help="Number of parallel envs for evaluating rollouts",
)
parser.add_argument(
"--n_render_rollout_threads",
type=int,
default=1,
help="Number of parallel envs for rendering rollouts",
)
parser.add_argument(
"--num_env_steps",
type=int,
default=10e6,
help="Number of environment steps to train (default: 10e6)",
)
parser.add_argument(
"--user_name",
type=str,
default="marl",
help="[for wandb usage], to specify user's name for simply collecting training data.",
)
# env parameters # env parameters
parser.add_argument("--env_name", type=str, default='MyEnv', help="specify the name of environment") parser.add_argument("--env_name", type=str, default="MyEnv", help="specify the name of environment")
parser.add_argument("--use_obs_instead_of_state", action='store_true', parser.add_argument(
default=False, help="Whether to use global state or concatenated obs") "--use_obs_instead_of_state",
action="store_true",
default=False,
help="Whether to use global state or concatenated obs",
)
# replay buffer parameters # replay buffer parameters
parser.add_argument("--episode_length", type=int, parser.add_argument("--episode_length", type=int, default=200, help="Max length for any episode")
default=200, help="Max length for any episode")
# network parameters # network parameters
parser.add_argument("--share_policy", action='store_false', parser.add_argument(
default=True, help='Whether agent share the same policy') "--share_policy",
parser.add_argument("--use_centralized_V", action='store_false', action="store_false",
default=True, help="Whether to use centralized V function") default=False,
parser.add_argument("--stacked_frames", type=int, default=1, help="Whether agent share the same policy",
help="Dimension of hidden layers for actor/critic networks") )
parser.add_argument("--use_stacked_frames", action='store_true', parser.add_argument(
default=False, help="Whether to use stacked_frames") "--use_centralized_V",
parser.add_argument("--hidden_size", type=int, default=64, action="store_false",
help="Dimension of hidden layers for actor/critic networks") default=True,
parser.add_argument("--layer_N", type=int, default=1, help="Whether to use centralized V function",
help="Number of layers for actor/critic networks") )
parser.add_argument("--use_ReLU", action='store_false', parser.add_argument(
default=True, help="Whether to use ReLU") "--stacked_frames",
parser.add_argument("--use_popart", action='store_true', default=False, help="by default False, use PopArt to normalize rewards.") type=int,
parser.add_argument("--use_valuenorm", action='store_false', default=True, help="by default True, use running mean and std to normalize rewards.") default=1,
parser.add_argument("--use_feature_normalization", action='store_false', help="Dimension of hidden layers for actor/critic networks",
default=True, help="Whether to apply layernorm to the inputs") )
parser.add_argument("--use_orthogonal", action='store_false', default=True, parser.add_argument(
help="Whether to use Orthogonal initialization for weights and 0 initialization for biases") "--use_stacked_frames",
parser.add_argument("--gain", type=float, default=0.01, action="store_true",
help="The gain # of last action layer") default=False,
help="Whether to use stacked_frames",
)
parser.add_argument(
"--hidden_size",
type=int,
default=64,
help="Dimension of hidden layers for actor/critic networks",
)
parser.add_argument(
"--layer_N",
type=int,
default=1,
help="Number of layers for actor/critic networks",
)
parser.add_argument("--use_ReLU", action="store_false", default=True, help="Whether to use ReLU")
parser.add_argument(
"--use_popart",
action="store_true",
default=False,
help="by default False, use PopArt to normalize rewards.",
)
parser.add_argument(
"--use_valuenorm",
action="store_false",
default=True,
help="by default True, use running mean and std to normalize rewards.",
)
parser.add_argument(
"--use_feature_normalization",
action="store_false",
default=True,
help="Whether to apply layernorm to the inputs",
)
parser.add_argument(
"--use_orthogonal",
action="store_false",
default=True,
help="Whether to use Orthogonal initialization for weights and 0 initialization for biases",
)
parser.add_argument("--gain", type=float, default=0.01, help="The gain # of last action layer")
# recurrent parameters # recurrent parameters
parser.add_argument("--use_naive_recurrent_policy", action='store_true', parser.add_argument(
default=False, help='Whether to use a naive recurrent policy') "--use_naive_recurrent_policy",
parser.add_argument("--use_recurrent_policy", action='store_false', action="store_true",
default=False, help='use a recurrent policy') default=False,
help="Whether to use a naive recurrent policy",
)
parser.add_argument(
"--use_recurrent_policy",
action="store_false",
default=False,
help="use a recurrent policy",
)
parser.add_argument("--recurrent_N", type=int, default=1, help="The number of recurrent layers.") parser.add_argument("--recurrent_N", type=int, default=1, help="The number of recurrent layers.")
parser.add_argument("--data_chunk_length", type=int, default=10, parser.add_argument(
help="Time length of chunks used to train a recurrent_policy") "--data_chunk_length",
type=int,
default=10,
help="Time length of chunks used to train a recurrent_policy",
)
# optimizer parameters # optimizer parameters
parser.add_argument("--lr", type=float, default=5e-4, parser.add_argument("--lr", type=float, default=5e-4, help="learning rate (default: 5e-4)")
help='learning rate (default: 5e-4)') parser.add_argument(
parser.add_argument("--critic_lr", type=float, default=5e-4, "--critic_lr",
help='critic learning rate (default: 5e-4)') type=float,
parser.add_argument("--opti_eps", type=float, default=1e-5, default=5e-4,
help='RMSprop optimizer epsilon (default: 1e-5)') help="critic learning rate (default: 5e-4)",
)
parser.add_argument(
"--opti_eps",
type=float,
default=1e-5,
help="RMSprop optimizer epsilon (default: 1e-5)",
)
parser.add_argument("--weight_decay", type=float, default=0) parser.add_argument("--weight_decay", type=float, default=0)
# ppo parameters # ppo parameters
parser.add_argument("--ppo_epoch", type=int, default=15, parser.add_argument("--ppo_epoch", type=int, default=15, help="number of ppo epochs (default: 15)")
help='number of ppo epochs (default: 15)') parser.add_argument(
parser.add_argument("--use_clipped_value_loss", "--use_clipped_value_loss",
action='store_false', default=True, help="by default, clip loss value. If set, do not clip loss value.") action="store_false",
parser.add_argument("--clip_param", type=float, default=0.2, default=True,
help='ppo clip parameter (default: 0.2)') help="by default, clip loss value. If set, do not clip loss value.",
parser.add_argument("--num_mini_batch", type=int, default=1, )
help='number of batches for ppo (default: 1)') parser.add_argument(
parser.add_argument("--entropy_coef", type=float, default=0.01, "--clip_param",
help='entropy term coefficient (default: 0.01)') type=float,
parser.add_argument("--value_loss_coef", type=float, default=0.2,
default=1, help='value loss coefficient (default: 0.5)') help="ppo clip parameter (default: 0.2)",
parser.add_argument("--use_max_grad_norm", )
action='store_false', default=True, help="by default, use max norm of gradients. If set, do not use.") parser.add_argument(
parser.add_argument("--max_grad_norm", type=float, default=10.0, "--num_mini_batch",
help='max norm of gradients (default: 0.5)') type=int,
parser.add_argument("--use_gae", action='store_false', default=1,
default=True, help='use generalized advantage estimation') help="number of batches for ppo (default: 1)",
parser.add_argument("--gamma", type=float, default=0.99, )
help='discount factor for rewards (default: 0.99)') parser.add_argument(
parser.add_argument("--gae_lambda", type=float, default=0.95, "--entropy_coef",
help='gae lambda parameter (default: 0.95)') type=float,
parser.add_argument("--use_proper_time_limits", action='store_true', default=0.01,
default=False, help='compute returns taking into account time limits') help="entropy term coefficient (default: 0.01)",
parser.add_argument("--use_huber_loss", action='store_false', default=True, help="by default, use huber loss. If set, do not use huber loss.") )
parser.add_argument("--use_value_active_masks", parser.add_argument(
action='store_false', default=True, help="by default True, whether to mask useless data in value loss.") "--value_loss_coef",
parser.add_argument("--use_policy_active_masks", type=float,
action='store_false', default=True, help="by default True, whether to mask useless data in policy loss.") default=1,
help="value loss coefficient (default: 0.5)",
)
parser.add_argument(
"--use_max_grad_norm",
action="store_false",
default=True,
help="by default, use max norm of gradients. If set, do not use.",
)
parser.add_argument(
"--max_grad_norm",
type=float,
default=10.0,
help="max norm of gradients (default: 0.5)",
)
parser.add_argument(
"--use_gae",
action="store_false",
default=True,
help="use generalized advantage estimation",
)
parser.add_argument(
"--gamma",
type=float,
default=0.99,
help="discount factor for rewards (default: 0.99)",
)
parser.add_argument(
"--gae_lambda",
type=float,
default=0.95,
help="gae lambda parameter (default: 0.95)",
)
parser.add_argument(
"--use_proper_time_limits",
action="store_true",
default=False,
help="compute returns taking into account time limits",
)
parser.add_argument(
"--use_huber_loss",
action="store_false",
default=True,
help="by default, use huber loss. If set, do not use huber loss.",
)
parser.add_argument(
"--use_value_active_masks",
action="store_false",
default=True,
help="by default True, whether to mask useless data in value loss.",
)
parser.add_argument(
"--use_policy_active_masks",
action="store_false",
default=True,
help="by default True, whether to mask useless data in policy loss.",
)
parser.add_argument("--huber_delta", type=float, default=10.0, help=" coefficience of huber loss.") parser.add_argument("--huber_delta", type=float, default=10.0, help=" coefficience of huber loss.")
# run parameters # run parameters
parser.add_argument("--use_linear_lr_decay", action='store_true', parser.add_argument(
default=False, help='use a linear schedule on the learning rate') "--use_linear_lr_decay",
action="store_true",
default=False,
help="use a linear schedule on the learning rate",
)
# save parameters # save parameters
parser.add_argument("--save_interval", type=int, default=1, help="time duration between contiunous twice models saving.") parser.add_argument(
"--save_interval",
type=int,
default=1,
help="time duration between contiunous twice models saving.",
)
# log parameters # log parameters
parser.add_argument("--log_interval", type=int, default=5, help="time duration between contiunous twice log printing.") parser.add_argument(
"--log_interval",
type=int,
default=5,
help="time duration between contiunous twice log printing.",
)
# eval parameters # eval parameters
parser.add_argument("--use_eval", action='store_true', default=False, help="by default, do not start evaluation. If set`, start evaluation alongside with training.") parser.add_argument(
parser.add_argument("--eval_interval", type=int, default=25, help="time duration between contiunous twice evaluation progress.") "--use_eval",
parser.add_argument("--eval_episodes", type=int, default=32, help="number of episodes of a single evaluation.") action="store_true",
default=False,
help="by default, do not start evaluation. If set`, start evaluation alongside with training.",
)
parser.add_argument(
"--eval_interval",
type=int,
default=25,
help="time duration between contiunous twice evaluation progress.",
)
parser.add_argument(
"--eval_episodes",
type=int,
default=32,
help="number of episodes of a single evaluation.",
)
# render parameters # render parameters
parser.add_argument("--save_gifs", action='store_true', default=False, help="by default, do not save render video. If set, save video.") parser.add_argument(
parser.add_argument("--use_render", action='store_true', default=False, help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.") "--save_gifs",
parser.add_argument("--render_episodes", type=int, default=5, help="the number of episodes to render a given env") action="store_true",
parser.add_argument("--ifi", type=float, default=0.1, help="the play interval of each rendered image in saved video.") default=False,
help="by default, do not save render video. If set, save video.",
)
parser.add_argument(
"--use_render",
action="store_true",
default=False,
help="by default, do not render the env during training. If set, start render. Note: something, the environment has internal render process which is not controlled by this hyperparam.",
)
parser.add_argument(
"--render_episodes",
type=int,
default=5,
help="the number of episodes to render a given env",
)
parser.add_argument(
"--ifi",
type=float,
default=0.1,
help="the play interval of each rendered image in saved video.",
)
# pretrained parameters # pretrained parameters
parser.add_argument("--model_dir", type=str, default=None, help="by default None. set the path to pretrained model.") parser.add_argument(
"--model_dir",
type=str,
default=None,
help="by default None. set the path to pretrained model.",
)
return parser return parser
...@@ -36,30 +36,30 @@ class DiscreteActionEnv(object): ...@@ -36,30 +36,30 @@ class DiscreteActionEnv(object):
share_obs_dim = 0 share_obs_dim = 0
total_action_space = [] total_action_space = []
for agent in range(self.num_agent): for agent_idx in range(self.num_agent):
# physical action space # physical action space
u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作 u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作
if self.movable: # if self.movable:
total_action_space.append(u_action_space) total_action_space.append(u_action_space)
# total action space # total action space
if len(total_action_space) > 1: # if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space # # all action spaces are discrete, so simplify to MultiDiscrete action space
if all( # if all(
[ # [
isinstance(act_space, spaces.Discrete) # isinstance(act_space, spaces.Discrete)
for act_space in total_action_space # for act_space in total_action_space
] # ]
): # ):
act_space = MultiDiscrete( # act_space = MultiDiscrete(
[[0, act_space.n - 1] for act_space in total_action_space] # [[0, act_space.n - 1] for act_space in total_action_space]
) # )
else: # else:
act_space = spaces.Tuple(total_action_space) # act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space) # self.action_space.append(act_space)
else: # else:
self.action_space.append(total_action_space[0]) self.action_space.append(total_action_space[agent_idx])
# observation space # observation space
share_obs_dim += self.signal_obs_dim share_obs_dim += self.signal_obs_dim
...@@ -73,9 +73,7 @@ class DiscreteActionEnv(object): ...@@ -73,9 +73,7 @@ class DiscreteActionEnv(object):
) # [-inf,inf] ) # [-inf,inf]
self.share_observation_space = [ self.share_observation_space = [
spaces.Box( spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,), dtype=np.float32)
low=-np.inf, high=+np.inf, shape=(share_obs_dim,), dtype=np.float32
)
for _ in range(self.num_agent) for _ in range(self.num_agent)
] ]
...@@ -135,12 +133,7 @@ class MultiDiscrete: ...@@ -135,12 +133,7 @@ class MultiDiscrete:
"""Returns a array with one sample from each discrete action space""" """Returns a array with one sample from each discrete action space"""
# For each row: round(random .* (max - min) + min, 0) # For each row: round(random .* (max - min) + min, 0)
random_array = np.random.rand(self.num_discrete_space) random_array = np.random.rand(self.num_discrete_space)
return [ return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.0), random_array) + self.low)]
int(x)
for x in np.floor(
np.multiply((self.high - self.low + 1.0), random_array) + self.low
)
]
def contains(self, x): def contains(self, x):
return ( return (
...@@ -157,9 +150,7 @@ class MultiDiscrete: ...@@ -157,9 +150,7 @@ class MultiDiscrete:
return "MultiDiscrete" + str(self.num_discrete_space) return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other): def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal( return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
self.high, other.high
)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -46,7 +46,7 @@ class DummyVecEnv(): ...@@ -46,7 +46,7 @@ class DummyVecEnv():
return obs, rews, dones, infos return obs, rews, dones, infos
def reset(self): def reset(self):
obs = [env.reset() for env in self.envs] obs = [env.reset() for env in self.envs] # [env_num, agent_num, obs_dim]
return np.array(obs) return np.array(obs)
def close(self): def close(self):
......
import time import time
import wandb
import os import os
import numpy as np import numpy as np
from itertools import chain from itertools import chain
...@@ -10,17 +8,18 @@ from tensorboardX import SummaryWriter ...@@ -10,17 +8,18 @@ from tensorboardX import SummaryWriter
from utils.separated_buffer import SeparatedReplayBuffer from utils.separated_buffer import SeparatedReplayBuffer
from utils.util import update_linear_schedule from utils.util import update_linear_schedule
def _t2n(x): def _t2n(x):
return x.detach().cpu().numpy() return x.detach().cpu().numpy()
class Runner(object): class Runner(object):
def __init__(self, config): def __init__(self, config):
self.all_args = config["all_args"]
self.all_args = config['all_args'] self.envs = config["envs"]
self.envs = config['envs'] self.eval_envs = config["eval_envs"]
self.eval_envs = config['eval_envs'] self.device = config["device"]
self.device = config['device'] self.num_agents = config["num_agents"]
self.num_agents = config['num_agents']
# parameters # parameters
self.env_name = self.all_args.env_name self.env_name = self.all_args.env_name
...@@ -34,7 +33,6 @@ class Runner(object): ...@@ -34,7 +33,6 @@ class Runner(object):
self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads self.n_eval_rollout_threads = self.all_args.n_eval_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N self.recurrent_N = self.all_args.recurrent_N
...@@ -49,37 +47,42 @@ class Runner(object): ...@@ -49,37 +47,42 @@ class Runner(object):
if self.use_render: if self.use_render:
import imageio import imageio
self.run_dir = config["run_dir"] self.run_dir = config["run_dir"]
self.gif_dir = str(self.run_dir / 'gifs') self.gif_dir = str(self.run_dir / "gifs")
if not os.path.exists(self.gif_dir): if not os.path.exists(self.gif_dir):
os.makedirs(self.gif_dir) os.makedirs(self.gif_dir)
else: else:
if self.use_wandb: # if self.use_wandb:
self.save_dir = str(wandb.run.dir) # self.save_dir = str(wandb.run.dir)
else: # else:
self.run_dir = config["run_dir"] self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs') self.log_dir = str(self.run_dir / "logs")
if not os.path.exists(self.log_dir): if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir) os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir) self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models') self.save_dir = str(self.run_dir / "models")
if not os.path.exists(self.save_dir): if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir) os.makedirs(self.save_dir)
from algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo from algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy from algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
self.policy = [] self.policy = []
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id] share_observation_space = (
self.envs.share_observation_space[agent_id]
if self.use_centralized_V
else self.envs.observation_space[agent_id]
)
# policy network # policy network
po = Policy(self.all_args, po = Policy(
self.envs.observation_space[agent_id], self.all_args,
share_observation_space, self.envs.observation_space[agent_id],
self.envs.action_space[agent_id], share_observation_space,
device = self.device) self.envs.action_space[agent_id],
device=self.device,
)
self.policy.append(po) self.policy.append(po)
if self.model_dir is not None: if self.model_dir is not None:
...@@ -89,16 +92,22 @@ class Runner(object): ...@@ -89,16 +92,22 @@ class Runner(object):
self.buffer = [] self.buffer = []
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
# algorithm # algorithm
tr = TrainAlgo(self.all_args, self.policy[agent_id], device = self.device) tr = TrainAlgo(self.all_args, self.policy[agent_id], device=self.device)
# buffer # buffer
share_observation_space = self.envs.share_observation_space[agent_id] if self.use_centralized_V else self.envs.observation_space[agent_id] share_observation_space = (
bu = SeparatedReplayBuffer(self.all_args, self.envs.share_observation_space[agent_id]
self.envs.observation_space[agent_id], if self.use_centralized_V
share_observation_space, else self.envs.observation_space[agent_id]
self.envs.action_space[agent_id]) )
bu = SeparatedReplayBuffer(
self.all_args,
self.envs.observation_space[agent_id],
share_observation_space,
self.envs.action_space[agent_id],
)
self.buffer.append(bu) self.buffer.append(bu)
self.trainer.append(tr) self.trainer.append(tr)
def run(self): def run(self):
raise NotImplementedError raise NotImplementedError
...@@ -110,14 +119,16 @@ class Runner(object): ...@@ -110,14 +119,16 @@ class Runner(object):
def insert(self, data): def insert(self, data):
raise NotImplementedError raise NotImplementedError
@torch.no_grad() @torch.no_grad()
def compute(self): def compute(self):
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_rollout() self.trainer[agent_id].prep_rollout()
next_value = self.trainer[agent_id].policy.get_values(self.buffer[agent_id].share_obs[-1], next_value = self.trainer[agent_id].policy.get_values(
self.buffer[agent_id].rnn_states_critic[-1], self.buffer[agent_id].share_obs[-1],
self.buffer[agent_id].masks[-1]) self.buffer[agent_id].rnn_states_critic[-1],
self.buffer[agent_id].masks[-1],
)
next_value = _t2n(next_value) next_value = _t2n(next_value)
self.buffer[agent_id].compute_returns(next_value, self.trainer[agent_id].value_normalizer) self.buffer[agent_id].compute_returns(next_value, self.trainer[agent_id].value_normalizer)
...@@ -126,7 +137,7 @@ class Runner(object): ...@@ -126,7 +137,7 @@ class Runner(object):
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
self.trainer[agent_id].prep_training() self.trainer[agent_id].prep_training()
train_info = self.trainer[agent_id].train(self.buffer[agent_id]) train_info = self.trainer[agent_id].train(self.buffer[agent_id])
train_infos.append(train_info) train_infos.append(train_info)
self.buffer[agent_id].after_update() self.buffer[agent_id].after_update()
return train_infos return train_infos
...@@ -134,30 +145,39 @@ class Runner(object): ...@@ -134,30 +145,39 @@ class Runner(object):
def save(self): def save(self):
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
policy_actor = self.trainer[agent_id].policy.actor policy_actor = self.trainer[agent_id].policy.actor
torch.save(policy_actor.state_dict(), str(self.save_dir) + "/actor_agent" + str(agent_id) + ".pt") torch.save(
policy_actor.state_dict(),
str(self.save_dir) + "/actor_agent" + str(agent_id) + ".pt",
)
policy_critic = self.trainer[agent_id].policy.critic policy_critic = self.trainer[agent_id].policy.critic
torch.save(policy_critic.state_dict(), str(self.save_dir) + "/critic_agent" + str(agent_id) + ".pt") torch.save(
policy_critic.state_dict(),
str(self.save_dir) + "/critic_agent" + str(agent_id) + ".pt",
)
def restore(self): def restore(self):
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
policy_actor_state_dict = torch.load(str(self.model_dir) + '/actor_agent' + str(agent_id) + '.pt') policy_actor_state_dict = torch.load(str(self.model_dir) + "/actor_agent" + str(agent_id) + ".pt")
self.policy[agent_id].actor.load_state_dict(policy_actor_state_dict) self.policy[agent_id].actor.load_state_dict(policy_actor_state_dict)
policy_critic_state_dict = torch.load(str(self.model_dir) + '/critic_agent' + str(agent_id) + '.pt') policy_critic_state_dict = torch.load(
str(self.model_dir) + "/critic_agent" + str(agent_id) + ".pt"
)
self.policy[agent_id].critic.load_state_dict(policy_critic_state_dict) self.policy[agent_id].critic.load_state_dict(policy_critic_state_dict)
def log_train(self, train_infos, total_num_steps): def log_train(self, train_infos, total_num_steps):
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
for k, v in train_infos[agent_id].items(): for k, v in train_infos[agent_id].items():
agent_k = "agent%i/" % agent_id + k agent_k = "agent%i/" % agent_id + k
if self.use_wandb: # if self.use_wandb:
wandb.log({agent_k: v}, step=total_num_steps) # pass
else: # wandb.log({agent_k: v}, step=total_num_steps)
self.writter.add_scalars(agent_k, {agent_k: v}, total_num_steps) # else:
self.writter.add_scalars(agent_k, {agent_k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps): def log_env(self, env_infos, total_num_steps):
for k, v in env_infos.items(): for k, v in env_infos.items():
if len(v) > 0: if len(v) > 0:
if self.use_wandb: # if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps) # wandb.log({k: np.mean(v)}, step=total_num_steps)
else: # else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps) self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
"""
# @Time : 2021/7/1 7:14 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env_runner.py
"""
import time import time
import wandb
import os import os
import numpy as np import numpy as np
from itertools import chain from itertools import chain
...@@ -14,7 +6,6 @@ import torch ...@@ -14,7 +6,6 @@ import torch
from utils.util import update_linear_schedule from utils.util import update_linear_schedule
from runner.separated.base_runner import Runner from runner.separated.base_runner import Runner
import imageio
def _t2n(x): def _t2n(x):
...@@ -29,9 +20,7 @@ class EnvRunner(Runner): ...@@ -29,9 +20,7 @@ class EnvRunner(Runner):
self.warmup() self.warmup()
start = time.time() start = time.time()
episodes = ( episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
)
for episode in range(episodes): for episode in range(episodes):
if self.use_linear_lr_decay: if self.use_linear_lr_decay:
...@@ -72,9 +61,7 @@ class EnvRunner(Runner): ...@@ -72,9 +61,7 @@ class EnvRunner(Runner):
train_infos = self.train() train_infos = self.train()
# post process # post process
total_num_steps = ( total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
(episode + 1) * self.episode_length * self.n_rollout_threads
)
# save model # save model
if episode % self.save_interval == 0 or episode == episodes - 1: if episode % self.save_interval == 0 or episode == episodes - 1:
...@@ -102,14 +89,10 @@ class EnvRunner(Runner): ...@@ -102,14 +89,10 @@ class EnvRunner(Runner):
for info in infos: for info in infos:
if "individual_reward" in info[agent_id].keys(): if "individual_reward" in info[agent_id].keys():
idv_rews.append(info[agent_id]["individual_reward"]) idv_rews.append(info[agent_id]["individual_reward"])
train_infos[agent_id].update( train_infos[agent_id].update({"individual_rewards": np.mean(idv_rews)})
{"individual_rewards": np.mean(idv_rews)}
)
train_infos[agent_id].update( train_infos[agent_id].update(
{ {
"average_episode_rewards": np.mean( "average_episode_rewards": np.mean(self.buffer[agent_id].rewards)
self.buffer[agent_id].rewards
)
* self.episode_length * self.episode_length
} }
) )
...@@ -121,12 +104,12 @@ class EnvRunner(Runner): ...@@ -121,12 +104,12 @@ class EnvRunner(Runner):
def warmup(self): def warmup(self):
# reset env # reset env
obs = self.envs.reset() obs = self.envs.reset() # shape = [env_num, agent_num, obs_dim]
share_obs = [] share_obs = []
for o in obs: for o in obs:
share_obs.append(list(chain(*o))) share_obs.append(list(chain(*o)))
share_obs = np.array(share_obs) share_obs = np.array(share_obs) # shape = [env_num, agent_num * obs_dim]
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
if not self.use_centralized_V: if not self.use_centralized_V:
...@@ -160,21 +143,17 @@ class EnvRunner(Runner): ...@@ -160,21 +143,17 @@ class EnvRunner(Runner):
# rearrange action # rearrange action
if self.envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete": if self.envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete":
for i in range(self.envs.action_space[agent_id].shape): for i in range(self.envs.action_space[agent_id].shape):
uc_action_env = np.eye( uc_action_env = np.eye(self.envs.action_space[agent_id].high[i] + 1)[action[:, i]]
self.envs.action_space[agent_id].high[i] + 1
)[action[:, i]]
if i == 0: if i == 0:
action_env = uc_action_env action_env = uc_action_env
else: else:
action_env = np.concatenate((action_env, uc_action_env), axis=1) action_env = np.concatenate((action_env, uc_action_env), axis=1)
elif self.envs.action_space[agent_id].__class__.__name__ == "Discrete": elif self.envs.action_space[agent_id].__class__.__name__ == "Discrete":
action_env = np.squeeze( action_env = np.squeeze(np.eye(self.envs.action_space[agent_id].n)[action], 1)
np.eye(self.envs.action_space[agent_id].n)[action], 1
)
else: else:
# TODO 这里改造成自己环境需要的形式即可 # TODO 这里改造成自己环境需要的形式即可
# TODO Here, you can change the action_env to the form you need # TODO Here, you can change the action_env to the form you need
action_env = actions action_env = action
# raise NotImplementedError # raise NotImplementedError
actions.append(action) actions.append(action)
...@@ -265,9 +244,7 @@ class EnvRunner(Runner): ...@@ -265,9 +244,7 @@ class EnvRunner(Runner):
), ),
dtype=np.float32, dtype=np.float32,
) )
eval_masks = np.ones( eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32
)
for eval_step in range(self.episode_length): for eval_step in range(self.episode_length):
eval_temp_actions_env = [] eval_temp_actions_env = []
...@@ -282,24 +259,16 @@ class EnvRunner(Runner): ...@@ -282,24 +259,16 @@ class EnvRunner(Runner):
eval_action = eval_action.detach().cpu().numpy() eval_action = eval_action.detach().cpu().numpy()
# rearrange action # rearrange action
if ( if self.eval_envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete":
self.eval_envs.action_space[agent_id].__class__.__name__
== "MultiDiscrete"
):
for i in range(self.eval_envs.action_space[agent_id].shape): for i in range(self.eval_envs.action_space[agent_id].shape):
eval_uc_action_env = np.eye( eval_uc_action_env = np.eye(self.eval_envs.action_space[agent_id].high[i] + 1)[
self.eval_envs.action_space[agent_id].high[i] + 1 eval_action[:, i]
)[eval_action[:, i]] ]
if i == 0: if i == 0:
eval_action_env = eval_uc_action_env eval_action_env = eval_uc_action_env
else: else:
eval_action_env = np.concatenate( eval_action_env = np.concatenate((eval_action_env, eval_uc_action_env), axis=1)
(eval_action_env, eval_uc_action_env), axis=1 elif self.eval_envs.action_space[agent_id].__class__.__name__ == "Discrete":
)
elif (
self.eval_envs.action_space[agent_id].__class__.__name__
== "Discrete"
):
eval_action_env = np.squeeze( eval_action_env = np.squeeze(
np.eye(self.eval_envs.action_space[agent_id].n)[eval_action], 1 np.eye(self.eval_envs.action_space[agent_id].n)[eval_action], 1
) )
...@@ -318,36 +287,23 @@ class EnvRunner(Runner): ...@@ -318,36 +287,23 @@ class EnvRunner(Runner):
eval_actions_env.append(eval_one_hot_action_env) eval_actions_env.append(eval_one_hot_action_env)
# Obser reward and next obs # Obser reward and next obs
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step( eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(eval_actions_env)
eval_actions_env
)
eval_episode_rewards.append(eval_rewards) eval_episode_rewards.append(eval_rewards)
eval_rnn_states[eval_dones == True] = np.zeros( eval_rnn_states[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), self.recurrent_N, self.hidden_size), ((eval_dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32, dtype=np.float32,
) )
eval_masks = np.ones( eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32 eval_masks[eval_dones == True] = np.zeros(((eval_dones == True).sum(), 1), dtype=np.float32)
)
eval_masks[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), 1), dtype=np.float32
)
eval_episode_rewards = np.array(eval_episode_rewards) eval_episode_rewards = np.array(eval_episode_rewards)
eval_train_infos = [] eval_train_infos = []
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
eval_average_episode_rewards = np.mean( eval_average_episode_rewards = np.mean(np.sum(eval_episode_rewards[:, :, agent_id], axis=0))
np.sum(eval_episode_rewards[:, :, agent_id], axis=0) eval_train_infos.append({"eval_average_episode_rewards": eval_average_episode_rewards})
) print("eval average episode rewards of agent%i: " % agent_id + str(eval_average_episode_rewards))
eval_train_infos.append(
{"eval_average_episode_rewards": eval_average_episode_rewards}
)
print(
"eval average episode rewards of agent%i: " % agent_id
+ str(eval_average_episode_rewards)
)
self.log_train(eval_train_infos, total_num_steps) self.log_train(eval_train_infos, total_num_steps)
...@@ -370,9 +326,7 @@ class EnvRunner(Runner): ...@@ -370,9 +326,7 @@ class EnvRunner(Runner):
), ),
dtype=np.float32, dtype=np.float32,
) )
masks = np.ones( masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32
)
for step in range(self.episode_length): for step in range(self.episode_length):
calc_start = time.time() calc_start = time.time()
...@@ -391,27 +345,15 @@ class EnvRunner(Runner): ...@@ -391,27 +345,15 @@ class EnvRunner(Runner):
action = action.detach().cpu().numpy() action = action.detach().cpu().numpy()
# rearrange action # rearrange action
if ( if self.envs.action_space[agent_id].__class__.__name__ == "MultiDiscrete":
self.envs.action_space[agent_id].__class__.__name__
== "MultiDiscrete"
):
for i in range(self.envs.action_space[agent_id].shape): for i in range(self.envs.action_space[agent_id].shape):
uc_action_env = np.eye( uc_action_env = np.eye(self.envs.action_space[agent_id].high[i] + 1)[action[:, i]]
self.envs.action_space[agent_id].high[i] + 1
)[action[:, i]]
if i == 0: if i == 0:
action_env = uc_action_env action_env = uc_action_env
else: else:
action_env = np.concatenate( action_env = np.concatenate((action_env, uc_action_env), axis=1)
(action_env, uc_action_env), axis=1 elif self.envs.action_space[agent_id].__class__.__name__ == "Discrete":
) action_env = np.squeeze(np.eye(self.envs.action_space[agent_id].n)[action], 1)
elif (
self.envs.action_space[agent_id].__class__.__name__
== "Discrete"
):
action_env = np.squeeze(
np.eye(self.envs.action_space[agent_id].n)[action], 1
)
else: else:
raise NotImplementedError raise NotImplementedError
...@@ -434,12 +376,8 @@ class EnvRunner(Runner): ...@@ -434,12 +376,8 @@ class EnvRunner(Runner):
((dones == True).sum(), self.recurrent_N, self.hidden_size), ((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32, dtype=np.float32,
) )
masks = np.ones( masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32 masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
)
masks[dones == True] = np.zeros(
((dones == True).sum(), 1), dtype=np.float32
)
if self.all_args.save_gifs: if self.all_args.save_gifs:
image = self.envs.render("rgb_array")[0][0] image = self.envs.render("rgb_array")[0][0]
...@@ -451,13 +389,8 @@ class EnvRunner(Runner): ...@@ -451,13 +389,8 @@ class EnvRunner(Runner):
episode_rewards = np.array(episode_rewards) episode_rewards = np.array(episode_rewards)
for agent_id in range(self.num_agents): for agent_id in range(self.num_agents):
average_episode_rewards = np.mean( average_episode_rewards = np.mean(np.sum(episode_rewards[:, :, agent_id], axis=0))
np.sum(episode_rewards[:, :, agent_id], axis=0) print("eval average episode rewards of agent%i: " % agent_id + str(average_episode_rewards))
)
print(
"eval average episode rewards of agent%i: " % agent_id
+ str(average_episode_rewards)
)
if self.all_args.save_gifs: if self.all_args.save_gifs:
imageio.mimsave( imageio.mimsave(
......
...@@ -27,9 +27,7 @@ class EnvRunner(Runner): ...@@ -27,9 +27,7 @@ class EnvRunner(Runner):
self.warmup() self.warmup()
start = time.time() start = time.time()
episodes = ( episodes = int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
int(self.num_env_steps) // self.episode_length // self.n_rollout_threads
)
for episode in range(episodes): for episode in range(episodes):
if self.use_linear_lr_decay: if self.use_linear_lr_decay:
...@@ -69,9 +67,7 @@ class EnvRunner(Runner): ...@@ -69,9 +67,7 @@ class EnvRunner(Runner):
train_infos = self.train() train_infos = self.train()
# post process # post process
total_num_steps = ( total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads
(episode + 1) * self.episode_length * self.n_rollout_threads
)
# save model # save model
if episode % self.save_interval == 0 or episode == episodes - 1: if episode % self.save_interval == 0 or episode == episodes - 1:
...@@ -103,14 +99,8 @@ class EnvRunner(Runner): ...@@ -103,14 +99,8 @@ class EnvRunner(Runner):
# agent_k = 'agent%i/individual_rewards' % agent_id # agent_k = 'agent%i/individual_rewards' % agent_id
# env_infos[agent_k] = idv_rews # env_infos[agent_k] = idv_rews
train_infos["average_episode_rewards"] = ( train_infos["average_episode_rewards"] = np.mean(self.buffer.rewards) * self.episode_length
np.mean(self.buffer.rewards) * self.episode_length print("average episode rewards is {}".format(train_infos["average_episode_rewards"]))
)
print(
"average episode rewards is {}".format(
train_infos["average_episode_rewards"]
)
)
self.log_train(train_infos, total_num_steps) self.log_train(train_infos, total_num_steps)
# self.log_env(env_infos, total_num_steps) # self.log_env(env_infos, total_num_steps)
...@@ -120,14 +110,14 @@ class EnvRunner(Runner): ...@@ -120,14 +110,14 @@ class EnvRunner(Runner):
def warmup(self): def warmup(self):
# reset env # reset env
obs = self.envs.reset() # shape = (5, 2, 14) obs = self.envs.reset() # shape = [env_num, agent_num, obs_dim]
# replay buffer # replay buffer
if self.use_centralized_V: if self.use_centralized_V:
share_obs = obs.reshape(self.n_rollout_threads, -1) # shape = (5, 28) share_obs = obs.reshape(self.n_rollout_threads, -1) # shape = [env_num, agent_num * obs_dim]
share_obs = np.expand_dims(share_obs, 1).repeat( share_obs = np.expand_dims(share_obs, 1).repeat(
self.num_agents, axis=1 self.num_agents, axis=1
) # shape = (5, 2, 28) ) # shape = shape = [env_num, agent_num, agent_num * obs_dim]
else: else:
share_obs = obs share_obs = obs
...@@ -151,21 +141,21 @@ class EnvRunner(Runner): ...@@ -151,21 +141,21 @@ class EnvRunner(Runner):
np.concatenate(self.buffer.masks[step]), np.concatenate(self.buffer.masks[step]),
) )
# [self.envs, agents, dim] # [self.envs, agents, dim]
values = np.array(np.split(_t2n(value), self.n_rollout_threads)) values = np.array(np.split(_t2n(value), self.n_rollout_threads)) # [env_num, agent_num, 1]
actions = np.array(np.split(_t2n(action), self.n_rollout_threads)) actions = np.array(np.split(_t2n(action), self.n_rollout_threads)) # [env_num, agent_num, action_dim]
action_log_probs = np.array( action_log_probs = np.array(
np.split(_t2n(action_log_prob), self.n_rollout_threads) np.split(_t2n(action_log_prob), self.n_rollout_threads)
) ) # [env_num, agent_num, 1]
rnn_states = np.array(np.split(_t2n(rnn_states), self.n_rollout_threads)) rnn_states = np.array(
np.split(_t2n(rnn_states), self.n_rollout_threads)
) # [env_num, agent_num, 1, hidden_size]
rnn_states_critic = np.array( rnn_states_critic = np.array(
np.split(_t2n(rnn_states_critic), self.n_rollout_threads) np.split(_t2n(rnn_states_critic), self.n_rollout_threads)
) ) # [env_num, agent_num, 1, hidden_size]
# rearrange action # rearrange action
if self.envs.action_space[0].__class__.__name__ == "MultiDiscrete": if self.envs.action_space[0].__class__.__name__ == "MultiDiscrete":
for i in range(self.envs.action_space[0].shape): for i in range(self.envs.action_space[0].shape):
uc_actions_env = np.eye(self.envs.action_space[0].high[i] + 1)[ uc_actions_env = np.eye(self.envs.action_space[0].high[i] + 1)[actions[:, :, i]]
actions[:, :, i]
]
if i == 0: if i == 0:
actions_env = uc_actions_env actions_env = uc_actions_env
else: else:
...@@ -239,9 +229,7 @@ class EnvRunner(Runner): ...@@ -239,9 +229,7 @@ class EnvRunner(Runner):
(self.n_eval_rollout_threads, *self.buffer.rnn_states.shape[2:]), (self.n_eval_rollout_threads, *self.buffer.rnn_states.shape[2:]),
dtype=np.float32, dtype=np.float32,
) )
eval_masks = np.ones( eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32
)
for eval_step in range(self.episode_length): for eval_step in range(self.episode_length):
self.trainer.prep_rollout() self.trainer.prep_rollout()
...@@ -251,60 +239,39 @@ class EnvRunner(Runner): ...@@ -251,60 +239,39 @@ class EnvRunner(Runner):
np.concatenate(eval_masks), np.concatenate(eval_masks),
deterministic=True, deterministic=True,
) )
eval_actions = np.array( eval_actions = np.array(np.split(_t2n(eval_action), self.n_eval_rollout_threads))
np.split(_t2n(eval_action), self.n_eval_rollout_threads) eval_rnn_states = np.array(np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads))
)
eval_rnn_states = np.array(
np.split(_t2n(eval_rnn_states), self.n_eval_rollout_threads)
)
if self.eval_envs.action_space[0].__class__.__name__ == "MultiDiscrete": if self.eval_envs.action_space[0].__class__.__name__ == "MultiDiscrete":
for i in range(self.eval_envs.action_space[0].shape): for i in range(self.eval_envs.action_space[0].shape):
eval_uc_actions_env = np.eye( eval_uc_actions_env = np.eye(self.eval_envs.action_space[0].high[i] + 1)[
self.eval_envs.action_space[0].high[i] + 1 eval_actions[:, :, i]
)[eval_actions[:, :, i]] ]
if i == 0: if i == 0:
eval_actions_env = eval_uc_actions_env eval_actions_env = eval_uc_actions_env
else: else:
eval_actions_env = np.concatenate( eval_actions_env = np.concatenate((eval_actions_env, eval_uc_actions_env), axis=2)
(eval_actions_env, eval_uc_actions_env), axis=2
)
elif self.eval_envs.action_space[0].__class__.__name__ == "Discrete": elif self.eval_envs.action_space[0].__class__.__name__ == "Discrete":
eval_actions_env = np.squeeze( eval_actions_env = np.squeeze(np.eye(self.eval_envs.action_space[0].n)[eval_actions], 2)
np.eye(self.eval_envs.action_space[0].n)[eval_actions], 2
)
else: else:
raise NotImplementedError raise NotImplementedError
# Obser reward and next obs # Obser reward and next obs
eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step( eval_obs, eval_rewards, eval_dones, eval_infos = self.eval_envs.step(eval_actions_env)
eval_actions_env
)
eval_episode_rewards.append(eval_rewards) eval_episode_rewards.append(eval_rewards)
eval_rnn_states[eval_dones == True] = np.zeros( eval_rnn_states[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), self.recurrent_N, self.hidden_size), ((eval_dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32, dtype=np.float32,
) )
eval_masks = np.ones( eval_masks = np.ones((self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_eval_rollout_threads, self.num_agents, 1), dtype=np.float32 eval_masks[eval_dones == True] = np.zeros(((eval_dones == True).sum(), 1), dtype=np.float32)
)
eval_masks[eval_dones == True] = np.zeros(
((eval_dones == True).sum(), 1), dtype=np.float32
)
eval_episode_rewards = np.array(eval_episode_rewards) eval_episode_rewards = np.array(eval_episode_rewards)
eval_env_infos = {} eval_env_infos = {}
eval_env_infos["eval_average_episode_rewards"] = np.sum( eval_env_infos["eval_average_episode_rewards"] = np.sum(np.array(eval_episode_rewards), axis=0)
np.array(eval_episode_rewards), axis=0 eval_average_episode_rewards = np.mean(eval_env_infos["eval_average_episode_rewards"])
) print("eval average episode rewards of agent: " + str(eval_average_episode_rewards))
eval_average_episode_rewards = np.mean(
eval_env_infos["eval_average_episode_rewards"]
)
print(
"eval average episode rewards of agent: "
+ str(eval_average_episode_rewards)
)
self.log_env(eval_env_infos, total_num_steps) self.log_env(eval_env_infos, total_num_steps)
@torch.no_grad() @torch.no_grad()
...@@ -330,9 +297,7 @@ class EnvRunner(Runner): ...@@ -330,9 +297,7 @@ class EnvRunner(Runner):
), ),
dtype=np.float32, dtype=np.float32,
) )
masks = np.ones( masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32
)
episode_rewards = [] episode_rewards = []
...@@ -347,21 +312,15 @@ class EnvRunner(Runner): ...@@ -347,21 +312,15 @@ class EnvRunner(Runner):
deterministic=True, deterministic=True,
) )
actions = np.array(np.split(_t2n(action), self.n_rollout_threads)) actions = np.array(np.split(_t2n(action), self.n_rollout_threads))
rnn_states = np.array( rnn_states = np.array(np.split(_t2n(rnn_states), self.n_rollout_threads))
np.split(_t2n(rnn_states), self.n_rollout_threads)
)
if envs.action_space[0].__class__.__name__ == "MultiDiscrete": if envs.action_space[0].__class__.__name__ == "MultiDiscrete":
for i in range(envs.action_space[0].shape): for i in range(envs.action_space[0].shape):
uc_actions_env = np.eye(envs.action_space[0].high[i] + 1)[ uc_actions_env = np.eye(envs.action_space[0].high[i] + 1)[actions[:, :, i]]
actions[:, :, i]
]
if i == 0: if i == 0:
actions_env = uc_actions_env actions_env = uc_actions_env
else: else:
actions_env = np.concatenate( actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
(actions_env, uc_actions_env), axis=2
)
elif envs.action_space[0].__class__.__name__ == "Discrete": elif envs.action_space[0].__class__.__name__ == "Discrete":
actions_env = np.squeeze(np.eye(envs.action_space[0].n)[actions], 2) actions_env = np.squeeze(np.eye(envs.action_space[0].n)[actions], 2)
else: else:
...@@ -375,12 +334,8 @@ class EnvRunner(Runner): ...@@ -375,12 +334,8 @@ class EnvRunner(Runner):
((dones == True).sum(), self.recurrent_N, self.hidden_size), ((dones == True).sum(), self.recurrent_N, self.hidden_size),
dtype=np.float32, dtype=np.float32,
) )
masks = np.ones( masks = np.ones((self.n_rollout_threads, self.num_agents, 1), dtype=np.float32)
(self.n_rollout_threads, self.num_agents, 1), dtype=np.float32 masks[dones == True] = np.zeros(((dones == True).sum(), 1), dtype=np.float32)
)
masks[dones == True] = np.zeros(
((dones == True).sum(), 1), dtype=np.float32
)
if self.all_args.save_gifs: if self.all_args.save_gifs:
image = envs.render("rgb_array")[0][0] image = envs.render("rgb_array")[0][0]
...@@ -392,10 +347,7 @@ class EnvRunner(Runner): ...@@ -392,10 +347,7 @@ class EnvRunner(Runner):
else: else:
envs.render("human") envs.render("human")
print( print("average episode rewards is: " + str(np.mean(np.sum(np.array(episode_rewards), axis=0))))
"average episode rewards is: "
+ str(np.mean(np.sum(np.array(episode_rewards), axis=0)))
)
# if self.all_args.save_gifs: # if self.all_args.save_gifs:
# imageio.mimsave(str(self.gif_dir) + '/render.gif', all_frames, duration=self.all_args.ifi) # imageio.mimsave(str(self.gif_dir) + '/render.gif', all_frames, duration=self.all_args.ifi)
...@@ -31,11 +31,15 @@ def make_train_env(all_args): ...@@ -31,11 +31,15 @@ def make_train_env(all_args):
def init_env(): def init_env():
# TODO 注意注意,这里选择连续还是离散可以选择注释上面两行,或者下面两行。 # TODO 注意注意,这里选择连续还是离散可以选择注释上面两行,或者下面两行。
# TODO Important, here you can choose continuous or discrete action space by uncommenting the above two lines or the below two lines. # TODO Important, here you can choose continuous or discrete action space by uncommenting the above two lines or the below two lines.
from envs.env_continuous import ContinuousActionEnv from envs.env_continuous import ContinuousActionEnv
env = ContinuousActionEnv() env = ContinuousActionEnv()
# from envs.env_discrete import DiscreteActionEnv # from envs.env_discrete import DiscreteActionEnv
# env = DiscreteActionEnv() # env = DiscreteActionEnv()
env.seed(all_args.seed + rank * 1000) env.seed(all_args.seed + rank * 1000)
return env return env
...@@ -63,9 +67,7 @@ def make_eval_env(all_args): ...@@ -63,9 +67,7 @@ def make_eval_env(all_args):
def parse_args(args, parser): def parse_args(args, parser):
parser.add_argument( parser.add_argument("--scenario_name", type=str, default="MyEnv", help="Which scenario to run on")
"--scenario_name", type=str, default="MyEnv", help="Which scenario to run on"
)
parser.add_argument("--num_landmarks", type=int, default=3) parser.add_argument("--num_landmarks", type=int, default=3)
parser.add_argument("--num_agents", type=int, default=2, help="number of players") parser.add_argument("--num_agents", type=int, default=2, help="number of players")
...@@ -79,20 +81,16 @@ def main(args): ...@@ -79,20 +81,16 @@ def main(args):
all_args = parse_args(args, parser) all_args = parse_args(args, parser)
if all_args.algorithm_name == "rmappo": if all_args.algorithm_name == "rmappo":
assert ( assert all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy, "check recurrent policy!"
all_args.use_recurrent_policy or all_args.use_naive_recurrent_policy
), "check recurrent policy!"
elif all_args.algorithm_name == "mappo": elif all_args.algorithm_name == "mappo":
assert ( assert (
all_args.use_recurrent_policy == False all_args.use_recurrent_policy == False and all_args.use_naive_recurrent_policy == False
and all_args.use_naive_recurrent_policy == False
), "check recurrent policy!" ), "check recurrent policy!"
else: else:
raise NotImplementedError raise NotImplementedError
assert ( assert (
all_args.share_policy == True all_args.share_policy == True and all_args.scenario_name == "simple_speaker_listener"
and all_args.scenario_name == "simple_speaker_listener"
) == False, "The simple_speaker_listener scenario can not use shared policy. Please check the config.py." ) == False, "The simple_speaker_listener scenario can not use shared policy. Please check the config.py."
# cuda # cuda
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment