Commit 90258d12 authored by hzq's avatar hzq

add continuous action space

parent 04fa328f
......@@ -22,7 +22,7 @@ MAPPO原版代码对于环境的封装过于复杂,本项目直接将环境封
## 用法
- 环境部分是一个空的的实现,文件`light_mappo/envs/env_wrappers.py`里面环境部分的实现:[Code](https://github.com/tinyzqh/light_mappo/blob/main/envs/env_wrappers.py)
- 环境部分是一个空的的实现,文件`light_mappo/envs/env_core.py`里面环境部分的实现:[Code](https://github.com/tinyzqh/light_mappo/blob/main/envs/env_wrappers.py)
```python
class Env(object):
......@@ -63,7 +63,7 @@ class Env(object):
```
只需要编写这一部分的代码,就可以无缝衔接MAPPO。初始版本,后期这一部分会单独提出来
只需要编写这一部分的代码,就可以无缝衔接MAPPO。在env_core.py之后,单独提出来了两个文件env_discrete.py和env_continuous.py这两个文件用于封装处理动作空间和离散动作空间。在algorithms/utils/act.py中elif self.continuous_action:这个判断逻辑也是用来处理连续动作空间的。和runner/shared/env_runner.py部分的# TODO 这里改造成自己环境需要的形式即可都是用来处理连续动作空间的
## Related Efforts
......
......@@ -14,11 +14,13 @@ class ACTLayer(nn.Module):
super(ACTLayer, self).__init__()
self.mixed_action = False
self.multi_discrete = False
self.continuous_action = False
if action_space.__class__.__name__ == "Discrete":
action_dim = action_space.n
self.action_out = Categorical(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "Box":
self.continuous_action = True
action_dim = action_space.shape[0]
self.action_out = DiagGaussian(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "MultiBinary":
......@@ -49,7 +51,7 @@ class ACTLayer(nn.Module):
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of taken actions.
"""
if self.mixed_action :
if self.mixed_action:
actions = []
action_log_probs = []
for action_out in self.action_outs:
......@@ -74,7 +76,16 @@ class ACTLayer(nn.Module):
actions = torch.cat(actions, -1)
action_log_probs = torch.cat(action_log_probs, -1)
elif self.continuous_action:
# actions = []
# action_log_probs = []
action_logit = self.action_out(x)
actions = action_logit.mode() if deterministic else action_logit.sample()
action_log_probs = action_logit.log_probs(actions)
# actions.append(action.float())
# action_log_probs.append(action_log_prob)
# actions = torch.cat(actions, -1)
# action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
else:
action_logits = self.action_out(x, available_actions)
actions = action_logits.mode() if deterministic else action_logits.sample()
......@@ -151,6 +162,27 @@ class ACTLayer(nn.Module):
action_log_probs = torch.cat(action_log_probs, -1) # ! could be wrong
dist_entropy = torch.tensor(dist_entropy).mean()
elif self.continuous_action:
# a, b = action.split((2, 1), -1)
# b = b.long()
# action = [a, b]
action_log_probs = []
dist_entropy = []
# for action_out, act in zip(self.action_outs, action):
action_logit = self.action_out(x)
action_log_probs.append(action_logit.log_probs(action))
if active_masks is not None:
if len(action_logit.entropy().shape) == len(active_masks.shape):
dist_entropy.append((action_logit.entropy() * active_masks).sum() / active_masks.sum())
else:
dist_entropy.append(
(action_logit.entropy() * active_masks.squeeze(-1)).sum() / active_masks.sum())
else:
dist_entropy.append(action_logit.entropy().mean())
action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
dist_entropy = dist_entropy[0] # / 2.0 + dist_entropy[1] / 0.98 # ! dosen't make sense
else:
action_logits = self.action_out(x, available_actions)
action_log_probs = action_logits.log_probs(action)
......
......@@ -176,7 +176,6 @@ def get_config():
parser.add_argument("--num_env_steps", type=int, default=10e6,
help='Number of environment steps to train (default: 10e6)')
parser.add_argument("--user_name", type=str, default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.")
parser.add_argument("--use_wandb", action='store_false', default=False, help="[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.")
# env parameters
parser.add_argument("--env_name", type=str, default='MyEnv', help="specify the name of environment")
......
"""
# @Time : 2021/7/2 5:22 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env.py
"""
import numpy as np
class Env(object):
"""
# 环境中的智能体
"""
def __init__(self, i):
self.agent_num = 2 # 设置智能体(小飞机)的个数,这里设置为两个
self.obs_dim = 14 # 设置智能体的观测纬度
self.action_dim = 5 # 设置智能体的动作纬度,这里假定为一个五个纬度的
def reset(self):
"""
# self.agent_num设定为2个智能体时,返回值为一个list,每个list里面为一个shape = (self.obs_dim, )的观测数据
"""
sub_agent_obs = []
for i in range(self.agent_num):
sub_obs = np.random.random(size=(14, ))
sub_agent_obs.append(sub_obs)
return sub_agent_obs
def step(self, actions):
"""
# self.agent_num设定为2个智能体时,actions的输入为一个2纬的list,每个list里面为一个shape = (self.action_dim, )的动作数据
# 默认参数情况下,输入为一个list,里面含有两个元素,因为动作纬度为5,所里每个元素shape = (5, )
"""
sub_agent_obs = []
sub_agent_reward = []
sub_agent_done = []
sub_agent_info = []
for i in range(self.agent_num):
sub_agent_obs.append(np.random.random(size=(14,)))
sub_agent_reward.append([np.random.rand()])
sub_agent_done.append(False)
sub_agent_info.append({})
return [sub_agent_obs, sub_agent_reward, sub_agent_done, sub_agent_info]
\ No newline at end of file
"""
# @Time : 2021/7/2 5:22 下午
# @Author : hezhiqiang
# @Email : tinyzqh@163.com
# @File : env_discrete.py
"""
import gym
from gym import spaces
import numpy as np
from envs.env_core import EnvCore
class DiscreteActionEnv(object):
"""对于离散动作环境的封装"""
def __init__(self):
self.env = EnvCore()
self.num_agent = self.env.agent_num
self.signal_obs_dim = self.env.obs_dim
self.signal_action_dim = self.env.action_dim
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
self.movable = True
# configure spaces
self.action_space = []
self.observation_space = []
self.share_observation_space = []
share_obs_dim = 0
for agent in range(self.num_agent):
total_action_space = []
# physical action space
u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作
if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# observation space
share_obs_dim += self.signal_obs_dim
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(self.signal_obs_dim,),
dtype=np.float32)) # [-inf,inf]
self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
dtype=np.float32) for _ in range(self.num_agent)]
def step(self, actions):
"""
输入actions纬度假设:
# actions shape = (5, 2, 5)
# 5个线程的环境,里面有2个智能体,每个智能体的动作是一个one_hot的5维编码
"""
results = self.env.step(actions)
obs, rews, dones, infos = results
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
obs = self.env.reset()
return np.stack(obs)
def close(self):
pass
def render(self, mode="rgb_array"):
pass
def seed(self, seed):
pass
class MultiDiscrete(gym.Space):
"""
- The multi-discrete action space consists of a series of discrete action spaces with different parameters
- It can be adapted to both a Discrete action space or a continuous (Box) action space
- It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
where the discrete action space can take any integers from `min` to `max` (both inclusive)
Note: A value of 0 always need to represent the NOOP action.
e.g. Nintendo Game Controller
- Can be conceptualized as 3 discrete action spaces:
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
- Can be initialized as
MultiDiscrete([ [0,4], [0,1], [0,1] ])
"""
def __init__(self, array_of_param_array):
super().__init__()
self.low = np.array([x[0] for x in array_of_param_array])
self.high = np.array([x[1] for x in array_of_param_array])
self.num_discrete_space = self.low.shape[0]
self.n = np.sum(self.high) + 2
def sample(self):
""" Returns a array with one sample from each discrete action space """
# For each row: round(random .* (max - min) + min, 0)
random_array = np.random.rand(self.num_discrete_space)
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
def contains(self, x):
return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (
np.array(x) <= self.high).all()
@property
def shape(self):
return self.num_discrete_space
def __repr__(self):
return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
if __name__ == "__main__":
DiscreteActionEnv().step(actions=None)
\ No newline at end of file
This diff is collapsed.
import wandb
import os
import numpy as np
import torch
......@@ -37,7 +36,6 @@ class Runner(object):
self.n_render_rollout_threads = self.all_args.n_render_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
......@@ -50,10 +48,6 @@ class Runner(object):
# dir
self.model_dir = self.all_args.model_dir
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
self.run_dir = str(wandb.run.dir)
else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
......@@ -146,9 +140,6 @@ class Runner(object):
:param total_num_steps: (int) total number of training env steps.
"""
for k, v in train_infos.items():
if self.use_wandb:
wandb.log({k: v}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
......@@ -159,7 +150,4 @@ class Runner(object):
"""
for k, v in env_infos.items():
if len(v)>0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
......@@ -16,7 +16,6 @@ import time
import numpy as np
import torch
from runner.shared.base_runner import Runner
import wandb
import imageio
......@@ -133,9 +132,12 @@ class EnvRunner(Runner):
else:
actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
elif self.envs.action_space[0].__class__.__name__ == 'Discrete':
# actions --> actions_env : shape:[10, 1] --> [5, 2, 5]
actions_env = np.squeeze(np.eye(self.envs.action_space[0].n)[actions], 2)
else:
raise NotImplementedError
# TODO 这里改造成自己环境需要的形式即可
actions_env = actions
# raise NotImplementedError
return values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env
......
"""
# @Time : 2021/6/30 10:07 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @Author : hezhiqiang
# @Email : tinyzqh@163.com
# @File : train.py
"""
# !/usr/bin/env python
import sys
import os
import wandb
import socket
import setproctitle
import numpy as np
......@@ -21,11 +20,27 @@ from envs.env_wrappers import SubprocVecEnv, DummyVecEnv
def make_train_env(all_args):
return SubprocVecEnv(all_args)
def get_env_fn(rank):
def init_env():
# from envs.env_continuous import ContinuousActionEnv
# env = ContinuousActionEnv()
from envs.env_discrete import DiscreteActionEnv
env = DiscreteActionEnv()
env.seed(all_args.seed + rank * 1000)
return env
return init_env
return DummyVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def make_eval_env(all_args):
return DummyVecEnv(all_args)
def get_env_fn(rank):
def init_env():
from envs.env_discrete import DiscreteActionEnv
env = DiscreteActionEnv()
env.seed(all_args.seed + rank * 1000)
return env
return init_env
return DummyVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def parse_args(args, parser):
......@@ -72,20 +87,6 @@ def main(args):
if not run_dir.exists():
os.makedirs(str(run_dir))
# wandb
if all_args.use_wandb:
run = wandb.init(config=all_args,
project=all_args.env_name,
entity=all_args.user_name,
notes=socket.gethostname(),
name=str(all_args.algorithm_name) + "_" +
str(all_args.experiment_name) +
"_seed" + str(all_args.seed),
group=all_args.scenario_name,
dir=str(run_dir),
job_type="training",
reinit=True)
else:
if not run_dir.exists():
curr_run = 'run1'
else:
......@@ -136,9 +137,6 @@ def main(args):
if all_args.use_eval and eval_envs is not envs:
eval_envs.close()
if all_args.use_wandb:
run.finish()
else:
runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
runner.writter.close()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment