add continuous action space

90258d12 · hzq · 04fa328f · 90258d12 · 90258d12 · 90258d12
Commit 90258d12 authored Jul 09, 2022 by hzq
9 changed files
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ MAPPO原版代码对于环境的封装过于复杂，本项目直接将环境封

 ## 用法

- 环境部分是一个空的的实现，文件`light_mappo/envs/env_wrappers.py`里面环境部分的实现：[Code](https://github.com/tinyzqh/light_mappo/blob/main/envs/env_wrappers.py)
+- 环境部分是一个空的的实现，文件`light_mappo/envs/env_core.py`里面环境部分的实现：[Code](https://github.com/tinyzqh/light_mappo/blob/main/envs/env_wrappers.py)

 ```python
 class Env(object):
@@ -63,7 +63,7 @@ class Env(object):
 ```


-只需要编写这一部分的代码，就可以无缝衔接MAPPO。初始版本，后期这一部分会单独提出来。
+只需要编写这一部分的代码，就可以无缝衔接MAPPO。在env_core.py之后，单独提出来了两个文件env_discrete.py和env_continuous.py这两个文件用于封装处理动作空间和离散动作空间。在algorithms/utils/act.py中elif self.continuous_action:这个判断逻辑也是用来处理连续动作空间的。和runner/shared/env_runner.py部分的# TODO 这里改造成自己环境需要的形式即可都是用来处理连续动作空间的。


 ## Related Efforts

--- a/algorithms/utils/act.py
+++ b/algorithms/utils/act.py
@@ -14,11 +14,13 @@ class ACTLayer(nn.Module):
        super(ACTLayer, self).__init__()
        self.mixed_action = False
        self.multi_discrete = False
+        self.continuous_action = False

        if action_space.__class__.__name__ == "Discrete":
            action_dim = action_space.n
            self.action_out = Categorical(inputs_dim, action_dim, use_orthogonal, gain)
        elif action_space.__class__.__name__ == "Box":
+            self.continuous_action = True
            action_dim = action_space.shape[0]
            self.action_out = DiagGaussian(inputs_dim, action_dim, use_orthogonal, gain)
        elif action_space.__class__.__name__ == "MultiBinary":
@@ -49,7 +51,7 @@ class ACTLayer(nn.Module):
        :return actions: (torch.Tensor) actions to take.
        :return action_log_probs: (torch.Tensor) log probabilities of taken actions.
        """
-        if self.mixed_action :
+        if self.mixed_action:
            actions = []
            action_log_probs = []
            for action_out in self.action_outs:
@@ -74,7 +76,16 @@ class ACTLayer(nn.Module):

            actions = torch.cat(actions, -1)
            action_log_probs = torch.cat(action_log_probs, -1)
-        
+        elif self.continuous_action:
+            # actions = []
+            # action_log_probs = []
+            action_logit = self.action_out(x)
+            actions = action_logit.mode() if deterministic else action_logit.sample()
+            action_log_probs = action_logit.log_probs(actions)
+            # actions.append(action.float())
+            # action_log_probs.append(action_log_prob)
+            # actions = torch.cat(actions, -1)
+            # action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
        else:
            action_logits = self.action_out(x, available_actions)
            actions = action_logits.mode() if deterministic else action_logits.sample() 
@@ -151,6 +162,27 @@ class ACTLayer(nn.Module):
            action_log_probs = torch.cat(action_log_probs, -1) # ! could be wrong
            dist_entropy = torch.tensor(dist_entropy).mean()

+        elif self.continuous_action:
+            # a, b = action.split((2, 1), -1)
+            # b = b.long()
+            # action = [a, b]
+            action_log_probs = []
+            dist_entropy = []
+            # for action_out, act in zip(self.action_outs, action):
+            action_logit = self.action_out(x)
+            action_log_probs.append(action_logit.log_probs(action))
+            if active_masks is not None:
+                if len(action_logit.entropy().shape) == len(active_masks.shape):
+                    dist_entropy.append((action_logit.entropy() * active_masks).sum() / active_masks.sum())
+                else:
+                    dist_entropy.append(
+                        (action_logit.entropy() * active_masks.squeeze(-1)).sum() / active_masks.sum())
+            else:
+                dist_entropy.append(action_logit.entropy().mean())
+
+            action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
+            dist_entropy = dist_entropy[0] # / 2.0 + dist_entropy[1] / 0.98  # ! dosen't make sense
+
        else:
            action_logits = self.action_out(x, available_actions)
            action_log_probs = action_logits.log_probs(action)

--- a/config.py
+++ b/config.py
@@ -176,7 +176,6 @@ def get_config():
    parser.add_argument("--num_env_steps", type=int, default=10e6,
                        help='Number of environment steps to train (default: 10e6)')
    parser.add_argument("--user_name", type=str, default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.")
-    parser.add_argument("--use_wandb", action='store_false', default=False, help="[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.")

    # env parameters
    parser.add_argument("--env_name", type=str, default='MyEnv', help="specify the name of environment")

--- a/envs/env.py
+++ b/envs/env.py
-"""
-# @Time    : 2021/7/2 5:22 下午
-# @Author  : hezhiqiang01
-# @Email   : hezhiqiang01@baidu.com
-# @File    : env.py
-"""
-
-import numpy as np
-
-
-class Env(object):
-    """
-    # 环境中的智能体
-    """
-    def __init__(self, i):
-        self.agent_num = 2  # 设置智能体(小飞机)的个数，这里设置为两个
-        self.obs_dim = 14  # 设置智能体的观测纬度
-        self.action_dim = 5  # 设置智能体的动作纬度，这里假定为一个五个纬度的
-
-    def reset(self):
-        """
-        # self.agent_num设定为2个智能体时，返回值为一个list，每个list里面为一个shape = (self.obs_dim, )的观测数据
-        """
-        sub_agent_obs = []
-        for i in range(self.agent_num):
-            sub_obs = np.random.random(size=(14, ))
-            sub_agent_obs.append(sub_obs)
-        return sub_agent_obs
-
-    def step(self, actions):
-        """
-        # self.agent_num设定为2个智能体时，actions的输入为一个2纬的list，每个list里面为一个shape = (self.action_dim, )的动作数据
-        # 默认参数情况下，输入为一个list，里面含有两个元素，因为动作纬度为5，所里每个元素shape = (5, )
-        """
-        sub_agent_obs = []
-        sub_agent_reward = []
-        sub_agent_done = []
-        sub_agent_info = []
-        for i in range(self.agent_num):
-            sub_agent_obs.append(np.random.random(size=(14,)))
-            sub_agent_reward.append([np.random.rand()])
-            sub_agent_done.append(False)
-            sub_agent_info.append({})
-
-        return [sub_agent_obs, sub_agent_reward, sub_agent_done, sub_agent_info]
\ No newline at end of file
--- a/envs/env_discrete.py
+++ b/envs/env_discrete.py
+"""
+# @Time    : 2021/7/2 5:22 下午
+# @Author  : hezhiqiang
+# @Email   : tinyzqh@163.com
+# @File    : env_discrete.py
+"""
+
+import gym
+from gym import spaces
+import numpy as np
+from envs.env_core import EnvCore
+
+
+class DiscreteActionEnv(object):
+    """对于离散动作环境的封装"""
+    def __init__(self):
+        self.env = EnvCore()
+        self.num_agent = self.env.agent_num
+
+        self.signal_obs_dim = self.env.obs_dim
+        self.signal_action_dim = self.env.action_dim
+
+        # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
+        self.discrete_action_input = False
+
+        self.movable = True
+
+        # configure spaces
+        self.action_space = []
+        self.observation_space = []
+        self.share_observation_space = []
+
+        share_obs_dim = 0
+        for agent in range(self.num_agent):
+            total_action_space = []
+
+            # physical action space
+            u_action_space = spaces.Discrete(self.signal_action_dim)  # 5个离散的动作
+
+            if self.movable:
+                total_action_space.append(u_action_space)
+
+            # total action space
+            if len(total_action_space) > 1:
+                # all action spaces are discrete, so simplify to MultiDiscrete action space
+                if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
+                    act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
+                else:
+                    act_space = spaces.Tuple(total_action_space)
+                self.action_space.append(act_space)
+            else:
+                self.action_space.append(total_action_space[0])
+
+            # observation space
+            share_obs_dim += self.signal_obs_dim
+            self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(self.signal_obs_dim,),
+                                                     dtype=np.float32))  # [-inf,inf]
+
+        self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
+                                                   dtype=np.float32) for _ in range(self.num_agent)]
+
+    def step(self, actions):
+        """
+        输入actions纬度假设：
+        # actions shape = (5, 2, 5)
+        # 5个线程的环境，里面有2个智能体，每个智能体的动作是一个one_hot的5维编码
+        """
+
+        results = self.env.step(actions)
+        obs, rews, dones, infos = results
+        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+
+    def reset(self):
+        obs = self.env.reset()
+        return np.stack(obs)
+
+    def close(self):
+        pass
+
+    def render(self, mode="rgb_array"):
+        pass
+
+    def seed(self, seed):
+        pass
+
+class MultiDiscrete(gym.Space):
+    """
+    - The multi-discrete action space consists of a series of discrete action spaces with different parameters
+    - It can be adapted to both a Discrete action space or a continuous (Box) action space
+    - It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
+    - It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
+       where the discrete action space can take any integers from `min` to `max` (both inclusive)
+    Note: A value of 0 always need to represent the NOOP action.
+    e.g. Nintendo Game Controller
+    - Can be conceptualized as 3 discrete action spaces:
+        1) Arrow Keys: Discrete 5  - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4]  - params: min: 0, max: 4
+        2) Button A:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
+        3) Button B:   Discrete 2  - NOOP[0], Pressed[1] - params: min: 0, max: 1
+    - Can be initialized as
+        MultiDiscrete([ [0,4], [0,1], [0,1] ])
+    """
+
+    def __init__(self, array_of_param_array):
+        super().__init__()
+        self.low = np.array([x[0] for x in array_of_param_array])
+        self.high = np.array([x[1] for x in array_of_param_array])
+        self.num_discrete_space = self.low.shape[0]
+        self.n = np.sum(self.high) + 2
+
+    def sample(self):
+        """ Returns a array with one sample from each discrete action space """
+        # For each row: round(random .* (max - min) + min, 0)
+        random_array = np.random.rand(self.num_discrete_space)
+        return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
+
+    def contains(self, x):
+        return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (
+                    np.array(x) <= self.high).all()
+
+    @property
+    def shape(self):
+        return self.num_discrete_space
+
+    def __repr__(self):
+        return "MultiDiscrete" + str(self.num_discrete_space)
+
+    def __eq__(self, other):
+        return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
+
+
+if __name__ == "__main__":
+    DiscreteActionEnv().step(actions=None)
\ No newline at end of file
--- a/envs/env_wrappers.py
+++ b/envs/env_wrappers.py
--- a/runner/shared/base_runner.py
+++ b/runner/shared/base_runner.py
-import wandb
 import os
 import numpy as np
 import torch
@@ -37,7 +36,6 @@ class Runner(object):
        self.n_render_rollout_threads = self.all_args.n_render_rollout_threads
        self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
        self.hidden_size = self.all_args.hidden_size
-        self.use_wandb = self.all_args.use_wandb
        self.use_render = self.all_args.use_render
        self.recurrent_N = self.all_args.recurrent_N

@@ -50,10 +48,6 @@ class Runner(object):
        # dir
        self.model_dir = self.all_args.model_dir

-        if self.use_wandb:
-            self.save_dir = str(wandb.run.dir)
-            self.run_dir = str(wandb.run.dir)
-        else:
        self.run_dir = config["run_dir"]
        self.log_dir = str(self.run_dir / 'logs')
        if not os.path.exists(self.log_dir):
@@ -146,9 +140,6 @@ class Runner(object):
        :param total_num_steps: (int) total number of training env steps.
        """
        for k, v in train_infos.items():
-            if self.use_wandb:
-                wandb.log({k: v}, step=total_num_steps)
-            else:
            self.writter.add_scalars(k, {k: v}, total_num_steps)

    def log_env(self, env_infos, total_num_steps):
@@ -159,7 +150,4 @@ class Runner(object):
        """
        for k, v in env_infos.items():
            if len(v)>0:
-                if self.use_wandb:
-                    wandb.log({k: np.mean(v)}, step=total_num_steps)
-                else:
                self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
--- a/runner/shared/env_runner.py
+++ b/runner/shared/env_runner.py
@@ -16,7 +16,6 @@ import time
 import numpy as np
 import torch
 from runner.shared.base_runner import Runner
-import wandb
 import imageio


@@ -133,9 +132,12 @@ class EnvRunner(Runner):
                else:
                    actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
        elif self.envs.action_space[0].__class__.__name__ == 'Discrete':
+            # actions  --> actions_env : shape:[10, 1] --> [5, 2, 5]
            actions_env = np.squeeze(np.eye(self.envs.action_space[0].n)[actions], 2)
        else:
-            raise NotImplementedError
+            # TODO 这里改造成自己环境需要的形式即可
+            actions_env = actions
+            # raise NotImplementedError

        return values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env


--- a/train/train.py
+++ b/train/train.py
 """
 # @Time    : 2021/6/30 10:07 下午
-# @Author  : hezhiqiang01
-# @Email   : hezhiqiang01@baidu.com
+# @Author  : hezhiqiang
+# @Email   : tinyzqh@163.com
 # @File    : train.py
 """

 # !/usr/bin/env python
 import sys
 import os
-import wandb
 import socket
 import setproctitle
 import numpy as np
@@ -21,11 +20,27 @@ from envs.env_wrappers import SubprocVecEnv, DummyVecEnv


 def make_train_env(all_args):
-    return SubprocVecEnv(all_args)
+    def get_env_fn(rank):
+        def init_env():
+            # from envs.env_continuous import ContinuousActionEnv
+            # env = ContinuousActionEnv()
+            from envs.env_discrete import DiscreteActionEnv
+            env = DiscreteActionEnv()
+            env.seed(all_args.seed + rank * 1000)
+            return env
+        return init_env
+    return DummyVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])


 def make_eval_env(all_args):
-    return DummyVecEnv(all_args)
+    def get_env_fn(rank):
+        def init_env():
+            from envs.env_discrete import DiscreteActionEnv
+            env = DiscreteActionEnv()
+            env.seed(all_args.seed + rank * 1000)
+            return env
+        return init_env
+    return DummyVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])


 def parse_args(args, parser):
@@ -72,20 +87,6 @@ def main(args):
    if not run_dir.exists():
        os.makedirs(str(run_dir))

-    # wandb
-    if all_args.use_wandb:
-        run = wandb.init(config=all_args,
-                         project=all_args.env_name,
-                         entity=all_args.user_name,
-                         notes=socket.gethostname(),
-                         name=str(all_args.algorithm_name) + "_" +
-                              str(all_args.experiment_name) +
-                              "_seed" + str(all_args.seed),
-                         group=all_args.scenario_name,
-                         dir=str(run_dir),
-                         job_type="training",
-                         reinit=True)
-    else:
    if not run_dir.exists():
        curr_run = 'run1'
    else:
@@ -136,9 +137,6 @@ def main(args):
    if all_args.use_eval and eval_envs is not envs:
        eval_envs.close()

-    if all_args.use_wandb:
-        run.finish()
-    else:
    runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
    runner.writter.close()