111

a562d9db · 张逸鸣 · ea508a3a · a562d9db · a562d9db · a562d9db
Commit a562d9db authored May 21, 2024 by 张逸鸣
14 changed files
--- a/algorithms/algorithm/r_actor_critic.py
+++ b/algorithms/algorithm/r_actor_critic.py
@@ -9,7 +9,7 @@ import torch
 import torch.nn as nn
 from algorithms.utils.util import init, check
 from algorithms.utils.cnn import CNNBase
-from algorithms.utils.mlp import MLPBase
+from algorithms.utils.mlp import MLPBase, MLPBaseWithTrans, MLPBaseGPT2
 from algorithms.utils.rnn import RNNLayer
 from algorithms.utils.act import ACTLayer
 from algorithms.utils.popart import PopArt
@@ -37,7 +37,7 @@ class R_Actor(nn.Module):
        self.tpdv = dict(dtype=torch.float32, device=device)

        obs_shape = get_shape_from_obs_space(obs_space)
-        base = CNNBase if len(obs_shape) == 3 else MLPBase
+        base = CNNBase if len(obs_shape) == 3 else MLPBaseGPT2
        self.base = base(args, obs_shape)

        if self._use_naive_recurrent_policy or self._use_recurrent_policy:
@@ -134,7 +134,9 @@ class R_Critic(nn.Module):
        init_method = [nn.init.xavier_uniform_, nn.init.orthogonal_][self._use_orthogonal]

        cent_obs_shape = get_shape_from_obs_space(cent_obs_space)
-        base = CNNBase if len(cent_obs_shape) == 3 else MLPBase
+        base = CNNBase if len(cent_obs_shape) == 3 else MLPBaseGPT2
+        print("cent_obs_shape:" )
+        print(cent_obs_shape)
        self.base = base(args, cent_obs_shape)

        if self._use_naive_recurrent_policy or self._use_recurrent_policy:

--- a/algorithms/utils/act.py
+++ b/algorithms/utils/act.py
@@ -15,9 +15,13 @@ class ACTLayer(nn.Module):
        self.mixed_action = False
        self.multi_discrete = False
        self.continuous_action = False
+        self.tanh = nn.Tanh()
+
+        print(action_space)

        if action_space.__class__.__name__ == "Discrete":
            action_dim = action_space.n
+            print(action_dim)
            self.action_out = Categorical(inputs_dim, action_dim, use_orthogonal, gain)
        elif action_space.__class__.__name__ == "Box":
            self.continuous_action = True
@@ -30,10 +34,12 @@ class ACTLayer(nn.Module):
            self.multi_discrete = True
            action_dims = action_space.high - action_space.low + 1
            self.action_outs = []
+            print(action_dims)
            for action_dim in action_dims:
                self.action_outs.append(Categorical(inputs_dim, action_dim, use_orthogonal, gain))
            self.action_outs = nn.ModuleList(self.action_outs)
        else:  # discrete + continous
+            print("mixed_action")
            self.mixed_action = True
            continous_dim = action_space[0].shape[0]
            discrete_dim = action_space[1].n
@@ -82,6 +88,8 @@ class ACTLayer(nn.Module):
            action_logit = self.action_out(x)
            actions = action_logit.mode() if deterministic else action_logit.sample()
            action_log_probs = action_logit.log_probs(actions)
+            #actions = torch.sigmoid(actions)
+            actions = self.tanh(actions)
            # actions.append(action.float())
            # action_log_probs.append(action_log_prob)
            # actions = torch.cat(actions, -1)

--- a/algorithms/utils/mlp.py
+++ b/algorithms/utils/mlp.py
--- a/config.py
+++ b/config.py
@@ -166,11 +166,11 @@ def get_config():
        default="check",
        help="an identifier to distinguish different experiment.",
    )
-    parser.add_argument("--seed", type=int, default=1, help="Random seed for numpy/torch")
+    parser.add_argument("--seed", type=int, default=10, help="Random seed for numpy/torch")
    parser.add_argument(
        "--cuda",
        action="store_false",
-        default=True,
+        default=False,
        help="by default True, will use GPU to train; or else will use CPU;",
    )
    parser.add_argument(
@@ -182,13 +182,13 @@ def get_config():
    parser.add_argument(
        "--n_training_threads",
        type=int,
-        default=2,
+        default=10,
        help="Number of torch threads for training",
    )
    parser.add_argument(
        "--n_rollout_threads",
        type=int,
-        default=5,
+        default=1,
        help="Number of parallel envs for training rollouts",
    )
    parser.add_argument(
@@ -206,7 +206,7 @@ def get_config():
    parser.add_argument(
        "--num_env_steps",
        type=int,
-        default=10e6,
+        default=2*10e6,
        help="Number of environment steps to train (default: 10e6)",
    )
    parser.add_argument(
@@ -256,13 +256,13 @@ def get_config():
    parser.add_argument(
        "--hidden_size",
        type=int,
-        default=64,
+        default=128,
        help="Dimension of hidden layers for actor/critic networks",
    )
    parser.add_argument(
        "--layer_N",
        type=int,
-        default=1,
+        default=3,
        help="Number of layers for actor/critic networks",
    )
    parser.add_argument("--use_ReLU", action="store_false", default=True, help="Whether to use ReLU")
@@ -314,11 +314,11 @@ def get_config():
    )

    # optimizer parameters
-    parser.add_argument("--lr", type=float, default=5e-4, help="learning rate (default: 5e-4)")
+    parser.add_argument("--lr", type=float, default=1e-5, help="learning rate (default: 5e-4)")
    parser.add_argument(
        "--critic_lr",
        type=float,
-        default=5e-4,
+        default=1e-4,
        help="critic learning rate (default: 5e-4)",
    )
    parser.add_argument(

--- a/envs/AerialVehicle.py
+++ b/envs/AerialVehicle.py
+import math
+import random
+
+import numpy as np
+
+
+class AerialVehicle(object):
+    def __init__(self, CustomerNum, TotalContentNum, ContentSize, Hight,
+                 EnvA, EnvB, Frequency, Bandwidth, TransmitPower, SpeedOfLight,
+                 AvgLOS, AvgNLOS, ConstrainLOS, Noise, MaxPlaceX, MinPlaceX,
+                 MaxPlaceY, MinPlaceY):
+        self.CustomerNum = CustomerNum          # 区域内用户数量
+        self.TotalContentNum = TotalContentNum  # 文件总数量
+        self.ContentSize = ContentSize          # 单个文件大小
+        self.K = 0                              # 分片数量
+        self.PlaceX = 0                         # 无人机位置水平坐标x
+        self.PlaceY = 0                         # 无人机位置水平坐标y
+        self.PlaceH = Hight                     # 无人机飞行高度H
+        self.EnvA = EnvA                        # 系统环境参数a
+        self.EnvB = EnvB                        # 系统环境参数b
+
+        self.Frequency = Frequency              # 无人机载波频率
+        self.SpeedOfLight = SpeedOfLight        # 光速取值
+        self.AvgLOS = AvgLOS                    # 无人机LOS传输平均额外损失
+        self.AvgNLOS = AvgNLOS                  # 无人机NLOS传输平均额外损失
+        self.ConstrainLOS = ConstrainLOS        # 最低LOS概率限制
+
+        self.Bandwidth = Bandwidth              # 无人机总带宽
+        self.TransmitPower = TransmitPower      # 无人机传输总功率
+        self.Noise = Noise                      # 高斯白噪声
+
+        self.ServiceNum = 0                     # 本轮服务的数量
+        self.ServiceList = []                   # 本轮服务的用户List
+
+        self.MaxPlaceX = MaxPlaceX              # 区域范围
+        self.MinPlaceX = MinPlaceX
+        self.MaxPlaceY = MaxPlaceY
+        self.MinPlaceY = MinPlaceY
+
+        self.TotalContentNum = TotalContentNum
+
+        self.ServiceRadius = self.getRadiusOfUAV()                         # 无人机服务半径
+        print("RadiusOfUAV " +  str(self.ServiceRadius))
+        self.CachedContentList = [0 for _ in range(TotalContentNum)]       # 无人机缓存列表F
+
+    def cacheContent(self, i):
+        self.CachedContentList[i] = 1
+
+    def clearCacheContent(self):
+        self.CachedContentList = [0 for _ in range(self.TotalContentNum)]
+
+    def whetherCacheContent(self, i):
+        return self.CachedContentList[i] == 1
+
+    def getCacheList(self):
+        return self.CachedContentList
+
+    def setK(self, K):
+        self.K = K
+
+    def getPlaceX(self):
+        return self.PlaceX
+
+    def getPlaceY(self):
+        return self.PlaceY
+
+    # 移动无人机位置
+    def moveTo(self,x, y):
+        self.PlaceX = x
+        self.PlaceY = y
+
+    def moveToByDist(self, dist, direction):
+        x = self.PlaceX + dist * math.cos(direction)
+        y = self.PlaceY + dist * math.sin(direction)
+
+        if self.isBeyond(x, y):
+            return 5
+        self.PlaceX = x
+        self.PlaceY = y
+        return self.InPlace()
+
+    def isBeyond(self, x, y):
+        if x > self.MaxPlaceX or x < self.MinPlaceX or y > self.MaxPlaceY or y < self.MinPlaceY:
+            return True
+        return False
+
+    # 限制无人机飞行范围
+    def InPlace(self):
+        punish = 0
+        if self.PlaceX > self.MaxPlaceX:
+        #    self.PlaceX = self.MaxPlaceX
+            punish+=3
+        if self.PlaceX > self.MaxPlaceX - 5:
+            punish += 1
+        if self.PlaceX < self.MinPlaceX:
+            #   self.PlaceX = self.MinPlaceX
+            punish += 3
+        if self.PlaceX < self.MinPlaceX + 5:
+            punish += 1
+        if self.PlaceY > self.MaxPlaceY:
+            #    self.PlaceY = self.MaxPlaceY
+            punish += 3
+        if self.PlaceY > self.MaxPlaceY - 5:
+            punish += 1
+        if self.PlaceY < self.MinPlaceY:
+            #    self.PlaceY = self.MinPlaceY
+            punish += 3
+        if self.PlaceY < self.MinPlaceY + 5:
+            punish += 1
+        return punish
+
+    # 计算水平距离
+    def getDist(self,x, y):
+        return math.sqrt(math.pow(self.PlaceX - x, 2) + math.pow(self.PlaceY - y, 2))
+
+    # 计算LOS路径的概率
+    def getPossOfLos(self, x, y):
+        var1 = 180 / math.pi * math.atan(self.PlaceH / self.getDist(x, y))
+        var2 = - self.EnvB * (var1 - self.EnvA)
+        var3 = 1 + self.EnvA * math.exp(var2)
+        return 1 / var3
+
+    # 计算LOS路径的概率
+    def getPossOfLosByDist(self, dist):
+        var1 = (180)/3.1415 * math.atan(self.PlaceH / dist)
+        var2 = - self.EnvB * (var1 - self.EnvA)
+        var3 = 1 + self.EnvA * math.exp(var2)
+        return 1 / var3
+
+    # 计算路损的公共部分
+    def getNormalLoss(self, x, y):
+        var1 = math.log10(4 * math.pi * self.Frequency * self.getDist(x, y) / self.SpeedOfLight)
+        if var1 <= 0:
+            return 0
+        return 20 * var1
+
+    # 计算LOS路损
+    def getLOS(self, x, y):
+        return self.getNormalLoss(x, y) + self.AvgLOS
+
+    # 计算NLOS路损
+    def getNLOS(self, x, y):
+        return self.getNormalLoss(x, y) + self.AvgNLOS
+
+    # 计算平均路损
+    def getAvgLOS(self, x, y):
+        return self.getLOS(x, y) * self.getPossOfLos(x, y) + self.getNLOS(x, y) * (1 - self.getPossOfLos(x, y))
+
+    # 计算无人机服务半径
+    def getRadiusOfUAV(self):
+        var1 = math.log10((1 - self.ConstrainLOS) / (self.EnvA * self.ConstrainLOS))
+        var2 = math.tan(self.EnvA - (1 / self.EnvB) * var1)
+        return self.PlaceH / var2
+
+    # 无人机分配传输功率
+    def allocTransmitPower(self):
+        return self.TransmitPower
+
+    # 无人机分配带宽
+    def allocBandWidth(self):
+        return self.Bandwidth / self.ServiceNum
+
+    # 尝试是否在无人机服务范围内
+    def tryGetService(self, x, y, requestIndex, UserIndex):
+        print("tryGetService")
+        if self.getDist(x, y) > self.ServiceRadius:
+           # 水平距离大于服务半径
+           return False
+        elif not self.whetherCacheContent(requestIndex):
+            # 该无人机未缓存该内容
+           print("nonononono")
+           return False
+        else:
+            self.ServiceNum = self.ServiceNum + 1
+            self.ServiceList.append(UserIndex)
+            return True
+
+    def addService(self, UserIndex):
+        self.ServiceNum = self.ServiceNum + 1
+        self.ServiceList.append(UserIndex)
+
+
+    # 计算下行传输速率
+    def getTransSpeed(self, x, y):
+        var1 = self.allocTransmitPower() / (self.Noise * np.power(10, self.getAvgLOS(x, y) / 10))
+        return self.allocBandWidth() * math.log2(var1 + 1) / (1024 * 1024 * 8)
+
+    # 计算分片大小
+    def getSizeOFSlice(self):
+        return self.ContentSize / self.K
+
+    # 能否从无人机处获得分片
+    def tryGetContent(self, x, y, time, UserIndex, transSpeedBaseLine, requestIndex):
+        if self.CachedContentList[requestIndex] != 1:
+            return -1
+        if UserIndex not in self.ServiceList:
+            return -1
+        speed = self.getTransSpeed(x, y)
+
+         
+        if speed < transSpeedBaseLine:
+            return False
+        if (self.getSizeOFSlice() / speed) > time:
+            return False
+        return True
+
+    def clearServiceList(self):
+        self.ServiceNum = 0
+        return self.ServiceList.clear()
+
+
+
+
+
+####################################leader
+    def setLeader(self, Role):
+        self.Role = Role
+
+    def isLeader(self):
+        return self.Role
+
+
+
+
+
+
+
--- a/envs/UserForUAV.py
+++ b/envs/UserForUAV.py
+import random
+from numpy import random as r
+
+class UserForUAV(object):
+
+    def __init__(self, UserId, TotalContentNum, FalvorNum, K):
+        self.UserId = UserId                     # 用户索引
+        self.TotalContentNum = TotalContentNum   # 文件集合总数
+        self.FalvorNum = FalvorNum               # 该用户倾向选择的文件数量
+
+        self.PlaceX = 0                          # 用户水平位置x
+        self.PlaceY = 0                          # 用户水平位置y
+
+        self.K = K
+        self.RequestIndex = 0
+
+        self.UAVList = []
+
+        if FalvorNum > TotalContentNum:
+            self.FalvorNum = TotalContentNum
+
+        self.FalvorList = random.sample(range(1, 9), FalvorNum)    # 该用户倾向选择的文件
+
+    # 按照zipf分布，模拟用户请求
+    def genRequestIndex(self):
+        x = r.zipf(a=2, size=1)[0]
+        x = x - 1
+        if x >= len(self.FalvorList):
+            self.RequestIndex = self.FalvorList[len(self.FalvorList)-1]
+        else:
+            self.RequestIndex = self.FalvorList[x]
+        return self.RequestIndex
+
+    def getRequestIndex(self):
+        return self.RequestIndex
+
+    # 移动用户位置
+    def moveTo(self,x, y):
+        self.PlaceX = x
+        self.PlaceY = y
+
+    def setUAVList(self, UAVList):
+        self.UAVList = UAVList
+
+    def tryGetservice(self):
+
+        for i in range(0, len(self.UAVList)):
+            self.UAVList[i].tryGetService(self.PlaceX, self.PlaceY, self.RequestIndex, self.UserId)
+
+    def tryGetCache(self, transSpeedBaseLine, time):
+        vehiclesAbleToTrans = 0
+        self.genRequestIndex()
+        for i in range(0, len(self.UAVList)):
+            if self.UAVList[i].tryGetContent(self.PlaceX, self.PlaceY, time, self.UserId, transSpeedBaseLine,  self.RequestIndex):
+                vehiclesAbleToTrans = vehiclesAbleToTrans + 1
+        return vehiclesAbleToTrans, self.RequestIndex
+
+
+
+
+
--- a/envs/env_continuous.py
+++ b/envs/env_continuous.py
@@ -2,6 +2,7 @@ import gym
 from gym import spaces
 import numpy as np
 from envs.env_core import EnvCore
+from envs.env_discrete import MultiDiscrete


 class ContinuousActionEnv(object):
@@ -12,10 +13,11 @@ class ContinuousActionEnv(object):

    def __init__(self):
        self.env = EnvCore()
-        self.num_agent = self.env.agent_num
+        self.num_agent = self.env.AerialVehiclesNum + 2

        self.signal_obs_dim = self.env.obs_dim
        self.signal_action_dim = self.env.action_dim
+        print( self.num_agent)

        # if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
        self.discrete_action_input = False
@@ -29,7 +31,8 @@ class ContinuousActionEnv(object):

        share_obs_dim = 0
        total_action_space = []
-        for agent in range(self.num_agent):
+
+        for agent in range(self.env.AerialVehiclesNum):
            # physical action space
            u_action_space = spaces.Box(
                low=-np.inf,
@@ -54,7 +57,60 @@ class ContinuousActionEnv(object):
                    dtype=np.float32,
                )
            )  # [-inf,inf]
+        print('section1')
+
+        mu_action = []
+        for i in range(self.env.CustomerNum * self.env.AerialVehiclesNum):
+           mu_action.append([0, 1])
+        u_action_space = MultiDiscrete(mu_action)
+
+        if self.movable:
+            total_action_space.append(u_action_space)
+
+        # total action space
+        self.action_space.append(u_action_space)
+
+        # observation space
+        share_obs_dim += self.env.getDimension2()
+        self.observation_space.append(
+            spaces.Box(
+                low=-np.inf,
+                high=+np.inf,
+                shape=(self.env.getDimension2(),),
+                dtype=np.float32,
+            )
+        )
+
+        print('section2')
+
+
+        mu_action = []
+
+        mu_action.append([0, self.env.ContentNum - 1])
+        u_action_space = spaces.Discrete(self.env.ContentNum)

+        if self.movable:
+            total_action_space.append(u_action_space)
+
+            # total action space
+        self.action_space.append(u_action_space)
+
+        # observation space
+        share_obs_dim += (self.env.CacheNum + self.env.CustomerNum)
+        self.observation_space.append(
+            spaces.Box(
+                low=-np.inf,
+                high=+np.inf,
+                shape=(self.env.CacheNum + self.env.CustomerNum,),
+                dtype=np.float32,
+            )
+        )
+
+
+
+
+        print("ons")
+        print(share_obs_dim)
        self.share_observation_space = [
            spaces.Box(
                low=-np.inf, high=+np.inf, shape=(share_obs_dim,), dtype=np.float32
@@ -62,6 +118,7 @@ class ContinuousActionEnv(object):
            for _ in range(self.num_agent)
        ]

+
    def step(self, actions):
        """
        输入actions维度假设：
@@ -75,11 +132,11 @@ class ContinuousActionEnv(object):

        results = self.env.step(actions)
        obs, rews, dones, infos = results
-        return np.stack(obs), np.stack(rews), np.stack(dones), infos
+        return obs, np.stack(rews), np.stack(dones), infos

    def reset(self):
        obs = self.env.reset()
-        return np.stack(obs)
+        return obs

    def close(self):
        pass

--- a/envs/env_core.py
+++ b/envs/env_core.py
--- a/envs/env_wrappers.py
+++ b/envs/env_wrappers.py
@@ -47,7 +47,7 @@ class DummyVecEnv():

    def reset(self):
        obs = [env.reset() for env in self.envs] # [env_num, agent_num, obs_dim]
-        return np.array(obs)
+        return np.array(obs, dtype=object)

    def close(self):
        for env in self.envs:

--- a/envs/plot.py
+++ b/envs/plot.py
--- a/runner/separated/env_runner.py
+++ b/runner/separated/env_runner.py
@@ -26,7 +26,6 @@ class EnvRunner(Runner):
            if self.use_linear_lr_decay:
                for agent_id in range(self.num_agents):
                    self.trainer[agent_id].policy.lr_decay(episode, episodes)
-
            for step in range(self.episode_length):
                # Sample actions
                (
@@ -63,15 +62,17 @@ class EnvRunner(Runner):
            # post process
            total_num_steps = (episode + 1) * self.episode_length * self.n_rollout_threads

+            self.envs.reset()
+
            # save model
-            if episode % self.save_interval == 0 or episode == episodes - 1:
-                self.save()
+            #if episode % self.save_interval == 0 or episode == episodes - 1:
+#                self.save()

            # log information
            if episode % self.log_interval == 0:
                end = time.time()
                print(
-                    "\n Scenario {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n".format(
+                    "\n Scenario2 {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n".format(
                        self.all_args.scenario_name,
                        self.algorithm_name,
                        self.experiment_name,
@@ -111,6 +112,7 @@ class EnvRunner(Runner):
            share_obs.append(list(chain(*o)))
        share_obs = np.array(share_obs)  # shape = [env_num, agent_num * obs_dim]

+
        for agent_id in range(self.num_agents):
            if not self.use_centralized_V:
                share_obs = np.array(list(obs[:, agent_id]))
@@ -137,6 +139,7 @@ class EnvRunner(Runner):
                self.buffer[agent_id].rnn_states_critic[step],
                self.buffer[agent_id].masks[step],
            )
+
            # [agents, envs, dim]
            values.append(_t2n(value))
            action = _t2n(action)
@@ -171,8 +174,8 @@ class EnvRunner(Runner):
            actions_env.append(one_hot_action_env)

        values = np.array(values).transpose(1, 0, 2)
-        actions = np.array(actions).transpose(1, 0, 2)
-        action_log_probs = np.array(action_log_probs).transpose(1, 0, 2)
+#        actions = np.array(actions).transpose(1, 0, 2)
+  #      action_log_probs = np.array(action_log_probs).transpose(1, 0, 2)
        rnn_states = np.array(rnn_states).transpose(1, 0, 2, 3)
        rnn_states_critic = np.array(rnn_states_critic).transpose(1, 0, 2, 3)

@@ -223,8 +226,8 @@ class EnvRunner(Runner):
                np.array(list(obs[:, agent_id])),
                rnn_states[:, agent_id],
                rnn_states_critic[:, agent_id],
-                actions[:, agent_id],
-                action_log_probs[:, agent_id],
+                actions[agent_id],
+                action_log_probs[agent_id],
                values[:, agent_id],
                rewards[:, agent_id],
                masks[:, agent_id],

--- a/runner/separated/env_runner2.py
+++ b/runner/separated/env_runner2.py
--- a/runner/shared/env_runner.py
+++ b/runner/shared/env_runner.py
@@ -77,7 +77,7 @@ class EnvRunner(Runner):
            if episode % self.log_interval == 0:
                end = time.time()
                print(
-                    "\n Scenario {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n".format(
+                    "\n Sce2nario {} Algo {} Exp {} updates {}/{} episodes, total num timesteps {}/{}, FPS {}.\n".format(
                        self.all_args.scenario_name,
                        self.algorithm_name,
                        self.experiment_name,
@@ -88,17 +88,7 @@ class EnvRunner(Runner):
                        int(total_num_steps / (end - start)),
                    )
                )
-
-                # if self.env_name == "MPE":
-                #     env_infos = {}
-                #     for agent_id in range(self.num_agents):
-                #         idv_rews = []
-                #         for info in infos:
-                #             if 'individual_reward' in info[agent_id].keys():
-                #                 idv_rews.append(info[agent_id]['individual_reward'])
-                #         agent_k = 'agent%i/individual_rewards' % agent_id
-                #         env_infos[agent_k] = idv_rews
-
+                print('11111')
                train_infos["average_episode_rewards"] = np.mean(self.buffer.rewards) * self.episode_length
                print("average episode rewards is {}".format(train_infos["average_episode_rewards"]))
                self.log_train(train_infos, total_num_steps)

--- a/train/train.py
+++ b/train/train.py
@@ -67,9 +67,9 @@ def make_eval_env(all_args):


 def parse_args(args, parser):
-    parser.add_argument("--scenario_name", type=str, default="MyEnv", help="Which scenario to run on")
+    parser.add_argument("--scenario_name", type=str, default="MPE", help="Which scenario to run on")
    parser.add_argument("--num_landmarks", type=int, default=3)
-    parser.add_argument("--num_agents", type=int, default=2, help="number of players")
+    parser.add_argument("--num_agents", type=int, default=9, help="number of players")

    all_args = parser.parse_known_args(args)[0]