Commit 90258d12 authored by hzq's avatar hzq

add continuous action space

parent 04fa328f
......@@ -22,7 +22,7 @@ MAPPO原版代码对于环境的封装过于复杂,本项目直接将环境封
## 用法
- 环境部分是一个空的的实现,文件`light_mappo/envs/env_wrappers.py`里面环境部分的实现:[Code](https://github.com/tinyzqh/light_mappo/blob/main/envs/env_wrappers.py)
- 环境部分是一个空的的实现,文件`light_mappo/envs/env_core.py`里面环境部分的实现:[Code](https://github.com/tinyzqh/light_mappo/blob/main/envs/env_wrappers.py)
```python
class Env(object):
......@@ -63,7 +63,7 @@ class Env(object):
```
只需要编写这一部分的代码,就可以无缝衔接MAPPO。初始版本,后期这一部分会单独提出来
只需要编写这一部分的代码,就可以无缝衔接MAPPO。在env_core.py之后,单独提出来了两个文件env_discrete.py和env_continuous.py这两个文件用于封装处理动作空间和离散动作空间。在algorithms/utils/act.py中elif self.continuous_action:这个判断逻辑也是用来处理连续动作空间的。和runner/shared/env_runner.py部分的# TODO 这里改造成自己环境需要的形式即可都是用来处理连续动作空间的
## Related Efforts
......
......@@ -14,11 +14,13 @@ class ACTLayer(nn.Module):
super(ACTLayer, self).__init__()
self.mixed_action = False
self.multi_discrete = False
self.continuous_action = False
if action_space.__class__.__name__ == "Discrete":
action_dim = action_space.n
self.action_out = Categorical(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "Box":
self.continuous_action = True
action_dim = action_space.shape[0]
self.action_out = DiagGaussian(inputs_dim, action_dim, use_orthogonal, gain)
elif action_space.__class__.__name__ == "MultiBinary":
......@@ -49,7 +51,7 @@ class ACTLayer(nn.Module):
:return actions: (torch.Tensor) actions to take.
:return action_log_probs: (torch.Tensor) log probabilities of taken actions.
"""
if self.mixed_action :
if self.mixed_action:
actions = []
action_log_probs = []
for action_out in self.action_outs:
......@@ -74,7 +76,16 @@ class ACTLayer(nn.Module):
actions = torch.cat(actions, -1)
action_log_probs = torch.cat(action_log_probs, -1)
elif self.continuous_action:
# actions = []
# action_log_probs = []
action_logit = self.action_out(x)
actions = action_logit.mode() if deterministic else action_logit.sample()
action_log_probs = action_logit.log_probs(actions)
# actions.append(action.float())
# action_log_probs.append(action_log_prob)
# actions = torch.cat(actions, -1)
# action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
else:
action_logits = self.action_out(x, available_actions)
actions = action_logits.mode() if deterministic else action_logits.sample()
......@@ -150,7 +161,28 @@ class ACTLayer(nn.Module):
action_log_probs = torch.cat(action_log_probs, -1) # ! could be wrong
dist_entropy = torch.tensor(dist_entropy).mean()
elif self.continuous_action:
# a, b = action.split((2, 1), -1)
# b = b.long()
# action = [a, b]
action_log_probs = []
dist_entropy = []
# for action_out, act in zip(self.action_outs, action):
action_logit = self.action_out(x)
action_log_probs.append(action_logit.log_probs(action))
if active_masks is not None:
if len(action_logit.entropy().shape) == len(active_masks.shape):
dist_entropy.append((action_logit.entropy() * active_masks).sum() / active_masks.sum())
else:
dist_entropy.append(
(action_logit.entropy() * active_masks.squeeze(-1)).sum() / active_masks.sum())
else:
dist_entropy.append(action_logit.entropy().mean())
action_log_probs = torch.sum(torch.cat(action_log_probs, -1), -1, keepdim=True)
dist_entropy = dist_entropy[0] # / 2.0 + dist_entropy[1] / 0.98 # ! dosen't make sense
else:
action_logits = self.action_out(x, available_actions)
action_log_probs = action_logits.log_probs(action)
......
......@@ -176,7 +176,6 @@ def get_config():
parser.add_argument("--num_env_steps", type=int, default=10e6,
help='Number of environment steps to train (default: 10e6)')
parser.add_argument("--user_name", type=str, default='marl',help="[for wandb usage], to specify user's name for simply collecting training data.")
parser.add_argument("--use_wandb", action='store_false', default=False, help="[for wandb usage], by default True, will log date to wandb server. or else will use tensorboard to log data.")
# env parameters
parser.add_argument("--env_name", type=str, default='MyEnv', help="specify the name of environment")
......
"""
# @Time : 2021/7/2 5:22 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @File : env.py
"""
import numpy as np
class Env(object):
"""
# 环境中的智能体
"""
def __init__(self, i):
self.agent_num = 2 # 设置智能体(小飞机)的个数,这里设置为两个
self.obs_dim = 14 # 设置智能体的观测纬度
self.action_dim = 5 # 设置智能体的动作纬度,这里假定为一个五个纬度的
def reset(self):
"""
# self.agent_num设定为2个智能体时,返回值为一个list,每个list里面为一个shape = (self.obs_dim, )的观测数据
"""
sub_agent_obs = []
for i in range(self.agent_num):
sub_obs = np.random.random(size=(14, ))
sub_agent_obs.append(sub_obs)
return sub_agent_obs
def step(self, actions):
"""
# self.agent_num设定为2个智能体时,actions的输入为一个2纬的list,每个list里面为一个shape = (self.action_dim, )的动作数据
# 默认参数情况下,输入为一个list,里面含有两个元素,因为动作纬度为5,所里每个元素shape = (5, )
"""
sub_agent_obs = []
sub_agent_reward = []
sub_agent_done = []
sub_agent_info = []
for i in range(self.agent_num):
sub_agent_obs.append(np.random.random(size=(14,)))
sub_agent_reward.append([np.random.rand()])
sub_agent_done.append(False)
sub_agent_info.append({})
return [sub_agent_obs, sub_agent_reward, sub_agent_done, sub_agent_info]
\ No newline at end of file
"""
# @Time : 2021/7/2 5:22 下午
# @Author : hezhiqiang
# @Email : tinyzqh@163.com
# @File : env_discrete.py
"""
import gym
from gym import spaces
import numpy as np
from envs.env_core import EnvCore
class DiscreteActionEnv(object):
"""对于离散动作环境的封装"""
def __init__(self):
self.env = EnvCore()
self.num_agent = self.env.agent_num
self.signal_obs_dim = self.env.obs_dim
self.signal_action_dim = self.env.action_dim
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
self.movable = True
# configure spaces
self.action_space = []
self.observation_space = []
self.share_observation_space = []
share_obs_dim = 0
for agent in range(self.num_agent):
total_action_space = []
# physical action space
u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作
if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# observation space
share_obs_dim += self.signal_obs_dim
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(self.signal_obs_dim,),
dtype=np.float32)) # [-inf,inf]
self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
dtype=np.float32) for _ in range(self.num_agent)]
def step(self, actions):
"""
输入actions纬度假设:
# actions shape = (5, 2, 5)
# 5个线程的环境,里面有2个智能体,每个智能体的动作是一个one_hot的5维编码
"""
results = self.env.step(actions)
obs, rews, dones, infos = results
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
obs = self.env.reset()
return np.stack(obs)
def close(self):
pass
def render(self, mode="rgb_array"):
pass
def seed(self, seed):
pass
class MultiDiscrete(gym.Space):
"""
- The multi-discrete action space consists of a series of discrete action spaces with different parameters
- It can be adapted to both a Discrete action space or a continuous (Box) action space
- It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
where the discrete action space can take any integers from `min` to `max` (both inclusive)
Note: A value of 0 always need to represent the NOOP action.
e.g. Nintendo Game Controller
- Can be conceptualized as 3 discrete action spaces:
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
- Can be initialized as
MultiDiscrete([ [0,4], [0,1], [0,1] ])
"""
def __init__(self, array_of_param_array):
super().__init__()
self.low = np.array([x[0] for x in array_of_param_array])
self.high = np.array([x[1] for x in array_of_param_array])
self.num_discrete_space = self.low.shape[0]
self.n = np.sum(self.high) + 2
def sample(self):
""" Returns a array with one sample from each discrete action space """
# For each row: round(random .* (max - min) + min, 0)
random_array = np.random.rand(self.num_discrete_space)
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
def contains(self, x):
return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (
np.array(x) <= self.high).all()
@property
def shape(self):
return self.num_discrete_space
def __repr__(self):
return "MultiDiscrete" + str(self.num_discrete_space)
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
if __name__ == "__main__":
DiscreteActionEnv().step(actions=None)
\ No newline at end of file
......@@ -7,214 +7,294 @@ Modified from OpenAI Baselines code to work with multi-agent envs
"""
import numpy as np
import gym
from gym import spaces
from envs.env import Env
import torch
from multiprocessing import Process, Pipe
from abc import ABC, abstractmethod
class MultiDiscrete(gym.Space):
def tile_images(img_nhwc):
"""
- The multi-discrete action space consists of a series of discrete action spaces with different parameters
- It can be adapted to both a Discrete action space or a continuous (Box) action space
- It is useful to represent game controllers or keyboards where each key can be represented as a discrete action space
- It is parametrized by passing an array of arrays containing [min, max] for each discrete action space
where the discrete action space can take any integers from `min` to `max` (both inclusive)
Note: A value of 0 always need to represent the NOOP action.
e.g. Nintendo Game Controller
- Can be conceptualized as 3 discrete action spaces:
1) Arrow Keys: Discrete 5 - NOOP[0], UP[1], RIGHT[2], DOWN[3], LEFT[4] - params: min: 0, max: 4
2) Button A: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
3) Button B: Discrete 2 - NOOP[0], Pressed[1] - params: min: 0, max: 1
- Can be initialized as
MultiDiscrete([ [0,4], [0,1], [0,1] ])
Tile N images into one big PxQ image
(P,Q) are chosen to be as close as possible, and if N
is square, then P=Q.
input: img_nhwc, list or array of images, ndim=4 once turned into array
n = batch index, h = height, w = width, c = channel
returns:
bigim_HWc, ndarray with ndim=3
"""
img_nhwc = np.asarray(img_nhwc)
N, h, w, c = img_nhwc.shape
H = int(np.ceil(np.sqrt(N)))
W = int(np.ceil(float(N)/H))
img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
return img_Hh_Ww_c
class CloudpickleWrapper(object):
"""
Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
"""
def __init__(self, array_of_param_array):
super().__init__()
self.low = np.array([x[0] for x in array_of_param_array])
self.high = np.array([x[1] for x in array_of_param_array])
self.num_discrete_space = self.low.shape[0]
self.n = np.sum(self.high) + 2
def sample(self):
""" Returns a array with one sample from each discrete action space """
# For each row: round(random .* (max - min) + min, 0)
random_array = np.random.rand(self.num_discrete_space)
return [int(x) for x in np.floor(np.multiply((self.high - self.low + 1.), random_array) + self.low)]
def contains(self, x):
return len(x) == self.num_discrete_space and (np.array(x) >= self.low).all() and (
np.array(x) <= self.high).all()
@property
def shape(self):
return self.num_discrete_space
def __repr__(self):
return "MultiDiscrete" + str(self.num_discrete_space)
def __init__(self, x):
self.x = x
def __getstate__(self):
import cloudpickle
return cloudpickle.dumps(self.x)
def __setstate__(self, ob):
import pickle
self.x = pickle.loads(ob)
def worker(remote, parent_remote, env_fn_wrapper):
parent_remote.close()
env = env_fn_wrapper.x()
while True:
cmd, data = remote.recv()
if cmd == 'step':
ob, reward, done, info = env.step(data)
if 'bool' in done.__class__.__name__:
if done:
ob = env.reset()
else:
if np.all(done):
ob = env.reset()
remote.send((ob, reward, done, info))
elif cmd == 'reset':
ob = env.reset()
remote.send((ob))
elif cmd == 'render':
if data == "rgb_array":
fr = env.render(mode=data)
remote.send(fr)
elif data == "human":
env.render(mode=data)
elif cmd == 'reset_task':
ob = env.reset_task()
remote.send(ob)
elif cmd == 'close':
env.close()
remote.close()
break
elif cmd == 'get_spaces':
remote.send((env.observation_space, env.share_observation_space, env.action_space))
else:
raise NotImplementedError
class ShareVecEnv(ABC):
"""
An abstract asynchronous, vectorized environment.
Used to batch data from multiple copies of an environment, so that
each observation becomes an batch of observations, and expected action is a batch of actions to
be applied per-environment.
"""
closed = False
viewer = None
def __eq__(self, other):
return np.array_equal(self.low, other.low) and np.array_equal(self.high, other.high)
metadata = {
'render.modes': ['human', 'rgb_array']
}
def __init__(self, num_envs, observation_space, share_observation_space, action_space):
self.num_envs = num_envs
self.observation_space = observation_space
self.share_observation_space = share_observation_space
self.action_space = action_space
class SubprocVecEnv(object):
def __init__(self, all_args):
@abstractmethod
def reset(self):
"""
envs: list of gym environments to run in subprocesses
Reset all the environments and return an array of
observations, or a dict of observation arrays.
If step_async is still doing work, that work will
be cancelled and step_wait() should not be called
until step_async() is invoked again.
"""
pass
self.env_list = [Env(i) for i in range(all_args.n_rollout_threads)]
self.num_envs = all_args.n_rollout_threads
self.num_agent = self.env_list[0].agent_num
self.signal_obs_dim = self.env_list[0].obs_dim
self.signal_action_dim = self.env_list[0].action_dim
self.u_range = 1.0 # control range for continuous control
self.movable = True
# environment parameters
# self.discrete_action_space = True
self.discrete_action_space = True
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
# if true, even the action is continuous, action will be performed discretely
self.force_discrete_action = False
# configure spaces
self.action_space = []
self.observation_space = []
self.share_observation_space = []
share_obs_dim = 0
for agent in range(self.num_agent):
total_action_space = []
@abstractmethod
def step_async(self, actions):
"""
Tell all the environments to start taking a step
with the given actions.
Call step_wait() to get the results of the step.
You should not call this if a step_async run is
already pending.
"""
pass
# physical action space
if self.discrete_action_space:
u_action_space = spaces.Discrete(self.signal_action_dim) # 5个离散的动作
else:
u_action_space = spaces.Box(low=-self.u_range, high=+self.u_range, shape=(2,), dtype=np.float32) # [-1,1]
if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
@abstractmethod
def step_wait(self):
"""
Wait for the step taken with step_async().
Returns (obs, rews, dones, infos):
- obs: an array of observations, or a dict of
arrays of observations.
- rews: an array of rewards
- dones: an array of "episode done" booleans
- infos: a sequence of info objects
"""
pass
# observation space
share_obs_dim += self.signal_obs_dim
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(self.signal_obs_dim,),
dtype=np.float32)) # [-inf,inf]
def close_extras(self):
"""
Clean up the extra resources, beyond what's in this base class.
Only runs when not self.closed.
"""
pass
self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
dtype=np.float32) for _ in range(self.num_agent)]
def close(self):
if self.closed:
return
if self.viewer is not None:
self.viewer.close()
self.close_extras()
self.closed = True
def step(self, actions):
"""
输入actions纬度假设:
# actions shape = (5, 2, 5)
# 5个线程的环境,里面有2个智能体,每个智能体的动作是一个one_hot的5维编码
Step the environments synchronously.
This is available for backwards compatibility.
"""
self.step_async(actions)
return self.step_wait()
def render(self, mode='human'):
imgs = self.get_images()
bigimg = tile_images(imgs)
if mode == 'human':
self.get_viewer().imshow(bigimg)
return self.get_viewer().isopen
elif mode == 'rgb_array':
return bigimg
else:
raise NotImplementedError
def get_images(self):
"""
Return RGB images from each environment
"""
raise NotImplementedError
results = [env.step(action) for env, action in zip(self.env_list, actions)]
@property
def unwrapped(self):
if isinstance(self, VecEnvWrapper):
return self.venv.unwrapped
else:
return self
def get_viewer(self):
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.SimpleImageViewer()
return self.viewer
class SubprocVecEnv(ShareVecEnv):
def __init__(self, env_fns, spaces=None):
"""
envs: list of gym environments to run in subprocesses
"""
self.waiting = False
self.closed = False
nenvs = len(env_fns)
self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
for p in self.ps:
p.daemon = True # if the main process crashes, we should not cause things to hang
p.start()
for remote in self.work_remotes:
remote.close()
self.remotes[0].send(('get_spaces', None))
observation_space, share_observation_space, action_space = self.remotes[0].recv()
ShareVecEnv.__init__(self, len(env_fns), observation_space,
share_observation_space, action_space)
def step_async(self, actions):
for remote, action in zip(self.remotes, actions):
remote.send(('step', action))
self.waiting = True
def step_wait(self):
results = [remote.recv() for remote in self.remotes]
self.waiting = False
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
def reset(self):
obs = [env.reset() for env in self.env_list]
for remote in self.remotes:
remote.send(('reset', None))
obs = [remote.recv() for remote in self.remotes]
return np.stack(obs)
def reset_task(self):
for remote in self.remotes:
remote.send(('reset_task', None))
return np.stack([remote.recv() for remote in self.remotes])
def close(self):
pass
if self.closed:
return
if self.waiting:
for remote in self.remotes:
remote.recv()
for remote in self.remotes:
remote.send(('close', None))
for p in self.ps:
p.join()
self.closed = True
def render(self, mode="rgb_array"):
pass
for remote in self.remotes:
remote.send(('render', mode))
if mode == "rgb_array":
frame = [remote.recv() for remote in self.remotes]
return np.stack(frame)
# single env
class DummyVecEnv(object):
def __init__(self, all_args):
"""
envs: list of gym environments to run in subprocesses
"""
self.env_list = [Env(i) for i in range(all_args.n_eval_rollout_threadss)]
self.num_envs = all_args.n_rollout_threads
self.num_agent = self.env_list[0].agent_num
self.u_range = 1.0 # control range for continuous control
self.movable = True
# environment parameters
self.discrete_action_space = True
# if true, action is a number 0...N, otherwise action is a one-hot N-dimensional vector
self.discrete_action_input = False
# if true, even the action is continuous, action will be performed discretely
self.force_discrete_action = False
# in this env, force_discrete_action == False��because world do not have discrete_action
# configure spaces
self.action_space = []
self.observation_space = []
self.share_observation_space = []
share_obs_dim = 0
for agent_num in range(self.num_agent):
total_action_space = []
# physical action space
if self.discrete_action_space:
u_action_space = spaces.Discrete(5) # 5个离散的动作
class DummyVecEnv(ShareVecEnv):
def __init__(self, env_fns):
self.envs = [fn() for fn in env_fns]
env = self.envs[0]
ShareVecEnv.__init__(self, len(
env_fns), env.observation_space, env.share_observation_space, env.action_space)
self.actions = None
def step_async(self, actions):
self.actions = actions
def step_wait(self):
results = [env.step(a) for (a, env) in zip(self.actions, self.envs)]
obs, rews, dones, infos = map(np.array, zip(*results))
for (i, done) in enumerate(dones):
if 'bool' in done.__class__.__name__:
if done:
obs[i] = self.envs[i].reset()
else:
u_action_space = spaces.Box(low=-self.u_range, high=+self.u_range, shape=(2,), dtype=np.float32) # [-1,1]
if self.movable:
total_action_space.append(u_action_space)
# total action space
if len(total_action_space) > 1:
# all action spaces are discrete, so simplify to MultiDiscrete action space
if all([isinstance(act_space, spaces.Discrete) for act_space in total_action_space]):
act_space = MultiDiscrete([[0, act_space.n - 1] for act_space in total_action_space])
else:
act_space = spaces.Tuple(total_action_space)
self.action_space.append(act_space)
else:
self.action_space.append(total_action_space[0])
# observation space
obs_dim = 14 # 单个智能体的观测维度
share_obs_dim += obs_dim
self.observation_space.append(spaces.Box(low=-np.inf, high=+np.inf, shape=(obs_dim,), dtype=np.float32)) # [-inf,inf]
self.share_observation_space = [spaces.Box(low=-np.inf, high=+np.inf, shape=(share_obs_dim,),
dtype=np.float32) for _ in range(self.num_agent)]
def step(self, actions):
"""
输入actions纬度假设:
# actions shape = (5, 2, 5)
# 5个线程的环境,里面有2个智能体,每个智能体的动作是一个one_hot的5维编码
"""
if np.all(done):
obs[i] = self.envs[i].reset()
results = [env.step(action) for env, action in zip(self.env_list, actions)]
obs, rews, dones, infos = zip(*results)
return np.stack(obs), np.stack(rews), np.stack(dones), infos
self.actions = None
return obs, rews, dones, infos
def reset(self):
obs = [env.reset() for env in self.env_list]
return np.stack(obs)
obs = [env.reset() for env in self.envs]
return np.array(obs)
def close(self):
pass
def render(self, mode="rgb_array"):
pass
for env in self.envs:
env.close()
def render(self, mode="human"):
if mode == "rgb_array":
return np.array([env.render(mode=mode) for env in self.envs])
elif mode == "human":
for env in self.envs:
env.render(mode=mode)
else:
raise NotImplementedError
\ No newline at end of file
import wandb
import os
import numpy as np
import torch
......@@ -37,7 +36,6 @@ class Runner(object):
self.n_render_rollout_threads = self.all_args.n_render_rollout_threads
self.use_linear_lr_decay = self.all_args.use_linear_lr_decay
self.hidden_size = self.all_args.hidden_size
self.use_wandb = self.all_args.use_wandb
self.use_render = self.all_args.use_render
self.recurrent_N = self.all_args.recurrent_N
......@@ -50,18 +48,14 @@ class Runner(object):
# dir
self.model_dir = self.all_args.model_dir
if self.use_wandb:
self.save_dir = str(wandb.run.dir)
self.run_dir = str(wandb.run.dir)
else:
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
self.run_dir = config["run_dir"]
self.log_dir = str(self.run_dir / 'logs')
if not os.path.exists(self.log_dir):
os.makedirs(self.log_dir)
self.writter = SummaryWriter(self.log_dir)
self.save_dir = str(self.run_dir / 'models')
if not os.path.exists(self.save_dir):
os.makedirs(self.save_dir)
from algorithms.algorithm.r_mappo import RMAPPO as TrainAlgo
from algorithms.algorithm.rMAPPOPolicy import RMAPPOPolicy as Policy
......@@ -146,10 +140,7 @@ class Runner(object):
:param total_num_steps: (int) total number of training env steps.
"""
for k, v in train_infos.items():
if self.use_wandb:
wandb.log({k: v}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: v}, total_num_steps)
self.writter.add_scalars(k, {k: v}, total_num_steps)
def log_env(self, env_infos, total_num_steps):
"""
......@@ -159,7 +150,4 @@ class Runner(object):
"""
for k, v in env_infos.items():
if len(v)>0:
if self.use_wandb:
wandb.log({k: np.mean(v)}, step=total_num_steps)
else:
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
self.writter.add_scalars(k, {k: np.mean(v)}, total_num_steps)
......@@ -16,7 +16,6 @@ import time
import numpy as np
import torch
from runner.shared.base_runner import Runner
import wandb
import imageio
......@@ -133,9 +132,12 @@ class EnvRunner(Runner):
else:
actions_env = np.concatenate((actions_env, uc_actions_env), axis=2)
elif self.envs.action_space[0].__class__.__name__ == 'Discrete':
# actions --> actions_env : shape:[10, 1] --> [5, 2, 5]
actions_env = np.squeeze(np.eye(self.envs.action_space[0].n)[actions], 2)
else:
raise NotImplementedError
# TODO 这里改造成自己环境需要的形式即可
actions_env = actions
# raise NotImplementedError
return values, actions, action_log_probs, rnn_states, rnn_states_critic, actions_env
......
"""
# @Time : 2021/6/30 10:07 下午
# @Author : hezhiqiang01
# @Email : hezhiqiang01@baidu.com
# @Author : hezhiqiang
# @Email : tinyzqh@163.com
# @File : train.py
"""
# !/usr/bin/env python
import sys
import os
import wandb
import socket
import setproctitle
import numpy as np
......@@ -21,11 +20,27 @@ from envs.env_wrappers import SubprocVecEnv, DummyVecEnv
def make_train_env(all_args):
return SubprocVecEnv(all_args)
def get_env_fn(rank):
def init_env():
# from envs.env_continuous import ContinuousActionEnv
# env = ContinuousActionEnv()
from envs.env_discrete import DiscreteActionEnv
env = DiscreteActionEnv()
env.seed(all_args.seed + rank * 1000)
return env
return init_env
return DummyVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def make_eval_env(all_args):
return DummyVecEnv(all_args)
def get_env_fn(rank):
def init_env():
from envs.env_discrete import DiscreteActionEnv
env = DiscreteActionEnv()
env.seed(all_args.seed + rank * 1000)
return env
return init_env
return DummyVecEnv([get_env_fn(i) for i in range(all_args.n_rollout_threads)])
def parse_args(args, parser):
......@@ -72,32 +87,18 @@ def main(args):
if not run_dir.exists():
os.makedirs(str(run_dir))
# wandb
if all_args.use_wandb:
run = wandb.init(config=all_args,
project=all_args.env_name,
entity=all_args.user_name,
notes=socket.gethostname(),
name=str(all_args.algorithm_name) + "_" +
str(all_args.experiment_name) +
"_seed" + str(all_args.seed),
group=all_args.scenario_name,
dir=str(run_dir),
job_type="training",
reinit=True)
if not run_dir.exists():
curr_run = 'run1'
else:
if not run_dir.exists():
exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if
str(folder.name).startswith('run')]
if len(exst_run_nums) == 0:
curr_run = 'run1'
else:
exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in run_dir.iterdir() if
str(folder.name).startswith('run')]
if len(exst_run_nums) == 0:
curr_run = 'run1'
else:
curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = run_dir / curr_run
if not run_dir.exists():
os.makedirs(str(run_dir))
curr_run = 'run%i' % (max(exst_run_nums) + 1)
run_dir = run_dir / curr_run
if not run_dir.exists():
os.makedirs(str(run_dir))
setproctitle.setproctitle(str(all_args.algorithm_name) + "-" + \
str(all_args.env_name) + "-" + str(all_args.experiment_name) + "@" + str(
......@@ -136,11 +137,8 @@ def main(args):
if all_args.use_eval and eval_envs is not envs:
eval_envs.close()
if all_args.use_wandb:
run.finish()
else:
runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
runner.writter.close()
runner.writter.export_scalars_to_json(str(runner.log_dir + '/summary.json'))
runner.writter.close()
if __name__ == "__main__":
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment