import sys
import time
import numpy as np
import torch
import torch.nn as nn
from transformers import GPT2Config, GPT2Model
import psutil  # 导入用于监控 CPU 使用率的库

from ptflops import get_model_complexity_info


# 设置随机种子保证可复现性
torch.manual_seed(42)
np.random.seed(42)


# 自定义的MLP模型
class Actor(nn.Module):
    def __init__(self, obs_dim, embd_dim, output_dim, config):
        super(Actor, self).__init__()
        self.gpt = GPT2Model(config)
        self.fc1 = nn.Linear(obs_dim, embd_dim)
        self.fc2 = nn.Linear(embd_dim, output_dim)
        self.activation = nn.ReLU()

    def forward(self, x):
        x = self.activation(self.fc1(x))
        gpt_out = self.gpt(inputs_embeds=x.unsqueeze(1)).last_hidden_state
        return self.fc2(gpt_out.squeeze(1))



def generate_neural_net_input(num_drones=5, num_users=20):
    """
    生成54维神经网络输入数组，包含：
    - 5个无人机的x,y坐标（前10位）
    - 20个用户的x,y坐标（中间40位）
    - 4个无人机间距离（最后4位）

    参数:
    num_drones: 无人机数量（默认为5）
    num_users: 用户数量（默认为20）

    返回:
    54维numpy数组
    """
    # 1. 生成无人机位置（随机分布在100x100区域内）
    drone_positions = np.random.uniform(0, 100, (num_drones, 2))

    # 2. 生成用户位置（随机分布在相同区域内）
    user_positions = np.random.uniform(0, 100, (num_users, 2))

    # 3. 计算无人机间距离（取4个特定组合：无人机0-1, 0-2, 1-3, 2-4）
    distances = [
        np.linalg.norm(drone_positions[0] - drone_positions[1]),  # d0-1
        np.linalg.norm(drone_positions[0] - drone_positions[2]),  # d0-2
        np.linalg.norm(drone_positions[1] - drone_positions[3]),  # d1-3
        np.linalg.norm(drone_positions[2] - drone_positions[4])  # d2-4
    ]

    # 4. 组合所有数据为54维数组
    # 前10位：无人机坐标展平 (5 drones × 2 coords = 10)
    # 中间40位：用户坐标展平 (20 users × 2 coords = 40)
    # 最后4位：距离值
    input_array = np.concatenate([
        drone_positions.flatten(),
        user_positions.flatten(),
        distances
    ])
    return input_array


def print_input_description(input_array):
    """打印输入的详细描述"""
    # 无人机位置
    print("无人机位置 (5 drones):")
    for i in range(0, 10, 2):
        drone_idx = i // 2
        x, y = input_array[i], input_array[i + 1]
        print(f"  无人机{drone_idx}: x={x:.2f}, y={y:.2f}")

    # 用户位置（部分显示）
    print("\n用户位置 (前5个):")
    for i in range(10, 20, 2):  # 只显示前5个用户
        user_idx = (i - 10) // 2
        x, y = input_array[i], input_array[i + 1]
        print(f"  用户{user_idx}: x={x:.2f}, y={y:.2f}")

    # 无人机间距离
    print("\n无人机间距离:")
    dist_labels = ["无人机0-1", "无人机0-2", "无人机1-3", "无人机2-4"]
    for i, label in enumerate(dist_labels):
        print(f"  {label}: {input_array[50 + i]:.2f}")


# 配置参数
obs_dim = 54 # 2*5 + 20*2 + 4
batch_size = 32
embd_dim = 256
config = GPT2Config(n_embd=embd_dim, n_layer=2, n_head=4)
torch.device("cpu")
# 创建模型
device = torch.device("cpu")
print(f"使用设备: {device}")




model = Actor(obs_dim, embd_dim, 2, config).to(device)
model.load_state_dict(torch.load('actor2-4-54.pt', map_location='cpu'))

print(f"测试神经网络输入: {device}")
input = generate_neural_net_input(5,20)
print_input_description(input)
ss = []
ss.append(input)
input =torch.tensor(input, dtype=torch.float32)
out = model(torch.tensor(ss, dtype=torch.float32))
print(out)
print(f"  无人机移动: x={out[0][0]:.2f}, y={out[0][1]:.2f}")
# 生成测试数据
def generate_batch(batch_size):
    return torch.randn(batch_size, obs_dim).to(device)


# 预热运行
print("进行预热运行...")
with torch.no_grad():
    for _ in range(10):
        data = generate_batch(batch_size)
        _ = model(data)

# 记录CPU使用率
cpu_usages = []  # 用于保存每次测试的CPU使用率峰值

# 测试不同批量大小的前向传播时间
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
print("\n开始性能测试...")
print(
    f"{'批量大小':<10} | {'平均时间(ms)':<12} | {'最小时间(ms)':<12} | {'最大时间(ms)':<12} | {'标准差':<10} | {'CPU峰值(%)':<10}")
print("-" * 90)

results = {}
for bs in batch_sizes:
    execution_times = []
    n_runs = 100
    cpu_usage_peak = 0

    # 开始监控CPU使用率
    p = psutil.Process()

    for i in range(n_runs):
        data = generate_batch(bs)

        # 开始计时
        start_time = time.perf_counter_ns()

        with torch.no_grad():
            output = model(data)

        # 结束计时
        end_time = time.perf_counter_ns()
        elapsed_ms = (end_time - start_time) / 1_000_000  # 毫秒
        execution_times.append(elapsed_ms)

        # 记录CPU使用率峰值
        current_cpu = p.cpu_percent(interval=None)
        if current_cpu > cpu_usage_peak:
            cpu_usage_peak = current_cpu

    # 保存当前批量大小的CPU峰值使用率
    cpu_usages.append(cpu_usage_peak)

    # 计算统计指标
    avg_time = np.mean(execution_times)
    min_time = np.min(execution_times)
    max_time = np.max(execution_times)
    std_dev = np.std(execution_times)

    results[bs] = {
        'avg': avg_time,
        'min': min_time,
        'max': max_time,
        'std': std_dev,
        'cpu_peak': cpu_usage_peak
    }

    print(
        f"{bs:<10} | {avg_time:.6f} ms   | {min_time:.6f} ms   | {max_time:.6f} ms   | {std_dev:.6f} | {cpu_usage_peak:.1f}%")

bs32_times = []
data = generate_batch(32)
for _ in range(100):
    start_time = time.perf_counter_ns()
    with torch.no_grad():
        output = model(data)
    end_time = time.perf_counter_ns()
    bs32_times.append((end_time - start_time) / 1_000_000)



# 打印最终结果
print("\n测试摘要:")
print(f"{'批量大小':<10} | {'平均时间(ms)':<12} | {'CPU峰值(%)':<10}")
print("-" * 50)
for bs in batch_sizes:
    print(f"{bs:<10} | {results[bs]['avg']:.6f} ms   | {results[bs]['cpu_peak']:.1f}%")



# 输出模型信息
print("\n模型信息:")
print(f"总参数数量: {sum(p.numel() for p in model.parameters()) / 1e6:.2f} M")

print("\n测试期间峰值系统资源使用情况:")
print(f"CPU峰值使用率: {max(cpu_usages):.1f}%")
print(f"内存峰值使用率: {psutil.virtual_memory().percent}%")
print(
    f"当前内存使用: {psutil.virtual_memory().used / 1024 ** 3:.2f} GB / {psutil.virtual_memory().total / 1024 ** 3:.2f} GB")

# 修正后的FLOPs计算部分
print("\n使用 ptflops 计算模型FLOPs:")


# 我们需要创建一个模型实例的包装器类，而不是直接使用函数
class ModelWrapper(nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, x):
        return self.model(x)


# 创建包装器实例
model_wrapper = ModelWrapper(model)

# 模型输入形状 (1, obs_dim) - 表示单样本输入
input_shape = (obs_dim,)


macs, params = get_model_complexity_info(
    model_wrapper,
    input_shape,
    as_strings=False,
    print_per_layer_stat=False,
    verbose=False
)

# 根据 MACs 估算 FLOPs (通常 FLOPs ≈ 2 * MACs)
total_flops = 2 * macs  # 近似值，实际中可能略有不同
print(total_flops)

# 输出结果
print(f"模型参数量: {params / 1e6:.2f} M")
print(f"MACs (乘法累加操作): {macs / 1e6:.2f} MMACs")
print(f"FLOPs 估计: {total_flops / 1e6:.2f} MFLOPs (单精度, 单样本)")


# 计算批量大小为32时的总FLOPs
batch_size_for_flops = 256
total_flops_batch = total_flops * batch_size_for_flops  # 理论上批量线性增长
print(f"\n批量大小 {batch_size_for_flops} 时的总FLOPs估计: {total_flops_batch / 1e9:.2f} GFLOPs")

# 计算实际性能表现
avg_time_sec = results[128]['avg'] / 1000  # 转换为秒
gflops_per_forward = total_flops_batch / 1e9
gflops_per_sec = gflops_per_forward / avg_time_sec
print(f"前向传播计算效率: {gflops_per_sec:.2f} GFLOPS")

# 计算CPU理论峰值性能
cpu_count = psutil.cpu_count(logical=False)  # 物理核心数
if cpu_freq := psutil.cpu_freq():
    cpu_freq_ghz = cpu_freq.current / 1000  # 当前CPU频率GHz
else:
    # 备用方法
    import platform

    if platform.system() == 'Darwin':  # macOS
        import subprocess

        output = subprocess.check_output(["sysctl", "-n", "hw.cpufrequency"]).decode().strip()
        cpu_freq_ghz = float(output) / 1000000000
    else:
        cpu_freq_ghz = 3.0  # 默认假设为3.0GHz

# 假设每个核心每周期执行8次浮点运算（考虑AVX向量化）
per_cycle_flops = 8
theoretical_gflops = cpu_count * cpu_freq_ghz * per_cycle_flops

print(f"CPU理论峰值: {theoretical_gflops:.1f} GFLOPS (单精度浮点)")

# 计算实际利用率
utilization = gflops_per_sec / theoretical_gflops * 100
print(f"CPU利用率: {utilization:.1f}%")

