vLLM-10-CSRC模块-数据结构

关键数据结构概览

CSRC 模块的数据结构设计围绕高效的 GPU 计算展开,包括内存布局优化、数据类型支持、内核参数管理和性能监控四个层次。

classDiagram
    class KernelParams {
        <<struct>>
        +int block_size
        +int grid_size
        +int shared_mem_size
        +cudaStream_t stream
        +void* workspace
        +size_t workspace_size
    }
    
    class AttentionParams {
        <<struct>>
        +scalar_t* query
        +scalar_t* key_cache
        +scalar_t* value_cache
        +scalar_t* output
        +int* block_tables
        +int* seq_lens
        +int num_seqs
        +int num_heads
        +int num_kv_heads
        +int head_size
        +float scale
        +int max_seq_len
        +float* alibi_slopes
    }
    
    class QuantizationParams {
        <<struct>>
        +torch::Tensor qweight
        +torch::Tensor qzeros
        +torch::Tensor scales
        +torch::Tensor bias
        +int group_size
        +int bits
        +QuantizationType quant_type
        +void* workspace
    }
    
    class MOEParams {
        <<struct>>
        +float* expert_weights
        +int* selected_experts
        +float* routing_weights
        +int* expert_tokens
        +int num_experts
        +int top_k
        +int hidden_size
        +int expert_hidden_size
    }
    
    class CacheBlockInfo {
        <<struct>>
        +int block_id
        +int* token_ids
        +int num_tokens
        +int max_tokens
        +bool is_full
        +CacheBlockInfo* next
        +void* key_data
        +void* value_data
    }
    
    class DeviceMemoryPool {
        +void* base_ptr
        +size_t total_size
        +size_t allocated_size
        +vector~MemoryBlock~ free_blocks
        +vector~MemoryBlock~ used_blocks
        +mutex allocation_mutex
        +allocate() void*
        +deallocate() void
        +get_memory_usage() size_t
    }
    
    class TensorWrapper {
        +torch::Tensor tensor
        +void* data_ptr
        +vector~int64_t~ shape
        +torch::ScalarType dtype
        +torch::Device device
        +int64_t numel
        +vector~int64_t~ strides
        +bool is_contiguous
        +to_device() TensorWrapper
        +reshape() TensorWrapper
    }
    
    KernelParams <|-- AttentionParams
    KernelParams <|-- QuantizationParams
    KernelParams <|-- MOEParams
    DeviceMemoryPool --> CacheBlockInfo : manages
    TensorWrapper --> KernelParams : provides data

核心数据结构定义

1. 基础内核参数结构

// 通用内核参数基类
struct KernelParams {
    // CUDA 执行配置
    dim3 grid_size;              // 网格大小
    dim3 block_size;             // 线程块大小
    int shared_mem_size;         // 共享内存大小(字节)
    cudaStream_t stream;         // CUDA流
    
    // 工作空间管理
    void* workspace;             // 临时工作空间指针
    size_t workspace_size;       // 工作空间大小
    
    // 性能监控
    cudaEvent_t start_event;     // 开始计时事件
    cudaEvent_t end_event;       // 结束计时事件
    
    // 构造函数
    KernelParams() : workspace(nullptr), workspace_size(0) {
        cudaEventCreate(&start_event);
        cudaEventCreate(&end_event);
    }
    
    // 析构函数
    ~KernelParams() {
        if (workspace) {
            cudaFree(workspace);
        }
        cudaEventDestroy(start_event);
        cudaEventDestroy(end_event);
    }
    
    // 计算最优执行配置
    void compute_launch_config(int problem_size, int threads_per_block = 256) {
        block_size = dim3(threads_per_block);
        grid_size = dim3((problem_size + threads_per_block - 1) / threads_per_block);
        
        // 限制网格大小避免硬件限制
        const int max_grid_size = 65535;
        if (grid_size.x > max_grid_size) {
            grid_size.y = (grid_size.x + max_grid_size - 1) / max_grid_size;
            grid_size.x = max_grid_size;
        }
    }
    
    // 分配工作空间
    void allocate_workspace(size_t size) {
        if (workspace_size < size) {
            if (workspace) cudaFree(workspace);
            cudaMalloc(&workspace, size);
            workspace_size = size;
        }
    }
};

2. PagedAttention 参数结构

// PagedAttention 内核参数
template<typename scalar_t>
struct PagedAttentionParams : public KernelParams {
    // 输入张量
    scalar_t* query;                    // 查询张量 [num_seqs, num_heads, head_size]
    scalar_t* key_cache;                // Key缓存 [num_blocks, num_kv_heads, head_size/x, block_size, x]
    scalar_t* value_cache;              // Value缓存 [num_blocks, num_kv_heads, head_size, block_size]
    scalar_t* output;                   // 输出张量 [num_seqs, num_heads, head_size]
    
    // 索引和长度信息
    int* block_tables;                  // 块表 [num_seqs, max_num_blocks_per_seq]
    int* seq_lens;                      // 序列长度 [num_seqs]
    
    // 维度参数
    int num_seqs;                       // 序列数量
    int num_heads;                      // 查询头数
    int num_kv_heads;                   // KV头数
    int head_size;                      // 头维度大小
    int block_size;                     // 块大小
    int max_seq_len;                    // 最大序列长度
    int max_num_blocks_per_seq;         // 每序列最大块数
    
    // 计算参数
    float scale;                        // 注意力缩放因子
    float* alibi_slopes;                // ALiBi位置编码斜率
    
    // 量化参数
    float kv_scale;                     // KV缓存量化缩放
    
    // 张量步长
    int q_stride;                       // 查询张量步长
    int kv_block_stride;                // KV块步长
    int kv_head_stride;                 // KV头步长
    
    // 构造函数
    PagedAttentionParams() {
        query = nullptr;
        key_cache = nullptr;
        value_cache = nullptr;
        output = nullptr;
        block_tables = nullptr;
        seq_lens = nullptr;
        alibi_slopes = nullptr;
        
        scale = 1.0f;
        kv_scale = 1.0f;
    }
    
    // 参数验证
    bool validate() const {
        if (!query || !key_cache || !value_cache || !output) {
            return false;
        }
        if (!block_tables || !seq_lens) {
            return false;
        }
        if (num_seqs <= 0 || num_heads <= 0 || num_kv_heads <= 0) {
            return false;
        }
        if (head_size <= 0 || block_size <= 0) {
            return false;
        }
        return true;
    }
    
    // 计算共享内存需求
    int compute_shared_memory_size() const {
        int q_vec_size = head_size * sizeof(scalar_t);
        int k_vec_size = head_size * sizeof(scalar_t);
        int logits_size = block_size * sizeof(float);
        int reduction_size = block_size.x * sizeof(float);
        
        return q_vec_size + k_vec_size + logits_size + reduction_size;
    }
};

3. 量化参数结构

// 量化类型枚举
enum class QuantizationType {
    NONE = 0,
    INT8 = 8,
    INT4 = 4,
    AWQ = 100,      // AWQ INT4量化
    GPTQ = 101,     // GPTQ量化
    SmoothQuant = 102  // SmoothQuant
};

// 量化内核参数
struct QuantizationParams : public KernelParams {
    // 量化权重和参数
    torch::Tensor qweight;              // 量化权重
    torch::Tensor qzeros;               // 量化零点
    torch::Tensor scales;               // 缩放因子
    torch::Tensor bias;                 // 偏置(可选)
    
    // 量化配置
    QuantizationType quant_type;        // 量化类型
    int bits;                          // 量化位数
    int group_size;                    // 分组大小
    
    // 矩阵维度
    int m, n, k;                       // 矩阵乘法维度 (m,k) x (k,n)
    
    // Split-K 参数
    int split_k_iters;                 // Split-K迭代数
    
    // 工作空间布局
    struct WorkspaceLayout {
        void* dequant_weights;         // 反量化权重缓存
        void* partial_results;         // 部分结果缓存
        void* reduction_workspace;     // Reduction工作空间
        
        size_t dequant_weights_size;
        size_t partial_results_size;
        size_t reduction_workspace_size;
    } workspace_layout;
    
    // 构造函数
    QuantizationParams() {
        quant_type = QuantizationType::NONE;
        bits = 16;
        group_size = 128;
        split_k_iters = 1;
        m = n = k = 0;
    }
    
    // 计算工作空间大小
    size_t compute_workspace_size() const {
        size_t dequant_size = 0;
        size_t partial_size = 0;
        size_t reduction_size = 0;
        
        if (quant_type == QuantizationType::AWQ || quant_type == QuantizationType::GPTQ) {
            // AWQ/GPTQ需要反量化缓存
            dequant_size = k * n * sizeof(half);
        }
        
        if (split_k_iters > 1) {
            // Split-K需要部分结果存储
            partial_size = m * n * split_k_iters * sizeof(float);
            reduction_size = m * n * sizeof(float);
        }
        
        return dequant_size + partial_size + reduction_size;
    }
    
    // 设置工作空间布局
    void setup_workspace_layout() {
        if (!workspace) return;
        
        char* workspace_ptr = static_cast<char*>(workspace);
        size_t offset = 0;
        
        // 反量化权重缓存
        workspace_layout.dequant_weights_size = k * n * sizeof(half);
        workspace_layout.dequant_weights = workspace_ptr + offset;
        offset += workspace_layout.dequant_weights_size;
        
        // 部分结果缓存
        if (split_k_iters > 1) {
            workspace_layout.partial_results_size = m * n * split_k_iters * sizeof(float);
            workspace_layout.partial_results = workspace_ptr + offset;
            offset += workspace_layout.partial_results_size;
            
            workspace_layout.reduction_workspace_size = m * n * sizeof(float);
            workspace_layout.reduction_workspace = workspace_ptr + offset;
            offset += workspace_layout.reduction_workspace_size;
        }
    }
};

4. MOE 参数结构

// MOE (Mixture of Experts) 参数结构
struct MOEParams : public KernelParams {
    // 路由相关
    float* expert_weights;              // 专家权重 [num_tokens, num_experts]
    int* selected_experts;              // 选中专家 [num_tokens, top_k]
    float* routing_weights;             // 路由权重 [num_tokens, top_k]
    int* expert_tokens;                 // 每专家token数 [num_experts]
    int* token_to_expert_map;           // token到专家映射
    
    // FFN权重
    float* gate_weights;                // 门控权重 [num_experts, hidden_size, expert_hidden_size]
    float* up_weights;                  // 上升权重 [num_experts, hidden_size, expert_hidden_size]
    float* down_weights;                // 下降权重 [num_experts, expert_hidden_size, hidden_size]
    
    // 维度参数
    int num_tokens;                     // token总数
    int num_experts;                    // 专家总数
    int top_k;                         // 每token选择的专家数
    int hidden_size;                    // 隐藏层大小
    int expert_hidden_size;             // 专家隐藏层大小
    
    // 负载均衡
    float load_balance_loss_coeff;      // 负载均衡损失系数
    float* load_balance_loss;           // 负载均衡损失输出
    
    // 工作空间布局
    struct MOEWorkspace {
        float* gate_outputs;            // 门控输出缓存
        float* up_outputs;              // 上升投影输出缓存  
        float* expert_inputs;           // 重排序的专家输入
        float* expert_outputs;          // 专家输出缓存
        int* sorted_token_indices;      // 排序后的token索引
        int* expert_offsets;            // 专家在批次中的偏移
    } workspace;
    
    // 构造函数
    MOEParams() {
        expert_weights = nullptr;
        selected_experts = nullptr;
        routing_weights = nullptr;
        expert_tokens = nullptr;
        token_to_expert_map = nullptr;
        
        gate_weights = nullptr;
        up_weights = nullptr;
        down_weights = nullptr;
        
        load_balance_loss = nullptr;
        load_balance_loss_coeff = 0.01f;
        
        num_tokens = num_experts = top_k = 0;
        hidden_size = expert_hidden_size = 0;
        
        memset(&workspace, 0, sizeof(workspace));
    }
    
    // 计算工作空间大小
    size_t compute_workspace_size() const {
        size_t gate_size = num_tokens * expert_hidden_size * sizeof(float);
        size_t up_size = num_tokens * expert_hidden_size * sizeof(float);
        size_t expert_inputs_size = num_tokens * hidden_size * sizeof(float);
        size_t expert_outputs_size = num_tokens * hidden_size * sizeof(float);
        size_t indices_size = num_tokens * sizeof(int);
        size_t offsets_size = (num_experts + 1) * sizeof(int);
        
        return gate_size + up_size + expert_inputs_size + expert_outputs_size + 
               indices_size + offsets_size;
    }
};

内存管理数据结构

1. 设备内存池

// 内存块描述符
struct MemoryBlock {
    void* ptr;                          // 内存指针
    size_t size;                        // 块大小
    size_t alignment;                   // 对齐要求
    bool is_free;                       // 是否空闲
    MemoryBlock* prev;                  // 前一个块
    MemoryBlock* next;                  // 后一个块
    
    // 构造函数
    MemoryBlock(void* p, size_t s, size_t align = 256) 
        : ptr(p), size(s), alignment(align), is_free(true), prev(nullptr), next(nullptr) {}
};

// 设备内存池管理器
class DeviceMemoryPool {
private:
    void* base_ptr_;                    // 基础内存指针
    size_t total_size_;                 // 总内存大小
    size_t allocated_size_;             // 已分配大小
    size_t peak_usage_;                 // 峰值使用量
    
    // 内存块管理
    MemoryBlock* free_list_head_;       // 空闲链表头
    MemoryBlock* used_list_head_;       // 使用链表头
    
    // 线程安全
    std::mutex allocation_mutex_;
    
    // 统计信息
    std::atomic<int> num_allocations_;
    std::atomic<int> num_deallocations_;
    std::atomic<size_t> current_usage_;
    
public:
    // 构造函数
    DeviceMemoryPool(size_t pool_size) : total_size_(pool_size), allocated_size_(0), peak_usage_(0) {
        // 分配大块连续内存
        cudaMalloc(&base_ptr_, total_size_);
        
        // 初始化空闲链表
        free_list_head_ = new MemoryBlock(base_ptr_, total_size_);
        used_list_head_ = nullptr;
        
        num_allocations_ = 0;
        num_deallocations_ = 0;
        current_usage_ = 0;
    }
    
    // 析构函数
    ~DeviceMemoryPool() {
        cudaFree(base_ptr_);
        cleanup_block_list(free_list_head_);
        cleanup_block_list(used_list_head_);
    }
    
    // 分配内存
    void* allocate(size_t size, size_t alignment = 256) {
        std::lock_guard<std::mutex> lock(allocation_mutex_);
        
        // 向上对齐到alignment边界
        size_t aligned_size = (size + alignment - 1) & ~(alignment - 1);
        
        // 寻找合适的空闲块
        MemoryBlock* block = find_free_block(aligned_size, alignment);
        if (!block) {
            // 内存不足,尝试碎片整理
            if (!defragment()) {
                return nullptr;  // 分配失败
            }
            block = find_free_block(aligned_size, alignment);
            if (!block) {
                return nullptr;
            }
        }
        
        // 分割块(如果需要)
        if (block->size > aligned_size + sizeof(MemoryBlock) + alignment) {
            split_block(block, aligned_size);
        }
        
        // 移动到使用列表
        move_to_used_list(block);
        
        // 更新统计
        allocated_size_ += block->size;
        peak_usage_ = std::max(peak_usage_, allocated_size_);
        current_usage_ += block->size;
        num_allocations_++;
        
        return block->ptr;
    }
    
    // 释放内存
    void deallocate(void* ptr) {
        if (!ptr) return;
        
        std::lock_guard<std::mutex> lock(allocation_mutex_);
        
        // 在使用列表中查找块
        MemoryBlock* block = find_used_block(ptr);
        if (!block) {
            return;  // 无效指针
        }
        
        // 移动到空闲列表
        move_to_free_list(block);
        
        // 尝试合并相邻空闲块
        merge_adjacent_blocks(block);
        
        // 更新统计
        allocated_size_ -= block->size;
        current_usage_ -= block->size;
        num_deallocations_++;
    }
    
    // 获取内存使用统计
    struct MemoryStats {
        size_t total_size;
        size_t allocated_size;
        size_t free_size;
        size_t peak_usage;
        int num_allocations;
        int num_deallocations;
        int num_free_blocks;
        int num_used_blocks;
        float fragmentation_ratio;
    };
    
    MemoryStats get_memory_stats() const {
        std::lock_guard<std::mutex> lock(const_cast<std::mutex&>(allocation_mutex_));
        
        MemoryStats stats;
        stats.total_size = total_size_;
        stats.allocated_size = allocated_size_;
        stats.free_size = total_size_ - allocated_size_;
        stats.peak_usage = peak_usage_;
        stats.num_allocations = num_allocations_;
        stats.num_deallocations = num_deallocations_;
        
        // 计算块数和碎片率
        stats.num_free_blocks = count_blocks(free_list_head_);
        stats.num_used_blocks = count_blocks(used_list_head_);
        stats.fragmentation_ratio = calculate_fragmentation_ratio();
        
        return stats;
    }
    
private:
    // 查找空闲块
    MemoryBlock* find_free_block(size_t size, size_t alignment) {
        MemoryBlock* current = free_list_head_;
        while (current) {
            // 检查大小和对齐
            void* aligned_ptr = align_pointer(current->ptr, alignment);
            size_t available_size = current->size - (static_cast<char*>(aligned_ptr) - static_cast<char*>(current->ptr));
            
            if (available_size >= size) {
                return current;
            }
            current = current->next;
        }
        return nullptr;
    }
    
    // 内存碎片整理
    bool defragment() {
        // 简单的碎片整理:合并所有相邻的空闲块
        MemoryBlock* current = free_list_head_;
        bool merged_any = false;
        
        while (current && current->next) {
            if (static_cast<char*>(current->ptr) + current->size == current->next->ptr) {
                // 合并相邻块
                MemoryBlock* next_block = current->next;
                current->size += next_block->size;
                current->next = next_block->next;
                if (next_block->next) {
                    next_block->next->prev = current;
                }
                delete next_block;
                merged_any = true;
            } else {
                current = current->next;
            }
        }
        
        return merged_any;
    }
    
    // 计算碎片率
    float calculate_fragmentation_ratio() const {
        if (allocated_size_ == 0) return 0.0f;
        
        size_t largest_free_block = 0;
        MemoryBlock* current = free_list_head_;
        while (current) {
            largest_free_block = std::max(largest_free_block, current->size);
            current = current->next;
        }
        
        size_t total_free = total_size_ - allocated_size_;
        if (total_free == 0) return 0.0f;
        
        return 1.0f - static_cast<float>(largest_free_block) / total_free;
    }
};

2. KV 缓存块管理

// KV缓存块信息
template<typename T>
struct CacheBlock {
    int block_id;                       // 块ID
    int sequence_id;                    // 所属序列ID
    int start_token_idx;                // 起始token索引
    int num_tokens;                     // 当前token数
    int capacity;                       // 块容量
    
    // 数据指针
    T* key_data;                        // Key数据指针
    T* value_data;                      // Value数据指针
    
    // 链表指针
    CacheBlock* next_in_sequence;       // 序列中的下一个块
    CacheBlock* prev_in_sequence;       // 序列中的前一个块
    CacheBlock* next_free;              // 空闲链表中的下一个块
    
    // 状态信息
    bool is_allocated;                  // 是否已分配
    bool is_full;                       // 是否已满
    int ref_count;                      // 引用计数(用于Copy-on-Write)
    
    // 构造函数
    CacheBlock() : block_id(-1), sequence_id(-1), start_token_idx(0), num_tokens(0), 
                   capacity(0), key_data(nullptr), value_data(nullptr),
                   next_in_sequence(nullptr), prev_in_sequence(nullptr), next_free(nullptr),
                   is_allocated(false), is_full(false), ref_count(0) {}
    
    // 添加token到块
    bool add_token(int token_idx, const T* key_vec, const T* value_vec, int head_size) {
        if (is_full || num_tokens >= capacity) {
            return false;
        }
        
        // 复制Key和Value数据
        int offset = num_tokens * head_size;
        memcpy(key_data + offset, key_vec, head_size * sizeof(T));
        memcpy(value_data + offset, value_vec, head_size * sizeof(T));
        
        num_tokens++;
        is_full = (num_tokens == capacity);
        
        return true;
    }
    
    // 获取token的Key/Value指针
    std::pair<T*, T*> get_token_kv(int token_idx, int head_size) {
        if (token_idx >= num_tokens) {
            return {nullptr, nullptr};
        }
        
        int offset = token_idx * head_size;
        return {key_data + offset, value_data + offset};
    }
};

// KV缓存管理器
template<typename T>
class KVCacheManager {
private:
    // 内存池
    DeviceMemoryPool* memory_pool_;
    
    // 缓存配置
    int num_layers_;
    int num_heads_;
    int head_size_;
    int block_size_;
    int max_blocks_;
    
    // 块管理
    std::vector<CacheBlock<T>*> all_blocks_;        // 所有块
    CacheBlock<T>* free_list_head_;                 // 空闲块链表
    std::unordered_map<int, CacheBlock<T>*> sequence_blocks_;  // 序列到块的映射
    
    // 统计信息
    std::atomic<int> allocated_blocks_;
    std::atomic<int> peak_blocks_;
    
public:
    // 构造函数
    KVCacheManager(DeviceMemoryPool* pool, int num_layers, int num_heads, 
                   int head_size, int block_size, int max_blocks)
        : memory_pool_(pool), num_layers_(num_layers), num_heads_(num_heads),
          head_size_(head_size), block_size_(block_size), max_blocks_(max_blocks),
          free_list_head_(nullptr), allocated_blocks_(0), peak_blocks_(0) {
        
        initialize_blocks();
    }
    
    // 为序列分配块
    CacheBlock<T>* allocate_block(int sequence_id) {
        if (!free_list_head_) {
            return nullptr;  // 无可用块
        }
        
        // 从空闲链表取出块
        CacheBlock<T>* block = free_list_head_;
        free_list_head_ = block->next_free;
        
        // 初始化块
        block->sequence_id = sequence_id;
        block->start_token_idx = 0;
        block->num_tokens = 0;
        block->is_allocated = true;
        block->is_full = false;
        block->ref_count = 1;
        block->next_free = nullptr;
        
        // 添加到序列映射
        sequence_blocks_[sequence_id] = block;
        
        // 更新统计
        allocated_blocks_++;
        peak_blocks_ = std::max(peak_blocks_.load(), allocated_blocks_.load());
        
        return block;
    }
    
    // 释放序列的所有块
    void free_sequence_blocks(int sequence_id) {
        auto it = sequence_blocks_.find(sequence_id);
        if (it == sequence_blocks_.end()) {
            return;
        }
        
        CacheBlock<T>* current = it->second;
        while (current) {
            CacheBlock<T>* next = current->next_in_sequence;
            
            // 重置块状态
            current->sequence_id = -1;
            current->num_tokens = 0;
            current->is_allocated = false;
            current->is_full = false;
            current->ref_count = 0;
            current->next_in_sequence = nullptr;
            current->prev_in_sequence = nullptr;
            
            // 添加到空闲链表
            current->next_free = free_list_head_;
            free_list_head_ = current;
            
            allocated_blocks_--;
            current = next;
        }
        
        sequence_blocks_.erase(it);
    }
    
    // 获取缓存统计信息
    struct CacheStats {
        int total_blocks;
        int allocated_blocks;
        int free_blocks;
        int peak_blocks;
        float utilization_ratio;
        size_t memory_usage;
    };
    
    CacheStats get_cache_stats() const {
        CacheStats stats;
        stats.total_blocks = max_blocks_;
        stats.allocated_blocks = allocated_blocks_;
        stats.free_blocks = max_blocks_ - allocated_blocks_;
        stats.peak_blocks = peak_blocks_;
        stats.utilization_ratio = static_cast<float>(allocated_blocks_) / max_blocks_;
        
        // 计算内存使用量
        size_t block_memory = num_layers_ * num_heads_ * head_size_ * block_size_ * sizeof(T) * 2;  // *2 for key and value
        stats.memory_usage = allocated_blocks_ * block_memory;
        
        return stats;
    }
    
private:
    // 初始化所有块
    void initialize_blocks() {
        all_blocks_.resize(max_blocks_);
        
        for (int i = 0; i < max_blocks_; ++i) {
            all_blocks_[i] = new CacheBlock<T>();
            all_blocks_[i]->block_id = i;
            all_blocks_[i]->capacity = block_size_;
            
            // 分配Key和Value内存
            size_t kv_size = num_layers_ * num_heads_ * head_size_ * block_size_ * sizeof(T);
            all_blocks_[i]->key_data = static_cast<T*>(memory_pool_->allocate(kv_size));
            all_blocks_[i]->value_data = static_cast<T*>(memory_pool_->allocate(kv_size));
            
            // 添加到空闲链表
            all_blocks_[i]->next_free = free_list_head_;
            free_list_head_ = all_blocks_[i];
        }
    }
};

性能监控数据结构

1. 内核性能统计

// 内核执行统计
struct KernelProfiler {
    // 计时信息
    std::vector<float> execution_times;    // 执行时间历史
    float total_time;                      // 总执行时间
    float avg_time;                        // 平均执行时间
    float min_time;                        // 最小执行时间
    float max_time;                        // 最大执行时间
    
    // 调用统计
    int num_calls;                         // 调用次数
    int num_failures;                      // 失败次数
    
    // 资源使用
    int registers_per_thread;              // 每线程寄存器数
    int shared_memory_per_block;           // 每块共享内存
    float occupancy;                       // 占用率
    
    // 内存带宽
    size_t bytes_read;                     // 读取字节数
    size_t bytes_written;                  // 写入字节数
    float memory_bandwidth_gbps;           // 内存带宽 GB/s
    
    // 构造函数
    KernelProfiler() : total_time(0.0f), avg_time(0.0f), min_time(FLT_MAX), max_time(0.0f),
                       num_calls(0), num_failures(0), registers_per_thread(0),
                       shared_memory_per_block(0), occupancy(0.0f),
                       bytes_read(0), bytes_written(0), memory_bandwidth_gbps(0.0f) {
        execution_times.reserve(1000);  // 预分配空间
    }
    
    // 记录执行时间
    void record_execution_time(float time_ms) {
        execution_times.push_back(time_ms);
        total_time += time_ms;
        num_calls++;
        
        min_time = std::min(min_time, time_ms);
        max_time = std::max(max_time, time_ms);
        avg_time = total_time / num_calls;
        
        // 保持历史记录在合理范围内
        if (execution_times.size() > 1000) {
            execution_times.erase(execution_times.begin());
        }
    }
    
    // 记录内存访问
    void record_memory_access(size_t read_bytes, size_t write_bytes, float time_ms) {
        bytes_read += read_bytes;
        bytes_written += write_bytes;
        
        // 计算内存带宽
        if (time_ms > 0) {
            float total_bytes = static_cast<float>(read_bytes + write_bytes);
            float bandwidth = (total_bytes / (1024 * 1024 * 1024)) / (time_ms / 1000.0f);
            memory_bandwidth_gbps = bandwidth;
        }
    }
    
    // 获取统计摘要
    struct ProfileSummary {
        float avg_time_ms;
        float min_time_ms;
        float max_time_ms;
        float std_dev_ms;
        int total_calls;
        float success_rate;
        float memory_bandwidth_gbps;
        float occupancy;
    };
    
    ProfileSummary get_summary() const {
        ProfileSummary summary;
        summary.avg_time_ms = avg_time;
        summary.min_time_ms = min_time;
        summary.max_time_ms = max_time;
        summary.total_calls = num_calls;
        summary.success_rate = (num_calls > 0) ? 1.0f - static_cast<float>(num_failures) / num_calls : 0.0f;
        summary.memory_bandwidth_gbps = memory_bandwidth_gbps;
        summary.occupancy = occupancy;
        
        // 计算标准差
        if (!execution_times.empty()) {
            float variance = 0.0f;
            for (float time : execution_times) {
                float diff = time - avg_time;
                variance += diff * diff;
            }
            variance /= execution_times.size();
            summary.std_dev_ms = std::sqrt(variance);
        } else {
            summary.std_dev_ms = 0.0f;
        }
        
        return summary;
    }
};

2. 全局性能监控

// 全局性能监控器
class GlobalProfiler {
private:
    // 内核分析器映射
    std::unordered_map<std::string, std::unique_ptr<KernelProfiler>> kernel_profilers_;
    
    // 系统级统计
    std::atomic<size_t> total_gpu_memory_allocated_;
    std::atomic<size_t> peak_gpu_memory_usage_;
    std::atomic<int> active_cuda_streams_;
    
    // 线程安全
    std::mutex profiler_mutex_;
    
public:
    // 单例模式
    static GlobalProfiler& instance() {
        static GlobalProfiler instance;
        return instance;
    }
    
    // 获取或创建内核分析器
    KernelProfiler* get_kernel_profiler(const std::string& kernel_name) {
        std::lock_guard<std::mutex> lock(profiler_mutex_);
        
        auto it = kernel_profilers_.find(kernel_name);
        if (it == kernel_profilers_.end()) {
            kernel_profilers_[kernel_name] = std::make_unique<KernelProfiler>();
        }
        
        return kernel_profilers_[kernel_name].get();
    }
    
    // 记录内存分配
    void record_memory_allocation(size_t bytes) {
        total_gpu_memory_allocated_ += bytes;
        size_t current_usage = total_gpu_memory_allocated_.load();
        
        // 原子更新峰值使用量
        size_t expected_peak = peak_gpu_memory_usage_.load();
        while (current_usage > expected_peak && 
               !peak_gpu_memory_usage_.compare_exchange_weak(expected_peak, current_usage)) {
            // 重试直到成功更新
        }
    }
    
    // 记录内存释放
    void record_memory_deallocation(size_t bytes) {
        total_gpu_memory_allocated_ -= bytes;
    }
    
    // 获取全局统计
    struct GlobalStats {
        size_t total_memory_allocated;
        size_t peak_memory_usage;
        int active_streams;
        int num_kernel_types;
        float total_kernel_time;
        std::vector<std::pair<std::string, KernelProfiler::ProfileSummary>> kernel_summaries;
    };
    
    GlobalStats get_global_stats() const {
        std::lock_guard<std::mutex> lock(const_cast<std::mutex&>(profiler_mutex_));
        
        GlobalStats stats;
        stats.total_memory_allocated = total_gpu_memory_allocated_;
        stats.peak_memory_usage = peak_gpu_memory_usage_;
        stats.active_streams = active_cuda_streams_;
        stats.num_kernel_types = kernel_profilers_.size();
        stats.total_kernel_time = 0.0f;
        
        // 收集所有内核统计
        for (const auto& pair : kernel_profilers_) {
            auto summary = pair.second->get_summary();
            stats.kernel_summaries.emplace_back(pair.first, summary);
            stats.total_kernel_time += summary.avg_time_ms * summary.total_calls;
        }
        
        return stats;
    }
    
    // 生成性能报告
    std::string generate_performance_report() const {
        auto stats = get_global_stats();
        std::stringstream report;
        
        report << "=== vLLM CSRC Performance Report ===\n";
        report << "Total GPU Memory: " << stats.total_memory_allocated / (1024*1024) << " MB\n";
        report << "Peak Memory Usage: " << stats.peak_memory_usage / (1024*1024) << " MB\n";
        report << "Active CUDA Streams: " << stats.active_streams << "\n";
        report << "Total Kernel Time: " << stats.total_kernel_time << " ms\n\n";
        
        report << "Kernel Performance:\n";
        for (const auto& kernel_stat : stats.kernel_summaries) {
            const auto& summary = kernel_stat.second;
            report << "  " << kernel_stat.first << ":\n";
            report << "    Calls: " << summary.total_calls << "\n";
            report << "    Avg Time: " << summary.avg_time_ms << " ms\n";
            report << "    Min/Max: " << summary.min_time_ms << "/" << summary.max_time_ms << " ms\n";
            report << "    Success Rate: " << (summary.success_rate * 100) << "%\n";
            report << "    Memory Bandwidth: " << summary.memory_bandwidth_gbps << " GB/s\n";
            report << "    Occupancy: " << (summary.occupancy * 100) << "%\n\n";
        }
        
        return report.str();
    }
};

这些数据结构为CSRC模块提供了完整的内存管理、参数传递和性能监控方案,支持高效的GPU计算和资源管理。