PyTorch-08-数据结构UML与交互图#
核心数据结构关系图#
完整系统UML类图#
classDiagram
%% c10核心层
class intrusive_ptr_target {
<<interface>>
+std::atomic~uint64_t~ combined_refcount_
+uint32_t refcount()
+uint32_t weakcount()
+void incref()
+void decref()
}
class TensorImpl {
-Storage storage_
-SizesAndStrides sizes_and_strides_
-int64_t storage_offset_
-int64_t numel_
-TypeMeta data_type_
-optional~Device~ device_opt_
-DispatchKeySet key_set_
-VariableVersion version_counter_
-unique_ptr~AutogradMetaInterface~ autograd_meta_
-unique_ptr~ExtraMeta~ extra_meta_
-PyObjectSlot pyobj_slot_
+IntArrayRef sizes()
+IntArrayRef strides()
+int64_t dim()
+int64_t numel()
+ScalarType dtype()
+Device device()
+Storage storage()
+bool is_contiguous()
+void set_sizes_contiguous(IntArrayRef)
+TensorImpl* as_view()
}
class StorageImpl {
-DataPtr data_ptr_
-SymInt size_bytes_
-Allocator* allocator_
-bool resizable_
-bool received_cuda_
+const DataPtr& data_ptr()
+DataPtr& mutable_data_ptr()
+size_t nbytes()
+bool resizable()
+void set_nbytes(size_t)
+void reset()
}
class Allocator {
<<interface>>
+virtual DataPtr allocate(size_t) = 0
+virtual void copy_data(void*, const void*, size_t) = 0
+virtual DeleterFnPtr raw_deleter()
+DataPtr clone(const void*, size_t)
}
class CPUAllocator {
+DataPtr allocate(size_t) override
+void copy_data(void*, const void*, size_t) override
}
class CUDACachingAllocator {
-DeviceAllocator base_allocator_
-std::vector~Block*~ free_blocks_
-std::unordered_set~Block*~ active_blocks_
+DataPtr allocate(size_t) override
+void free_block(Block*)
+void empty_cache()
+DeviceStats get_stats()
}
class DispatchKeySet {
-uint64_t repr_
+bool has(DispatchKey)
+DispatchKeySet add(DispatchKey)
+DispatchKeySet remove(DispatchKey)
+DispatchKey highestPriorityTypeId()
+int getDispatchTableIndexForDispatchKeySet()
}
%% ATen层
class Tensor {
-intrusive_ptr~TensorImpl~ impl_
+Tensor add(const Tensor&, const Scalar&)
+Tensor& add_(const Tensor&, const Scalar&)
+Tensor matmul(const Tensor&)
+Tensor view(IntArrayRef)
+Tensor transpose(int64_t, int64_t)
+void backward()
+Tensor grad()
}
class TensorBase {
-intrusive_ptr~TensorImpl~ impl_
+IntArrayRef sizes()
+IntArrayRef strides()
+int64_t numel()
+ScalarType dtype()
+Device device()
+bool is_same(const TensorBase&)
}
class Dispatcher {
-std::array~OperatorEntry~ operators_
-std::vector~BackendFallbackKernel~ backend_fallback_kernels_
+static Dispatcher& singleton()
+template~class Return, class... Args~
Return call(const TypedOperatorHandle~Return(Args...)>&, Args...)
+RegistrationHandleRAII registerDef(FunctionSchema)
+RegistrationHandleRAII registerImpl(OperatorHandle, DispatchKey, KernelFunction)
}
class OperatorEntry {
-FunctionSchema schema_
-std::array~AnnotatedKernel~ dispatchTable_
-DispatchKeyExtractor dispatchKeyExtractor_
+const KernelFunction& lookup(DispatchKeySet)
+bool hasKernelForDispatchKey(DispatchKey)
+void updateDispatchTable(DispatchKey, KernelFunction)
}
class TensorIterator {
-SmallVector~OperandInfo~ operands_
-DimVector shape_
-int64_t numel_
-bool is_reduction_
-bool all_ops_same_shape_
+void add_output(const Tensor&)
+void add_input(const Tensor&)
+TensorIteratorConfig& build()
+char* data_ptr(int arg)
+void for_each(loop2d_t loop)
}
%% Autograd层
class Node {
<<interface>>
+edge_list next_edges_
+std::vector~SavedVariable~ saved_variables_
+uint64_t sequence_nr_
+virtual variable_list apply(variable_list&&) = 0
+virtual std::string name() = 0
+void save_variables(TensorList)
}
class AddBackward0 {
+variable_list apply(variable_list&&) override
+std::string name() override
}
class MulBackward0 {
+SavedVariable saved_self
+SavedVariable saved_other
+variable_list apply(variable_list&&) override
+std::string name() override
}
class AutogradMeta {
-Variable grad_
-std::shared_ptr~Node~ grad_fn_
-std::weak_ptr~Node~ grad_accumulator_
-std::vector~std::shared_ptr~FunctionPreHook~~ hooks_
-bool requires_grad_
-bool is_view_
+void set_requires_grad(bool, TensorImpl*)
+bool requires_grad()
+Variable& mutable_grad()
+const std::shared_ptr~Node~& grad_fn()
}
class Engine {
-std::vector~std::shared_ptr~ReadyQueue~~ ready_queues_
-std::vector~std::thread~ workers_
+static Engine& get_default_engine()
+variable_list execute(const edge_list&, const variable_list&, bool, bool, const edge_list&)
+void thread_main(std::shared_ptr~GraphTask~)
}
%% torch.nn层
class Module {
-OrderedDict~std::string, std::shared_ptr~Module~~ _modules
-OrderedDict~std::string, Parameter~ _parameters
-OrderedDict~std::string, Tensor~ _buffers
-bool training_
+virtual Tensor forward(TensorList) = 0
+Tensor operator()(TensorList)
+void train(bool)
+void eval()
+std::vector~Tensor~ parameters()
+void to(Device)
}
class Linear {
+Parameter weight
+Parameter bias
+int64_t in_features
+int64_t out_features
+Tensor forward(const Tensor&) override
+void reset_parameters()
}
class Conv2d {
+Parameter weight
+Parameter bias
+int64_t in_channels
+int64_t out_channels
+std::pair~int64_t, int64_t~ kernel_size
+Tensor forward(const Tensor&) override
}
%% 继承关系
intrusive_ptr_target <|-- TensorImpl
intrusive_ptr_target <|-- StorageImpl
Allocator <|-- CPUAllocator
Allocator <|-- CUDACachingAllocator
TensorBase <|-- Tensor
Node <|-- AddBackward0
Node <|-- MulBackward0
Module <|-- Linear
Module <|-- Conv2d
%% 组合关系
TensorImpl *-- StorageImpl : storage_
TensorImpl *-- DispatchKeySet : key_set_
TensorImpl *-- AutogradMeta : autograd_meta_
StorageImpl *-- Allocator : allocator_
Tensor *-- TensorImpl : impl_
Dispatcher *-- OperatorEntry : operators_
AutogradMeta *-- Node : grad_fn_
%% 依赖关系
TensorIterator ..> Tensor : operates on
Engine ..> Node : executes
Linear ..> Tensor : processes
Conv2d ..> Tensor : processes
模块交互状态图#
stateDiagram-v2
[*] --> TensorCreated
TensorCreated --> WithGrad : requires_grad=True
TensorCreated --> WithoutGrad : requires_grad=False
WithGrad --> ComputationGraph : 参与运算
WithoutGrad --> SimpleComputation : 参与运算
ComputationGraph --> GraphBuilt : 构建完成
SimpleComputation --> ResultReady : 计算完成
GraphBuilt --> BackwardReady : 调用backward()
BackwardReady --> GradientsComputed : 梯度计算完成
GradientsComputed --> OptimizationStep : 调用optimizer.step()
OptimizationStep --> ParametersUpdated : 参数更新完成
ParametersUpdated --> [*] : 训练步骤结束
ResultReady --> [*] : 推理完成
note right of ComputationGraph
构建autograd图
创建Function节点
保存前向上下文
end note
note right of BackwardReady
启动Engine
遍历计算图
计算梯度
end note
内存管理生命周期图#
stateDiagram-v2
[*] --> Allocated : Allocator::allocate()
Allocated --> InUse : 创建Tensor
InUse --> Shared : 多个Tensor共享
InUse --> Released : 引用计数=0
Shared --> InUse : 其他引用释放
Shared --> Released : 所有引用释放
Released --> Cached : CachingAllocator缓存
Released --> Deallocated : 直接释放
Cached --> InUse : 缓存命中复用
Cached --> Deallocated : 内存压力释放
Deallocated --> [*]
note right of Cached
GPU内存池
减少cudaMalloc调用
按大小分桶管理
end note
关键交互时序图#
训练一个batch的完整时序#
sequenceDiagram
autonumber
participant App as 应用代码
participant DataLoader as DataLoader
participant Model as nn.Module
participant Tensor as Tensor
participant Autograd as Autograd Engine
participant Optimizer as Optimizer
participant Allocator as GPU Allocator
App->>DataLoader: next(iter(dataloader))
DataLoader->>DataLoader: 加载批次数据
DataLoader->>Allocator: 分配GPU内存
Allocator-->>DataLoader: 返回内存地址
DataLoader-->>App: batch数据
App->>Model: model(batch_input)
loop 前向传播
Model->>Tensor: 各种算子调用
Tensor->>Tensor: 构建autograd图
Note over Tensor: 创建Function节点<br/>保存前向上下文
end
Model-->>App: loss值
App->>Autograd: loss.backward()
Autograd->>Autograd: 创建GraphTask
Autograd->>Autograd: 拓扑排序
loop 反向传播
Autograd->>Autograd: 执行Function::apply()
Note over Autograd: 计算梯度<br/>传播给输入
end
Autograd->>Tensor: 累积梯度到.grad
Autograd-->>App: 反向传播完成
App->>Optimizer: optimizer.step()
Optimizer->>Tensor: 更新参数
Optimizer-->>App: 参数更新完成
App->>Model: model.zero_grad()
Model->>Tensor: 清零梯度
算子分发详细时序#
sequenceDiagram
autonumber
participant User as 用户调用
participant PyBind as Python绑定
participant Dispatcher as Dispatcher
participant OpEntry as OperatorEntry
participant Kernel as 具体Kernel
participant CUDA as CUDA Runtime
User->>PyBind: tensor.add(other)
PyBind->>PyBind: 解析Python参数
PyBind->>Dispatcher: at::add(self, other, alpha)
Dispatcher->>Dispatcher: 提取DispatchKeySet
Note over Dispatcher: self.key_set() | other.key_set()<br/>{CUDA, AutogradCUDA}
Dispatcher->>OpEntry: lookup(DispatchKeySet)
OpEntry->>OpEntry: getDispatchTableIndex()
Note over OpEntry: 位运算计算索引<br/>查表获取KernelFunction
OpEntry-->>Dispatcher: KernelFunction
Dispatcher->>Kernel: call(op, DispatchKeySet, args)
alt Autograd模式
Kernel->>Kernel: AutogradCUDA内核
Note over Kernel: 记录AddBackward节点<br/>保存前向上下文
Kernel->>Dispatcher: redispatch去除Autograd键
Dispatcher->>Kernel: CUDA内核
else 推理模式
Kernel->>Kernel: CUDA内核
end
Kernel->>CUDA: cudaLaunchKernel()
CUDA->>CUDA: GPU执行kernel
CUDA-->>Kernel: 执行完成
Kernel-->>Dispatcher: 返回结果
Dispatcher-->>PyBind: 返回结果
PyBind-->>User: Python Tensor对象
内存分配与回收时序#
sequenceDiagram
autonumber
participant Tensor as Tensor创建
participant Storage as StorageImpl
participant Allocator as CUDACachingAllocator
participant Pool as 内存池
participant CUDA as CUDA Driver
participant GC as 垃圾回收
Tensor->>Storage: 创建StorageImpl
Storage->>Allocator: allocate(nbytes)
Allocator->>Pool: 查找空闲块
alt 缓存命中
Pool-->>Allocator: 返回缓存块
else 缓存未命中
Allocator->>CUDA: cudaMalloc(nbytes)
CUDA-->>Allocator: device_ptr
Allocator->>Pool: 记录新分配块
end
Allocator-->>Storage: DataPtr
Storage-->>Tensor: 完成创建
Note over Tensor: 使用张量...
Tensor->>GC: 引用计数=0
GC->>Storage: ~StorageImpl()
Storage->>Allocator: DataPtr析构
Allocator->>Pool: 回收块到空闲列表
Note over Pool: 延迟释放<br/>等待内存压力
opt 内存压力大时
Pool->>CUDA: cudaFree(device_ptr)
CUDA-->>Pool: 释放完成
end
性能瓶颈分析图#
CPU时间分布图#
pie title CPU时间分布 (训练1个epoch)
"CUDA Kernels" : 65.2
"Python解释器" : 12.3
"内存管理" : 8.7
"Dispatcher分发" : 6.1
"Autograd引擎" : 4.2
"数据加载" : 2.8
"其他" : 0.7
内存使用分布图#
pie title GPU内存使用分布
"模型参数" : 35.5
"激活值" : 28.2
"梯度" : 15.8
"优化器状态" : 12.3
"临时缓冲区" : 6.1
"内存池开销" : 2.1
调用频次热力图#
graph TD
A[at::add - 45.2%] --> A1[add_cpu: 12.3%]
A --> A2[add_cuda: 32.9%]
B[at::matmul - 23.1%] --> B1[mm_cpu: 3.2%]
B --> B2[mm_cuda: 19.9%]
C[at::conv2d - 18.7%] --> C1[cudnn_conv: 18.7%]
D[autograd::backward - 8.9%] --> D1[Engine::execute: 8.9%]
E[其他算子 - 4.1%]
style A fill:#ff6b6b
style B fill:#ff8e53
style C fill:#ff8e53
style D fill:#4ecdc4
style E fill:#45b7d1
数据流向图#
训练数据流#
flowchart LR
subgraph 数据加载
A1[原始数据] --> A2[DataLoader]
A2 --> A3[批次数据]
A3 --> A4[GPU内存]
end
subgraph 前向传播
A4 --> B1[Input Layer]
B1 --> B2[Hidden Layers]
B2 --> B3[Output Layer]
B3 --> B4[Loss Function]
end
subgraph 反向传播
B4 --> C1[Loss.backward]
C1 --> C2[Autograd Engine]
C2 --> C3[梯度计算]
C3 --> C4[梯度累积]
end
subgraph 参数更新
C4 --> D1[Optimizer]
D1 --> D2[参数更新]
D2 --> D3[梯度清零]
end
D3 --> A2
style A4 fill:#e1f5ff
style B4 fill:#e8f5e9
style C2 fill:#fff4e1
style D1 fill:#fce4ec
内存数据流#
flowchart TD
subgraph CPU内存
A1[Python对象] --> A2[numpy arrays]
A2 --> A3[torch.Tensor]
end
subgraph GPU内存
B1[Device Tensor]
B2[Kernel输入]
B3[Kernel输出]
B4[缓存池]
end
subgraph 计算单元
C1[CUDA Cores]
C2[Tensor Cores]
C3[cuDNN]
end
A3 --> B1 : to(device)
B1 --> B2 : 内存布局转换
B2 --> C1 : 一般计算
B2 --> C2 : 混合精度
B2 --> C3 : 卷积/RNN
C1 --> B3
C2 --> B3
C3 --> B3
B3 --> B1 : 结果存储
B1 --> B4 : 释放到缓存
B4 --> B1 : 缓存复用
B1 --> A3 : to('cpu')