1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
| namespace at::native::cudnn {
// cuDNN句柄管理(基于资源管理分析)
class CuDNNHandlePool {
private:
struct HandleEntry {
cudnnHandle_t handle;
c10::cuda::CUDAStream stream;
bool in_use;
HandleEntry() : handle(nullptr), in_use(false) {
C10_CUDNN_CHECK(cudnnCreate(&handle));
}
~HandleEntry() {
if (handle) {
cudnnDestroy(handle);
}
}
};
// 每个设备每个流的句柄池
std::unordered_map<c10::DeviceIndex,
std::unordered_map<c10::cuda::CUDAStream,
std::vector<std::unique_ptr<HandleEntry>>>> handle_pools_;
std::mutex pool_mutex_;
public:
// 获取cuDNN句柄
cudnnHandle_t acquire_handle(c10::cuda::CUDAStream stream) {
auto device = c10::cuda::current_device();
std::lock_guard<std::mutex> lock(pool_mutex_);
auto& device_pools = handle_pools_[device];
auto& stream_pool = device_pools[stream];
// 查找可用的句柄
for (auto& entry : stream_pool) {
if (!entry->in_use) {
entry->in_use = true;
entry->stream = stream;
// 设置流
C10_CUDNN_CHECK(cudnnSetStream(entry->handle, stream));
return entry->handle;
}
}
// 创建新句柄
auto new_entry = std::make_unique<HandleEntry>();
new_entry->in_use = true;
new_entry->stream = stream;
C10_CUDNN_CHECK(cudnnSetStream(new_entry->handle, stream));
auto handle = new_entry->handle;
stream_pool.push_back(std::move(new_entry));
return handle;
}
void release_handle(cudnnHandle_t handle) {
std::lock_guard<std::mutex> lock(pool_mutex_);
// 查找并释放句柄
for (auto& [device, device_pools] : handle_pools_) {
for (auto& [stream, stream_pool] : device_pools) {
for (auto& entry : stream_pool) {
if (entry->handle == handle) {
entry->in_use = false;
return;
}
}
}
}
}
};
// cuDNN卷积优化(基于算法选择分析)
class CuDNNConvolutionOptimizer {
public:
// 算法性能缓存
struct AlgorithmCache {
struct CacheKey {
cudnnConvolutionDescriptor_t conv_desc;
cudnnFilterDescriptor_t filter_desc;
cudnnTensorDescriptor_t input_desc;
cudnnTensorDescriptor_t output_desc;
bool operator==(const CacheKey& other) const {
// 简化:实际需要比较所有描述符
return memcmp(this, &other, sizeof(CacheKey)) == 0;
}
};
struct CacheValue {
cudnnConvolutionFwdAlgo_t algorithm;
size_t workspace_size;
float execution_time;
};
std::unordered_map<CacheKey, CacheValue, CacheKeyHash> cache;
std::mutex cache_mutex;
// 查找最佳算法
std::optional<CacheValue> find_best_algorithm(const CacheKey& key) {
std::lock_guard<std::mutex> lock(cache_mutex);
auto it = cache.find(key);
return (it != cache.end()) ? std::make_optional(it->second) : std::nullopt;
}
// 缓存算法性能
void cache_algorithm(const CacheKey& key, const CacheValue& value) {
std::lock_guard<std::mutex> lock(cache_mutex);
cache[key] = value;
}
};
static AlgorithmCache algorithm_cache_;
// 自动选择最佳卷积算法
static cudnnConvolutionFwdAlgo_t select_best_algorithm(
cudnnHandle_t handle,
const cudnnTensorDescriptor_t input_desc,
const cudnnFilterDescriptor_t filter_desc,
const cudnnConvolutionDescriptor_t conv_desc,
const cudnnTensorDescriptor_t output_desc,
size_t max_workspace_size) {
// 构建缓存键
AlgorithmCache::CacheKey key{conv_desc, filter_desc, input_desc, output_desc};
// 查找缓存
if (auto cached = algorithm_cache_.find_best_algorithm(key)) {
if (cached->workspace_size <= max_workspace_size) {
return cached->algorithm;
}
}
// 查找可用算法
int algo_count;
C10_CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithmMaxCount(handle, &algo_count));
std::vector<cudnnConvolutionFwdAlgoPerf_t> algo_perfs(algo_count);
int returned_algo_count;
C10_CUDNN_CHECK(cudnnFindConvolutionForwardAlgorithm(
handle, input_desc, filter_desc, conv_desc, output_desc,
algo_count, &returned_algo_count, algo_perfs.data()
));
// 选择最快且满足内存要求的算法
for (int i = 0; i < returned_algo_count; ++i) {
const auto& perf = algo_perfs[i];
if (perf.status == CUDNN_STATUS_SUCCESS &&
perf.memory <= max_workspace_size) {
// 缓存结果
AlgorithmCache::CacheValue value{
perf.algo, perf.memory, perf.time
};
algorithm_cache_.cache_algorithm(key, value);
return perf.algo;
}
}
// 回退到默认算法
return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
}
// 工作空间管理
class WorkspaceManager {
private:
std::unordered_map<c10::DeviceIndex, void*> workspace_ptrs_;
std::unordered_map<c10::DeviceIndex, size_t> workspace_sizes_;
std::mutex workspace_mutex_;
public:
void* get_workspace(c10::DeviceIndex device, size_t required_size) {
std::lock_guard<std::mutex> lock(workspace_mutex_);
auto size_it = workspace_sizes_.find(device);
auto ptr_it = workspace_ptrs_.find(device);
if (size_it != workspace_sizes_.end() &&
ptr_it != workspace_ptrs_.end() &&
size_it->second >= required_size) {
// 现有工作空间足够大
return ptr_it->second;
}
// 需要分配更大的工作空间
if (ptr_it != workspace_ptrs_.end()) {
// 释放旧的工作空间
auto allocator = c10::cuda::CUDACachingAllocator::get();
allocator->raw_deallocate(ptr_it->second);
}
// 分配新的工作空间
auto allocator = c10::cuda::CUDACachingAllocator::get();
void* new_workspace = allocator->raw_allocate(required_size);
workspace_ptrs_[device] = new_workspace;
workspace_sizes_[device] = required_size;
return new_workspace;
}
};
static WorkspaceManager workspace_manager_;
};
// cuDNN卷积的完整实现
Tensor cudnn_convolution(
const Tensor& input, // [N, C, H, W]
const Tensor& weight, // [O, C, kH, kW]
const c10::optional<Tensor>& bias,
IntArrayRef stride,
IntArrayRef padding,
IntArrayRef dilation,
int64_t groups,
bool benchmark,
bool deterministic) {
// 1. 参数验证
checkInputDims(input, weight, bias, stride, padding, dilation, groups);
// 2. 创建cuDNN描述符
auto input_desc = TensorDescriptor(input);
auto weight_desc = FilterDescriptor(weight);
auto conv_desc = ConvolutionDescriptor(stride, padding, dilation, groups);
// 3. 计算输出尺寸
auto output_sizes = conv_output_size(
input.sizes(), weight.sizes(), stride, padding, dilation
);
auto output = at::empty(output_sizes, input.options());
auto output_desc = TensorDescriptor(output);
// 4. 获取cuDNN句柄
auto handle = getCudnnHandle();
// 5. 选择最佳算法
size_t max_workspace_size = getMaxWorkspaceSize();
auto algorithm = CuDNNConvolutionOptimizer::select_best_algorithm(
handle, input_desc.desc(), weight_desc.desc(), conv_desc.desc(),
output_desc.desc(), max_workspace_size
);
// 6. 获取工作空间
size_t workspace_size;
C10_CUDNN_CHECK(cudnnGetConvolutionForwardWorkspaceSize(
handle, input_desc.desc(), weight_desc.desc(), conv_desc.desc(),
output_desc.desc(), algorithm, &workspace_size
));
auto workspace = CuDNNConvolutionOptimizer::workspace_manager_
.get_workspace(input.device().index(), workspace_size);
// 7. 执行卷积
AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "cudnn_convolution", [&] {
auto alpha = scalar_to_cudnn_type<scalar_t>(1.0);
auto beta = scalar_to_cudnn_type<scalar_t>(0.0);
C10_CUDNN_CHECK(cudnnConvolutionForward(
handle,
&alpha, input_desc.desc(), input.data_ptr<scalar_t>(),
weight_desc.desc(), weight.data_ptr<scalar_t>(),
conv_desc.desc(), algorithm,
workspace, workspace_size,
&beta, output_desc.desc(), output.mutable_data_ptr<scalar_t>()
));
// 添加偏置
if (bias.has_value()) {
auto bias_desc = TensorDescriptor(*bias);
C10_CUDNN_CHECK(cudnnAddTensor(
handle,
&alpha, bias_desc.desc(), bias->data_ptr<scalar_t>(),
&alpha, output_desc.desc(), output.mutable_data_ptr<scalar_t>()
));
}
});
return output;
}
} // namespace at::native::cudnn
|