加载中...
加载中...
cuDNN(CUDA Deep Neural Network library)是NVIDIA开发的GPU加速深度神经网络库,为深度学习应用提供了高度优化的基础原语。本文将深入探讨cuDNN的核心功能、API使用、性能优化策略和实际应用案例。
cuDNN是一个基于CUDA的GPU加速库,专门为深度神经网络计算而设计。它提供了以下核心功能:
// 检查cuDNN版本
#include <cudnn.h>
void printCudnnVersion() {
printf("cuDNN version: %d\n", CUDNN_MAJOR_VERSION);
printf("cuDNN patch: %d\n", CUDNN_PATCH_LEVEL);
printf("cuDNN minor: %d\n", CUDNN_MINOR_VERSION);
}
主要版本特性:
// 创建张量描述符
cudnnTensorDescriptor_t tensorDesc;
cudnnCreateTensorDescriptor(&tensorDesc);
// 设置张量格式
int batchSize = 32, channels = 3, height = 224, width = 224;
int nbDims = 4;
int dimA[4] = {batchSize, channels, height, width};
int strideA[4] = {channels * height * width, height * width, width, 1};
cudnnSetTensorNdDescriptor(tensorDesc, CUDNN_DATA_FLOAT, nbDims, dimA, strideA);
// 使用完成后释放
cudnnDestroyTensorDescriptor(tensorDesc);
// 不同精度支持
cudnnDataType_t dataType;
// 单精度浮点
dataType = CUDNN_DATA_FLOAT;
// 半精度浮点(FP16)
dataType = CUDNN_DATA_HALF;
// 8位量化
dataType = CUDNN_DATA_INT8;
// 32位整型
dataType = CUDNN_DATA_INT32;
// 双精度浮点
dataType = CUDNN_DATA_DOUBLE;
void performConvolution() {
// 创建句柄
cudnnHandle_t handle;
cudnnCreate(&handle);
// 创建描述符
cudnnTensorDescriptor_t inputDesc, outputDesc;
cudnnFilterDescriptor_t filterDesc;
cudnnConvolutionDescriptor_t convDesc;
// 初始化描述符...
// 选择算法
int requestedAlgoCount = 1;
int returnedAlgoCount;
cudnnConvolutionFwdAlgoPerf_t perfResults;
cudnnFindConvolutionForwardAlgorithm(handle, inputDesc, filterDesc, convDesc,
outputDesc, requestedAlgoCount,
&returnedAlgoCount, &perfResults);
// 分配工作空间
size_t workspaceSize = 0;
cudnnGetConvolutionForwardWorkspaceSize(handle, inputDesc, filterDesc,
convDesc, outputDesc,
perfResults.algo, &workspaceSize);
void* workspace = nullptr;
if (workspaceSize > 0) {
cudaMalloc(&workspace, workspaceSize);
}
// 执行卷积
float alpha = 1.0f, beta = 0.0f;
cudnnConvolutionForward(handle, &alpha, inputDesc, d_input,
filterDesc, d_filter, convDesc,
perfResults.algo, workspace, workspaceSize,
&beta, outputDesc, d_output);
// 清理资源
cudaFree(workspace);
cudnnDestroy(handle);
}
void performConvolutionBackward() {
// 数据梯度反向传播
cudnnConvolutionBackwardData(handle, &alpha, filterDesc, d_filter,
outputDesc, d_output, convDesc,
algo, workspace, workspaceSize,
&beta, inputDesc, d_inputGrad);
// 滤波器梯度反向传播
cudnnConvolutionBackwardFilter(handle, &alpha, inputDesc, d_input,
outputDesc, d_output, convDesc,
algo, workspace, workspaceSize,
&beta, filterDesc, d_filterGrad);
}
cuDNN提供多种卷积算法:
// 前向卷积算法
cudnnConvolutionFwdAlgo_t algo;
// GEMM-based算法(通用性强)
algo = CUDNN_CONVOLUTION_FWD_ALGO_GEMM;
// FFT-based算法(对大kernel效果好)
algo = CUDNN_CONVOLUTION_FWD_ALGO_FFT;
// Winograd算法(对3x3 kernel效果好)
algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
// 直接卷积(对小kernel效果好)
algo = CUDNN_CONVOLUTION_FWD_ALGO_DIRECT;
void performPooling() {
// 创建池化描述符
cudnnPoolingDescriptor_t poolDesc;
cudnnCreatePoolingDescriptor(&poolDesc);
// 设置池化参数
int windowHeight = 2, windowWidth = 2;
int verticalPadding = 0, horizontalPadding = 0;
int verticalStride = 2, horizontalStride = 2;
cudnnSetPooling2dDescriptor(poolDesc,
CUDNN_POOLING_MAX, // 最大池化
CUDNN_PROPAGATE_NAN, // NaN传播方式
windowHeight, windowWidth,
verticalPadding, horizontalPadding,
verticalStride, horizontalStride);
// 执行池化
cudnnPoolingForward(handle, poolDesc, &alpha, inputDesc, d_input,
&beta, outputDesc, d_output);
cudnnDestroyPoolingDescriptor(poolDesc);
}
void performActivation() {
// 创建激活描述符
cudnnActivationDescriptor_t activDesc;
cudnnCreateActivationDescriptor(&activDesc);
// 设置ReLU激活函数
cudnnSetActivationDescriptor(activDesc, CUDNN_ACTIVATION_RELU,
CUDNN_PROPAGATE_NAN, 0.0); // ReLU系数
// 执行激活
cudnnActivationForward(handle, activDesc, &alpha, inputDesc, d_input,
&beta, outputDesc, d_output);
// 反向传播
cudnnActivationBackward(handle, activDesc, &alpha, outputDesc, d_output,
inputDesc, d_input, inputDesc, d_inputGrad,
&beta, outputDesc, d_outputGrad);
cudnnDestroyActivationDescriptor(activDesc);
}
void performBatchNormalization() {
// 创建归一化描述符
cudnnBatchNormDescriptor_t bnDesc;
cudnnCreateBatchNormDescriptor(&bnDesc);
// 设置归一化模式
cudnnSetBatchNormalizationDescriptor(bnDesc, CUDNN_BATCHNORM_PER_ACTIVATION,
CUDNN_BATCHNORM_SPATIAL, 0.0, 0.0);
// 训练模式:计算均值和方差
cudnnBatchNormalizationForwardTraining(handle, bnDesc, &alpha, &beta,
inputDesc, d_input, outputDesc, d_output,
bnScaleBiasMeanVarDesc, d_bnScale,
d_bnBias, d_runningMean, d_runningVar,
epsilon, d_resultSaveMean, d_resultSaveVar);
// 推理模式:使用预计算的均值和方差
cudnnBatchNormalizationForwardInference(handle, bnDesc, &alpha, &beta,
inputDesc, d_input, outputDesc, d_output,
bnScaleBiasMeanVarDesc, d_bnScale,
d_bnBias, d_runningMean, d_runningVar, epsilon);
cudnnDestroyBatchNormDescriptor(bnDesc);
}
void performRNN() {
// 创建RNN描述符
cudnnRNNDescriptor_t rnnDesc;
cudnnCreateRNNDescriptor(&rnnDesc);
// 设置RNN参数
int hiddenSize = 512;
int numLayers = 2;
cudnnDropoutDescriptor_t dropoutDesc;
// ... 设置dropout描述符
cudnnSetRNNDescriptor(handle, rnnDesc, hiddenSize, numLayers,
dropoutDesc, CUDNN_LINEAR_INPUT, // 输入模式
CUDNN_BIDIRECTIONAL, // 双向RNN
CUDNN_LSTM, // LSTM单元
CUDNN_DATA_FLOAT);
// 设置RNN数据描述符
cudnnRNNDataDescriptor_t inputDesc, outputDesc;
// ... 设置数据维度和格式
// 前向传播
cudnnRNNForwardTraining(handle, rnnDesc, seqLengthArray,
inputDesc, d_input,
hDesc, d_hx, cDesc, d_cx, // 初始隐藏和细胞状态
filterDesc, d_weights, // 权重
outputDesc, d_output,
hDesc, d_hy, cDesc, d_cy, // 最终隐藏和细胞状态
workspace, workspaceSize);
cudnnDestroyRNNDescriptor(rnnDesc);
}
void performAttention() {
// 创建注意力描述符
cudnnAttentionDescriptor_t attnDesc;
cudnnCreateAttentionDescriptor(&attnDesc);
// 设置注意力参数
int qSize = 64, kSize = 64, vSize = 64;
int maxSeqLengthQ = 128, maxSeqLengthKV = 128;
int numHeads = 8;
cudnnSetAttentionDescriptor(attnDesc, CUDNN_ATTN_QKV_PROJ_BIASED,
CUDNN_DATA_FLOAT, qSize, kSize, vSize,
maxSeqLengthQ, maxSeqLengthKV, numHeads);
// 创建seq长度数组
int loWinOffset[128], hiWinOffset[128]; // 可选的局部注意力窗口
int qoSeqlens[128], kvSeqlens[128];
// 执行注意力
cudnnMultiHeadAttnForward(handle, attnDesc,
loWinOffset, hiWinOffset,
qoSeqlens, kvSeqlens,
/* input tensors */ /* output tensors */);
cudnnDestroyAttentionDescriptor(attnDesc);
}
void findBestAlgorithm() {
// 查找最佳卷积算法
int maxAlgoCount = 10;
int algoCount;
cudnnConvolutionFwdAlgoPerf_t perfResults[maxAlgoCount];
cudnnFindConvolutionForwardAlgorithm(handle, inputDesc, filterDesc,
convDesc, outputDesc, maxAlgoCount,
&algoCount, perfResults);
// 选择最快的算法
cudnnConvolutionFwdAlgo_t bestAlgo = perfResults[0].algo;
size_t bestWorkspaceSize = perfResults[0].memory;
printf("Best algorithm: %d, time: %f, memory: %zu\n",
bestAlgo, perfResults[0].time, bestWorkspaceSize);
}
class CudnnWorkspaceManager {
private:
void* workspace_;
size_t currentSize_;
public:
CudnnWorkspaceManager() : workspace_(nullptr), currentSize_(0) {}
~CudnnWorkspaceManager() {
if (workspace_) {
cudaFree(workspace_);
}
}
void* getWorkspace(size_t requiredSize) {
if (requiredSize > currentSize_) {
// 重新分配更大的工作空间
if (workspace_) {
cudaFree(workspace_);
}
cudaMalloc(&workspace_, requiredSize);
currentSize_ = requiredSize;
}
return workspace_;
}
};
void useMixedPrecision() {
// 创建张量描述符(FP16)
cudnnTensorDescriptor_t inputDescHalf, outputDescHalf;
cudnnCreateTensorDescriptor(&inputDescHalf);
cudnnCreateTensorDescriptor(&outputDescHalf);
// 设置FP16格式
cudnnSetTensor4dDescriptor(inputDescHalf, CUDNN_TENSOR_NCHW,
CUDNN_DATA_HALF, N, C, H, W);
// 使用FP32计算(更高精度)
cudnnSetConvolutionMathType(convDesc, CUDNN_TENSOR_OP_MATH);
// 执行卷积(内部使用Tensor Cores)
cudnnConvolutionForward(handle, &alpha, inputDescHalf, d_inputHalf,
filterDesc, d_filterHalf, convDesc, algo,
workspace, workspaceSize, &beta,
outputDescHalf, d_outputHalf);
}
void optimizeBatchSize() {
// 测试不同批大小
std::vector<int> batchSizes = {1, 2, 4, 8, 16, 32, 64, 128};
float bestTime = FLT_MAX;
int bestBatchSize = 1;
for (int batchSize : batchSizes) {
// 更新张量描述符
cudnnSetTensor4dDescriptor(inputDesc, CUDNN_TENSOR_NCHW,
CUDNN_DATA_FLOAT, batchSize, C, H, W);
// 计时
auto start = std::chrono::high_resolution_clock::now();
performConvolution();
auto end = std::chrono::high_resolution_clock::now();
float time = std::chrono::duration<float>(end - start).count();
float throughput = batchSize / time;
printf("Batch size: %d, Time: %f, Throughput: %f samples/sec\n",
batchSize, time, throughput);
if (throughput > bestThroughput) {
bestThroughput = throughput;
bestBatchSize = batchSize;
}
}
printf("Best batch size: %d\n", bestBatchSize);
}
void multiGpuConvolution() {
int gpuCount;
cudaGetDeviceCount(&gpuCount);
// 为每个GPU创建独立的cuDNN句柄
std::vector<cudnnHandle_t> handles(gpuCount);
for (int i = 0; i < gpuCount; i++) {
cudaSetDevice(i);
cudnnCreate(&handles[i]);
}
// 分割数据到多个GPU
int batchSizePerGPU = totalBatchSize / gpuCount;
// 并行处理
std::vector<std::thread> threads;
for (int i = 0; i < gpuCount; i++) {
threads.emplace_back([&, i]() {
cudaSetDevice(i);
// 处理第i个GPU的数据
performConvolutionOnGPU(handles[i], i, batchSizePerGPU);
});
}
// 等待所有线程完成
for (auto& thread : threads) {
thread.join();
}
// 清理
for (int i = 0; i < gpuCount; i++) {
cudnnDestroy(handles[i]);
}
}
cuDNN 9.0引入了图API,用于构建和执行复杂的计算图:
void buildCudnnGraph() {
// 创建图构建器
cudnnBackendDescriptor_t graphBuilder;
cudnnCreateBackendDescriptor(&graphBuilder);
cudnnSetBackendDescriptor(graphBuilder, CUDNN_BACKEND_GRAPH_DESCRIPTOR);
// 创建操作描述符
cudnnBackendDescriptor_t convOp, activationOp;
// 设置卷积操作
cudnnCreateBackendDescriptor(&convOp);
cudnnSetBackendDescriptor(convOp, CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR);
// ... 设置输入、输出、滤波器等参数
// 设置激活操作
cudnnCreateBackendDescriptor(&activationOp);
cudnnSetBackendDescriptor(activationOp, CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR);
// ... 设置激活函数参数
// 构建图
std::array<void*, 2> ops = {convOp, activationOp};
cudnnSetAttribute(graphBuilder, CUDNN_ATTR_GRAPH_OPS, CUDNN_TYPE_PTR, ops.size(), ops.data());
// 执行图
cudnnBackendDescriptor_t executePlan;
cudnnCreateBackendDescriptor(&executePlan);
cudnnSetBackendDescriptor(executePlan, CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR);
// 执行
std::array<void*, 4> variantPacks = {/* input tensors, output tensors, etc. */};
cudnnBackendExecute(handle, executePlan, variantPacks.data());
// 清理
cudnnDestroyBackendDescriptor(graphBuilder);
cudnnDestroyBackendDescriptor(convOp);
cudnnDestroyBackendDescriptor(activationOp);
cudnnDestroyBackendDescriptor(executePlan);
}
cuDNN 9.x集成了Flash Attention算法,用于高效的自注意力计算:
void flashAttention() {
// 使用Flash Attention优化
cudnnAttentionDescriptor_t flashAttnDesc;
cudnnCreateAttentionDescriptor(&flashAttnDesc);
// 启用Flash Attention
cudnnSetAttentionDescriptor(flashAttnDesc,
CUDNN_ATTN_QKV_PROJ_BIASED | CUDNN_ATTN_FLASH_ATTENTION,
CUDNN_DATA_HALF, qSize, kSize, vSize,
maxSeqLen, maxSeqLen, numHeads);
// 更低的内存使用和更高的性能
cudnnMultiHeadAttnForward(handle, flashAttnDesc, /*...*/);
}
class ResNetBlock {
private:
cudnnHandle_t handle_;
cudnnTensorDescriptor_t inputDesc_, outputDesc_;
cudnnTensorDescriptor_t relu1Desc_, relu2Desc_;
cudnnFilterDescriptor_t conv1Weight_, conv2Weight_;
cudnnConvolutionDescriptor_t conv1Desc_, conv2Desc_;
cudnnActivationDescriptor_t reluDesc_;
void* workspace_;
size_t workspaceSize_;
public:
ResNetBlock(int channels, int height, int width) : workspace_(nullptr) {
cudnnCreate(&handle_);
// 创建描述符
createDescriptors(channels, height, width);
// 计算工作空间大小
size_t ws1, ws2;
cudnnGetConvolutionForwardWorkspaceSize(handle_, inputDesc_, conv1Weight_,
conv1Desc_, relu1Desc_,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, &ws1);
cudnnGetConvolutionForwardWorkspaceSize(handle_, relu1Desc_, conv2Weight_,
conv2Desc_, outputDesc_,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, &ws2);
workspaceSize_ = std::max(ws1, ws2);
cudaMalloc(&workspace_, workspaceSize_);
}
void forward(float* d_input, float* d_output,
float* d_conv1Weight, float* d_conv2Weight) {
float alpha = 1.0f, beta = 0.0f;
// 第一个卷积
cudnnConvolutionForward(handle_, &alpha, inputDesc_, d_input,
conv1Weight_, d_conv1Weight, conv1Desc_,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
workspace_, workspaceSize_, &beta, relu1Desc_, d_temp);
// 第一个ReLU
cudnnActivationForward(handle_, reluDesc_, &alpha, relu1Desc_, d_temp,
&beta, relu1Desc_, d_temp);
// 第二个卷积
cudnnConvolutionForward(handle_, &alpha, relu1Desc_, d_temp,
conv2Weight_, d_conv2Weight, conv2Desc_,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
workspace_, workspaceSize_, &alpha, outputDesc_, d_output);
// 第二个ReLU
cudnnActivationForward(handle_, reluDesc_, &alpha, outputDesc_, d_output,
&beta, outputDesc_, d_output);
}
~ResNetBlock() {
cudaFree(workspace_);
destroyDescriptors();
cudnnDestroy(handle_);
}
};
class BERTLayer {
private:
cudnnHandle_t handle_;
cudnnAttentionDescriptor_t attnDesc_;
cudnnTensorDescriptor_t qDesc_, kDesc_, vDesc_;
public:
BERTLayer(int hiddenSize, int numHeads, int seqLength) {
cudnnCreate(&handle_);
// 创建多头注意力描述符
cudnnCreateAttentionDescriptor(&attnDesc_);
cudnnSetAttentionDescriptor(attnDesc_, CUDNN_ATTN_QKV_PROJ_BIASED,
CUDNN_DATA_HALF, hiddenSize, hiddenSize, hiddenSize,
seqLength, seqLength, numHeads);
// 创建张量描述符
createTensorDescriptors(hiddenSize, numHeads, seqLength);
}
void multiHeadAttention(half* d_q, half* d_k, half* d_v,
half* d_attnOutput) {
float alpha = 1.0f, beta = 0.0f;
// 执行自注意力
cudnnMultiHeadAttnForward(handle_, attnDesc_, nullptr, nullptr,
nullptr, nullptr,
/* Q */ d_q, /* K */ d_k, /* V */ d_v,
nullptr, nullptr, d_attnOutput);
// 可选:添加残差连接和层归一化
// ...
}
void feedForward(half* d_input, half* d_output) {
// 前馈网络:Linear -> GELU -> Linear
// 可以使用cuDNN的pointwise操作实现
}
~BERTLayer() {
cudnnDestroyAttentionDescriptor(attnDesc_);
destroyTensorDescriptors();
cudnnDestroy(handle_);
}
};
void profileCudnnOperations() {
// 启用profiling
cudnnProfilerStart();
// 执行操作
performConvolution();
// 停止profiling
cudnnProfilerStop();
// 输出profiling信息
cudnnPrintProfilerState();
}
# 使用Nsight Systems分析
nsys profile ./my_cudnn_app
# 使用Nsight Compute分析kernel
nv-nsight-cu ./my_cudnn_app
class CudnnBenchmarker {
public:
template<typename Func>
float benchmark(Func func, int warmup = 10, int iterations = 100) {
// Warmup
for (int i = 0; i < warmup; i++) {
func();
}
cudaDeviceSynchronize();
// Benchmark
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; i++) {
func();
}
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
float totalTime = std::chrono::duration<float>(end - start).count();
return totalTime / iterations;
}
};
#define CUDNN_CHECK(call) \
do { \
cudnnStatus_t status = call; \
if (status != CUDNN_STATUS_SUCCESS) { \
const char* msg = cudnnGetErrorString(status); \
fprintf(stderr, "cuDNN error: %s:%d - %s\n", \
__FILE__, __LINE__, msg); \
exit(EXIT_FAILURE); \
} \
} while (0)
// 使用示例
CUDNN_CHECK(cudnnCreate(&handle));
CUDNN_CHECK(cudnnConvolutionForward(handle, &alpha, /* ... */));
void debugTensor(cudnnTensorDescriptor_t desc, void* data, const char* name) {
int nbDims, dims[8], strides[8];
cudnnDataType_t dataType;
cudnnGetTensorNdDescriptor(desc, 8, &dataType, &nbDims, dims, strides);
printf("Tensor %s:\n", name);
printf(" Dimensions: [");
for (int i = 0; i < nbDims; i++) {
printf("%d", dims[i]);
if (i < nbDims - 1) printf(", ");
}
printf("]\n");
// 复制数据到主机并打印部分值
std::vector<float> hostData(dims[0]);
cudaMemcpy(hostData.data(), data, dims[0] * sizeof(float),
cudaMemcpyDeviceToHost);
printf(" First 5 values: [%.4f, %.4f, %.4f, %.4f, %.4f]\n",
hostData[0], hostData[1], hostData[2], hostData[3], hostData[4]);
}
class CudnnMemoryPool {
private:
std::unordered_map<size_t, std::vector<void*>> pool_;
public:
void* allocate(size_t size) {
auto it = pool_.find(size);
if (it != pool_.end() && !it->second.empty()) {
void* ptr = it->second.back();
it->second.pop_back();
return ptr;
}
void* ptr;
cudaMalloc(&ptr, size);
return ptr;
}
void deallocate(void* ptr, size_t size) {
pool_[size].push_back(ptr);
}
~CudnnMemoryPool() {
for (auto& pair : pool_) {
for (void* ptr : pair.second) {
cudaFree(ptr);
}
}
}
};
class AdaptiveAlgorithmSelector {
public:
cudnnConvolutionFwdAlgo_t selectAlgorithm(
cudnnTensorDescriptor_t inputDesc,
cudnnFilterDescriptor_t filterDesc,
cudnnConvolutionDescriptor_t convDesc,
cudnnTensorDescriptor_t outputDesc) {
// 获取输入尺寸
int n, c, h, w;
cudnnGetTensor4dDescriptor(inputDesc, nullptr, &n, &c, &h, &w, nullptr, nullptr, nullptr);
// 获取滤波器尺寸
int k, c_out, h_k, w_k;
cudnnGetFilter4dDescriptor(filterDesc, nullptr, nullptr, &k, &c_out, &h_k, &w_k);
// 根据问题特性选择算法
if (h_k == 1 && w_k == 1) {
// 1x1卷积,使用GEMM
return CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (h_k == 3 && w_k == 3 && h >= 32 && w >= 32) {
// 大特征图的3x3卷积,使用Winograd
return CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD;
} else if (k >= 64) {
// 大kernel,使用FFT
return CUDNN_CONVOLUTION_FWD_ALGO_FFT;
} else {
// 默认使用直接卷积
return CUDNN_CONVOLUTION_FWD_ALGO_DIRECT;
}
}
};
cuDNN作为深度学习GPU加速的核心库,提供了以下关键价值:
通过合理使用cuDNN的API和优化技术,可以显著提升深度学习应用的训练和推理性能。随着深度学习模型的不断发展,cuDNN将继续在GPU加速计算领域发挥重要作用。
发表评论
请登录后发表评论
评论 (0)