加载中...
加载中...
cuBLAS(CUDA Basic Linear Algebra Subprograms)是NVIDIA基于CUDA架构实现的BLAS(基础线性代数子程序)库,为GPU上的向量和矩阵运算提供高性能实现。本文将深入探讨cuBLAS的核心功能、API使用、性能优化策略和实际应用案例。
cuBLAS是NVIDIA提供的GPU加速线性代数库,实现了BLAS标准接口:
cuBLAS利用GPU的大规模并行计算能力,为科学计算、机器学习、深度学习等应用提供加速。
// cuBLAS的核心架构设计
/*
CPU (Host) GPU (Device)
├── cuBLAS API ├── cuBLAS Kernels
│ ├── 创建句柄 │ ├── Level 1: 向量运算
│ ├── 设置参数 │ ├── Level 2: 矩阵-向量
│ ├── 启动计算 │ └── Level 3: 矩阵-矩阵
│ └── 获取结果 └── 内存管理
└── 内存传输 ├── 全局内存
├── Host -> Device ├── 共享内存
└── Device -> Host └── 寄存器
*/
// 检查cuBLAS版本
#include <cublas_v2.h>
void printCublasVersion() {
int version = 0;
cublasGetVersion(handle, &version);
printf("cuBLAS version: %d\n", version);
}
主要版本特性:
#include <cublas_v2.h>
class CublasHandle {
private:
cublasHandle_t handle_;
bool is_initialized_;
public:
CublasHandle() : is_initialized_(false) {
cublasStatus_t status = cublasCreate(&handle_);
if (status == CUBLAS_STATUS_SUCCESS) {
is_initialized_ = true;
}
}
~CublasHandle() {
if (is_initialized_) {
cublasDestroy(handle_);
}
}
cublasHandle_t get() const { return handle_; }
operator cublasHandle_t() const { return handle_; }
};
void configureCublasHandle(cublasHandle_t handle, cudaStream_t stream = 0) {
// 设置CUDA流
if (stream != 0) {
cublasSetStream(handle, stream);
}
// 设置数学模式(使用Tensor Cores)
cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
// 设置指针模式(主机端或设备端)
cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
// 设置原子模式
cublasAtomicsMode_t atomicsMode;
cublasGetAtomicsMode(handle, &atomicsMode);
}
void vectorAdd(cublasHandle_t handle,
const float* x, const float* y, float* result,
int n, float alpha = 1.0f, float beta = 1.0f) {
// y = alpha * x + beta * y
cublasStatus_t status = cublasSaxpy(handle, n, &alpha, x, 1, y, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
printf("Vector addition failed: %d\n", status);
}
}
// 扩展版本:自定义向量加法
void customVectorAdd(cublasHandle_t handle,
const float* x, const float* y, float* z,
int n) {
// 创建临时向量
float* d_temp;
cudaMalloc(&d_temp, n * sizeof(float));
// z = x + y
cublasScopy(handle, n, x, 1, z, 1);
cublasSaxpy(handle, n, &alpha_one, y, 1, z, 1);
cudaFree(d_temp);
}
float vectorDot(cublasHandle_t handle,
const float* x, const float* y, int n) {
float result;
float* d_result;
cudaMalloc(&d_result, sizeof(float));
// 计算点积
cublasStatus_t status = cublasSdot(handle, n, x, 1, y, 1, d_result);
// 将结果复制回主机
cudaMemcpy(&result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_result);
return result;
}
// 批量向量点积
void batchVectorDot(cublasHandle_t handle,
const float* x, const float* y, float* results,
int n, int batchSize) {
for (int i = 0; i < batchSize; i++) {
const float* x_i = x + i * n;
const float* y_i = y + i * n;
results[i] = vectorDot(handle, x_i, y_i, n);
}
}
float vectorNorm(cublasHandle_t handle,
const float* x, int n,
cublasNormType_t normType = CUBLAS_NORM_L2) {
float result;
float* d_result;
cudaMalloc(&d_result, sizeof(float));
// 计算向量范数
cublasStatus_t status = cublasSnrm2(handle, n, x, 1, d_result);
cudaMemcpy(&result, d_result, sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(d_result);
return result;
}
// 向量绝对值最大元素
int vectorAmax(cublasHandle_t handle, const float* x, int n) {
int result;
int* d_result;
cudaMalloc(&d_result, sizeof(int));
cublasIsamax(handle, n, x, 1, d_result);
cudaMemcpy(&result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_result);
return result - 1; // cuBLAS使用1-based索引
}
void matrixVectorMultiply(cublasHandle_t handle,
const float* A, const float* x, float* y,
int m, int n, float alpha = 1.0f, float beta = 0.0f) {
// y = alpha * A * x + beta * y
cublasOperation_t trans = CUBLAS_OP_N; // 不转置
cublasStatus_t status = cublasSgemv(handle, trans, m, n,
&alpha, A, m, x, 1,
&beta, y, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
printf("Matrix-vector multiplication failed: %d\n", status);
}
}
// 转置矩阵-向量乘法
void transposedMatrixVectorMultiply(cublasHandle_t handle,
const float* A, const float* x, float* y,
int m, int n) {
float alpha = 1.0f, beta = 0.0f;
// y = alpha * A^T * x
cublasStatus_t status = cublasSgemv(handle, CUBLAS_OP_T, m, n,
&alpha, A, m, x, 1,
&beta, y, 1);
}
void triangularMatrixVectorMultiply(cublasHandle_t handle,
const float* A, float* x,
int n, bool upper = true,
bool unitDiag = false) {
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
cublasDiagType_t diag = unitDiag ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
// x = A * x (A是三角矩阵)
cublasStatus_t status = cublasStrmv(handle, uplo, CUBLAS_OP_N, diag,
n, A, n, x, 1);
}
void symmetricMatrixVectorMultiply(cublasHandle_t handle,
const float* A, const float* x, float* y,
int n, float alpha = 1.0f, float beta = 0.0f) {
cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
// y = alpha * A * x + beta * y (A是对称矩阵)
cublasStatus_t status = cublasSsymv(handle, uplo, n,
&alpha, A, n, x, 1,
&beta, y, 1);
}
void matrixMultiply(cublasHandle_t handle,
const float* A, const float* B, float* C,
int m, int n, int k,
float alpha = 1.0f, float beta = 0.0f) {
// C = alpha * A * B + beta * C
cublasOperation_t transA = CUBLAS_OP_N;
cublasOperation_t transB = CUBLAS_OP_N;
cublasStatus_t status = cublasSgemm(handle, transA, transB,
m, n, k,
&alpha, A, m, B, k,
&beta, C, m);
if (status != CUBLAS_STATUS_SUCCESS) {
printf("Matrix multiplication failed: %d\n", status);
}
}
// 批量矩阵乘法
void batchedMatrixMultiply(cublasHandle_t handle,
const float** A_array, const float** B_array, float** C_array,
int m, int n, int k, int batchSize) {
float alpha = 1.0f, beta = 0.0f;
cublasStatus_t status = cublasSgemmBatched(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
&alpha,
A_array, m,
B_array, k,
&beta,
C_array, m,
batchSize);
}
// Strided批量矩阵乘法(更高效的内存布局)
void stridedBatchedMatrixMultiply(cublasHandle_t handle,
const float* A, const float* B, float* C,
int m, int n, int k, int batchSize,
long long strideA, long long strideB, long long strideC) {
float alpha = 1.0f, beta = 0.0f;
cublasStatus_t status = cublasSgemmStridedBatched(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
&alpha,
A, m, strideA,
B, k, strideB,
&beta,
C, m, strideC,
batchSize);
}
void symmetricMatrixMultiply(cublasHandle_t handle,
const float* A, const float* B, float* C,
int m, int n, bool leftSide = true) {
float alpha = 1.0f, beta = 0.0f;
cublasSideMode_t side = leftSide ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER;
// C = alpha * A * B + beta * C 或 C = alpha * B * A + beta * C
cublasStatus_t status = cublasSsymm(handle, side, uplo, m, n,
&alpha, A, m, B, m,
&beta, C, m);
}
void triangularMatrixMultiply(cublasHandle_t handle,
const float* A, float* B,
int m, int n, bool leftSide = true,
bool upper = true) {
float alpha = 1.0f;
cublasSideMode_t side = leftSide ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
cublasOperation_t trans = CUBLAS_OP_N;
cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
// B = alpha * op(A) * B 或 B = alpha * B * op(A)
cublasStatus_t status = cublasStrmm(handle, side, uplo, trans, diag,
m, n, &alpha, A, m, B, m);
}
void triangularMatrixSolve(cublasHandle_t handle,
const float* A, float* B,
int m, int n, bool leftSide = true,
bool upper = true) {
float alpha = 1.0f;
cublasSideMode_t side = leftSide ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
cublasFillMode_t uplo = upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
cublasOperation_t trans = CUBLAS_OP_N;
cublasDiagType_t diag = CUBLAS_DIAG_NON_UNIT;
// 求解 op(A) * X = alpha * B 或 X * op(A) = alpha * B
cublasStatus_t status = cublasStrsm(handle, side, uplo, trans, diag,
m, n, &alpha, A, m, B, m);
}
void findBestGemmAlgorithm(cublasHandle_t handle,
int m, int n, int k) {
// 创建测试矩阵
float *A, *B, *C, *C_ref;
size_t sizeA = m * k * sizeof(float);
size_t sizeB = k * n * sizeof(float);
size_t sizeC = m * n * sizeof(float);
cudaMalloc(&A, sizeA);
cudaMalloc(&B, sizeB);
cudaMalloc(&C, sizeC);
cudaMalloc(&C_ref, sizeC);
// 测试不同算法
cublasGemmAlgo_t algorithms[] = {
CUBLAS_GEMM_DEFAULT,
CUBLAS_GEMM_DEFAULT_TENSOR_OP,
CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, ..., CUBLAS_GEMM_ALGO23
};
int algoCount = sizeof(algorithms) / sizeof(algorithms[0]);
float bestTime = FLT_MAX;
cublasGemmAlgo_t bestAlgo = CUBLAS_GEMM_DEFAULT;
for (int i = 0; i < algoCount; i++) {
// 计时
auto start = std::chrono::high_resolution_clock::now();
float alpha = 1.0f, beta = 0.0f;
cublasStatus_t status = cublasGemmEx(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k,
&alpha, A, CUDA_R_32F, m,
B, CUDA_R_32F, k,
&beta, C, CUDA_R_32F, m,
algorithms[i]);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
if (status == CUBLAS_STATUS_SUCCESS) {
float time = std::chrono::duration<float>(end - start).count();
if (time < bestTime) {
bestTime = time;
bestAlgo = algorithms[i];
}
}
}
printf("Best algorithm: %d, Time: %f ms\n", bestAlgo, bestTime * 1000);
cudaFree(A); cudaFree(B); cudaFree(C); cudaFree(C_ref);
}
class CublasMemoryPool {
private:
std::unordered_map<size_t, std::vector<void*>> pool_;
std::mutex mutex_;
public:
void* allocate(size_t size) {
std::lock_guard<std::mutex> lock(mutex_);
auto it = pool_.find(size);
if (it != pool_.end() && !it->second.empty()) {
void* ptr = it->second.back();
it->second.pop_back();
return ptr;
}
void* ptr;
cudaMalloc(&ptr, size);
return ptr;
}
void deallocate(void* ptr, size_t size) {
std::lock_guard<std::mutex> lock(mutex_);
pool_[size].push_back(ptr);
}
~CublasMemoryPool() {
for (auto& pair : pool_) {
for (void* ptr : pair.second) {
cudaFree(ptr);
}
}
}
};
void optimizedMatrixMultiply(cublasHandle_t handle,
const float* A, const float* B, float* C,
int m, int n, int k) {
// 确保内存对齐
const int alignment = 256; // 32字节对齐
size_t pitchA, pitchB, pitchC;
// 使用pitched内存分配以提高缓存效率
float* d_A, *d_B, *d_C;
cudaMallocPitch(&d_A, &pitchA, k * sizeof(float), m);
cudaMallocPitch(&d_B, &pitchB, n * sizeof(float), k);
cudaMallocPitch(&d_C, &pitchC, n * sizeof(float), m);
// 复制数据
cudaMemcpy2D(d_A, pitchA, A, k * sizeof(float), k * sizeof(float), m,
cudaMemcpyHostToDevice);
cudaMemcpy2D(d_B, pitchB, B, n * sizeof(float), n * sizeof(float), k,
cudaMemcpyHostToDevice);
// 执行矩阵乘法(使用leading dimension)
float alpha = 1.0f, beta = 0.0f;
cublasStatus_t status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha,
d_A, pitchA / sizeof(float),
d_B, pitchB / sizeof(float),
&beta, d_C, pitchC / sizeof(float));
// 获取结果
cudaMemcpy2D(C, n * sizeof(float), d_C, pitchC, n * sizeof(float), m,
cudaMemcpyDeviceToHost);
cudaFree(d_A); cudaFree(d_B); cudaFree(d_C);
}
void streamParallelMatrixMultiply(const float* A, const float* B, float* C,
int m, int n, int k, int numStreams) {
const int streamCount = 4;
cublasHandle_t handles[streamCount];
cudaStream_t streams[streamCount];
// 创建句柄和流
for (int i = 0; i < streamCount; i++) {
cublasCreate(&handles[i]);
cudaStreamCreate(&streams[i]);
cublasSetStream(handles[i], streams[i]);
}
// 分解工作
int rowsPerStream = m / streamCount;
for (int i = 0; i < streamCount; i++) {
int startRow = i * rowsPerStream;
int endRow = (i == streamCount - 1) ? m : (i + 1) * rowsPerStream;
int currentRows = endRow - startRow;
const float* A_sub = A + startRow * k;
float* C_sub = C + startRow * n;
// 在流中执行
float alpha = 1.0f, beta = 0.0f;
cublasSgemm(handles[i], CUBLAS_OP_N, CUBLAS_OP_N,
currentRows, n, k, &alpha,
A_sub, m, B, k, &beta, C_sub, m);
}
// 同步所有流
for (int i = 0; i < streamCount; i++) {
cudaStreamSynchronize(streams[i]);
cublasDestroy(handles[i]);
cudaStreamDestroy(streams[i]);
}
}
void mixedPrecisionMatrixMultiply(cublasHandle_t handle,
const float* A, const float* B, float* C,
int m, int n, int k) {
// 分配FP16缓冲区
half* d_A_fp16, *d_B_fp16, *d_C_fp16;
size_t sizeA = m * k * sizeof(half);
size_t sizeB = k * n * sizeof(half);
size_t sizeC = m * n * sizeof(half);
cudaMalloc(&d_A_fp16, sizeA);
cudaMalloc(&d_B_fp16, sizeB);
cudaMalloc(&d_C_fp16, sizeC);
// 转换为FP16
cublasStatus_t status;
status = cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, k, &alpha_one,
A, m, &beta_zero, nullptr, m,
(float*)d_A_fp16, m);
// 使用Tensor Core进行FP16计算
float alpha = 1.0f, beta = 0.0f;
status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha,
d_A_fp16, CUDA_R_16F, m,
d_B_fp16, CUDA_R_16F, k,
&beta, d_C_fp16, CUDA_R_16F, m,
CUBLAS_GEMM_DEFAULT_TENSOR_OP);
// 转换回FP32
status = cublasSgeam(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, &alpha_one,
(float*)d_C_fp16, m, &beta_zero, nullptr, m,
C, m);
cudaFree(d_A_fp16); cudaFree(d_B_fp16); cudaFree(d_C_fp16);
}
void groupedGemm(cublasHandle_t handle,
const float* const A[], const float* const B[], float* const C[],
const int m[], const int n[], const int k[],
int groupCount) {
// 创建Grouped GEMM描述符
cublasStatus_t status;
// 设置参数数组
float alpha = 1.0f, beta = 0.0f;
const int lda[m], ldb[n], ldc[m];
// 计算leading dimensions
for (int i = 0; i < groupCount; i++) {
lda[i] = m[i];
ldb[i] = k[i];
ldc[i] = m[i];
}
// 执行Grouped GEMM
status = cublasGemmBatchedEx(handle,
CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha,
A, CUDA_R_32F, lda,
B, CUDA_R_32F, ldb,
&beta, C, CUDA_R_32F, ldc,
groupCount, CUBLAS_GEMM_DEFAULT);
}
void cublasLtMatrixMultiply(cublasLtHandle_t ltHandle,
const float* A, const float* B, float* C,
int m, int n, int k) {
// 创建操作描述符
cublasLtMatmulDesc_t matmulDesc;
cublasLtMatrixLayout_t Adesc, Bdesc, Cdesc;
// 创建矩阵布局描述符
cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_32F, m, k, m);
cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_32F, k, n, k);
cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32F, m, n, m);
// 创建矩阵乘法描述符
cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32F, CUDA_R_32F);
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSA,
&CUBLAS_OP_N, sizeof(CUBLAS_OP_N));
cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB,
&CUBLAS_OP_N, sizeof(CUBLAS_OP_N));
// 查询工作空间大小
size_t workspaceSize = 0;
cublasLtMatmulAlgoGetHeuristic(ltHandle, matmulDesc, Adesc, Bdesc, Cdesc, Cdesc,
nullptr, 1, nullptr, &workspaceSize);
void* workspace;
cudaMalloc(&workspace, workspaceSize);
// 执行矩阵乘法
float alpha = 1.0f, beta = 0.0f;
cublasLtMatmul(ltHandle, matmulDesc,
&alpha, A, Adesc, B, Bdesc, &beta,
C, Cdesc, C, Cdesc,
nullptr, workspace, workspaceSize, 0);
// 清理
cudaFree(workspace);
cublasLtMatmulDescDestroy(matmulDesc);
cublasLtMatrixLayoutDestroy(Adesc);
cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc);
}
void integrateWithCudaGraphs() {
// 创建CUDA图
cudaGraph_t graph;
cudaStream_t stream;
cudaStreamCreate(&stream);
// 开始捕获
cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
// 在捕获中执行cuBLAS操作
cublasHandle_t handle;
cublasCreate(&handle);
cublasSetStream(handle, stream);
float alpha = 1.0f, beta = 0.0f;
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha, d_A, m, d_B, k, &beta, d_C, m);
// 结束捕获
cudaStreamEndCapture(stream, &graph);
// 实例化并执行图
cudaGraphExec_t graphExec;
cudaGraphInstantiate(&graphExec, graph, nullptr, nullptr, 0);
// 重复执行图
for (int i = 0; i < iterations; i++) {
cudaGraphLaunch(graphExec, stream);
}
cudaStreamSynchronize(stream);
// 清理
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(graph);
cudaStreamDestroy(stream);
cublasDestroy(handle);
}
class LinearLayer {
private:
cublasHandle_t handle_;
float* weights_; // [out_features, in_features]
float* bias_; // [out_features]
int in_features_, out_features_;
public:
LinearLayer(int in_features, int out_features)
: in_features_(in_features), out_features_(out_features) {
cublasCreate(&handle_);
// 分配权重和偏置
cudaMalloc(&weights_, out_features * in_features * sizeof(float));
cudaMalloc(&bias_, out_features * sizeof(float));
// 初始化权重(示例)
initializeWeights();
}
void forward(const float* input, float* output, int batch_size) {
float alpha = 1.0f, beta = 0.0f;
// output = input * weights^T + bias
cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_T,
batch_size, out_features_, in_features_,
&alpha, input, batch_size, weights_, out_features_,
&beta, output, batch_size);
// 添加偏置
for (int i = 0; i < batch_size; i++) {
cublasSaxpy(handle_, out_features_, &alpha_one, bias_, 1,
output + i * out_features_, 1);
}
}
void backward(const float* grad_output, const float* input,
float* grad_input, float* grad_weight, float* grad_bias,
int batch_size) {
float alpha = 1.0f, beta = 0.0f;
// 计算权重梯度: grad_weight = grad_output^T * input
cublasSgemm(handle_, CUBLAS_OP_T, CUBLAS_OP_N,
out_features_, in_features_, batch_size,
&alpha, grad_output, batch_size, input, batch_size,
&beta, grad_weight, out_features_);
// 计算输入梯度: grad_input = grad_output * weights
cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N,
batch_size, in_features_, out_features_,
&alpha, grad_output, batch_size, weights_, out_features_,
&beta, grad_input, batch_size);
// 计算偏置梯度
for (int i = 0; i < batch_size; i++) {
cublasSaxpy(handle_, out_features_, &alpha_one,
grad_output + i * out_features_, 1, grad_bias, 1);
}
}
~LinearLayer() {
cudaFree(weights_);
cudaFree(bias_);
cublasDestroy(handle_);
}
};
class MatrixSolver {
private:
cublasHandle_t handle_;
public:
MatrixSolver() {
cublasCreate(&handle_);
}
// 使用LU分解求解线性方程组
void solveLinearSystem(float* A, float* x, const float* b, int n) {
// 复制b到x
cublasScopy(handle_, n, b, 1, x, 1);
// 使用cuSOLVER进行LU分解(需要链接cuSOLVER)
// 这里简化示例,实际需要完整的LU分解实现
// 回代求解
for (int i = n - 1; i >= 0; i--) {
float sum = 0.0f;
for (int j = i + 1; j < n; j++) {
sum += A[i * n + j] * x[j];
}
x[i] = (x[i] - sum) / A[i * n + i];
}
}
// 使用QR分解求解最小二乘问题
void leastSquares(float* A, float* x, const float* b, int m, int n) {
// QR分解 (简化示例)
// 实际实现需要使用cuSOLVER的geqrf函数
// 求解R * x = Q^T * b
float alpha = 1.0f, beta = 0.0f;
float* Qtb;
cudaMalloc(&Qtb, n * sizeof(float));
// Qtb = Q^T * b
cublasSgemv(handle_, CUBLAS_OP_T, m, n,
&alpha, A, m, b, 1, &beta, Qtb, 1);
// 回代求解(R是上三角矩阵)
for (int i = n - 1; i >= 0; i--) {
float sum = 0.0f;
for (int j = i + 1; j < n; j++) {
sum += A[i * m + j] * x[j];
}
x[i] = (Qtb[i] - sum) / A[i * m + i];
}
cudaFree(Qtb);
}
~MatrixSolver() {
cublasDestroy(handle_);
}
};
class SignalProcessor {
private:
cublasHandle_t handle_;
public:
SignalProcessor() {
cublasCreate(&handle_);
}
// 计算自相关函数
void autoCorrelation(const float* signal, float* result, int n) {
// 使用FFT计算自相关(需要cuFFT)
// 这里使用cuBLAS进行直接计算(较慢但简单)
for (int lag = 0; lag < n; lag++) {
float dot_product;
const float* x = signal;
const float* y = signal + lag;
int length = n - lag;
// 计算点积
cublasSdot(handle_, length, x, 1, y, 1, &dot_product);
result[lag] = dot_product;
}
}
// 矩阵形式的卷积
void convolution(const float* signal, const float* kernel,
float* output, int signal_len, int kernel_len) {
int output_len = signal_len + kernel_len - 1;
// 创建Toeplitz矩阵(简化示例)
float* toeplitz;
cudaMalloc(&toeplitz, output_len * signal_len * sizeof(float));
// 填充Toeplitz矩阵
// ... 实际实现需要复杂的矩阵填充逻辑
// 使用矩阵乘法实现卷积
float alpha = 1.0f, beta = 0.0f;
cublasSgemv(handle_, CUBLAS_OP_N,
output_len, signal_len,
&alpha, toeplitz, output_len,
kernel, 1, &beta, output, 1);
cudaFree(toeplitz);
}
};
class CublasBenchmark {
private:
cublasHandle_t handle_;
public:
CublasBenchmark() {
cublasCreate(&handle_);
}
template<typename Func>
float benchmark(Func func, int warmup = 10, int iterations = 100) {
// Warmup
for (int i = 0; i < warmup; i++) {
func();
}
cudaDeviceSynchronize();
// Benchmark
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < iterations; i++) {
func();
}
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
float totalTime = std::chrono::duration<float>(end - start).count();
return totalTime / iterations;
}
void benchmarkGemmSizes() {
std::vector<std::tuple<int, int, int>> sizes = {
{64, 64, 64},
{128, 128, 128},
{256, 256, 256},
{512, 512, 512},
{1024, 1024, 1024},
{2048, 2048, 2048}
};
for (auto [m, n, k] : sizes) {
// 准备数据
float *A, *B, *C;
size_t sizeA = m * k * sizeof(float);
size_t sizeB = k * n * sizeof(float);
size_t sizeC = m * n * sizeof(float);
cudaMalloc(&A, sizeA);
cudaMalloc(&B, sizeB);
cudaMalloc(&C, sizeC);
// Benchmark
float time = benchmark([&]() {
float alpha = 1.0f, beta = 0.0f;
cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha, A, m, B, k, &beta, C, m);
});
// 计算GFLOPS
double gflops = 2.0 * m * n * k / (time * 1e9);
printf("Size: %dx%dx%d, Time: %.3f ms, GFLOPS: %.2f\n",
m, n, k, time * 1000, gflops);
cudaFree(A); cudaFree(B); cudaFree(C);
}
}
~CublasBenchmark() {
cublasDestroy(handle_);
}
};
void analyzeMemoryAccess() {
int m = 1024, n = 1024, k = 1024;
// 测试不同内存布局
std::vector<cublasOperation_t> trans_ops = {CUBLAS_OP_N, CUBLAS_OP_T};
for (auto transA : trans_ops) {
for (auto transB : trans_ops) {
// 分配内存
float *A, *B, *C;
cudaMalloc(&A, m * k * sizeof(float));
cudaMalloc(&B, k * n * sizeof(float));
cudaMalloc(&C, m * n * sizeof(float));
// 计时
auto start = std::chrono::high_resolution_clock::now();
float alpha = 1.0f, beta = 0.0f;
cublasSgemm(handle_, transA, transB,
m, n, k, &alpha, A, m, B, k, &beta, C, m);
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
float time = std::chrono::duration<float>(end - start).count();
printf("TransA: %c, TransB: %c, Time: %.3f ms\n",
transA == CUBLAS_OP_N ? 'N' : 'T',
transB == CUBLAS_OP_N ? 'N' : 'T',
time * 1000);
cudaFree(A); cudaFree(B); cudaFree(C);
}
}
}
#define CUBLAS_CHECK(call) \
do { \
cublasStatus_t status = call; \
if (status != CUBLAS_STATUS_SUCCESS) { \
printf("cuBLAS error %d at %s:%d\n", status, __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
} \
} while (0)
// 使用示例
CUBLAS_CHECK(cublasCreate(&handle_));
CUBLAS_CHECK(cublasSgemm(handle_, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha, A, m, B, k, &beta, C, m));
class CublasResourceManager {
private:
std::vector<void*> allocated_memory_;
cublasHandle_t handle_;
public:
CublasResourceManager() {
cublasCreate(&handle_);
}
void* allocate(size_t size) {
void* ptr;
cudaError_t error = cudaMalloc(&ptr, size);
if (error == cudaSuccess) {
allocated_memory_.push_back(ptr);
return ptr;
}
return nullptr;
}
~CublasResourceManager() {
for (void* ptr : allocated_memory_) {
cudaFree(ptr);
}
cublasDestroy(handle_);
}
};
cuBLAS作为GPU加速线性代数的核心库,提供了以下关键价值:
通过合理使用cuBLAS的API和优化技术,可以显著提升科学计算、机器学习和深度学习应用的性能。随着GPU硬件的不断发展,cuBLAS将继续在高性能计算领域发挥重要作用。
发表评论
请登录后发表评论
评论 (0)