當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

GPU与CPU版本的矩阵乘法对比

發布時間：2024/4/18 编程问答 27 豆豆

生活随笔收集整理的這篇文章主要介紹了 GPU与CPU版本的矩阵乘法对比小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

由于剛剛開始學習Cuda，還沒有整理出一個完整的Cuda類，只是在Nvidia提供的kenerl架構上做修改。

　　但用于初體驗GPU給我們帶來的好處也綽綽有余了。

　　直接貼代碼：

/*矩陣乘法，CPU版本和GPU版本的對比*/#include "cuda_runtime.h" #include "device_launch_parameters.h"#include <stdio.h> #include <stdlib.h> #include <time.h> #include <Windows.h> #include <string> #include <malloc.h>//用于指示不同的GPU 優化版本 enum Type {Mode1 = 1, //Mode 1 :將每一個C[i][j]都分別分配一個線程Mode2 = 2 //Mode 2 :不讓一個線程完整計算一個C[i][j]，通過C(i,j) = sum { A(i,k)*B(k,j) }發現，我們還可以再細度劃分：// sub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize// C(i, j) = sum{ Csub(i, j) }// 就是把矩陣分成n*n個大的子塊，然后每一個block負責計算子塊i 和子塊j的子乘積，計算完畢后加起來則可。這里主要使用了共享顯存作優化。 };cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode);__global__ void MatrixMulGPU_1(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB) {float sum = 0;//找出該線程所在的行和列int row = blockIdx.y * blockDim.y + threadIdx.y;int col = blockIdx.x * blockDim.x + threadIdx.x;//線程Thread(row, col)負責計算C(row, col)for (int i = 0; i < WB; ++i){sum += a[row * WA + i] * b[i * WB + col];}c[row * WB + col] = sum; }template<int BLOCK_SIZE> __global__ void MatrixMulGPU_2(float *c, const float *a, const float *b, unsigned int WA, unsigned int WB) {// Block indexint bx = blockIdx.x;int by = blockIdx.y;// Thread indexint tx = threadIdx.x;int ty = threadIdx.y;// Index of the first sub-matrix of A processed by the blockint aBegin = WA * BLOCK_SIZE * by;// Index of the last sub-matrix of A processed by the blockint aEnd = aBegin + WA - 1;// Step size used to iterate through the sub-matrices of Aint aStep = BLOCK_SIZE;// Index of the first sub-matrix of B processed by the blockint bBegin = BLOCK_SIZE * bx;// Step size used to iterate through the sub-matrices of Bint bStep = BLOCK_SIZE * WB;// Csub is used to store the element of the block sub-matrix// that is computed by the threadfloat Csub = 0;// Loop over all the sub-matrices of A and B// required to compute the block sub-matrixfor (int i = aBegin, j = bBegin;i <= aEnd;i += aStep, j += bStep){// Declaration of the shared memory array As used to// store the sub-matrix of A__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];// Declaration of the shared memory array Bs used to// store the sub-matrix of B__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];// Load the matrices from device memory// to shared memory; each thread loads// one element of each matrixAs[ty][tx] = a[i + WA * ty + tx];Bs[ty][tx] = b[j + WB * ty + tx];// Synchronize to make sure the matrices are loaded__syncthreads();// Multiply the two matrices together;// each thread computes one element// of the block sub-matrix #pragma unrollfor (int k = 0; k < BLOCK_SIZE; ++k){Csub += As[ty][k] * Bs[k][tx];}// Synchronize to make sure that the preceding// computation is done before loading two new// sub-matrices of A and B in the next iteration__syncthreads();}// Write the block sub-matrix to device memory;// each thread writes one elementint k = WB * BLOCK_SIZE * by + BLOCK_SIZE * bx;c[k + WB * ty + tx] = Csub; }//GPU version void MatrixMulCPU(float *_C, const float* _A, const float* _B, int WA, int HA, int WB, int HB) {if (WA != HB){printf("the matrix A and B cannot be multipled!");exit(0);}for (int i = 0; i < HA; ++i){for (int j = 0; j < WB; ++j){for (int k = 0; k < WA; ++k){_C[i * WA + j] += _A[i * WA + k] * _B[k * WB + j];}}} }//給初始的矩陣一個隨機值 void randomInit(float* _data, int _size) {for (int i = 0; i < _size; ++i){_data[i] = rand() / (float)RAND_MAX * 100;} }//print the matrix void printMatrix(float* m_Matrix, int W, int H) {for (int i = 0; i < W * H; ++i){printf("%2.1f ", m_Matrix[i]);if (i % W == 0 && i != 0) printf("\n");}printf("\n"); }bool CheckAnswer(const float* _C, const float* _D, unsigned int size) {bool isRight = true;for (int i = 0; i < size && isRight == true; ++i){if (_C[i] != _D[i])isRight = false;}return isRight; }int main() {const int width_A = 1024;const int height_A = 1024;const int width_B = 1024;const int height_B = 1024;float *B = (float *)malloc(sizeof(float) * height_B * width_B);float *A = (float *)malloc(sizeof(float) * height_A * width_A);float *C = (float *)malloc(sizeof(float) * height_A * width_B);float *D = (float *)malloc(sizeof(float) * height_A * width_B);float *E = (float *)malloc(sizeof(float) * height_A * width_B);memset(A, 0.0, sizeof(float) * height_A * width_A);memset(B, 0.0, sizeof(float) * height_B * width_B);memset(C, 0.0, sizeof(float) * height_A * width_B);memset(D, 0.0, sizeof(float) * height_A * width_B);memset(E, 0.0, sizeof(float) * height_A * width_B);//產生隨機數生成器srand((unsigned)time(0));randomInit(B, height_B * width_B);randomInit(A, height_A * width_A);//printMatrix(B, width_B, height_B);//printMatrix(A, width_A, height_A);//CPU 計算unsigned int tick1 = GetTickCount();MatrixMulCPU(C, A, B, width_A, height_A, width_B, height_B);printf("CPU use time : %dms\n", GetTickCount() - tick1);//GPU Type m_Mode = Mode1;unsigned int tick2 = GetTickCount();cudaError_t cudaStatus = addWithCuda(D, A, B, width_A, height_A, width_B, height_B, m_Mode);if (cudaStatus != cudaSuccess){fprintf(stderr, "addWithCuda failed!\n");return 1;}printf("GPU mode1 use time : %dms\n", GetTickCount() - tick2);m_Mode = Mode2;unsigned int tick3 = GetTickCount();cudaStatus = addWithCuda(E, A, B, width_A, height_A, width_B, height_B, m_Mode);if (cudaStatus != cudaSuccess){fprintf(stderr, "addWithCuda failed!\n");return 1;}printf("GPU mode2 use time : %dms\n", GetTickCount() - tick3);//檢查GPU, CPU 計算的結果是否相同if (!CheckAnswer(C, D, height_A * width_B) && !CheckAnswer(C, E, height_A * width_B))printf("The answer is wrong!");else printf("The answer is right!");// cudaDeviceReset must be called before exiting in order for profiling and// tracing tools such as Nsight and Visual Profiler to show complete traces.cudaStatus = cudaDeviceReset();if (cudaStatus != cudaSuccess){fprintf(stderr, "cudaDeviceReset failed!");return 1;}return 0; }// Helper function for using CUDA to add vectors in parallel. cudaError_t addWithCuda(float *c, const float *a, const float *b, unsigned int WA, unsigned int HA, unsigned int WB, unsigned int HB, Type mode) {float *dev_a = 0;float *dev_b = 0;float *dev_c = 0;cudaError_t cudaStatus;// Choose which GPU to run on, change this on a multi-GPU system.cudaStatus = cudaSetDevice(0);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");goto Error;}// Allocate GPU buffers for three vectors (two input, one output) .cudaStatus = cudaMalloc((void**)&dev_c, HA * WB * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}cudaStatus = cudaMalloc((void**)&dev_a, HA * WA * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}cudaStatus = cudaMalloc((void**)&dev_b, HB * WB * sizeof(float));if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMalloc failed!");goto Error;}// Copy input vectors from host memory to GPU buffers.cudaStatus = cudaMemcpy(dev_a, a, HA * WA * sizeof(float), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}cudaStatus = cudaMemcpy(dev_b, b, HB * WB * sizeof(float), cudaMemcpyHostToDevice);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}//為每一個C[i][j]設置一個線程進行計算int block_size = 16;dim3 Threads(block_size, block_size);dim3 Blocks(WB / block_size, HA / block_size);// Launch a kernel on the GPU with one thread for each element.if (mode == Mode1){MatrixMulGPU_1 << <Blocks, Threads >>>(dev_c, dev_a, dev_b, WA, WB);}if (mode == Mode2){MatrixMulGPU_2<16> << <Blocks, Threads >> >(dev_c, dev_a, dev_b, WA, WB);}// Check for any errors launching the kernelcudaStatus = cudaGetLastError();if (cudaStatus != cudaSuccess) {fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));goto Error;}// cudaDeviceSynchronize waits for the kernel to finish, and returns// any errors encountered during the launch.cudaStatus = cudaDeviceSynchronize();if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);goto Error;}// Copy output vector from GPU buffer to host memory.cudaStatus = cudaMemcpy(c, dev_c, HA * WB * sizeof(float), cudaMemcpyDeviceToHost);if (cudaStatus != cudaSuccess) {fprintf(stderr, "cudaMemcpy failed!");goto Error;}Error:cudaFree(dev_c);cudaFree(dev_a);cudaFree(dev_b);return cudaStatus; }

代碼中，總過使用了CPU的計算和兩種GPU的運算，最終的運行結果如下：

可以明顯的看出，GPU的運行速度比CPU快很多，并且將任務越細分，運行的速度也更快。

后續我還想通過更多的方式（比如texture binding）來繼續進行優化。

原文鏈接：http://www.cnblogs.com/stormhan/p/5467187.html

總結

以上是生活随笔為你收集整理的GPU与CPU版本的矩阵乘法对比的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。

上一篇： C++、python、CUDA性能分析-
下一篇： numpy使用MKL库提升计算性能