CUDA时长统计
技術背景
前面的一篇文章中介紹了在CUDA中使用宏來監測CUDA C函數或者Kernel函數的運行報錯問題。同樣的思路,我們可用寫一個用于統計函數運行時長的宏,這樣不需要使用額外的工具來對函數體的性能進行測試。
文件準備
因為這里的宏改動,主要涉及CUDA頭文件和CUDA文件的修改,所以Cython文件和Python文件還有異常捕獲宏我們還是復用這篇文章里面用到的。測試內容是,定義一個原始數組和一個索引數組,輸出索引的結果數組。
wrapper.pyx
# cythonize -i -f wrapper.pyx
import numpy as np
cimport numpy as np
cimport cython
cdef extern from "<dlfcn.h>" nogil:
void *dlopen(const char *, int)
char *dlerror()
void *dlsym(void *, const char *)
int dlclose(void *)
enum:
RTLD_LAZY
ctypedef int (*GatherFunc)(float *source, int *index, float *res, int N, int M) noexcept nogil
cdef void* handle = dlopen('/path/to/libcuindex.so', RTLD_LAZY)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef float[:] cuda_gather(float[:] x, int[:] idx):
cdef:
GatherFunc Gather
int success
int N = idx.shape[0]
int M = x.shape[0]
float[:] res = np.zeros((N, ), dtype=np.float32)
Gather = <GatherFunc>dlsym(handle, "Gather")
success = Gather(&x[0], &idx[0], &res[0], N, M)
return res
while not True:
dlclose(handle)
test_gather.py
import numpy as np
np.random.seed(0)
from wrapper import cuda_gather
M = 1024 * 1024 * 128
N = 1024 * 1024
x = np.random.random((M,)).astype(np.float32)
idx = np.random.randint(0, M, (N,)).astype(np.int32)
res = np.asarray(cuda_gather(x, idx))
print (res.shape)
print ((res==x[idx]).sum())
error.cuh
#pragma once
#include <stdio.h>
#define CHECK(call) do{const cudaError_t error_code = call; if (error_code != cudaSuccess){printf("CUDA Error:\n"); printf(" File: %s\n", __FILE__); printf(" Line: %d\n", __LINE__); printf(" Error code: %d\n", error_code); printf(" Error text: %s\n", cudaGetErrorString(error_code)); exit(1);}} while (0)
計時宏
這里增加一個用于計時的record.cuh頭文件,里面寫一個TIME_CUDA_FUNCTION宏,然后在CUDA中需要統計的函數前調用,就可以輸出CUDA函數的運行時長了。
#pragma once
#include <stdio.h>
#include <cuda_runtime.h>
// 宏定義,用于測量CUDA函數的執行時間
#define TIME_CUDA_FUNCTION(func) \
do { \
cudaEvent_t start, stop; \
float elapsedTime; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
cudaEventRecord(start, NULL); \
\
func; \
\
cudaEventRecord(stop, NULL); \
cudaEventSynchronize(stop); \
cudaEventElapsedTime(&elapsedTime, start, stop); \
printf("Time taken by function %s is: %f ms\n", #func, elapsedTime); \
\
cudaEventDestroy(start); \
cudaEventDestroy(stop); \
} while (0)
計時宏的使用
我們在CUDA文件cuda_index.cu中調用record.cuh里面的計時宏,這里用來統計一個CUDA核函數的執行時間:
// nvcc -shared ./cuda_index.cu -Xcompiler -fPIC -o ./libcuindex.so
#include <stdio.h>
#include "cuda_index.cuh"
#include "error.cuh"
#include "record.cuh"
void __global__ GatherKernel(float *source, int *index, float *res, int N){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N){
res[idx] = source[index[idx]];
}
}
extern "C" int Gather(float *source, int *index, float *res, int N, int M){
float *souce_device, *res_device;
int *index_device;
CHECK(cudaMalloc((void **)&souce_device, M * sizeof(float)));
CHECK(cudaMalloc((void **)&res_device, N * sizeof(float)));
CHECK(cudaMalloc((void **)&index_device, N * sizeof(int)));
CHECK(cudaMemcpy(souce_device, source, M * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(res_device, res, N * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(index_device, index, N * sizeof(int), cudaMemcpyHostToDevice));
int block_size = 1024;
int grid_size = (N + block_size - 1) / block_size;
TIME_CUDA_FUNCTION((GatherKernel<<<grid_size, block_size>>>(souce_device, index_device, res_device, N)));
CHECK(cudaGetLastError());
CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(res, res_device, N * sizeof(float), cudaMemcpyDeviceToHost));
CHECK(cudaFree(souce_device));
CHECK(cudaFree(index_device));
CHECK(cudaDeviceSynchronize());
CHECK(cudaFree(res_device));
CHECK(cudaDeviceReset());
return 1;
}
需要注意的是,TIME_CUDA_FUNCTION宏只能有一個輸入,但是使用CUDA核函數的時候實際上會被當作是兩個輸入,因此我們需要將CUDA核函數用括號再封裝起來。
輸出結果
最終按照這篇文章中的運行流程,可以得到這樣的輸出結果:
Time taken by function (GatherKernel<<<grid_size, block_size>>>(souce_device, index_device, res_device, N)) is: 0.584224 ms
(1048576,)
1048576
這里CUDA核函數的運行時長被正確的格式化輸出了。
返回耗時數值
除了在CUDA中直接打印耗時的數值,我們還可以修改record.cuh中的宏,讓其返回耗時數值:
#pragma once
#include <stdio.h>
#include <cuda_runtime.h>
// 宏定義,用于測量CUDA函數的執行時間
#define TIME_CUDA_FUNCTION(func) \
do { \
cudaEvent_t start, stop; \
float elapsedTime; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
cudaEventRecord(start, NULL); \
\
func; \
\
cudaEventRecord(stop, NULL); \
cudaEventSynchronize(stop); \
cudaEventElapsedTime(&elapsedTime, start, stop); \
printf("Time taken by function %s is: %f ms\n", #func, elapsedTime); \
\
cudaEventDestroy(start); \
cudaEventDestroy(stop); \
} while (0)
// 宏定義,用于測量CUDA函數的執行時間并返回該時間
#define GET_CUDA_TIME(func) \
({ \
cudaEvent_t start, stop; \
float elapsedTime = 0.0f; \
cudaEventCreate(&start); \
cudaEventCreate(&stop); \
cudaEventRecord(start, NULL); \
\
func; \
\
cudaEventRecord(stop, NULL); \
cudaEventSynchronize(stop); \
cudaEventElapsedTime(&elapsedTime, start, stop); \
\
cudaEventDestroy(start); \
cudaEventDestroy(stop); \
\
elapsedTime; \
})
修改頭文件cuda_index.cuh,因為這里我們需要返回一個運行時長的float數值,不再是int類型了:
#include <stdio.h>
extern "C" float Gather(float *source, int *index, float *res, int N, int M);
最后再對應修改下cuda_index.cu中的內容:
// nvcc -shared ./cuda_index.cu -Xcompiler -fPIC -o ./libcuindex.so
#include <stdio.h>
#include "cuda_index.cuh"
#include "error.cuh"
#include "record.cuh"
void __global__ GatherKernel(float *source, int *index, float *res, int N){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N){
res[idx] = source[index[idx]];
}
}
extern "C" float Gather(float *source, int *index, float *res, int N, int M){
float *souce_device, *res_device;
int *index_device;
CHECK(cudaMalloc((void **)&souce_device, M * sizeof(float)));
CHECK(cudaMalloc((void **)&res_device, N * sizeof(float)));
CHECK(cudaMalloc((void **)&index_device, N * sizeof(int)));
CHECK(cudaMemcpy(souce_device, source, M * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(res_device, res, N * sizeof(float), cudaMemcpyHostToDevice));
CHECK(cudaMemcpy(index_device, index, N * sizeof(int), cudaMemcpyHostToDevice));
int block_size = 1024;
int grid_size = (N + block_size - 1) / block_size;
float timeTaken = GET_CUDA_TIME((GatherKernel<<<grid_size, block_size>>>(souce_device, index_device, res_device, N)));
CHECK(cudaGetLastError());
CHECK(cudaDeviceSynchronize());
CHECK(cudaMemcpy(res, res_device, N * sizeof(float), cudaMemcpyDeviceToHost));
CHECK(cudaFree(souce_device));
CHECK(cudaFree(index_device));
CHECK(cudaDeviceSynchronize());
CHECK(cudaFree(res_device));
CHECK(cudaDeviceReset());
return timeTaken;
}
這樣就可以把函數運行耗時的數值返回給Cython文件,然后在Cython文件wrapper.pyx中打印耗時:
# cythonize -i -f wrapper.pyx
import numpy as np
cimport numpy as np
cimport cython
cdef extern from "<dlfcn.h>" nogil:
void *dlopen(const char *, int)
char *dlerror()
void *dlsym(void *, const char *)
int dlclose(void *)
enum:
RTLD_LAZY
ctypedef float (*GatherFunc)(float *source, int *index, float *res, int N, int M) noexcept nogil
cdef void* handle = dlopen('/home/dechin/projects/gitee/dechin/tests/cuda/libcuindex.so', RTLD_LAZY)
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef float[:] cuda_gather(float[:] x, int[:] idx):
cdef:
GatherFunc Gather
float timeTaken
int N = idx.shape[0]
int M = x.shape[0]
float[:] res = np.zeros((N, ), dtype=np.float32)
Gather = <GatherFunc>dlsym(handle, "Gather")
timeTaken = Gather(&x[0], &idx[0], &res[0], N, M)
print (timeTaken)
return res
while not True:
dlclose(handle)
最后再通過Python模塊調用(無需改動),輸出結果為:
0.6107839941978455
(1048576,)
1048576
這里的單位是ms。
總結概要
這篇文章主要介紹了一個CUDA入門的技術:使用CUDA頭文件寫一個專門用于CUDA函數運行時長統計的宏,這樣就可以統計目標Kernel函數的運行時長。可以直接在CUDA中打印相應的數值,也可以回傳到Cython或者Python中進行打印。
版權聲明
本文首發鏈接為:https://www.cnblogs.com/dechinphy/p/cuda-time-record.html
作者ID:DechinPhy
更多原著文章:https://www.cnblogs.com/dechinphy/
請博主喝咖啡:https://www.cnblogs.com/dechinphy/gallery/image/379634.html
總結
- 上一篇: 使用exe4j将java项目打成exe执
- 下一篇: ES6中的模板字符串---反引号``