OpenCL简单入门
一、前言
最近在做三維點云處理方面的項目,對于三維數據方面的處理來說是非常耗時的,為了加快項目算法的處理速度,于是充分發揮計算機的GPU處理性能,在對項目算法中的不同模塊采用了Opencl和CUDA加速技術。這篇只在這里記錄Opencl部分,后續會更新CUDA部分。一如既往,從簡單的入門開始,下面將分小節開始。
二、OpenCL的相關概念
OpenCL是一個為異構平臺編寫程序的框架,此異構平臺可由CPU、GPU或其他類型的處理器組成。OpenCL由一門用于編寫kernels (在OpenCL設備上運行的核函數)的語言(基于C99)和一組用于定義并控制平臺的API組成。OpenCL提供了兩種層面的并行機制:數據并行和任務并行。一個完整的OpenCL加速技術過程涉及到平臺(Platform)、設備(Device)、上下文(Context)、OpenCL程序(Program)、指令隊列(Command)、核函數(Kernel)、內存對象(Memory Object)、調用設備接口(NDRange),下面將分別進行做簡單的介紹,后面也會給出參考資料的相關鏈接。
三、OpenCL編程的步驟
Opencl編程的步驟比較繁瑣,但是都比較固定,下面集合代碼進行介紹,利用理解操作。
1.平臺查找和初始化
調用兩次clGetPlatformIDs函數,第一次獲取可用的平臺數量,第二次獲取一個可用的平臺。代碼參考如下:
int getPlatform(cl_platform_id &platform) {platform = NULL;//the chosen platformcl_uint numPlatforms;//the NO. of platformscl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);if (status != CL_SUCCESS){cout << "Error: Getting platforms!" << endl;return -1;}/**For clarity, choose the first available platform. */if (numPlatforms > 0){cl_platform_id* platforms =(cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));status = clGetPlatformIDs(numPlatforms, platforms, NULL);platform = platforms[0];free(platforms);}elsereturn -1; }2.設備查找和初始化
調用兩次clGetDeviceIDs函數,第一次獲取可用的設備數量,第二次獲取一個可用的設備。代碼參考如下:
cl_device_id *getCl_device_id(cl_platform_id &platform) {cl_uint numDevices = 0;cl_device_id *devices = NULL;cl_int status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);if (numDevices > 0) //GPU available.{devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);}return devices; }3.創建上下文
調用clCreateContext函數,上下文context可能會管理多個設備device。代碼參考如下:
oclContext=clCreateContext(NULL,1,&oclComputeDeviceID,NULL,NULL,&ret_ocl);4.創建命令隊列
調用clCreateCommandQueue函數,一個設備device對應一個command queue。上下文conetxt將命令發送到設備對應的command queue,設備就可以執行命令隊列里的命令。代碼參考如下:
oclCommandQueue=clCreateCommandQueue(oclContext,oclComputeDeviceID,0,&ret_ocl);5.創建內存對象
調用clCreateBuffer函數,Buffer中保存的是數據對象,就是設備執行程序需要的數據保存在其中。Buffer由上下文conetxt創建,這樣上下文管理的多個設備就會共享Buffer中的數據。代碼參考如下:
deviceInput1=clCreateBuffer(oclContext,CL_MEM_READ_ONLY,size,NULL,&ret_ocl);6.創建程序對象
創建程序對象,程序對象就代表你的程序源文件或者二進制代碼數據。這里將要調用Opencl內置函數和一個自己編寫的工具函數,工具函數用于讀取Kernel核函數。文件后綴為.cl。代碼參考如下:
ret_ocl=clBuildProgram(oclProgram,0,NULL,NULL,NULL,NULL); char * ReadKernelSourceFile(const char* filename, size_t* length) {FILE *file = NULL;size_t sourcesLength;char* sourcesString;int ret;file = fopen(filename, "rb");if (file==NULL){//printf("%s at %d:Can't open %s\n", _FILE, _LINE_ - 2, filename);return NULL;}fseek(file, 0, SEEK_END);sourcesLength = ftell(file);//the file length;fseek(file, 0, SEEK_SET);sourcesString = (char*)malloc(sourcesLength + 1);//sourcesString[0] = '\0';ret = fread(sourcesString, sourcesLength, 1, file);//read file failif (ret==0){//printf("%s at %d:Can't open %s\n", _FILE, _LINE_ - 2, filename);return NULL;}fclose(file);if (length!=0){*length = sourcesLength;}sourcesString[sourcesLength] = '\0';//最后一位加0表示結束return sourcesString; }7.創建核函數Kernel
調用clCreateKernel函數,根據你的程序對象,生成kernel對象,表示設備程序的入口。參考代碼如下:
// create OpenCL kernel by passing kernel function name that we used in .cl fileoclKernel=clCreateKernel(oclProgram,"vecAdd",&ret_ocl);8.設置Kernel參數
調用clSetKernelArg函數,參考代碼如下:
ret_ocl=clSetKernelArg(oclKernel,0,sizeof(cl_mem),(void *)&deviceInput1); // 'deviceInput1' maps to 'in1' param of kernel function in .cl file9.設置工作項大小并執行核函數
設置工作項的東西(worksize),可以簡單的理解為線程的多少。核函數的執行,調用調用clEnqueueNDRangeKernel函數,參考代碼如下:
ret_ocl=clEnqueueNDRangeKernel(oclCommandQueue,oclKernel,1,NULL,&globalWorkSize,&localWorkSize,0,NULL,NULL);10.讀取結果到主機端
在設備端運行完結果后,需要將結果拷貝到主機端,參與下一步的相應計算,調用調用clEnqueueReadBuffer函數。參考代碼如下:
ret_ocl=clEnqueueReadBuffer(oclCommandQueue,deviceOutput,CL_TRUE,0,size,hostOutput,0,NULL,NULL);11.資源釋放。
當完成所需加速部分得到結果后,需要將設備端的資源進行釋放,才能完成完成整個運行過程。資源釋opencl有內置的函數,參考代碼如下:
void cleanup(void) {// code// OpenCL cleanupif(oclSourceCode){free((void *)oclSourceCode);oclSourceCode=NULL;}if(oclKernel){clReleaseKernel(oclKernel);oclKernel=NULL;}if(oclProgram){clReleaseProgram(oclProgram);oclProgram=NULL;}if(oclCommandQueue){clReleaseCommandQueue(oclCommandQueue);oclCommandQueue=NULL;}if(oclContext){clReleaseContext(oclContext);oclContext=NULL;}// free allocated device-memoryif(deviceInput1){clReleaseMemObject(deviceInput1);deviceInput1=NULL;}if(deviceInput2){clReleaseMemObject(deviceInput2);deviceInput2=NULL;}if(deviceOutput){clReleaseMemObject(deviceOutput);deviceOutput=NULL;}// free allocated host-memoryif(hostInput1){free(hostInput1);hostInput1=NULL;}if(hostInput2){free(hostInput2);hostInput2=NULL;}if(hostOutput){free(hostOutput);hostOutput=NULL;}if(gold){free(gold);gold=NULL;} }至此完成opencl整個過程,下面將以向量的相加給出完整的例子,僅供參考。
四、程序實例向量相加
例子包含一個工具文件、函數核函數文件,主程序,工具文件tool.h和tool.cpp,核函數VceAdd.cl ,主程序代碼參考如下,另外兩個下載鏈接例子工程文件。
// headers #include <stdio.h> #include <stdlib.h> // exit() #include <string.h> // strlen() #include <math.h> // fabs() #include <iostream>#include "CL/opencl.h"#include "helper_timer.h"// global OpenCL variables cl_int ret_ocl; cl_platform_id oclPlatformID; cl_device_id oclComputeDeviceID; // compute device id cl_context oclContext; // compute context cl_command_queue oclCommandQueue; // compute command queue cl_program oclProgram; // compute program cl_kernel oclKernel; // compute kernelchar *oclSourceCode=NULL; size_t sizeKernelCodeLength;// odd number 11444777 is deliberate illustration ( Nvidia OpenCL Samples ) int iNumberOfArrayElements = 11444777; size_t localWorkSize=256; size_t globalWorkSize;float *hostInput1=NULL; float *hostInput2=NULL; float *hostOutput=NULL; float *gold=NULL;cl_mem deviceInput1=NULL; cl_mem deviceInput2=NULL; cl_mem deviceOutput=NULL;float timeOnCPU; float timeOnGPU;int main(void) {// function declarationsvoid fillFloatArrayWithRandomNumbers(float *, int);size_t roundGlobalSizeToNearestMultipleOfLocalSize(int, unsigned int);void vecAddHost(const float *, const float *, float *, int);char* loadOclProgramSource(const char *,const char *,size_t *);void cleanup(void);void FileInit(float *, int );// code// allocate host-memoryhostInput1=(float *)malloc(sizeof(float) * iNumberOfArrayElements);if(hostInput1== NULL){printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Host Input Array 1.\nExitting ...\n");cleanup();exit(EXIT_FAILURE);}hostInput2=(float *)malloc(sizeof(float) * iNumberOfArrayElements);if(hostInput2== NULL){printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Host Input Array 2.\nExitting ...\n");cleanup();exit(EXIT_FAILURE);}// allocate host-memory to hold 'float' type host vector hostOutputhostOutput=(float *)malloc(sizeof(float) * iNumberOfArrayElements);if(hostOutput== NULL){printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Host Output Array.\nExitting ...\n");cleanup();exit(EXIT_FAILURE);}gold=(float *)malloc(sizeof(float) * iNumberOfArrayElements);if(gold== NULL){printf("CPU Memory Fatal Error = Can Not Allocate Enough Memory For Gold Output Array.\nExitting ...\n");cleanup();exit(EXIT_FAILURE);}// fill above input host vectors with arbitary but hard-coded data// fillFloatArrayWithRandomNumbers(hostInput1,iNumberOfArrayElements);// fillFloatArrayWithRandomNumbers(hostInput2,iNumberOfArrayElements);FileInit(hostInput1, iNumberOfArrayElements);FileInit(hostInput2, iNumberOfArrayElements);// get OpenCL supporting platform's IDret_ocl=clGetPlatformIDs(1,&oclPlatformID,NULL);if(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clGetDeviceIDs() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// get OpenCL supporting GPU device's IDret_ocl=clGetDeviceIDs(oclPlatformID,CL_DEVICE_TYPE_GPU,1,&oclComputeDeviceID,NULL);if(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clGetDeviceIDs() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}char gpu_name[255];clGetDeviceInfo(oclComputeDeviceID,CL_DEVICE_NAME,sizeof(gpu_name),&gpu_name,NULL);printf("%s\n",gpu_name);// create OpenCL compute contextoclContext=clCreateContext(NULL,1,&oclComputeDeviceID,NULL,NULL,&ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateContext() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// create command queueoclCommandQueue=clCreateCommandQueue(oclContext,oclComputeDeviceID,0,&ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateCommandQueue() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// create OpenCL program from .cloclSourceCode=loadOclProgramSource("VecAddenw.cl","",&sizeKernelCodeLength);cl_int status=0;oclProgram = clCreateProgramWithSource(oclContext, 1, (const char **)&oclSourceCode, &sizeKernelCodeLength, &ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateProgramWithSource() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(0);}// build OpenCL programret_ocl=clBuildProgram(oclProgram,0,NULL,NULL,NULL,NULL);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clBuildProgram() Failed : %d. Exitting Now ...\n",ret_ocl);size_t len;char buffer[2048];clGetProgramBuildInfo(oclProgram,oclComputeDeviceID,CL_PROGRAM_BUILD_LOG,sizeof(buffer),buffer,&len);printf("OpenCL Program Build Log : %s\n",buffer);cleanup();exit(EXIT_FAILURE);}// create OpenCL kernel by passing kernel function name that we used in .cl fileoclKernel=clCreateKernel(oclProgram,"vecAdd",&ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateKernel() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}int size=iNumberOfArrayElements * sizeof(cl_float);// allocate device-memorydeviceInput1=clCreateBuffer(oclContext,CL_MEM_READ_ONLY,size,NULL,&ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateBuffer() Failed For 1st Input Array : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}deviceInput2=clCreateBuffer(oclContext,CL_MEM_READ_ONLY,size,NULL,&ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateBuffer() Failed For 2nd Input Array : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}deviceOutput=clCreateBuffer(oclContext,CL_MEM_WRITE_ONLY,size,NULL,&ret_ocl);if(ret_ocl!=CL_SUCCESS){printf("OpenCL Error - clCreateBuffer() Failed For 2nd Input Array : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// set OpenCL kernel arguments. Our OpenCL kernel has 4 arguments 0,1,2,3// set 0 based 0th argument i.e. deviceInput1ret_ocl=clSetKernelArg(oclKernel,0,sizeof(cl_mem),(void *)&deviceInput1); // 'deviceInput1' maps to 'in1' param of kernel function in .cl fileif(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clSetKernelArg() Failed For 1st Argument : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// set 0 based 1st argument i.e. deviceInput2ret_ocl=clSetKernelArg(oclKernel,1,sizeof(cl_mem),(void *)&deviceInput2); // 'deviceInput2' maps to 'in2' param of kernel function in .cl fileif(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clSetKernelArg() Failed For 2nd Argument : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// set 0 based 2nd argument i.e. deviceOutputret_ocl=clSetKernelArg(oclKernel,2,sizeof(cl_mem),(void *)&deviceOutput); // 'deviceOutput' maps to 'out' param of kernel function in .cl fileif(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clSetKernelArg() Failed For 3rd Argument : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// set 0 based 3rd argument i.e. lenret_ocl=clSetKernelArg(oclKernel,3,sizeof(cl_int),(void *)&iNumberOfArrayElements); // 'iNumberOfArrayElements' maps to 'len' param of kernel function in .cl fileif(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clSetKernelArg() Failed For 4th Argument : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// write abve 'input' device buffer to device memoryret_ocl=clEnqueueWriteBuffer(oclCommandQueue,deviceInput1,CL_FALSE,0,size,hostInput1,0,NULL,NULL);if(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clEnqueueWriteBuffer() Failed For 1st Input Device Buffer : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}ret_ocl=clEnqueueWriteBuffer(oclCommandQueue,deviceInput2,CL_FALSE,0,size,hostInput2,0,NULL,NULL);if(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clEnqueueWriteBuffer() Failed For 2nd Input Device Buffer : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// run the kernelglobalWorkSize=roundGlobalSizeToNearestMultipleOfLocalSize(localWorkSize, iNumberOfArrayElements);// start timerStopWatchInterface *timer = NULL;sdkCreateTimer(&timer);sdkStartTimer(&timer);ret_ocl=clEnqueueNDRangeKernel(oclCommandQueue,oclKernel,1,NULL,&globalWorkSize,&localWorkSize,0,NULL,NULL);if(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clEnqueueNDRangeKernel() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}// finish OpenCL command queueclFinish(oclCommandQueue);// stop timersdkStopTimer(&timer);timeOnGPU = sdkGetTimerValue(&timer);sdkDeleteTimer(&timer);// read back result from the device (i.e from deviceOutput) into cpu variable (i.e hostOutput)ret_ocl=clEnqueueReadBuffer(oclCommandQueue,deviceOutput,CL_TRUE,0,size,hostOutput,0,NULL,NULL);if(ret_ocl != CL_SUCCESS){printf("OpenCL Error - clEnqueueReadBuffer() Failed : %d. Exitting Now ...\n",ret_ocl);cleanup();exit(EXIT_FAILURE);}vecAddHost(hostInput1, hostInput2, gold, iNumberOfArrayElements);// compare results for golden-hostconst float epsilon = 0.000001f;bool bAccuracy=true;int breakValue=0;int i;for(i=0;i<iNumberOfArrayElements;i++){/*float val1 = gold[i];float val2 = hostOutput[i];if(fabs(val1-val2) > epsilon){bAccuracy = false;breakValue=i;break;}*///std::cout << "HostOutPut:" << hostOutput[i] << std::endl;}if(bAccuracy==false){printf("Break Value = %d\n",breakValue);}char str[125];if(bAccuracy==true)sprintf(str,"%s","Comparison Of Output Arrays On CPU And GPU Are Accurate Within The Limit Of 0.000001");elsesprintf(str,"%s","Not All Comparison Of Output Arrays On CPU And GPU Are Accurate Within The Limit Of 0.000001");printf("1st Array Is From 0th Element %.6f To %dth Element %.6f\n",hostInput1[0], iNumberOfArrayElements-1, hostInput1[iNumberOfArrayElements-1]);printf("2nd Array Is From 0th Element %.6f To %dth Element %.6f\n",hostInput2[0], iNumberOfArrayElements-1, hostInput2[iNumberOfArrayElements-1]);printf("Global Work Size = %u And Local Work Size Size = %u\n",(unsigned int)globalWorkSize, (unsigned int)localWorkSize);printf("Sum Of Each Element From Above 2 Arrays Creates 3rd Array As :\n");printf("3nd Array Is From 0th Element %.6f To %dth Element %.6f\n",hostOutput[0], iNumberOfArrayElements-1, hostOutput[iNumberOfArrayElements-1]);printf("The Time Taken To Do Above Addition On CPU = %.6f (ms)\n",timeOnCPU);printf("The Time Taken To Do Above Addition On GPU = %.6f (ms)\n",timeOnGPU);printf("%s\n",str);// total cleanupcleanup();system("pause");return(0); }void cleanup(void) {// code// OpenCL cleanupif(oclSourceCode){free((void *)oclSourceCode);oclSourceCode=NULL;}if(oclKernel){clReleaseKernel(oclKernel);oclKernel=NULL;}if(oclProgram){clReleaseProgram(oclProgram);oclProgram=NULL;}if(oclCommandQueue){clReleaseCommandQueue(oclCommandQueue);oclCommandQueue=NULL;}if(oclContext){clReleaseContext(oclContext);oclContext=NULL;}// free allocated device-memoryif(deviceInput1){clReleaseMemObject(deviceInput1);deviceInput1=NULL;}if(deviceInput2){clReleaseMemObject(deviceInput2);deviceInput2=NULL;}if(deviceOutput){clReleaseMemObject(deviceOutput);deviceOutput=NULL;}// free allocated host-memoryif(hostInput1){free(hostInput1);hostInput1=NULL;}if(hostInput2){free(hostInput2);hostInput2=NULL;}if(hostOutput){free(hostOutput);hostOutput=NULL;}if(gold){free(gold);gold=NULL;} }void fillFloatArrayWithRandomNumbers(float *pFloatArray, int iSize) {// codeint i;const float fScale = 1.0f / (float)RAND_MAX;for (i = 0; i < iSize; ++i){pFloatArray[i] = fScale * rand();} }void FileInit(float *p, int N) {for (int i = 0; i < N; i++){p[i] = i;} }size_t roundGlobalSizeToNearestMultipleOfLocalSize(int local_size, unsigned int global_size) {// codeunsigned int r = global_size % local_size;if(r == 0){return(global_size);}else{return(global_size + local_size - r);} }// "Golden" Host processing vector addition function for comparison purposes void vecAddHost(const float* pFloatData1, const float* pFloatData2, float* pFloatResult, int iNumElements) {int i;StopWatchInterface *timer = NULL;sdkCreateTimer(&timer);sdkStartTimer(&timer);for (i = 0; i < iNumElements; i++){pFloatResult[i] = pFloatData1[i] + pFloatData2[i];}sdkStopTimer(&timer);timeOnCPU = sdkGetTimerValue(&timer);sdkDeleteTimer(&timer); }char* loadOclProgramSource(const char *filename, const char *preamble, size_t *sizeFinalLength) {// localsFILE *pFile=NULL;size_t sizeSourceLength;pFile=fopen(filename,"rb"); // binary readif(pFile==NULL)return(NULL);size_t sizePreambleLength=(size_t)strlen(preamble);// get the length of the source codefseek(pFile,0,SEEK_END);sizeSourceLength=ftell(pFile);fseek(pFile,0,SEEK_SET); // reset to beginning// allocate a buffer for the source code string and read it inchar *sourceString=(char *)malloc(sizeSourceLength+sizePreambleLength+1);memcpy(sourceString, preamble, sizePreambleLength);if(fread((sourceString)+sizePreambleLength,sizeSourceLength,1,pFile)!=1){fclose(pFile);free(sourceString);return(0);}// close the file and return the total length of the combined (preamble + source) stringfclose(pFile);if(sizeFinalLength != 0){*sizeFinalLength = sizeSourceLength + sizePreambleLength;}sourceString[sizeSourceLength + sizePreambleLength]='\0';return(sourceString); }由于工作任務緊張,寫的有點倉促,難免有錯誤之處,望大家指正,相互學習。
?
總結
以上是生活随笔為你收集整理的OpenCL简单入门的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 实现简单的shell sed替换功能
- 下一篇: (转)GDB 使用方法