openCL(一)——hello word
过了一个十一长假,算是舒服了一下,又要给自己充电了,自己最近吧openCL的东西了解了一下,贴出程序和大家分享。。。
对于openCL的安装和配置,网上已经很多了,我也有转载。
对于openCL在初始化配置比较繁琐,不过在AMD的DEMOL里给提供了,是一个初始化模板。有一个头文件和一个CPP文件
一头文件 Template.h。包含了函数申明和一些全局变量的声明。这里不做说明,将在cpp中介绍
#ifndef TEMPLATE_H_#define TEMPLATE_H_#include <CL/cl.h>#include <string.h>#include <cstdlib>#include <iostream>#include <string>#include <fstream>// GLOBALS #define SDK_SUCCESS 0#define SDK_FAILURE 1/* * Input data is stored here. */cl_uint *input;/* * Output data is stored here. */cl_uint *output;/* * Multiplier is stored in this variable */cl_uint multiplier;/* problem size for 1D algorithm and width of problem size for 2D algorithm */cl_uint width;/* The memory buffer that is used as input/output for OpenCL kernel */cl_mem inputBuffer;cl_mem outputBuffer;cl_context context;cl_device_id *devices;cl_command_queue commandQueue;cl_program program;/* This program uses only one kernel and this serves as a handle to it */cl_kernel kernel;// FUNCTION DECLARATIONS /* * OpenCL related initialisations are done here. * Context, Device list, Command Queue are set up. * Calls are made to set up OpenCL memory buffers that this program uses * and to load the programs into memory and get kernel handles. */int initializeCL(void);/* * Convert char* to string */std::string convertToString(const char * filename);/* * This is called once the OpenCL context, memory etc. are set up, * the program is loaded into memory and the kernel handles are ready. * * It sets the values for kernels' arguments and enqueues calls to the kernels * on to the command queue and waits till the calls have finished execution. * * It also gets kernel start and end time if profiling is enabled. * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise */int runCLKernels(void);/** * Releases OpenCL resources (Context, Memory etc.) * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise */int cleanupCL(void);/** * Releases program's resources * @return returns SDK_SUCCESS on success and SDK_FAILURE otherwise */void cleanupHost(void);/* * Prints no more than 256 elements of the given array. * Prints full array if length is less than 256. * * Prints Array name followed by elements. */void print1DArray( const std::string arrayName, const unsigned int * arrayData, const unsigned int length);#endif /* #ifndef TEMPLATE_H_ */
二Template.cpp文件,包括了函数实现和main调用
首先在IopenCL中,大致的流程如下:
1.把openCl装置初始化
2.在openCL装置上配置三块记忆体,以存放数据的资料
3.将数据的复制到OPEVNCL装置上
4.编译要执行的程序
5.执行编译好的程序
6.将计算结果从openCL装置上复制到内存,显示或存入文件
下面将在程序中一步一步实现上面的流程
/* ============================================================#include "Template.h"/* * \brief Host Initialization * Allocate and initialize memory * on the host. Print input array. */intinitializeHost(void)//初始化需要处理的数据和将要存储的空间,即生成两个数组:input,output{ width = 256; input = NULL; output = NULL; multiplier = 2; ///////////////////////////////////////////////////////////////// // Allocate and initialize memory used by host ///////////////////////////////////////////////////////////////// cl_uint sizeInBytes = width * sizeof(cl_uint); input = (cl_uint *) malloc(sizeInBytes); if(!input) { std::cout << "Error: Failed to allocate input memory on host\n"; return SDK_FAILURE; } output = (cl_uint *) malloc(sizeInBytes); if(!output) { std::cout << "Error: Failed to allocate input memory on host\n"; return SDK_FAILURE; } for(cl_uint i = 0; i < width; i++) input[i] = i; // print input array print1DArray(std::string("Input").c_str(), input, width); return SDK_SUCCESS;}/* * Converts the contents of a file into a string */std::stringconvertToString(const char *filename)//将kernel源码,即自己写的并行化的函数,转化成字符串{ size_t size; char* str; std::string s; std::fstream f(filename, (std::fstream::in | std::fstream::binary)); if(f.is_open()) { size_t fileSize; f.seekg(0, std::fstream::end); size = fileSize = (size_t)f.tellg(); f.seekg(0, std::fstream::beg); str = new char[size+1]; if(!str) { f.close(); std::cout << "Memory allocation failed"; return NULL; } f.read(str, fileSize); f.close(); str[size] = '\0'; s = str; delete[] str; return s; } else { std::cout << "\nFile containg the kernel code(\".cl\") not found. Please copy the required file in the folder containg the executable.\n"; exit(1); } return NULL;}/* * \brief OpenCL related initialization * Create Context, Device list, Command Queue * Create OpenCL memory buffer objects * Load CL file, compile, link CL source * Build program and kernel objects */intinitializeCL(void)//openCL装置初始化{ cl_int status = 0; size_t deviceListSize; //////////////////////////////////////////////////////////////////// // STEP 1 Getting Platform.获得系统山所有可支持openCL的装置,我的电脑室AMD显卡 //////////////////////////////////////////////////////////////////// /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms;//平台数目 cl_platform_id platform = NULL;//平台ID status = clGetPlatformIDs(0, NULL, &numPlatforms);//获取平台数目 if(status != CL_SUCCESS) { std::cout << "Error: Getting Platforms. (clGetPlatformsIDs)\n"; return SDK_FAILURE; } if(numPlatforms > 0) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL);//测试是否能获得平台信息 if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Ids. (clGetPlatformsIDs)\n"; return SDK_FAILURE; } for(unsigned int i=0; i < numPlatforms; ++i)//获得每个平台的信息 { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Platform Info.(clGetPlatformInfo)\n"; return SDK_FAILURE; } platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } delete platforms; } if(NULL == platform) { std::cout << "NULL platform found so Exiting Application." << std::endl; return SDK_FAILURE; } //////////////////////////////////////////////////////////////////// // STEP 2 Creating context using the platform selected // Context created from type includes all available // devices of the specified type from the selected platform // 从指定的装置类别中,建立一个openCL context. //////////////////////////////////////////////////////////////////// /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 };///////////////////////////////////////////////////////////////////////////////函数参数介绍//第一个参数:cl_context_properties,包含三个数据的数组:property种类(这里 // CL_CONTEXT_PLATFORM为使用平台ID,第二个是平台ID,)//第二个参数:指定要使用的装置的类别: /* CL_DEVICE_TYPE_CPU:使用CPU装置 CL_DEVICE_TYPE_GPU:使用显示晶片装置 CL_DEVICE_TYPE_ACCELERATOR:特定的OPENCL加速装置,例如CELL CL_DEVICE_TYPE_DEFAULT:系统预定的openCL装置 CL_DEVICE_TYPE_ALL:所有系统统计中的openCL装置 *///其他参数不知道了。。。。 context = clCreateContextFromType(cps, CL_DEVICE_TYPE_CPU, NULL, NULL, &status); if(status != CL_SUCCESS) { std::cout << "Error: Creating Context. (clCreateContextFromType)\n"; return SDK_FAILURE; } //////////////////////////////////////////////////////////////////// // STEP 3 // 3.1 Query context for the device list size, // 3.2 Allocate that much memory using malloc or new // 3.3 Again query context info to get the array of device // available in the created context// 3.1获得装置列表大小// 3.2开辟空间为列表大小// 3.3获得所有装置的列表 //////////////////////////////////////////////////////////////////// // First, get the size of device list data获得装置的列表的大小 status = clGetContextInfo(context, CL_CONTEXT_DEVICES, //表示要取得的装置的列表 0, NULL, &deviceListSize); if(status != CL_SUCCESS) { std::cout << "Error: Getting Context Info \ (device list size, clGetContextInfo)\n"; return SDK_FAILURE; } devices = (cl_device_id *)malloc(deviceListSize); if(devices == 0) { std::cout << "Error: No devices found.\n"; return SDK_FAILURE; } // Now, get the device list data获得装置的列表信息 status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Context Info \ (device list, clGetContextInfo)\n"; return SDK_FAILURE; } //////////////////////////////////////////////////////////////////// // STEP 4 Creating command queue for a single device // Each device in the context can have a // dedicated commandqueue object for itself// 建立Command Queue命令队列,接受对openCL的各种操作 //////////////////////////////////////////////////////////////////// commandQueue = clCreateCommandQueue( context, //之前获得装置的列表信息 devices[0], //自己选择的装置 0, &status); if(status != CL_SUCCESS) { std::cout << "Creating Command Queue. (clCreateCommandQueue)\n"; return SDK_FAILURE; } ///////////////////////////////////////////////////////////////// // STEP 5 Creating cl_buffer objects from host buffer // These buffer objects can be passed to the kernel // as kernel arguments// 配置记忆体并复制资料到装置 ///////////////////////////////////////////////////////////////////分别创建两个记忆体,输入和输出 inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * width, input, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (inputBuffer)\n"; return SDK_FAILURE; } outputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * width, output, &status); if(status != CL_SUCCESS) { std::cout << "Error: clCreateBuffer (outputBuffer)\n"; return SDK_FAILURE; } ///////////////////////////////////////////////////////////////// // STEP 6. Building Kernel // 6.1 Load CL file, using basic file i/o // 6.2 Build CL program object // 6.3 Create CL kernel object// 6.1导入CL文件,即自己写的并行化的函数// 6.2建立程序// 6.3编译程序 ///////////////////////////////////////////////////////////////// const char * filename = "Template_Kernels.cl"; std::string sourceStr = convertToString(filename); const char * source = sourceStr.c_str(); size_t sourceSize[] = { strlen(source) };//直接将CL文件读到记忆体 program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(status != CL_SUCCESS) { std::cout << "Error: Loading Binary into cl_program \ (clCreateProgramWithBinary)\n"; return SDK_FAILURE; } // create a cl program executable for all the devices specified status = clBuildProgram(program, 1, devices, NULL, NULL, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Building Program (clBuildProgram)\n"; return SDK_FAILURE; } // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "templateKernel", &status); if(status != CL_SUCCESS) { std::cout << "Error: Creating Kernel from program. (clCreateKernel)\n"; return SDK_FAILURE; } return SDK_SUCCESS;}/* * \brief Run OpenCL program * * Bind host variables to kernel arguments * Run the CL kernel */int runCLKernels(void){ cl_int status; cl_uint maxDims; cl_event events[2]; size_t globalThreads[1]; size_t localThreads[1]; size_t maxWorkGroupSize; size_t maxWorkItemSizes[3]; //////////////////////////////////////////////////////////////////// // STEP 7 Analyzing proper workgroup size for the kernel // by querying device information // 7.1 Device Info CL_DEVICE_MAX_WORK_GROUP_SIZE // 7.2 Device Info CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS // 7.3 Device Info CL_DEVICE_MAX_WORK_ITEM_SIZES //////////////////////////////////////////////////////////////////// /** * Query device capabilities. Maximum * work item dimensions and the maximmum * work item sizes */ status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n"; return SDK_FAILURE; } status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDims, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n"; return SDK_FAILURE; } status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*maxDims, (void*)maxWorkItemSizes, NULL); if(status != CL_SUCCESS) { std::cout << "Error: Getting Device Info. (clGetDeviceInfo)\n"; return SDK_FAILURE; } globalThreads[0] = width; localThreads[0] = 1; if(localThreads[0] > maxWorkGroupSize || localThreads[0] > maxWorkItemSizes[0]) { std::cout << "Unsupported: Device does not support requested number of work items."; return SDK_FAILURE; } //////////////////////////////////////////////////////////////////// // STEP 8 Set appropriate arguments to the kernel // 8.1 Kernel Arg outputBuffer ( cl_mem object) // 8.2 Kernel Arg inputBuffer (cl_mem object) // 8.3 Kernel Arg multiplier (cl_uint) //////////////////////////////////////////////////////////////////// // the output array to the kernel status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&outputBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernel argument. (output)\n"; return SDK_FAILURE; } // the input array to the kernel status = clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *)&inputBuffer); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernel argument. (input)\n"; return SDK_FAILURE; } // multiplier status = clSetKernelArg( kernel, 2, sizeof(cl_uint), (void *)&multiplier); if(status != CL_SUCCESS) { std::cout << "Error: Setting kernel argument. (multiplier)\n"; return SDK_FAILURE; } //////////////////////////////////////////////////////////////////// // STEP 9 Enqueue a kernel run call. // Wait till the event completes and release the event //////////////////////////////////////////////////////////////////// status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(status != CL_SUCCESS) { std::cout << "Error: Enqueueing kernel onto command queue. \ (clEnqueueNDRangeKernel)\n"; return SDK_FAILURE; } // wait for the kernel call to finish execution status = clWaitForEvents(1, &events[0]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for kernel run to finish. \ (clWaitForEvents)\n"; return SDK_FAILURE; } status = clReleaseEvent(events[0]); if(status != CL_SUCCESS) { std::cout << "Error: Release event object. \ (clReleaseEvent)\n"; return SDK_FAILURE; } //////////////////////////////////////////////////////////////////// // STEP 10 Enqueue readBuffer to read the output back // Wait for the event and release the event //////////////////////////////////////////////////////////////////// status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, width * sizeof(cl_uint), output, 0, NULL, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: clEnqueueReadBuffer failed. \ (clEnqueueReadBuffer)\n"; return SDK_FAILURE; } // Wait for the read buffer to finish execution status = clWaitForEvents(1, &events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Waiting for read buffer call to finish. \ (clWaitForEvents)\n"; return SDK_FAILURE; } status = clReleaseEvent(events[1]); if(status != CL_SUCCESS) { std::cout << "Error: Release event object. \ (clReleaseEvent)\n"; return SDK_FAILURE; } return SDK_SUCCESS;}/* * \brief Release OpenCL resources (Context, Memory etc.) */int cleanupCL(void){ cl_int status; //////////////////////////////////////////////////////////////////// // STEP 11 CLean up the opencl resources used //////////////////////////////////////////////////////////////////// status = clReleaseKernel(kernel); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseKernel \n"; return SDK_FAILURE; } status = clReleaseProgram(program); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseProgram\n"; return SDK_FAILURE; } status = clReleaseMemObject(inputBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (inputBuffer)\n"; return SDK_FAILURE; } status = clReleaseMemObject(outputBuffer); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseMemObject (outputBuffer)\n"; return SDK_FAILURE; } status = clReleaseCommandQueue(commandQueue); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseCommandQueue\n"; return SDK_FAILURE; } status = clReleaseContext(context); if(status != CL_SUCCESS) { std::cout << "Error: In clReleaseContext\n"; return SDK_FAILURE; } return SDK_SUCCESS;}/* * \brief Releases program's resources */voidcleanupHost(void){ if(input != NULL) { free(input); input = NULL; } if(output != NULL) { free(output); output = NULL; } if(devices != NULL) { free(devices); devices = NULL; }}/* * \brief Print no more than 256 elements of the given array. * * Print Array name followed by elements. */void print1DArray( const std::string arrayName, const unsigned int * arrayData, const unsigned int length){ cl_uint i; cl_uint numElementsToPrint = (256 < length) ? 256 : length; std::cout << std::endl; std::cout << arrayName << ":" << std::endl; for(i = 0; i < numElementsToPrint; ++i) { std::cout << arrayData[i] << " "; } std::cout << std::endl;}void verify(){ bool passed = true; for(unsigned int i = 0; i < width; ++i) if(input[i] * multiplier != output[i]) passed = false; if(passed == true) std::cout << "Passed!\n" << std::endl; else std::cout << "Failed!\n" << std::endl;}int main(int argc, char * argv[]){ // Initialize Host application if(initializeHost() != SDK_SUCCESS) return SDK_FAILURE; // Initialize OpenCL resources if(initializeCL() != SDK_SUCCESS) return SDK_FAILURE; // Run the CL program if(runCLKernels() != SDK_SUCCESS) return SDK_FAILURE; // Print output array print1DArray(std::string("Output"), output, width); // Verify output verify(); // Releases OpenCL resources if(cleanupCL()!= SDK_SUCCESS) return SDK_FAILURE; // Release host resources cleanupHost(); return SDK_SUCCESS;}