我们使用矢量加操作作为第一个OpenCL程序,进入OpenCL编程讲解。
内核程序vecAdd.cl内容:
_kernel void vector_add(global const float *a,
global const float *b, global float *result)
{
int gid = get_global_id(0);
result[gid] = a[gid] + b[gid];
}
主机端代码:
#include 〈stdio.h>
#include 〈stdlib.h>
#ifdef _APPLE_
#include 〈OpenCL/cl.h>
#else
#include 〈CL/cl.h>
#endif
const int ARRAY_SIZE = 1000;
char *ReadKernelSourceFile(const char *filename, size_t *length)
{
FILE *file = NULL;
size_t sourceLength;
char *sourceString;
int ret;
file = fopen(filename, "rb");
if(file == NULL)
}
{
printf("%s at %d :Can't open %s\n", _FILE_, _LINE_ - 2,
filename);
return NULL;
}
fseek(file, 0, SEEK_END);
sourceLength = ftell(file);
fseek(file, 0, SEEK_SET);
sourceString = (char *)malloc(sourceLength + 1)
sourceString[0] = '\0';
ret = fread(sourceString, sourceLength, 1, file);
if(ret == 0)
{
printf("%s at %d : Can't read source %s\n", _FILE_,
_LINE_ - 2, filename);
return NULL;
}
fclose(file);
if(length != 0)
{
*length = sourceLength;
}
sourceString[sourceLength] = '\0';
return sourceString;
}
/*
1.创建平台
2.创建设备
3.根据设备创建上下文
*/
cl_context CreateContext(cl_device_id *device)
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_context context = NULL;
errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms 〈= 0)
{
printf( "Failed to find any OpenCL platforms." );
return NULL;
}
errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1,
device, NULL);
if (errNum != CL_SUCCESS)
{
printf( "There is no GPU, trying CPU..." );
errNum = clGetDeviceIDs(firstPlatformId,
CL_DEVICE_TYPE_CPU, 1, device, NULL);
}
if (errNum != CL_SUCCESS)
{
printf( "There is NO GPU or CPU" );
return NULL;
}
context = clCreateContext(NULL, 1, *device, NULL, NULL, & errNum);
if (errNum != CL_SUCCESS)
{
printf( " create context error\n" );
return NULL;
}
return context;
}
/*
@在上下文可用的第一个设备中创建命令队列
*/
cl_command_queue CreateCommandQueue(cl_context context,
cl_device_id device)
{
cl_int errNum;
cl_command_queue commandQueue = NULL;
commandQueue = clCreateCommandQueue(context, device, 0, NULL);
if (commandQueue == NULL)
{
printf("Failed to create commandQueue for device 0");
return NULL;
}
return commandQueue;
}
/*
@读取内核源码创建OpenCL程序
*/
cl_program CreateProgram(cl_context context,
cl_device_id device,
const char *fileName)
{
cl_int errNum;
cl_program program;
size_t program_length;
char *const source = ReadKernelSourceFile (fileName,
&program_length);
program = clCreateProgramWithSource(context, 1,
(const char **)&source,
NULL, NULL);
if (program == NULL)
{
printf("Failed to create CL program from source." );
return NULL;
}
errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
char buildLog[16384];
clGetProgramBuildInfo(program, device,
CL_PROGRAM_BUILD_LOG,
sizeof(buildLog),
buildLog, NULL);
printf("Error in kernel:%s ", buildLog);
clReleaseProgram(program);
return NULL;
}
return program;
}
/*
@创建内存对象
*/
bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
float * a, float * b)
{
memObjects[0] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, a, NULL);
memObjects[1] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(float) * ARRAY_SIZE, b, NULL);
memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * ARRAY_SIZE, NULL, NULL);
if (memObjects[0] == NULL || memObjects[1] == NULL
|| memObjects[2] == NULL)
{
printf("Error creating memory objects." );
return false;
}
return true;
}
/*
@清除OpenCL资源
*/
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel,
cl_mem memObjects[3])
{
for (int i = 0; i 〈 3; i++)
{
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
int main(int argc, char **argv)
{
cl_context context = 0;
cl_command_queue commandQueue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem memObjects[3] = { 0, 0, 0 };
cl_int errNum;
//创建OpenCL上下文
context = CreateContext(&device);
if (context == NULL)
{
printf("Failed to create OpenCL context." );
return 1;
}
//获得OpenCL设备,并创建命令队列
commandQueue = CreateCommandQueue(context, &device);
if (commandQueue == NULL)
{
Cleanup(context, commandQueue, program, kernel,
memObjects);
return 1;
}
//创建OPenCL程序
program = CreateProgram(context, device, "vecAdd.cl");
if (program == NULL)
{
Cleanup(context, commandQueue, program,
kernel, memObjects);
return 1;
}
//创建OpenCL内核
kernel = clCreateKernel(program, "vector_add", NULL);
if (kernel == NULL)
{
printf( "Failed to create kernel");
Cleanup(context, commandQueue, program,
kernel, memObjects);
return 1;
}
//创建OpenCL内存对象
float result[ARRAY_SIZE];
float a[ARRAY_SIZE];
float b[ARRAY_SIZE];
for (int i = 0; i 〈 ARRAY_SIZE; i++)
{
a[i] = (float)i;
b[i] = (float)(i * 2);
}
if (!CreateMemObjects(context, memObjects, a, b))
{
Cleanup(context, commandQueue, program,
kernel, memObjects);
return 1;
}
//设置内核参数
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem),
&memObjects[1]);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem),
&memObjects[2]);
if (errNum != CL_SUCCESS)
{
printf("Error setting kernel arguments.");
Cleanup(context, commandQueue, program,
kernel, memObjects);
return 1;
}
size_t globalWorkSize[1] = { ARRAY_SIZE };
size_t localWorkSize[1] = { 1 };
//执行内核
errNum = clEnqueueNDRangeKernel(commandQueue, kernel,
1, NULL,
globalWorkSize,
localWorkSize, 0, NULL,
NULL);
if (errNum != CL_SUCCESS)
{
printf( "Error queuing kernel for execution." );
Cleanup(context, commandQueue, program, kernel,
memObjects);
return 1;
}
//计算结果拷贝回主机
errNum = clEnqueueReadBuffer(commandQueue, memObjects[2],
CL_TRUE, 0,
ARRAY_SIZE * sizeof(float),
result, 0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
printf( "Error reading result buffer." );
Cleanup(context, commandQueue, program, kernel,
memObjects);
return 1;
}
for (int i = 0; i 〈 ARRAY_SIZE; i++)
{
printf("i=%d:%f\n", i, result[i]);
}
printf("Executed program succesfully." );
Cleanup(context, commandQueue, program, kernel, memObjects);
return 0;
}
我们以向量加法作为OpenCL编程的第一个例子。从例子中可以看出OpenCL编程的标准开发流程。
支持OpenCL支持的平台、设备很多,为了兼顾不同设备,OpenCL程序的第一步就是确定OpenCL执行的平台,在确定平台之后再确定执行OpenCL计算的设备。确定设备后创建上下文,上下文中包含上一步查询的OpenCL计算的设备(允许包含多个计算设备对象),以及接下来创建的内核、程序对象和内存对象。
创建上下文以后,需要创建命令队列,但一个OpenCL设备可以对应多个队列对应一个命令队列一个OpenCL设备。例如,如果需要使用上下文中包含的两个计算设备时,为每个设备创建各自的命令队列。主机与OpenCL设备间数据传输、执行内核等交互操作都是通过入队到命令队列中,命令队列中的各个命令交给OpenCL驱动或相应的硬件单元去执行。
由于在运行时才知道OpenCL设备信息,所以在OpenCL主机端程序中读取内核源码并创建程序对象,根据OpenCL设备编译、构建程序对象,最后创建内核对象。通过这三步操作,把OpenCL设备上执行的内核代码编译完成。
对于在OpenCL设备上执行的内核函数需要输入参数,在主机端调用设置内核参数函数。除此之外,还需要设置在设备上用于执行的工作组和工作项的参数。
当上述操作都完成以后,我们就可以把内核函数入队到命令队列中,命令队列提交给相应的设备执行。
OpenCL设备执行完计算后,把数据拷贝回主机端,并销毁分配的资源。
上述整个流程,如下图所示。
通过向量加法的例子,相信读者对于OpenCL开发有了一定的了解。