OpenCL 第一个程序

我们使用矢量加操作作为第一个OpenCL程序,进入OpenCL编程讲解。

内核程序vecAdd.cl内容:

 _kernel void vector_add(global const float *a,
                             global const float *b, global float *result)
 {
     int gid = get_global_id(0);
     result[gid] = a[gid] + b[gid];
 }

主机端代码:

#include 〈stdio.h>
#include 〈stdlib.h>
#ifdef _APPLE_
#include 〈OpenCL/cl.h>
#else
#include 〈CL/cl.h>
#endif
const int ARRAY_SIZE = 1000;
char *ReadKernelSourceFile(const char *filename, size_t *length)
{
    FILE *file = NULL;
    size_t sourceLength;
    char *sourceString;
    int ret;
    file = fopen(filename, "rb");
    if(file == NULL)
 }
 {
    printf("%s at %d :Can't open %s\n", _FILE_, _LINE_ - 2,
            filename);
    return NULL;
 }
 fseek(file, 0, SEEK_END);
 sourceLength = ftell(file);
 fseek(file, 0, SEEK_SET);
 sourceString = (char *)malloc(sourceLength + 1)
                  sourceString[0] = '\0';
ret = fread(sourceString, sourceLength, 1, file);
 if(ret == 0)
 {
    printf("%s at %d : Can't read source %s\n", _FILE_,
            _LINE_ - 2, filename);
    return NULL;
 }
 fclose(file);
 if(length != 0)
 {
    *length = sourceLength;
 }
 sourceString[sourceLength] = '\0';
 return sourceString;
}
/*
1.创建平台
2.创建设备
3.根据设备创建上下文
*/
cl_context CreateContext(cl_device_id *device)
{
 cl_int errNum;
 cl_uint numPlatforms;
 cl_platform_id firstPlatformId;
 cl_context context = NULL;
 errNum = clGetPlatformIDs(1, &firstPlatformId, &numPlatforms);
 if (errNum != CL_SUCCESS || numPlatforms 〈= 0)
 {
    printf( "Failed to find any OpenCL platforms." );
    return NULL;
 }
 errNum = clGetDeviceIDs(firstPlatformId, CL_DEVICE_TYPE_GPU, 1,
                            device, NULL);
 if (errNum != CL_SUCCESS)
 {
    printf( "There is no GPU, trying CPU..." );
    errNum = clGetDeviceIDs(firstPlatformId,
                                  CL_DEVICE_TYPE_CPU, 1, device, NULL);
 }
 if (errNum != CL_SUCCESS)
 {
    printf( "There is NO GPU or CPU" );
    return NULL;
 }
context = clCreateContext(NULL, 1, *device, NULL, NULL, & errNum);
 if (errNum != CL_SUCCESS)
 {
    printf( " create context error\n" );
    return NULL;
 }
 return context;
}
 /*
 @在上下文可用的第一个设备中创建命令队列
 */
 cl_command_queue CreateCommandQueue(cl_context context,
                                          cl_device_id device)
{
    cl_int errNum;
    cl_command_queue commandQueue = NULL;
    commandQueue = clCreateCommandQueue(context, device, 0, NULL);
    if (commandQueue == NULL)
    {
        printf("Failed to create commandQueue for device 0");
        return NULL;
    }
    return commandQueue;
 }
 /*
 @读取内核源码创建OpenCL程序
 */
 cl_program CreateProgram(cl_context context,
                              cl_device_id device,
                              const char *fileName)
 {
    cl_int errNum;
    cl_program program;
    size_t program_length;
    char *const source = ReadKernelSourceFile (fileName,
                                                        &program_length);
    program = clCreateProgramWithSource(context, 1,
                                              (const char **)&source,
                                                NULL, NULL);
    if (program == NULL)
    {
        printf("Failed to create CL program from source." );
        return NULL;
    }
 errNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 if (errNum != CL_SUCCESS)
 {
    char buildLog[16384];
    clGetProgramBuildInfo(program, device,
                              CL_PROGRAM_BUILD_LOG,
                              sizeof(buildLog),
                              buildLog, NULL);
    printf("Error in kernel:%s ", buildLog);
    clReleaseProgram(program);
    return NULL;
 }
 return program;
}
/*
@创建内存对象
*/
bool CreateMemObjects(cl_context context, cl_mem memObjects[3],
                    float * a, float * b)
{
 memObjects[0] = clCreateBuffer(context,
                        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                        sizeof(float) * ARRAY_SIZE, a, NULL);
 memObjects[1] = clCreateBuffer(context,
                        CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                        sizeof(float) * ARRAY_SIZE, b, NULL);
 memObjects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
                                    sizeof(float) * ARRAY_SIZE, NULL, NULL);
 if (memObjects[0] == NULL || memObjects[1] == NULL
    || memObjects[2] == NULL)
 {
    printf("Error creating memory objects." );
    return false;
   }
 return true;
}
/*
@清除OpenCL资源
*/
void Cleanup(cl_context context, cl_command_queue commandQueue,
          cl_program program, cl_kernel kernel,
          cl_mem memObjects[3])
{
 for (int i = 0; i 〈 3; i++)
 {
    if (memObjects[i] != 0)
        clReleaseMemObject(memObjects[i]);
 }
 if (commandQueue != 0)
    clReleaseCommandQueue(commandQueue);
 if (kernel != 0)
    clReleaseKernel(kernel);
 if (program != 0)
    clReleaseProgram(program);
 if (context != 0)
    clReleaseContext(context);
}
int main(int argc, char **argv)
{
 cl_context context = 0;
 cl_command_queue commandQueue = 0;
 cl_program program = 0;
 cl_device_id device = 0;
 cl_kernel kernel = 0;
 cl_mem memObjects[3] = { 0, 0, 0 };
 cl_int errNum;
 //创建OpenCL上下文
 context = CreateContext(&device);
 if (context == NULL)
 {
    printf("Failed to create OpenCL context." );
    return 1;
}
 //获得OpenCL设备,并创建命令队列
 commandQueue = CreateCommandQueue(context, &device);
 if (commandQueue == NULL)
 {
    Cleanup(context, commandQueue, program, kernel,
              memObjects);
    return 1;
 }
 //创建OPenCL程序
 program = CreateProgram(context, device, "vecAdd.cl");
 if (program == NULL)
 {
    Cleanup(context, commandQueue, program,
              kernel, memObjects);
    return 1;
 }
 //创建OpenCL内核
 kernel = clCreateKernel(program, "vector_add", NULL);
 if (kernel == NULL)
 {
    printf( "Failed to create kernel");
    Cleanup(context, commandQueue, program,
              kernel, memObjects);
    return 1;
 }
 //创建OpenCL内存对象
 float result[ARRAY_SIZE];
 float a[ARRAY_SIZE];
 float b[ARRAY_SIZE];
 for (int i = 0; i 〈 ARRAY_SIZE; i++)
 {
    a[i] = (float)i;
    b[i] = (float)(i * 2);
 }
 if (!CreateMemObjects(context, memObjects, a, b))
 {
    Cleanup(context, commandQueue, program,
              kernel, memObjects);
    return 1;
 }
 //设置内核参数
 errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]);
 errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem),
&memObjects[1]);
 errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem),
                            &memObjects[2]);
 if (errNum != CL_SUCCESS)
 {
    printf("Error setting kernel arguments.");
    Cleanup(context, commandQueue, program,
              kernel, memObjects);
    return 1;
 }
 size_t globalWorkSize[1] = { ARRAY_SIZE };
 size_t localWorkSize[1] = { 1 };
 //执行内核
 errNum = clEnqueueNDRangeKernel(commandQueue, kernel,
                                      1, NULL,
                                      globalWorkSize,
                                      localWorkSize, 0, NULL,
                                      NULL);
 if (errNum != CL_SUCCESS)
 {
    printf( "Error queuing kernel for execution." );
    Cleanup(context, commandQueue, program, kernel,
            memObjects);
    return 1;
 }
 //计算结果拷贝回主机
 errNum = clEnqueueReadBuffer(commandQueue, memObjects[2],
                                  CL_TRUE, 0,
                                  ARRAY_SIZE * sizeof(float),
                                  result, 0, NULL, NULL);
 if (errNum != CL_SUCCESS)
 {
    printf( "Error reading result buffer." );
    Cleanup(context, commandQueue, program, kernel,
              memObjects);
    return 1;
 }
 for (int i = 0; i 〈 ARRAY_SIZE; i++)
 {
    printf("i=%d:%f\n", i, result[i]);
 }
 printf("Executed program succesfully." );
 Cleanup(context, commandQueue, program, kernel, memObjects);
 return 0;
}

我们以向量加法作为OpenCL编程的第一个例子。从例子中可以看出OpenCL编程的标准开发流程。

支持OpenCL支持的平台、设备很多,为了兼顾不同设备,OpenCL程序的第一步就是确定OpenCL执行的平台,在确定平台之后再确定执行OpenCL计算的设备。确定设备后创建上下文,上下文中包含上一步查询的OpenCL计算的设备(允许包含多个计算设备对象),以及接下来创建的内核、程序对象和内存对象。

创建上下文以后,需要创建命令队列,但一个OpenCL设备可以对应多个队列对应一个命令队列一个OpenCL设备。例如,如果需要使用上下文中包含的两个计算设备时,为每个设备创建各自的命令队列。主机与OpenCL设备间数据传输、执行内核等交互操作都是通过入队到命令队列中,命令队列中的各个命令交给OpenCL驱动或相应的硬件单元去执行。

由于在运行时才知道OpenCL设备信息,所以在OpenCL主机端程序中读取内核源码并创建程序对象,根据OpenCL设备编译、构建程序对象,最后创建内核对象。通过这三步操作,把OpenCL设备上执行的内核代码编译完成。

对于在OpenCL设备上执行的内核函数需要输入参数,在主机端调用设置内核参数函数。除此之外,还需要设置在设备上用于执行的工作组和工作项的参数。

当上述操作都完成以后,我们就可以把内核函数入队到命令队列中,命令队列提交给相应的设备执行。

OpenCL设备执行完计算后,把数据拷贝回主机端,并销毁分配的资源。

上述整个流程,如下图所示。

OpenCL执行流程

通过向量加法的例子,相信读者对于OpenCL开发有了一定的了解。

Camera课程

Python教程

Java教程

Web教程

数据库教程

图形图像教程

办公软件教程

Linux教程

计算机教程

大数据教程

开发工具教程