OpenCL 设备队列的使用示例

设备队列作为OpenCL 2.0中一个比较重要的特性,我们将会用一个详细的示例演示设备队列的使用。主机端和设备端代码如下:
主机端代码:

1. #include〈stdio.h>
2. #include〈stdlib.h>
3. #include〈CL/cl.h>
4. char* readKernelFile(const char* filename, size_t* length)
5. {
6.          FILE* file = NULL;
7.          size_t sourceLength;
8.          char* sourceString;
9.          int ret;
10.         file = fopen(filename, "rb");
11.         if(file == NULL) {
12.                 printf("%s at %d :Can't open %s\n",
13.                        _FILE_,_LINE_-2, filename);
14.              return NULL;
15.         }
16.         fseek(file, 0, SEEK_END);
17.         sourceLength = ftell(file);
18.         fseek(file, 0, SEEK_SET);
19.         sourceString = (char *)malloc(sourceLength + 1);
20.         ret= fread((sourceString), sourceLength, 1, file);
21.         if(ret == 0) {
22.               printf("%s at %d : Can't read source %s\n",
23.                      _FILE_,_LINE_-2, filename);
24.            return NULL;
25.         }
26.         fclose(file);
27.         if(length != 0){
28.              *length = sourceLength;
29.         }
30.         sourceString[sourceLength] = '\0';
31.         return sourceString;
32. }
33. int main()
34. {
35.     cl_platform_id platform;
36.     cl_device_id device;
37.     cl_context context;
38.     cl_command_queue cmdqueue;
39.     cl_command_queue devicequeue;
40.     cl_program program;
41.     cl_kernel kernel_saxpy_dp_no_wait;
42.     int glbSize;
43.      float factor;
44.      err=clGetPlatformIDs(1,&platform,NULL);
45.      checkErr(err,_LINE_);
46.      err=clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,
47.                           1,&device,NULL);
48.      context=clCreateContext(NULL,1,&device,NULL,NULL,&err);
49.      cmdqueue=clCreateCommandQueue(context,device,NULL,&err);
50.      cl_queue_properties props[] = {
51.                  CL_QUEUE_PROPERTIES,
52.                  CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
53.                  |CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
54.                  0
55.     };
56.     devicequeue= clCreateCommandQueueWithProperties(
57.                      context,device,props,&err);
58.     size_t program_length;
59.     char* const source=readKernelFile("dp_Kernels.cl",
60.                                         &program_length);
61.     program=clCreateProgramWithSource(context,1,
62.                                  (const char**)&source,NULL,&err);
63.     err=clBuildProgram(program,1,&device,
64.                              "-cl-std=CL2.0 ",NULL,NULL);
65.     if(CL_SUCCESS!=err){
66.             char *buffer;
67.             printf("Error: Failed to build program executable!");
68.             size_t logsize;
69.             err=clGetProgramBuildInfo(program,device,
70.                             CL_PROGRAM_BUILD_LOG,0,NULL,&logsize);
71.             buffer=(char*)malloc(logsize*sizeof(char));
72.             err=clGetProgramBuildInfo(program,device,
73.                   CL_PROGRAM_BUILD_LOG,logsize,buffer,NULL);
74.             printf("log:%s\n",buffer);
75.             free(buffer);
76.       }
77.       kernel_saxpy_dp_no_wait=clCreateKernel(program,
78.                                    "saxpy_dp_no_wait", &err);
79.      float *saxpy_src_0;
80.      float *saxpy_src_1;
81.      float *saxpy_dst_0;
82.
83.       glbSize=8192;
84.       factor=2.3f;
85.       size_t glbSizeBytes = glbSize * sizeof(float);
86.       saxpy_dst_0=(float*)clSVMAlloc(context,
87.                   CL_MEM_READ_WRITE,glbSizeBytes,0);
88.       saxpy_src_0=(float*)clSVMAlloc(context,
89.                    CL_MEM_READ_ONLY,glbSizeBytes,0);
90.       saxpy_src_1=(float*)clSVMAlloc(context,
91.                    CL_MEM_READ_ONLY,glbSizeBytes,0);
92.      float one=1.0f;
93.      float two=2.0f;
94.      float three=3.0f;
95.      err=clEnqueueSVMMemFill(cmdqueue,saxpy_src_0,
96.                           (const void *)&one, sizeof(float),
97.     glbSizeBytes, 0, NULL, NULL);
98.     err=clEnqueueSVMMemFill(cmdqueue,saxpy_src_1,
99.                            (const void *)&two, sizeof(float),
100.                            glbSizeBytes, 0, NULL, NULL);
101.    err=clEnqueueSVMMemFill(cmdqueue,saxpy_dst_0,
102.                            (const void *)&three, sizeof(float),
103.                             glbSizeBytes, 0, NULL, NULL);
104.    err=clFinish(cmdqueue);
105.    err = clSetKernelArg(kernel_saxpy_dp_no_wait,
106.                            0, sizeof(int), (void *)&glbSize);
107.    err |= clSetKernelArg(kernel_saxpy_dp_no_wait,
108.                            1, sizeof(float), (void *)&factor);
109.    err |= clSetKernelArgSVMPointer(
110.                     kernel_saxpy_dp_no_wait, 2, saxpy_src_0);
111.    err |= clSetKernelArgSVMPointer(
112.                     kernel_saxpy_dp_no_wait, 3, saxpy_src_1);
113.   err |= clSetKernelArgSVMPointer(
114.                     kernel_saxpy_dp_no_wait, 4, saxpy_dst_0);
115.
116.   size_t globalsize[1]={glbSize/localsize[0]};
117.
118.   err=clEnqueueNDRangeKernel(
119.                 cmdqueue,kernel_saxpy_dp_no_wait,1,
120.                 NULL,globalsize,localsize,0,NULL,NULL);
121.   err = clFinish(cmdqueue);
122.
123.   clEnqueueSVMMap(cmdqueue, CL_TRUE, CL_MAP_READ,
124.                    saxpy_dst_0, glbSizeBytes, 0, NULL, NULL);
125.   err = clFinish(cmdqueue);
126.   for (int i = 0; i 〈 100; i++ )
127.    printf("i=%d,%f \n",i, saxpy_dst_0[i]);
128.    clEnqueueSVMUnmap(cmdqueue, saxpy_dst_0, 0, NULL, NULL);
129.   err = clFinish(cmdqueue);
130.   err=clReleaseKernel(kernel_saxpy_dp_no_wait);
131.   err=clReleaseProgram(program);
132.   err=clReleaseCommandQueue(cmdqueue);
133.   clSVMFree(context,saxpy_dst_0);
134.   clSVMFree(context,saxpy_src_0);
135.   clSVMFree(context,saxpy_src_1);
136.   err=clReleaseContext(context);
137.}

设备端代码:

138.__kernel void saxpy_dp_child(const int    numElems,
139.                        const float factor,
140.                        __global const float *src_0,
141.                        __global const float *src_1,
142.                        __global       float *dst_0)
143.{
144.           uint gid = get_global_id(0);
145.           if (gid 〈 numElems)
146.           dst_0[gid] = factor * src_0[gid] * src_1[gid];
147.}
148.
149.__kernel void saxpy_dp_no_wait(
150.                             const int    numElems,
151.                             const float factor,
152.                             __global const float *src_0,
153.                             __global const float *src_1,
154.                             __global       float *dst_0)
155.{
156.      uint global_id = get_global_id(0);
157.      uint global_sz = get_global_size(0);
158.
159.      uint child_global_sz = numElems / global_sz;
160.      uint child_offset = global_id * child_global_sz;
161.
162.      __global const float *src_0_child =
163.      &src_0[child_offset];
164.      _global const float *src_1_child = &src_1[child_offset];
165.
166.       _global       float *dst_0_child = &dst_0[child_offset];
167.
168.      queue_t defQ = get_default_queue();
169.      ndrange_t ndrange = ndrange_1D(child_global_sz);
170.      void (^saxpy_dp_child_wrapper)(void) =
171.                   ^{saxpy_dp_child(child_global_sz,
172.                    factor,
173.                    src_0_child,
174.                    src_1_child,
175.                    dst_0_child);};
176.       int err_ret = enqueue_kernel(defQ,
177.                                      CLK_ENQUEUE_FLAGS_NO_WAIT,
178.                                      ndrange, saxpy_dp_child_wrapper);
179.}

上述代码实现的是长度为glbSize=8192两个矢量src_0和src_相乘,结果再乘以factor常量。代码116行设置父内核的全局工作项个数为8192/256=32个。父内核中32个工作项每个都启动一个子内核,从代码129~160行可以看到每个子内核中的全局工作项为256个。
代码粗体部分都是关于设备队列的操作。代码56行创建了一个在设备命令队列,代码168行通过get_default_queue()在设备中获得了这个设备命令队列。代码169行,使用ndrange_1D()函数设置子内核全局工作项大小。代码170~175行创建子内核的块。代码176~179行,父内核中每个工作项入队一个子内核。

Camera课程

Python教程

Java教程

Web教程

数据库教程

图形图像教程

办公软件教程

Linux教程

计算机教程

大数据教程

开发工具教程