设备队列作为OpenCL 2.0中一个比较重要的特性,我们将会用一个详细的示例演示设备队列的使用。主机端和设备端代码如下:
主机端代码:
1. #include〈stdio.h>
2. #include〈stdlib.h>
3. #include〈CL/cl.h>
4. char* readKernelFile(const char* filename, size_t* length)
5. {
6. FILE* file = NULL;
7. size_t sourceLength;
8. char* sourceString;
9. int ret;
10. file = fopen(filename, "rb");
11. if(file == NULL) {
12. printf("%s at %d :Can't open %s\n",
13. _FILE_,_LINE_-2, filename);
14. return NULL;
15. }
16. fseek(file, 0, SEEK_END);
17. sourceLength = ftell(file);
18. fseek(file, 0, SEEK_SET);
19. sourceString = (char *)malloc(sourceLength + 1);
20. ret= fread((sourceString), sourceLength, 1, file);
21. if(ret == 0) {
22. printf("%s at %d : Can't read source %s\n",
23. _FILE_,_LINE_-2, filename);
24. return NULL;
25. }
26. fclose(file);
27. if(length != 0){
28. *length = sourceLength;
29. }
30. sourceString[sourceLength] = '\0';
31. return sourceString;
32. }
33. int main()
34. {
35. cl_platform_id platform;
36. cl_device_id device;
37. cl_context context;
38. cl_command_queue cmdqueue;
39. cl_command_queue devicequeue;
40. cl_program program;
41. cl_kernel kernel_saxpy_dp_no_wait;
42. int glbSize;
43. float factor;
44. err=clGetPlatformIDs(1,&platform,NULL);
45. checkErr(err,_LINE_);
46. err=clGetDeviceIDs(platform,CL_DEVICE_TYPE_GPU,
47. 1,&device,NULL);
48. context=clCreateContext(NULL,1,&device,NULL,NULL,&err);
49. cmdqueue=clCreateCommandQueue(context,device,NULL,&err);
50. cl_queue_properties props[] = {
51. CL_QUEUE_PROPERTIES,
52. CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
53. |CL_QUEUE_ON_DEVICE|CL_QUEUE_ON_DEVICE_DEFAULT,
54. 0
55. };
56. devicequeue= clCreateCommandQueueWithProperties(
57. context,device,props,&err);
58. size_t program_length;
59. char* const source=readKernelFile("dp_Kernels.cl",
60. &program_length);
61. program=clCreateProgramWithSource(context,1,
62. (const char**)&source,NULL,&err);
63. err=clBuildProgram(program,1,&device,
64. "-cl-std=CL2.0 ",NULL,NULL);
65. if(CL_SUCCESS!=err){
66. char *buffer;
67. printf("Error: Failed to build program executable!");
68. size_t logsize;
69. err=clGetProgramBuildInfo(program,device,
70. CL_PROGRAM_BUILD_LOG,0,NULL,&logsize);
71. buffer=(char*)malloc(logsize*sizeof(char));
72. err=clGetProgramBuildInfo(program,device,
73. CL_PROGRAM_BUILD_LOG,logsize,buffer,NULL);
74. printf("log:%s\n",buffer);
75. free(buffer);
76. }
77. kernel_saxpy_dp_no_wait=clCreateKernel(program,
78. "saxpy_dp_no_wait", &err);
79. float *saxpy_src_0;
80. float *saxpy_src_1;
81. float *saxpy_dst_0;
82.
83. glbSize=8192;
84. factor=2.3f;
85. size_t glbSizeBytes = glbSize * sizeof(float);
86. saxpy_dst_0=(float*)clSVMAlloc(context,
87. CL_MEM_READ_WRITE,glbSizeBytes,0);
88. saxpy_src_0=(float*)clSVMAlloc(context,
89. CL_MEM_READ_ONLY,glbSizeBytes,0);
90. saxpy_src_1=(float*)clSVMAlloc(context,
91. CL_MEM_READ_ONLY,glbSizeBytes,0);
92. float one=1.0f;
93. float two=2.0f;
94. float three=3.0f;
95. err=clEnqueueSVMMemFill(cmdqueue,saxpy_src_0,
96. (const void *)&one, sizeof(float),
97. glbSizeBytes, 0, NULL, NULL);
98. err=clEnqueueSVMMemFill(cmdqueue,saxpy_src_1,
99. (const void *)&two, sizeof(float),
100. glbSizeBytes, 0, NULL, NULL);
101. err=clEnqueueSVMMemFill(cmdqueue,saxpy_dst_0,
102. (const void *)&three, sizeof(float),
103. glbSizeBytes, 0, NULL, NULL);
104. err=clFinish(cmdqueue);
105. err = clSetKernelArg(kernel_saxpy_dp_no_wait,
106. 0, sizeof(int), (void *)&glbSize);
107. err |= clSetKernelArg(kernel_saxpy_dp_no_wait,
108. 1, sizeof(float), (void *)&factor);
109. err |= clSetKernelArgSVMPointer(
110. kernel_saxpy_dp_no_wait, 2, saxpy_src_0);
111. err |= clSetKernelArgSVMPointer(
112. kernel_saxpy_dp_no_wait, 3, saxpy_src_1);
113. err |= clSetKernelArgSVMPointer(
114. kernel_saxpy_dp_no_wait, 4, saxpy_dst_0);
115.
116. size_t globalsize[1]={glbSize/localsize[0]};
117.
118. err=clEnqueueNDRangeKernel(
119. cmdqueue,kernel_saxpy_dp_no_wait,1,
120. NULL,globalsize,localsize,0,NULL,NULL);
121. err = clFinish(cmdqueue);
122.
123. clEnqueueSVMMap(cmdqueue, CL_TRUE, CL_MAP_READ,
124. saxpy_dst_0, glbSizeBytes, 0, NULL, NULL);
125. err = clFinish(cmdqueue);
126. for (int i = 0; i 〈 100; i++ )
127. printf("i=%d,%f \n",i, saxpy_dst_0[i]);
128. clEnqueueSVMUnmap(cmdqueue, saxpy_dst_0, 0, NULL, NULL);
129. err = clFinish(cmdqueue);
130. err=clReleaseKernel(kernel_saxpy_dp_no_wait);
131. err=clReleaseProgram(program);
132. err=clReleaseCommandQueue(cmdqueue);
133. clSVMFree(context,saxpy_dst_0);
134. clSVMFree(context,saxpy_src_0);
135. clSVMFree(context,saxpy_src_1);
136. err=clReleaseContext(context);
137.}
设备端代码:
138.__kernel void saxpy_dp_child(const int numElems,
139. const float factor,
140. __global const float *src_0,
141. __global const float *src_1,
142. __global float *dst_0)
143.{
144. uint gid = get_global_id(0);
145. if (gid 〈 numElems)
146. dst_0[gid] = factor * src_0[gid] * src_1[gid];
147.}
148.
149.__kernel void saxpy_dp_no_wait(
150. const int numElems,
151. const float factor,
152. __global const float *src_0,
153. __global const float *src_1,
154. __global float *dst_0)
155.{
156. uint global_id = get_global_id(0);
157. uint global_sz = get_global_size(0);
158.
159. uint child_global_sz = numElems / global_sz;
160. uint child_offset = global_id * child_global_sz;
161.
162. __global const float *src_0_child =
163. &src_0[child_offset];
164. _global const float *src_1_child = &src_1[child_offset];
165.
166. _global float *dst_0_child = &dst_0[child_offset];
167.
168. queue_t defQ = get_default_queue();
169. ndrange_t ndrange = ndrange_1D(child_global_sz);
170. void (^saxpy_dp_child_wrapper)(void) =
171. ^{saxpy_dp_child(child_global_sz,
172. factor,
173. src_0_child,
174. src_1_child,
175. dst_0_child);};
176. int err_ret = enqueue_kernel(defQ,
177. CLK_ENQUEUE_FLAGS_NO_WAIT,
178. ndrange, saxpy_dp_child_wrapper);
179.}
上述代码实现的是长度为glbSize=8192两个矢量src_0和src_相乘,结果再乘以factor常量。代码116行设置父内核的全局工作项个数为8192/256=32个。父内核中32个工作项每个都启动一个子内核,从代码129~160行可以看到每个子内核中的全局工作项为256个。
代码粗体部分都是关于设备队列的操作。代码56行创建了一个在设备命令队列,代码168行通过get_default_queue()在设备中获得了这个设备命令队列。代码169行,使用ndrange_1D()函数设置子内核全局工作项大小。代码170~175行创建子内核的块。代码176~179行,父内核中每个工作项入队一个子内核。