Cycles/OpenCL:
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #ifdef WITH_OPENCL
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "util_foreach.h"
29 #include "util_map.h"
30 #include "util_math.h"
31 #include "util_md5.h"
32 #include "util_opencl.h"
33 #include "util_opengl.h"
34 #include "util_path.h"
35 #include "util_time.h"
36
37 CCL_NAMESPACE_BEGIN
38
39 #define CL_MEM_PTR(p) ((cl_mem)(unsigned long)(p))
40
41 class OpenCLDevice : public Device
42 {
43 public:
44         cl_context cxContext;
45         cl_command_queue cqCommandQueue;
46         cl_platform_id cpPlatform;
47         cl_device_id cdDevice;
48         cl_program cpProgram;
49         cl_kernel ckPathTraceKernel;
50         cl_kernel ckFilmConvertKernel;
51         cl_int ciErr;
52         map<string, device_vector<uchar>*> const_mem_map;
53         map<string, device_memory*> mem_map;
54         device_ptr null_mem;
55         bool device_initialized;
56         string platform_name;
57
58         const char *opencl_error_string(cl_int err)
59         {
60                 switch (err) {
61                         case CL_SUCCESS: return "Success!";
62                         case CL_DEVICE_NOT_FOUND: return "Device not found.";
63                         case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
64                         case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
65                         case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
66                         case CL_OUT_OF_RESOURCES: return "Out of resources";
67                         case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
68                         case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
69                         case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
70                         case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
71                         case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
72                         case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
73                         case CL_MAP_FAILURE: return "Map failure";
74                         case CL_INVALID_VALUE: return "Invalid value";
75                         case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
76                         case CL_INVALID_PLATFORM: return "Invalid platform";
77                         case CL_INVALID_DEVICE: return "Invalid device";
78                         case CL_INVALID_CONTEXT: return "Invalid context";
79                         case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
80                         case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
81                         case CL_INVALID_HOST_PTR: return "Invalid host pointer";
82                         case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
83                         case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
84                         case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
85                         case CL_INVALID_SAMPLER: return "Invalid sampler";
86                         case CL_INVALID_BINARY: return "Invalid binary";
87                         case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
88                         case CL_INVALID_PROGRAM: return "Invalid program";
89                         case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
90                         case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
91                         case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
92                         case CL_INVALID_KERNEL: return "Invalid kernel";
93                         case CL_INVALID_ARG_INDEX: return "Invalid argument index";
94                         case CL_INVALID_ARG_VALUE: return "Invalid argument value";
95                         case CL_INVALID_ARG_SIZE: return "Invalid argument size";
96                         case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
97                         case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
98                         case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
99                         case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
100                         case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
101                         case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
102                         case CL_INVALID_EVENT: return "Invalid event";
103                         case CL_INVALID_OPERATION: return "Invalid operation";
104                         case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
105                         case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
106                         case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
107                         default: return "Unknown";
108                 }
109         }
110
111         bool opencl_error(cl_int err)
112         {
113                 if(err != CL_SUCCESS) {
114                         string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
115                         if(error_msg == "")
116                                 error_msg = message;
117                         fprintf(stderr, "%s\n", message.c_str());
118                         return true;
119                 }
120
121                 return false;
122         }
123
124         void opencl_error(const string& message)
125         {
126                 if(error_msg == "")
127                         error_msg = message;
128                 fprintf(stderr, "%s\n", message.c_str());
129         }
130
131         void opencl_assert(cl_int err)
132         {
133                 if(err != CL_SUCCESS) {
134                         string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
135                         if(error_msg == "")
136                                 error_msg = message;
137                         fprintf(stderr, "%s\n", message.c_str());
138 #ifndef NDEBUG
139                         abort();
140 #endif
141                 }
142         }
143
144         OpenCLDevice(DeviceInfo& info, bool background_)
145         {
146                 background = background_;
147                 cpPlatform = NULL;
148                 cxContext = NULL;
149                 cqCommandQueue = NULL;
150                 cpProgram = NULL;
151                 ckPathTraceKernel = NULL;
152                 ckFilmConvertKernel = NULL;
153                 null_mem = 0;
154                 device_initialized = false;
155
156                 /* setup platform */
157                 cl_uint num_platforms;
158
159                 ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
160                 if(opencl_error(ciErr))
161                         return;
162
163                 if(num_platforms == 0) {
164                         opencl_error("OpenCL: no platforms found.");
165                         return;
166                 }
167
168                 ciErr = clGetPlatformIDs(1, &cpPlatform, NULL);
169                 if(opencl_error(ciErr))
170                         return;
171
172                 char name[256];
173                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
174                 platform_name = name;
175
176                 /* get devices */
177                 vector<cl_device_id> device_ids;
178                 cl_uint num_devices;
179
180                 if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &num_devices)))
181                         return;
182
183                 if(info.num > num_devices) {
184                         if(num_devices == 0)
185                                 opencl_error("OpenCL: no devices found.");
186                         else
187                                 opencl_error("OpenCL: specified device not found.");
188                         return;
189                 }
190
191                 device_ids.resize(num_devices);
192                 
193                 if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, num_devices, &device_ids[0], NULL)))
194                         return;
195
196                 cdDevice = device_ids[info.num];
197
198                 /* create context */
199                 cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
200                 if(opencl_error(ciErr))
201                         return;
202
203                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
204                 if(opencl_error(ciErr))
205                         return;
206
207                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
208                 device_initialized = true;
209         }
210
211         bool opencl_version_check()
212         {
213                 char version[256];
214
215                 int major, minor, req_major = 1, req_minor = 1;
216
217                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
218
219                 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
220                         opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
221                         return false;
222                 }
223
224                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
225                         opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
226                         return false;
227                 }
228
229                 clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
230
231                 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
232                         opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
233                         return false;
234                 }
235
236                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
237                         opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
238                         return false;
239                 }
240
241                 /* we don't check CL_DEVICE_VERSION since for e.g. nvidia sm 1.3 cards this is
242                         1.0 even if the language features are there, just limited shared memory */
243
244                 return true;
245         }
246
247         bool load_binary(const string& kernel_path, const string& clbin)
248         {
249                 /* read binary into memory */
250                 vector<uint8_t> binary;
251
252                 if(!path_read_binary(clbin, binary)) {
253                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
254                         return false;
255                 }
256
257                 /* create program */
258                 cl_int status;
259                 size_t size = binary.size();
260                 const uint8_t *bytes = &binary[0];
261
262                 cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
263                         &size, &bytes, &status, &ciErr);
264
265                 if(opencl_error(status) || opencl_error(ciErr)) {
266                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
267                         return false;
268                 }
269
270                 if(!build_kernel(kernel_path))
271                         return false;
272
273                 return true;
274         }
275
276         bool save_binary(const string& clbin)
277         {
278                 size_t size = 0;
279                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
280
281                 if(!size)
282                         return false;
283
284                 vector<uint8_t> binary(size);
285                 uint8_t *bytes = &binary[0];
286
287                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
288
289                 if(!path_write_binary(clbin, binary)) {
290                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
291                         return false;
292                 }
293
294                 return true;
295         }
296
297         string kernel_build_options()
298         {
299                 string build_options = " -cl-fast-relaxed-math ";
300                 
301                 /* Multi Closure for nVidia cards */
302                 if(platform_name == "NVIDIA CUDA")
303                         build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
304                         
305                 /* No Float3 for Apple */
306                 else if(platform_name == "Apple")
307                         build_options += "-D__CL_NO_FLOAT3__ ";
308                         
309                 /* Basic shading for AMD cards (non Apple) */
310                 else if(platform_name == "AMD Accelerated Parallel Processing")
311                         build_options += "-D__KERNEL_SHADING__ -D__CL_NO_FLOAT3__ ";
312
313                 return build_options;
314         }
315
316         bool build_kernel(const string& kernel_path)
317         {
318                 string build_options = kernel_build_options();
319         
320                 ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
321
322                 if(ciErr != CL_SUCCESS) {
323                         /* show build errors */
324                         char *build_log;
325                         size_t ret_val_size;
326
327                         clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
328
329                         build_log = new char[ret_val_size+1];
330                         clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
331
332                         build_log[ret_val_size] = '\0';
333                         opencl_error("OpenCL build failed: errors in console");
334                         fprintf(stderr, "%s\n", build_log);
335
336                         delete[] build_log;
337
338                         return false;
339                 }
340
341                 return true;
342         }
343
344         bool compile_kernel(const string& kernel_path, const string& kernel_md5)
345         {
346                 /* we compile kernels consisting of many files. unfortunately opencl
347                    kernel caches do not seem to recognize changes in included files.
348                    so we force recompile on changes by adding the md5 hash of all files */
349                 string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
350                 source = path_source_replace_includes(source, kernel_path);
351
352                 size_t source_len = source.size();
353                 const char *source_str = source.c_str();
354
355                 cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
356
357                 if(opencl_error(ciErr))
358                         return false;
359
360                 double starttime = time_dt();
361                 printf("Compiling OpenCL kernel ...\n");
362
363                 if(!build_kernel(kernel_path))
364                         return false;
365
366                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
367
368                 return true;
369         }
370
371         string device_md5_hash()
372         {
373                 MD5Hash md5;
374                 char version[256], driver[256], name[256], vendor[256];
375
376                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
377                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
378                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
379                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
380
381                 md5.append((uint8_t*)vendor, strlen(vendor));
382                 md5.append((uint8_t*)version, strlen(version));
383                 md5.append((uint8_t*)name, strlen(name));
384                 md5.append((uint8_t*)driver, strlen(driver));
385
386                 string options = kernel_build_options();
387                 md5.append((uint8_t*)options.c_str(), options.size());
388
389                 return md5.get_hex();
390         }
391
392         bool load_kernels(bool experimental)
393         {
394                 /* verify if device was initialized */
395                 if(!device_initialized) {
396                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
397                         return false;
398                 }
399
400                 /* verify we have right opencl version */
401                 if(!opencl_version_check())
402                         return false;
403
404                 /* md5 hash to detect changes */
405                 string kernel_path = path_get("kernel");
406                 string kernel_md5 = path_files_md5_hash(kernel_path);
407                 string device_md5 = device_md5_hash();
408
409                 /* try to use cache binary */
410                 string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());;
411                 clbin = path_user_get(path_join("cache", clbin));
412
413                 if(path_exists(clbin)) {
414                         /* if exists already, try use it */
415                         if(!load_binary(kernel_path, clbin))
416                                 return false;
417                 }
418                 else {
419                         /* compile kernel */
420                         if(!compile_kernel(kernel_path, kernel_md5))
421                                 return false;
422
423                         /* save binary for reuse */
424                         save_binary(clbin);
425                 }
426
427                 /* find kernels */
428                 ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
429                 if(opencl_error(ciErr))
430                         return false;
431
432                 ckFilmConvertKernel = clCreateKernel(cpProgram, "kernel_ocl_tonemap", &ciErr);
433                 if(opencl_error(ciErr))
434                         return false;
435
436                 return true;
437         }
438
439         ~OpenCLDevice()
440         {
441                 if(null_mem)
442                         clReleaseMemObject(CL_MEM_PTR(null_mem));
443
444                 map<string, device_vector<uchar>*>::iterator mt;
445                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
446                         mem_free(*(mt->second));
447                         delete mt->second;
448                 }
449
450                 if(ckPathTraceKernel)
451                         clReleaseKernel(ckPathTraceKernel);  
452                 if(ckFilmConvertKernel)
453                         clReleaseKernel(ckFilmConvertKernel);  
454                 if(cpProgram)
455                         clReleaseProgram(cpProgram);
456                 if(cqCommandQueue)
457                         clReleaseCommandQueue(cqCommandQueue);
458                 if(cxContext)
459                         clReleaseContext(cxContext);
460         }
461
462         void mem_alloc(device_memory& mem, MemoryType type)
463         {
464                 size_t size = mem.memory_size();
465
466                 if(type == MEM_READ_ONLY)
467                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, size, NULL, &ciErr);
468                 else if(type == MEM_WRITE_ONLY)
469                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_WRITE_ONLY, size, NULL, &ciErr);
470                 else
471                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_WRITE, size, NULL, &ciErr);
472
473                 opencl_assert(ciErr);
474         }
475
476         void mem_copy_to(device_memory& mem)
477         {
478                 /* this is blocking */
479                 size_t size = mem.memory_size();
480                 ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
481                 opencl_assert(ciErr);
482         }
483
484         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
485         {
486                 size_t offset = elem*y*w;
487                 size_t size = elem*w*h;
488
489                 ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
490                 opencl_assert(ciErr);
491         }
492
493         void mem_zero(device_memory& mem)
494         {
495                 if(mem.device_pointer) {
496                         memset((void*)mem.data_pointer, 0, mem.memory_size());
497                         mem_copy_to(mem);
498                 }
499         }
500
501         void mem_free(device_memory& mem)
502         {
503                 if(mem.device_pointer) {
504                         ciErr = clReleaseMemObject(CL_MEM_PTR(mem.device_pointer));
505                         mem.device_pointer = 0;
506                         opencl_assert(ciErr);
507                 }
508         }
509
510         void const_copy_to(const char *name, void *host, size_t size)
511         {
512                 if(const_mem_map.find(name) == const_mem_map.end()) {
513                         device_vector<uchar> *data = new device_vector<uchar>();
514                         data->copy((uchar*)host, size);
515
516                         mem_alloc(*data, MEM_READ_ONLY);
517                         const_mem_map[name] = data;
518                 }
519                 else {
520                         device_vector<uchar> *data = const_mem_map[name];
521                         data->copy((uchar*)host, size);
522                 }
523
524                 mem_copy_to(*const_mem_map[name]);
525         }
526
527         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
528         {
529                 mem_alloc(mem, MEM_READ_ONLY);
530                 mem_copy_to(mem);
531                 mem_map[name] = &mem;
532         }
533
534         void tex_free(device_memory& mem)
535         {
536                 if(mem.data_pointer)
537                         mem_free(mem);
538         }
539
540         size_t global_size_round_up(int group_size, int global_size)
541         {
542                 int r = global_size % group_size;
543                 return global_size + ((r == 0)? 0: group_size - r);
544         }
545
546         void path_trace(DeviceTask& task)
547         {
548                 /* cast arguments to cl types */
549                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
550                 cl_mem d_buffer = CL_MEM_PTR(task.buffer);
551                 cl_mem d_rng_state = CL_MEM_PTR(task.rng_state);
552                 cl_int d_x = task.x;
553                 cl_int d_y = task.y;
554                 cl_int d_w = task.w;
555                 cl_int d_h = task.h;
556                 cl_int d_sample = task.sample;
557                 cl_int d_offset = task.offset;
558                 cl_int d_stride = task.stride;
559
560                 /* sample arguments */
561                 int narg = 0;
562                 ciErr = 0;
563
564                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data);
565                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
566                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state);
567
568 #define KERNEL_TEX(type, ttype, name) \
569         ciErr |= set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
570 #include "kernel_textures.h"
571
572                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample);
573                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x);
574                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y);
575                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w);
576                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h);
577                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset);
578                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride);
579
580                 opencl_assert(ciErr);
581
582                 size_t workgroup_size;
583
584                 clGetKernelWorkGroupInfo(ckPathTraceKernel, cdDevice,
585                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
586         
587                 workgroup_size = max(sqrt((double)workgroup_size), 1.0);
588
589                 size_t local_size[2] = {workgroup_size, workgroup_size};
590                 size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
591
592                 /* run kernel */
593                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckPathTraceKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
594                 opencl_assert(ciErr);
595                 opencl_assert(clFinish(cqCommandQueue));
596         }
597
598         cl_int set_kernel_arg_mem(cl_kernel kernel, int *narg, const char *name)
599         {
600                 cl_mem ptr;
601                 cl_int err = 0;
602
603                 if(mem_map.find(name) != mem_map.end()) {
604                         device_memory *mem = mem_map[name];
605                 
606                         ptr = CL_MEM_PTR(mem->device_pointer);
607                 }
608                 else {
609                         /* work around NULL not working, even though the spec says otherwise */
610                         ptr = CL_MEM_PTR(null_mem);
611                 }
612                 
613                 err |= clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr);
614                 opencl_assert(err);
615
616                 return err;
617         }
618
619         void tonemap(DeviceTask& task)
620         {
621                 /* cast arguments to cl types */
622                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
623                 cl_mem d_rgba = CL_MEM_PTR(task.rgba);
624                 cl_mem d_buffer = CL_MEM_PTR(task.buffer);
625                 cl_int d_x = task.x;
626                 cl_int d_y = task.y;
627                 cl_int d_w = task.w;
628                 cl_int d_h = task.h;
629                 cl_int d_sample = task.sample;
630                 cl_int d_resolution = task.resolution;
631                 cl_int d_offset = task.offset;
632                 cl_int d_stride = task.stride;
633
634                 /* sample arguments */
635                 int narg = 0;
636                 ciErr = 0;
637
638                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data);
639                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba);
640                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
641
642 #define KERNEL_TEX(type, ttype, name) \
643         ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
644 #include "kernel_textures.h"
645
646                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample);
647                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_resolution), (void*)&d_resolution);
648                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x);
649                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y);
650                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w);
651                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h);
652                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset);
653                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride);
654
655                 opencl_assert(ciErr);
656
657                 size_t workgroup_size;
658
659                 clGetKernelWorkGroupInfo(ckFilmConvertKernel, cdDevice,
660                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
661         
662                 workgroup_size = max(sqrt((double)workgroup_size), 1.0);
663
664                 size_t local_size[2] = {workgroup_size, workgroup_size};
665                 size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
666
667                 /* run kernel */
668                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckFilmConvertKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
669                 opencl_assert(ciErr);
670                 opencl_assert(clFinish(cqCommandQueue));
671         }
672
673         void task_add(DeviceTask& maintask)
674         {
675                 list<DeviceTask> tasks;
676
677                 /* arbitrary limit to work around apple ATI opencl issue */
678                 if(platform_name == "Apple")
679                         maintask.split_max_size(tasks, 76800);
680                 else
681                         tasks.push_back(maintask);
682
683                 DeviceTask task;
684
685                 foreach(DeviceTask& task, tasks) {
686                         if(task.type == DeviceTask::TONEMAP)
687                                 tonemap(task);
688                         else if(task.type == DeviceTask::PATH_TRACE)
689                                 path_trace(task);
690                 }
691         }
692
693         void task_wait()
694         {
695         }
696
697         void task_cancel()
698         {
699         }
700 };
701
702 Device *device_opencl_create(DeviceInfo& info, bool background)
703 {
704         return new OpenCLDevice(info, background);
705 }
706
707 void device_opencl_info(vector<DeviceInfo>& devices)
708 {
709         vector<cl_device_id> device_ids;
710         cl_uint num_devices;
711         cl_platform_id platform_id;
712         cl_uint num_platforms;
713
714         /* get devices */
715         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
716                 return;
717
718         if(clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS)
719                 return;
720
721         if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &num_devices) != CL_SUCCESS)
722                 return;
723         
724         device_ids.resize(num_devices);
725
726         if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, num_devices, &device_ids[0], NULL) != CL_SUCCESS)
727                 return;
728         
729         /* add devices */
730         for(int num = 0; num < num_devices; num++) {
731                 cl_device_id device_id = device_ids[num];
732                 char name[1024];
733
734                 if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
735                         continue;
736
737                 DeviceInfo info;
738
739                 info.type = DEVICE_OPENCL;
740                 info.description = string(name);
741                 info.id = string_printf("OPENCL_%d", num);
742                 info.num = num;
743                 /* we don't know if it's used for display, but assume it is */
744                 info.display_device = true;
745                 info.advanced_shading = false;
746
747                 devices.push_back(info);
748         }
749 }
750
751 CCL_NAMESPACE_END
752
753 #endif /* WITH_OPENCL */
754