9a55f95789507f1fd92a71f78751fc7c8d87c756
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #ifdef WITH_OPENCL
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "util_foreach.h"
29 #include "util_map.h"
30 #include "util_math.h"
31 #include "util_md5.h"
32 #include "util_opencl.h"
33 #include "util_opengl.h"
34 #include "util_path.h"
35 #include "util_time.h"
36
37 CCL_NAMESPACE_BEGIN
38
39 #define CL_MEM_PTR(p) ((cl_mem)(unsigned long)(p))
40
41 class OpenCLDevice : public Device
42 {
43 public:
44         cl_context cxContext;
45         cl_command_queue cqCommandQueue;
46         cl_platform_id cpPlatform;
47         cl_device_id cdDevice;
48         cl_program cpProgram;
49         cl_kernel ckPathTraceKernel;
50         cl_kernel ckFilmConvertKernel;
51         cl_int ciErr;
52         map<string, device_vector<uchar>*> const_mem_map;
53         map<string, device_memory*> mem_map;
54         device_ptr null_mem;
55         bool device_initialized;
56         string platform_name;
57
58         const char *opencl_error_string(cl_int err)
59         {
60                 switch (err) {
61                         case CL_SUCCESS: return "Success!";
62                         case CL_DEVICE_NOT_FOUND: return "Device not found.";
63                         case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
64                         case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
65                         case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
66                         case CL_OUT_OF_RESOURCES: return "Out of resources";
67                         case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
68                         case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
69                         case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
70                         case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
71                         case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
72                         case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
73                         case CL_MAP_FAILURE: return "Map failure";
74                         case CL_INVALID_VALUE: return "Invalid value";
75                         case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
76                         case CL_INVALID_PLATFORM: return "Invalid platform";
77                         case CL_INVALID_DEVICE: return "Invalid device";
78                         case CL_INVALID_CONTEXT: return "Invalid context";
79                         case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
80                         case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
81                         case CL_INVALID_HOST_PTR: return "Invalid host pointer";
82                         case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
83                         case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
84                         case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
85                         case CL_INVALID_SAMPLER: return "Invalid sampler";
86                         case CL_INVALID_BINARY: return "Invalid binary";
87                         case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
88                         case CL_INVALID_PROGRAM: return "Invalid program";
89                         case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
90                         case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
91                         case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
92                         case CL_INVALID_KERNEL: return "Invalid kernel";
93                         case CL_INVALID_ARG_INDEX: return "Invalid argument index";
94                         case CL_INVALID_ARG_VALUE: return "Invalid argument value";
95                         case CL_INVALID_ARG_SIZE: return "Invalid argument size";
96                         case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
97                         case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
98                         case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
99                         case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
100                         case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
101                         case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
102                         case CL_INVALID_EVENT: return "Invalid event";
103                         case CL_INVALID_OPERATION: return "Invalid operation";
104                         case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
105                         case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
106                         case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
107                         default: return "Unknown";
108                 }
109         }
110
111         bool opencl_error(cl_int err)
112         {
113                 if(err != CL_SUCCESS) {
114                         string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
115                         if(error_msg == "")
116                                 error_msg = message;
117                         fprintf(stderr, "%s\n", message.c_str());
118                         return true;
119                 }
120
121                 return false;
122         }
123
124         void opencl_error(const string& message)
125         {
126                 if(error_msg == "")
127                         error_msg = message;
128                 fprintf(stderr, "%s\n", message.c_str());
129         }
130
131         void opencl_assert(cl_int err)
132         {
133                 if(err != CL_SUCCESS) {
134                         string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
135                         if(error_msg == "")
136                                 error_msg = message;
137                         fprintf(stderr, "%s\n", message.c_str());
138 #ifndef NDEBUG
139                         abort();
140 #endif
141                 }
142         }
143
144         OpenCLDevice(DeviceInfo& info, bool background_)
145         {
146                 background = background_;
147                 cpPlatform = NULL;
148                 cxContext = NULL;
149                 cqCommandQueue = NULL;
150                 cpProgram = NULL;
151                 ckPathTraceKernel = NULL;
152                 ckFilmConvertKernel = NULL;
153                 null_mem = 0;
154                 device_initialized = false;
155
156                 /* setup platform */
157                 cl_uint num_platforms;
158
159                 ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
160                 if(opencl_error(ciErr))
161                         return;
162
163                 if(num_platforms == 0) {
164                         opencl_error("OpenCL: no platforms found.");
165                         return;
166                 }
167
168                 ciErr = clGetPlatformIDs(num_platforms, &cpPlatform, NULL);
169                 if(opencl_error(ciErr))
170                         return;
171
172                 char name[256];
173                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
174                 platform_name = name;
175
176                 /* get devices */
177                 vector<cl_device_id> device_ids;
178                 cl_uint num_devices;
179
180                 if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &num_devices)))
181                         return;
182
183                 if(info.num > num_devices) {
184                         if(num_devices == 0)
185                                 opencl_error("OpenCL: no devices found.");
186                         else
187                                 opencl_error("OpenCL: specified device not found.");
188                         return;
189                 }
190
191                 device_ids.resize(num_devices);
192                 
193                 if(opencl_error(clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, num_devices, &device_ids[0], NULL)))
194                         return;
195
196                 cdDevice = device_ids[info.num];
197
198                 /* create context */
199                 cxContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr);
200                 if(opencl_error(ciErr))
201                         return;
202
203                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
204                 if(opencl_error(ciErr))
205                         return;
206
207                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
208                 device_initialized = true;
209         }
210
211         bool opencl_version_check()
212         {
213                 char version[256];
214
215                 int major, minor, req_major = 1, req_minor = 1;
216
217                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
218
219                 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
220                         opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
221                         return false;
222                 }
223
224                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
225                         opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
226                         return false;
227                 }
228
229                 clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
230
231                 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
232                         opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
233                         return false;
234                 }
235
236                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
237                         opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
238                         return false;
239                 }
240
241                 /* we don't check CL_DEVICE_VERSION since for e.g. nvidia sm 1.3 cards this is
242                         1.0 even if the language features are there, just limited shared memory */
243
244                 return true;
245         }
246
247         bool load_binary(const string& kernel_path, const string& clbin)
248         {
249                 /* read binary into memory */
250                 vector<uint8_t> binary;
251
252                 if(!path_read_binary(clbin, binary)) {
253                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
254                         return false;
255                 }
256
257                 /* create program */
258                 cl_int status;
259                 size_t size = binary.size();
260                 const uint8_t *bytes = &binary[0];
261
262                 cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
263                         &size, &bytes, &status, &ciErr);
264
265                 if(opencl_error(status) || opencl_error(ciErr)) {
266                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
267                         return false;
268                 }
269
270                 if(!build_kernel(kernel_path))
271                         return false;
272
273                 return true;
274         }
275
276         bool save_binary(const string& clbin)
277         {
278                 size_t size = 0;
279                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
280
281                 if(!size)
282                         return false;
283
284                 vector<uint8_t> binary(size);
285                 uint8_t *bytes = &binary[0];
286
287                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
288
289                 if(!path_write_binary(clbin, binary)) {
290                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
291                         return false;
292                 }
293
294                 return true;
295         }
296
297         string kernel_build_options()
298         {
299                 string build_options = " -cl-fast-relaxed-math ";
300                 
301                 /* full shading only on NVIDIA cards at the moment */
302                 if(platform_name == "NVIDIA CUDA")
303                         build_options += "-D__KERNEL_SHADING__ -D__MULTI_CLOSURE__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
304                 if(platform_name == "Apple")
305                         build_options += " -D__CL_NO_FLOAT3__ ";
306
307                 return build_options;
308         }
309
310         bool build_kernel(const string& kernel_path)
311         {
312                 string build_options = kernel_build_options();
313         
314                 ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
315
316                 if(ciErr != CL_SUCCESS) {
317                         /* show build errors */
318                         char *build_log;
319                         size_t ret_val_size;
320
321                         clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
322
323                         build_log = new char[ret_val_size+1];
324                         clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
325
326                         build_log[ret_val_size] = '\0';
327                         opencl_error("OpenCL build failed: errors in console");
328                         fprintf(stderr, "%s\n", build_log);
329
330                         delete[] build_log;
331
332                         return false;
333                 }
334
335                 return true;
336         }
337
338         bool compile_kernel(const string& kernel_path, const string& kernel_md5)
339         {
340                 /* we compile kernels consisting of many files. unfortunately opencl
341                    kernel caches do not seem to recognize changes in included files.
342                    so we force recompile on changes by adding the md5 hash of all files */
343                 string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
344                 source = path_source_replace_includes(source, kernel_path);
345
346                 size_t source_len = source.size();
347                 const char *source_str = source.c_str();
348
349                 cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
350
351                 if(opencl_error(ciErr))
352                         return false;
353
354                 double starttime = time_dt();
355                 printf("Compiling OpenCL kernel ...\n");
356
357                 if(!build_kernel(kernel_path))
358                         return false;
359
360                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
361
362                 return true;
363         }
364
365         string device_md5_hash()
366         {
367                 MD5Hash md5;
368                 char version[256], driver[256], name[256], vendor[256];
369
370                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
371                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
372                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
373                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
374
375                 md5.append((uint8_t*)vendor, strlen(vendor));
376                 md5.append((uint8_t*)version, strlen(version));
377                 md5.append((uint8_t*)name, strlen(name));
378                 md5.append((uint8_t*)driver, strlen(driver));
379
380                 string options = kernel_build_options();
381                 md5.append((uint8_t*)options.c_str(), options.size());
382
383                 return md5.get_hex();
384         }
385
386         bool load_kernels(bool experimental)
387         {
388                 /* verify if device was initialized */
389                 if(!device_initialized) {
390                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
391                         return false;
392                 }
393
394                 /* verify we have right opencl version */
395                 if(!opencl_version_check())
396                         return false;
397
398                 /* md5 hash to detect changes */
399                 string kernel_path = path_get("kernel");
400                 string kernel_md5 = path_files_md5_hash(kernel_path);
401                 string device_md5 = device_md5_hash();
402
403                 /* try to use cache binary */
404                 string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());;
405                 clbin = path_user_get(path_join("cache", clbin));
406
407                 if(path_exists(clbin)) {
408                         /* if exists already, try use it */
409                         if(!load_binary(kernel_path, clbin))
410                                 return false;
411                 }
412                 else {
413                         /* compile kernel */
414                         if(!compile_kernel(kernel_path, kernel_md5))
415                                 return false;
416
417                         /* save binary for reuse */
418                         save_binary(clbin);
419                 }
420
421                 /* find kernels */
422                 ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
423                 if(opencl_error(ciErr))
424                         return false;
425
426                 ckFilmConvertKernel = clCreateKernel(cpProgram, "kernel_ocl_tonemap", &ciErr);
427                 if(opencl_error(ciErr))
428                         return false;
429
430                 return true;
431         }
432
433         ~OpenCLDevice()
434         {
435                 if(null_mem)
436                         clReleaseMemObject(CL_MEM_PTR(null_mem));
437
438                 map<string, device_vector<uchar>*>::iterator mt;
439                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
440                         mem_free(*(mt->second));
441                         delete mt->second;
442                 }
443
444                 if(ckPathTraceKernel)
445                         clReleaseKernel(ckPathTraceKernel);  
446                 if(ckFilmConvertKernel)
447                         clReleaseKernel(ckFilmConvertKernel);  
448                 if(cpProgram)
449                         clReleaseProgram(cpProgram);
450                 if(cqCommandQueue)
451                         clReleaseCommandQueue(cqCommandQueue);
452                 if(cxContext)
453                         clReleaseContext(cxContext);
454         }
455
456         void mem_alloc(device_memory& mem, MemoryType type)
457         {
458                 size_t size = mem.memory_size();
459
460                 if(type == MEM_READ_ONLY)
461                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, size, NULL, &ciErr);
462                 else if(type == MEM_WRITE_ONLY)
463                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_WRITE_ONLY, size, NULL, &ciErr);
464                 else
465                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_WRITE, size, NULL, &ciErr);
466
467                 opencl_assert(ciErr);
468         }
469
470         void mem_copy_to(device_memory& mem)
471         {
472                 /* this is blocking */
473                 size_t size = mem.memory_size();
474                 ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
475                 opencl_assert(ciErr);
476         }
477
478         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
479         {
480                 size_t offset = elem*y*w;
481                 size_t size = elem*w*h;
482
483                 ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
484                 opencl_assert(ciErr);
485         }
486
487         void mem_zero(device_memory& mem)
488         {
489                 if(mem.device_pointer) {
490                         memset((void*)mem.data_pointer, 0, mem.memory_size());
491                         mem_copy_to(mem);
492                 }
493         }
494
495         void mem_free(device_memory& mem)
496         {
497                 if(mem.device_pointer) {
498                         ciErr = clReleaseMemObject(CL_MEM_PTR(mem.device_pointer));
499                         mem.device_pointer = 0;
500                         opencl_assert(ciErr);
501                 }
502         }
503
504         void const_copy_to(const char *name, void *host, size_t size)
505         {
506                 if(const_mem_map.find(name) == const_mem_map.end()) {
507                         device_vector<uchar> *data = new device_vector<uchar>();
508                         data->copy((uchar*)host, size);
509
510                         mem_alloc(*data, MEM_READ_ONLY);
511                         const_mem_map[name] = data;
512                 }
513                 else {
514                         device_vector<uchar> *data = const_mem_map[name];
515                         data->copy((uchar*)host, size);
516                 }
517
518                 mem_copy_to(*const_mem_map[name]);
519         }
520
521         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
522         {
523                 mem_alloc(mem, MEM_READ_ONLY);
524                 mem_copy_to(mem);
525                 mem_map[name] = &mem;
526         }
527
528         void tex_free(device_memory& mem)
529         {
530                 if(mem.data_pointer)
531                         mem_free(mem);
532         }
533
534         size_t global_size_round_up(int group_size, int global_size)
535         {
536                 int r = global_size % group_size;
537                 return global_size + ((r == 0)? 0: group_size - r);
538         }
539
540         void path_trace(DeviceTask& task)
541         {
542                 /* cast arguments to cl types */
543                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
544                 cl_mem d_buffer = CL_MEM_PTR(task.buffer);
545                 cl_mem d_rng_state = CL_MEM_PTR(task.rng_state);
546                 cl_int d_x = task.x;
547                 cl_int d_y = task.y;
548                 cl_int d_w = task.w;
549                 cl_int d_h = task.h;
550                 cl_int d_sample = task.sample;
551                 cl_int d_offset = task.offset;
552                 cl_int d_stride = task.stride;
553
554                 /* sample arguments */
555                 int narg = 0;
556                 ciErr = 0;
557
558                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data);
559                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
560                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state);
561
562 #define KERNEL_TEX(type, ttype, name) \
563         ciErr |= set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
564 #include "kernel_textures.h"
565
566                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample);
567                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x);
568                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y);
569                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w);
570                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h);
571                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset);
572                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride);
573
574                 opencl_assert(ciErr);
575
576                 size_t workgroup_size;
577
578                 clGetKernelWorkGroupInfo(ckPathTraceKernel, cdDevice,
579                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
580         
581                 workgroup_size = max(sqrt((double)workgroup_size), 1.0);
582
583                 size_t local_size[2] = {workgroup_size, workgroup_size};
584                 size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
585
586                 /* run kernel */
587                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckPathTraceKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
588                 opencl_assert(ciErr);
589                 opencl_assert(clFinish(cqCommandQueue));
590         }
591
592         cl_int set_kernel_arg_mem(cl_kernel kernel, int *narg, const char *name)
593         {
594                 cl_mem ptr;
595                 cl_int err = 0;
596
597                 if(mem_map.find(name) != mem_map.end()) {
598                         device_memory *mem = mem_map[name];
599                 
600                         ptr = CL_MEM_PTR(mem->device_pointer);
601                 }
602                 else {
603                         /* work around NULL not working, even though the spec says otherwise */
604                         ptr = CL_MEM_PTR(null_mem);
605                 }
606                 
607                 err |= clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr);
608                 opencl_assert(err);
609
610                 return err;
611         }
612
613         void tonemap(DeviceTask& task)
614         {
615                 /* cast arguments to cl types */
616                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
617                 cl_mem d_rgba = CL_MEM_PTR(task.rgba);
618                 cl_mem d_buffer = CL_MEM_PTR(task.buffer);
619                 cl_int d_x = task.x;
620                 cl_int d_y = task.y;
621                 cl_int d_w = task.w;
622                 cl_int d_h = task.h;
623                 cl_int d_sample = task.sample;
624                 cl_int d_resolution = task.resolution;
625                 cl_int d_offset = task.offset;
626                 cl_int d_stride = task.stride;
627
628                 /* sample arguments */
629                 int narg = 0;
630                 ciErr = 0;
631
632                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data);
633                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba);
634                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
635
636 #define KERNEL_TEX(type, ttype, name) \
637         ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
638 #include "kernel_textures.h"
639
640                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample);
641                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_resolution), (void*)&d_resolution);
642                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x);
643                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y);
644                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w);
645                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h);
646                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset);
647                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride);
648
649                 opencl_assert(ciErr);
650
651                 size_t workgroup_size;
652
653                 clGetKernelWorkGroupInfo(ckFilmConvertKernel, cdDevice,
654                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
655         
656                 workgroup_size = max(sqrt((double)workgroup_size), 1.0);
657
658                 size_t local_size[2] = {workgroup_size, workgroup_size};
659                 size_t global_size[2] = {global_size_round_up(local_size[0], d_w), global_size_round_up(local_size[1], d_h)};
660
661                 /* run kernel */
662                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, ckFilmConvertKernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
663                 opencl_assert(ciErr);
664                 opencl_assert(clFinish(cqCommandQueue));
665         }
666
667         void task_add(DeviceTask& maintask)
668         {
669                 list<DeviceTask> tasks;
670
671                 /* arbitrary limit to work around apple ATI opencl issue */
672                 if(platform_name == "Apple")
673                         maintask.split_max_size(tasks, 76800);
674                 else
675                         tasks.push_back(maintask);
676
677                 DeviceTask task;
678
679                 foreach(DeviceTask& task, tasks) {
680                         if(task.type == DeviceTask::TONEMAP)
681                                 tonemap(task);
682                         else if(task.type == DeviceTask::PATH_TRACE)
683                                 path_trace(task);
684                 }
685         }
686
687         void task_wait()
688         {
689         }
690
691         void task_cancel()
692         {
693         }
694 };
695
696 Device *device_opencl_create(DeviceInfo& info, bool background)
697 {
698         return new OpenCLDevice(info, background);
699 }
700
701 void device_opencl_info(vector<DeviceInfo>& devices)
702 {
703         vector<cl_device_id> device_ids;
704         cl_uint num_devices;
705         cl_platform_id platform_id;
706         cl_uint num_platforms;
707
708         /* get devices */
709         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
710                 return;
711
712         if(clGetPlatformIDs(num_platforms, &platform_id, NULL) != CL_SUCCESS)
713                 return;
714
715         if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, 0, NULL, &num_devices) != CL_SUCCESS)
716                 return;
717         
718         device_ids.resize(num_devices);
719
720         if(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU|CL_DEVICE_TYPE_ACCELERATOR, num_devices, &device_ids[0], NULL) != CL_SUCCESS)
721                 return;
722         
723         /* add devices */
724         for(int num = 0; num < num_devices; num++) {
725                 cl_device_id device_id = device_ids[num];
726                 char name[1024];
727
728                 if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
729                         continue;
730
731                 DeviceInfo info;
732
733                 info.type = DEVICE_OPENCL;
734                 info.description = string(name);
735                 info.id = string_printf("OPENCL_%d", num);
736                 info.num = num;
737                 /* we don't know if it's used for display, but assume it is */
738                 info.display_device = true;
739                 info.advanced_shading = false;
740
741                 devices.push_back(info);
742         }
743 }
744
745 CCL_NAMESPACE_END
746
747 #endif /* WITH_OPENCL */
748