Merge branch 'blender2.7'
[blender.git] / intern / cycles / device / opencl / opencl_split.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "kernel/kernel_types.h"
22 #include "kernel/split/kernel_split_data_types.h"
23
24 #include "util/util_algorithm.h"
25 #include "util/util_debug.h"
26 #include "util/util_foreach.h"
27 #include "util/util_logging.h"
28 #include "util/util_md5.h"
29 #include "util/util_path.h"
30 #include "util/util_time.h"
31
32 CCL_NAMESPACE_BEGIN
33
34 struct texture_slot_t {
35         texture_slot_t(const string& name, int slot)
36                 : name(name),
37                   slot(slot) {
38         }
39         string name;
40         int slot;
41 };
42
43 static const string fast_compiled_kernels =
44         "data_init "
45         "path_init "
46         "state_buffer_size "
47         "scene_intersect "
48         "queue_enqueue "
49         "shader_setup "
50         "shader_sort "
51         "enqueue_inactive "
52         "next_iteration_setup "
53         "indirect_subsurface "
54         "buffer_update";
55
56 const string OpenCLDevice::get_opencl_program_name(bool single_program, const string& kernel_name)
57 {
58         if (single_program) {
59                 return "split";
60         }
61         else {
62                 if (fast_compiled_kernels.find(kernel_name) != std::string::npos) {
63                         return "split_bundle";
64                 }
65                 else {
66                         return "split_" + kernel_name;
67                 }
68         }
69 }
70
71 const string OpenCLDevice::get_opencl_program_filename(bool single_program, const string& kernel_name)
72 {
73         if (single_program) {
74                 return "kernel_split.cl";
75         }
76         else {
77                 if (fast_compiled_kernels.find(kernel_name) != std::string::npos) {
78                         return "kernel_split_bundle.cl";
79                 }
80                 else {
81                         return "kernel_" + kernel_name + ".cl";
82                 }
83         }
84 }
85
86 string OpenCLDevice::get_build_options(const DeviceRequestedFeatures& requested_features, const string& opencl_program_name)
87 {
88         /* first check for non-split kernel programs */
89         if (opencl_program_name == "base" || opencl_program_name == "denoising") {
90                 return "";
91         }
92         else if (opencl_program_name == "bake") {
93                 /* Note: get_build_options for bake is only requested when baking is enabled.
94                    displace and background are always requested.
95                    `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
96                 DeviceRequestedFeatures features(requested_features);
97                 features.use_denoising = false;
98                 features.use_object_motion = false;
99                 features.use_camera_motion = false;
100                 return features.get_build_options();
101         }
102         else if (opencl_program_name == "displace") {
103                 /* As displacement does not use any nodes from the Shading group (eg BSDF).
104                    We disable all features that are related to shading. */
105                 DeviceRequestedFeatures features(requested_features);
106                 features.use_denoising = false;
107                 features.use_object_motion = false;
108                 features.use_camera_motion = false;
109                 features.use_baking = false;
110                 features.use_transparent = false;
111                 features.use_shadow_tricks = false;
112                 features.use_subsurface = false;
113                 features.use_volume = false;
114                 features.nodes_features &= ~NODE_FEATURE_VOLUME;
115                 features.use_denoising = false;
116                 features.use_principled = false;
117                 return features.get_build_options();
118         }
119         else if (opencl_program_name == "background") {
120                 /* Background uses Background shading
121                    It is save to disable shadow features, subsurface and volumetric. */
122                 DeviceRequestedFeatures features(requested_features);
123                 features.use_baking = false;
124                 features.use_transparent = false;
125                 features.use_shadow_tricks = false;
126                 features.use_denoising = false;
127                 /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
128                    Perhaps we should remove them in UI as it does not make any sense when
129                    rendering background. */
130                 features.nodes_features &= ~NODE_FEATURE_VOLUME;
131                 features.use_subsurface = false;
132                 features.use_volume = false;
133                 return features.get_build_options();
134         }
135
136         string build_options = "-D__SPLIT_KERNEL__ ";
137         DeviceRequestedFeatures nofeatures;
138         /* Set compute device build option. */
139         cl_device_type device_type;
140         OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
141         assert(this->ciErr == CL_SUCCESS);
142         if(device_type == CL_DEVICE_TYPE_GPU) {
143                 build_options += "-D__COMPUTE_DEVICE_GPU__ ";
144         }
145
146         /* Add program specific optimized compile directives */
147         if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
148                 build_options += nofeatures.get_build_options();
149         }
150         else if (opencl_program_name == "split_subsurface_scatter" && !requested_features.use_subsurface) {
151                 /* When subsurface is off, the kernel updates indexes and does not need any
152                    Compile directives */
153                 build_options += nofeatures.get_build_options();
154         }
155         else {
156                 DeviceRequestedFeatures features(requested_features);
157
158                 /* Always turn off baking at this point. Baking is only usefull when building the bake kernel.
159                    this also makes sure that the kernels that are build during baking can be reused
160                    when not doing any baking. */
161                 features.use_baking = false;
162
163                 /* Do not vary on shaders when program doesn't do any shading.
164                    We have bundled them in a single program. */
165                 if (opencl_program_name == "split_bundle") {
166                         features.max_nodes_group = 0;
167                         features.nodes_features = 0;
168                 }
169
170                 /* No specific settings, just add the regular ones */
171                 build_options += features.get_build_options();
172         }
173
174         return build_options;
175 }
176
177 namespace {
178
179 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
180  * fetch its size.
181  */
182 typedef struct KernelGlobalsDummy {
183         ccl_constant KernelData *data;
184         ccl_global char *buffers[8];
185
186 #define KERNEL_TEX(type, name) \
187         TextureInfo name;
188 #  include "kernel/kernel_textures.h"
189 #undef KERNEL_TEX
190         SplitData split_data;
191         SplitParams split_param_data;
192 } KernelGlobalsDummy;
193
194 }  // namespace
195
196
197 struct CachedSplitMemory {
198         int id;
199         device_memory *split_data;
200         device_memory *ray_state;
201         device_memory *queue_index;
202         device_memory *use_queues_flag;
203         device_memory *work_pools;
204         device_ptr *buffer;
205 };
206
207 class OpenCLSplitKernelFunction : public SplitKernelFunction {
208 public:
209         OpenCLDevice* device;
210         OpenCLDevice::OpenCLProgram program;
211         CachedSplitMemory& cached_memory;
212         int cached_id;
213
214         OpenCLSplitKernelFunction(OpenCLDevice* device, CachedSplitMemory& cached_memory) :
215                         device(device), cached_memory(cached_memory), cached_id(cached_memory.id-1)
216         {
217         }
218
219         ~OpenCLSplitKernelFunction()
220         {
221                 program.release();
222         }
223
224         virtual bool enqueue(const KernelDimensions& dim, device_memory& kg, device_memory& data)
225         {
226                 if(cached_id != cached_memory.id) {
227                         cl_uint start_arg_index =
228                                 device->kernel_set_args(program(),
229                                                         0,
230                                                         kg,
231                                                         data,
232                                                         *cached_memory.split_data,
233                                                         *cached_memory.ray_state);
234
235                                 device->set_kernel_arg_buffers(program(), &start_arg_index);
236
237                         start_arg_index +=
238                                 device->kernel_set_args(program(),
239                                                         start_arg_index,
240                                                         *cached_memory.queue_index,
241                                                         *cached_memory.use_queues_flag,
242                                                         *cached_memory.work_pools,
243                                                         *cached_memory.buffer);
244
245                         cached_id = cached_memory.id;
246                 }
247
248                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
249                                                        program(),
250                                                        2,
251                                                        NULL,
252                                                        dim.global_size,
253                                                        dim.local_size,
254                                                        0,
255                                                        NULL,
256                                                        NULL);
257
258                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
259
260                 if(device->ciErr != CL_SUCCESS) {
261                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
262                                                        clewErrorString(device->ciErr));
263                         device->opencl_error(message);
264                         return false;
265                 }
266
267                 return true;
268         }
269 };
270
271 class OpenCLSplitKernel : public DeviceSplitKernel {
272         OpenCLDevice *device;
273         CachedSplitMemory cached_memory;
274 public:
275         explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device) {
276         }
277
278         virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
279                                                                const DeviceRequestedFeatures& requested_features)
280         {
281                 OpenCLSplitKernelFunction* kernel = new OpenCLSplitKernelFunction(device, cached_memory);
282
283                 bool single_program = OpenCLInfo::use_single_program();
284                 const string program_name = device->get_opencl_program_name(single_program, kernel_name);
285                 kernel->program =
286                         OpenCLDevice::OpenCLProgram(device,
287                                                     program_name,
288                                                     device->get_opencl_program_filename(single_program, kernel_name),
289                                                     device->get_build_options(requested_features, program_name));
290
291                 kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
292                 kernel->program.load();
293
294                 if(!kernel->program.is_loaded()) {
295                         delete kernel;
296                         return NULL;
297                 }
298
299                 return kernel;
300         }
301
302         virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads)
303         {
304                 device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
305                 size_buffer.alloc(1);
306                 size_buffer.zero_to_device();
307
308                 uint threads = num_threads;
309                 cl_kernel kernel_state_buffer_size = device->program_split(ustring("path_trace_state_buffer_size"));
310                 device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
311
312                 size_t global_size = 64;
313                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
314                                                        kernel_state_buffer_size,
315                                                        1,
316                                                        NULL,
317                                                        &global_size,
318                                                        NULL,
319                                                        0,
320                                                        NULL,
321                                                        NULL);
322
323                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
324
325                 size_buffer.copy_from_device(0, 1, 1);
326                 size_t size = size_buffer[0];
327                 size_buffer.free();
328
329                 if(device->ciErr != CL_SUCCESS) {
330                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
331                                                        clewErrorString(device->ciErr));
332                         device->opencl_error(message);
333                         return 0;
334                 }
335
336                 return size;
337         }
338
339         virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
340                                                     RenderTile& rtile,
341                                                     int num_global_elements,
342                                                     device_memory& kernel_globals,
343                                                     device_memory& kernel_data,
344                                                     device_memory& split_data,
345                                                     device_memory& ray_state,
346                                                     device_memory& queue_index,
347                                                     device_memory& use_queues_flag,
348                                                     device_memory& work_pool_wgs
349                                                     )
350         {
351                 cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
352
353                 /* Set the range of samples to be processed for every ray in
354                  * path-regeneration logic.
355                  */
356                 cl_int start_sample = rtile.start_sample;
357                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
358
359                 cl_kernel kernel_data_init = device->program_split(ustring("path_trace_data_init"));
360
361                 cl_uint start_arg_index =
362                         device->kernel_set_args(kernel_data_init,
363                                                 0,
364                                                 kernel_globals,
365                                                 kernel_data,
366                                                 split_data,
367                                                 num_global_elements,
368                                                 ray_state);
369
370                         device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
371
372                 start_arg_index +=
373                         device->kernel_set_args(kernel_data_init,
374                                                 start_arg_index,
375                                                 start_sample,
376                                                 end_sample,
377                                                 rtile.x,
378                                                 rtile.y,
379                                                 rtile.w,
380                                                 rtile.h,
381                                                 rtile.offset,
382                                                 rtile.stride,
383                                                 queue_index,
384                                                 dQueue_size,
385                                                 use_queues_flag,
386                                                 work_pool_wgs,
387                                                 rtile.num_samples,
388                                                 rtile.buffer);
389
390                 /* Enqueue ckPathTraceKernel_data_init kernel. */
391                 device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
392                                                        kernel_data_init,
393                                                        2,
394                                                        NULL,
395                                                        dim.global_size,
396                                                        dim.local_size,
397                                                        0,
398                                                        NULL,
399                                                        NULL);
400
401                 device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
402
403                 if(device->ciErr != CL_SUCCESS) {
404                         string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
405                                                        clewErrorString(device->ciErr));
406                         device->opencl_error(message);
407                         return false;
408                 }
409
410                 cached_memory.split_data = &split_data;
411                 cached_memory.ray_state = &ray_state;
412                 cached_memory.queue_index = &queue_index;
413                 cached_memory.use_queues_flag = &use_queues_flag;
414                 cached_memory.work_pools = &work_pool_wgs;
415                 cached_memory.buffer = &rtile.buffer;
416                 cached_memory.id++;
417
418                 return true;
419         }
420
421         virtual int2 split_kernel_local_size()
422         {
423                 return make_int2(64, 1);
424         }
425
426         virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
427         {
428                 cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
429                 /* Use small global size on CPU devices as it seems to be much faster. */
430                 if(type == CL_DEVICE_TYPE_CPU) {
431                         VLOG(1) << "Global size: (64, 64).";
432                         return make_int2(64, 64);
433                 }
434
435                 cl_ulong max_buffer_size;
436                 clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
437
438                 if(DebugFlags().opencl.mem_limit) {
439                         max_buffer_size = min(max_buffer_size,
440                                               cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
441                 }
442
443                 VLOG(1) << "Maximum device allocation size: "
444                         << string_human_readable_number(max_buffer_size) << " bytes. ("
445                         << string_human_readable_size(max_buffer_size) << ").";
446
447                 /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
448                 max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024);
449
450                 size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
451                 int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements));
452                 VLOG(1) << "Global size: " << global_size << ".";
453                 return global_size;
454         }
455 };
456
457 bool OpenCLDevice::opencl_error(cl_int err)
458 {
459         if(err != CL_SUCCESS) {
460                 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
461                 if(error_msg == "")
462                         error_msg = message;
463                 fprintf(stderr, "%s\n", message.c_str());
464                 return true;
465         }
466
467         return false;
468 }
469
470 void OpenCLDevice::opencl_error(const string& message)
471 {
472         if(error_msg == "")
473                 error_msg = message;
474         fprintf(stderr, "%s\n", message.c_str());
475 }
476
477 void OpenCLDevice::opencl_assert_err(cl_int err, const char* where)
478 {
479         if(err != CL_SUCCESS) {
480                 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
481                 if(error_msg == "")
482                         error_msg = message;
483                 fprintf(stderr, "%s\n", message.c_str());
484 #ifndef NDEBUG
485                 abort();
486 #endif
487         }
488 }
489
490 OpenCLDevice::OpenCLDevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background)
491 : Device(info, stats, profiler, background),
492   memory_manager(this),
493   texture_info(this, "__texture_info", MEM_TEXTURE)
494 {
495         cpPlatform = NULL;
496         cdDevice = NULL;
497         cxContext = NULL;
498         cqCommandQueue = NULL;
499         null_mem = 0;
500         device_initialized = false;
501         textures_need_update = true;
502
503         vector<OpenCLPlatformDevice> usable_devices;
504         OpenCLInfo::get_usable_devices(&usable_devices);
505         if(usable_devices.size() == 0) {
506                 opencl_error("OpenCL: no devices found.");
507                 return;
508         }
509         assert(info.num < usable_devices.size());
510         OpenCLPlatformDevice& platform_device = usable_devices[info.num];
511         device_num = info.num;
512         cpPlatform = platform_device.platform_id;
513         cdDevice = platform_device.device_id;
514         platform_name = platform_device.platform_name;
515         device_name = platform_device.device_name;
516         VLOG(2) << "Creating new Cycles device for OpenCL platform "
517                 << platform_name << ", device "
518                 << device_name << ".";
519
520         {
521                 /* try to use cached context */
522                 thread_scoped_lock cache_locker;
523                 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
524
525                 if(cxContext == NULL) {
526                         /* create context properties array to specify platform */
527                         const cl_context_properties context_props[] = {
528                                 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
529                                 0, 0
530                         };
531
532                         /* create context */
533                         cxContext = clCreateContext(context_props, 1, &cdDevice,
534                                 context_notify_callback, cdDevice, &ciErr);
535
536                         if(opencl_error(ciErr)) {
537                                 opencl_error("OpenCL: clCreateContext failed");
538                                 return;
539                         }
540
541                         /* cache it */
542                         OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
543                 }
544         }
545
546         cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
547         if(opencl_error(ciErr)) {
548                 opencl_error("OpenCL: Error creating command queue");
549                 return;
550         }
551
552         null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
553         if(opencl_error(ciErr)) {
554                 opencl_error("OpenCL: Error creating memory buffer for NULL");
555                 return;
556         }
557
558         /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
559         texture_info.resize(1);
560         memory_manager.alloc("texture_info", texture_info);
561
562         device_initialized = true;
563
564         split_kernel = new OpenCLSplitKernel(this);
565 }
566
567 OpenCLDevice::~OpenCLDevice()
568 {
569         task_pool.stop();
570
571         memory_manager.free();
572
573         if(null_mem)
574                 clReleaseMemObject(CL_MEM_PTR(null_mem));
575
576         ConstMemMap::iterator mt;
577         for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
578                 delete mt->second;
579         }
580
581         base_program.release();
582         bake_program.release();
583         displace_program.release();
584         background_program.release();
585         program_split.release();
586
587         if(cqCommandQueue)
588                 clReleaseCommandQueue(cqCommandQueue);
589         if(cxContext)
590                 clReleaseContext(cxContext);
591
592         delete split_kernel;
593 }
594
595 void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
596         const void * /*private_info*/, size_t /*cb*/, void *user_data)
597 {
598         string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
599         fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
600 }
601
602 bool OpenCLDevice::opencl_version_check()
603 {
604         string error;
605         if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
606                 opencl_error(error);
607                 return false;
608         }
609         if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
610                 opencl_error(error);
611                 return false;
612         }
613         return true;
614 }
615
616 string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
617 {
618         MD5Hash md5;
619         char version[256], driver[256], name[256], vendor[256];
620
621         clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
622         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
623         clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
624         clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
625
626         md5.append((uint8_t*)vendor, strlen(vendor));
627         md5.append((uint8_t*)version, strlen(version));
628         md5.append((uint8_t*)name, strlen(name));
629         md5.append((uint8_t*)driver, strlen(driver));
630
631         string options = kernel_build_options();
632         options += kernel_custom_build_options;
633         md5.append((uint8_t*)options.c_str(), options.size());
634
635         return md5.get_hex();
636 }
637
638 bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures& requested_features)
639 {
640         VLOG(2) << "Loading kernels for platform " << platform_name
641                 << ", device " << device_name << ".";
642         /* Verify if device was initialized. */
643         if(!device_initialized) {
644                 fprintf(stderr, "OpenCL: failed to initialize device.\n");
645                 return false;
646         }
647
648         /* Verify we have right opencl version. */
649         if(!opencl_version_check())
650                 return false;
651
652         vector<OpenCLProgram*> programs;
653
654         if (requested_features.use_true_displacement) {
655                 displace_program = OpenCLProgram(this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
656                 displace_program.add_kernel(ustring("displace"));
657                 programs.push_back(&displace_program);
658         }
659
660         if (requested_features.use_background_light) {
661                 background_program = OpenCLProgram(this, "background", "kernel_background.cl", get_build_options(requested_features, "background"));
662                 background_program.add_kernel(ustring("background"));
663                 programs.push_back(&background_program);
664         }
665
666         bool single_program = OpenCLInfo::use_single_program();
667
668 #define ADD_SPLIT_KERNEL_SINGLE_PROGRAM(kernel_name) program_split.add_kernel(ustring("path_trace_"#kernel_name));
669 #define ADD_SPLIT_KERNEL_SPLIT_PROGRAM(kernel_name) \
670                 const string program_name_##kernel_name = "split_"#kernel_name; \
671                 program_##kernel_name = \
672                         OpenCLDevice::OpenCLProgram(this, \
673                                                     program_name_##kernel_name, \
674                                                     "kernel_"#kernel_name".cl", \
675                                                     get_build_options(requested_features, program_name_##kernel_name)); \
676                 program_##kernel_name.add_kernel(ustring("path_trace_"#kernel_name)); \
677                 programs.push_back(&program_##kernel_name);
678
679         if (single_program) {
680                 program_split = OpenCLDevice::OpenCLProgram(this,
681                                                             "split" ,
682                                                             "kernel_split.cl",
683                                                             get_build_options(requested_features, "split"));
684
685                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(state_buffer_size);
686                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(data_init);
687                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init);
688                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect);
689                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(lamp_emission);
690                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(do_volume);
691                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue);
692                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_background);
693                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup);
694                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort);
695                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_eval);
696                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(holdout_emission_blurring_pathtermination_ao);
697                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(subsurface_scatter);
698                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(direct_lighting);
699                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_ao);
700                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shadow_blocked_dl);
701                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive);
702                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup);
703                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface);
704                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update);
705
706                 programs.push_back(&program_split);
707         }
708         else {
709                 /* Ordered with most complex kernels first, to reduce overall compile time. */
710                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(subsurface_scatter);
711                 if (requested_features.use_volume) {
712                         ADD_SPLIT_KERNEL_SPLIT_PROGRAM(do_volume);
713                 }
714                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_dl);
715                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shadow_blocked_ao);
716                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(holdout_emission_blurring_pathtermination_ao);
717                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(lamp_emission);
718                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(direct_lighting);
719                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(indirect_background);
720                 ADD_SPLIT_KERNEL_SPLIT_PROGRAM(shader_eval);
721
722                 /* Quick kernels bundled in a single program to reduce overhead of starting
723                         * Blender processes. */
724                 program_split = OpenCLDevice::OpenCLProgram(this,
725                                                             "split_bundle" ,
726                                                             "kernel_split_bundle.cl",
727                                                             get_build_options(requested_features, "split_bundle"));
728
729                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(data_init);
730                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(state_buffer_size);
731                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(path_init);
732                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(scene_intersect);
733                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(queue_enqueue);
734                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_setup);
735                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(shader_sort);
736                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(enqueue_inactive);
737                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(next_iteration_setup);
738                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(indirect_subsurface);
739                 ADD_SPLIT_KERNEL_SINGLE_PROGRAM(buffer_update);
740                 programs.push_back(&program_split);
741         }
742 #undef ADD_SPLIT_KERNEL_SPLIT_PROGRAM
743 #undef ADD_SPLIT_KERNEL_SINGLE_PROGRAM
744
745         base_program = OpenCLProgram(this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
746         base_program.add_kernel(ustring("convert_to_byte"));
747         base_program.add_kernel(ustring("convert_to_half_float"));
748         base_program.add_kernel(ustring("zero_buffer"));
749         programs.push_back(&base_program);
750
751         if (requested_features.use_baking) {
752                 bake_program = OpenCLProgram(this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
753                 bake_program.add_kernel(ustring("bake"));
754                 programs.push_back(&bake_program);
755         }
756
757         denoising_program = OpenCLProgram(this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
758         denoising_program.add_kernel(ustring("filter_divide_shadow"));
759         denoising_program.add_kernel(ustring("filter_get_feature"));
760         denoising_program.add_kernel(ustring("filter_write_feature"));
761         denoising_program.add_kernel(ustring("filter_detect_outliers"));
762         denoising_program.add_kernel(ustring("filter_combine_halves"));
763         denoising_program.add_kernel(ustring("filter_construct_transform"));
764         denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
765         denoising_program.add_kernel(ustring("filter_nlm_blur"));
766         denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
767         denoising_program.add_kernel(ustring("filter_nlm_update_output"));
768         denoising_program.add_kernel(ustring("filter_nlm_normalize"));
769         denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
770         denoising_program.add_kernel(ustring("filter_finalize"));
771         programs.push_back(&denoising_program);
772
773         /* Parallel compilation of Cycles kernels, this launches multiple
774          * processes to workaround OpenCL frameworks serializing the calls
775          * internally within a single process. */
776         TaskPool task_pool;
777         foreach(OpenCLProgram *program, programs) {
778                 task_pool.push(function_bind(&OpenCLProgram::load, program));
779         }
780         task_pool.wait_work();
781
782         foreach(OpenCLProgram *program, programs) {
783                 VLOG(2) << program->get_log();
784                 if(!program->is_loaded()) {
785                         program->report_error();
786                         return false;
787                 }
788         }
789
790         return split_kernel->load_kernels(requested_features);
791 }
792
793 void OpenCLDevice::mem_alloc(device_memory& mem)
794 {
795         if(mem.name) {
796                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
797                         << string_human_readable_number(mem.memory_size()) << " bytes. ("
798                         << string_human_readable_size(mem.memory_size()) << ")";
799         }
800
801         size_t size = mem.memory_size();
802
803         /* check there is enough memory available for the allocation */
804         cl_ulong max_alloc_size = 0;
805         clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
806
807         if(DebugFlags().opencl.mem_limit) {
808                 max_alloc_size = min(max_alloc_size,
809                                      cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
810         }
811
812         if(size > max_alloc_size) {
813                 string error = "Scene too complex to fit in available memory.";
814                 if(mem.name != NULL) {
815                         error += string_printf(" (allocating buffer %s failed.)", mem.name);
816                 }
817                 set_error(error);
818
819                 return;
820         }
821
822         cl_mem_flags mem_flag;
823         void *mem_ptr = NULL;
824
825         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
826                 mem_flag = CL_MEM_READ_ONLY;
827         else
828                 mem_flag = CL_MEM_READ_WRITE;
829
830         /* Zero-size allocation might be invoked by render, but not really
831          * supported by OpenCL. Using NULL as device pointer also doesn't really
832          * work for some reason, so for the time being we'll use special case
833          * will null_mem buffer.
834          */
835         if(size != 0) {
836                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
837                                                                 mem_flag,
838                                                                 size,
839                                                                 mem_ptr,
840                                                                 &ciErr);
841                 opencl_assert_err(ciErr, "clCreateBuffer");
842         }
843         else {
844                 mem.device_pointer = null_mem;
845         }
846
847         stats.mem_alloc(size);
848         mem.device_size = size;
849 }
850
851 void OpenCLDevice::mem_copy_to(device_memory& mem)
852 {
853         if(mem.type == MEM_TEXTURE) {
854                 tex_free(mem);
855                 tex_alloc(mem);
856         }
857         else {
858                 if(!mem.device_pointer) {
859                         mem_alloc(mem);
860                 }
861
862                 /* this is blocking */
863                 size_t size = mem.memory_size();
864                 if(size != 0) {
865                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
866                                                            CL_MEM_PTR(mem.device_pointer),
867                                                            CL_TRUE,
868                                                            0,
869                                                            size,
870                                                            mem.host_pointer,
871                                                            0,
872                                                            NULL, NULL));
873                 }
874         }
875 }
876
877 void OpenCLDevice::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
878 {
879         size_t offset = elem*y*w;
880         size_t size = elem*w*h;
881         assert(size != 0);
882         opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
883                                           CL_MEM_PTR(mem.device_pointer),
884                                           CL_TRUE,
885                                           offset,
886                                           size,
887                                           (uchar*)mem.host_pointer + offset,
888                                           0,
889                                           NULL, NULL));
890 }
891
892 void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
893 {
894         cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
895
896         size_t global_size[] = {1024, 1024};
897         size_t num_threads = global_size[0] * global_size[1];
898
899         cl_mem d_buffer = CL_MEM_PTR(mem);
900         cl_ulong d_offset = 0;
901         cl_ulong d_size = 0;
902
903         while(d_offset < size) {
904                 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
905
906                 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
907
908                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
909                                                ckZeroBuffer,
910                                                2,
911                                                NULL,
912                                                global_size,
913                                                NULL,
914                                                0,
915                                                NULL,
916                                                NULL);
917                 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
918
919                 d_offset += d_size;
920         }
921 }
922
923 void OpenCLDevice::mem_zero(device_memory& mem)
924 {
925         if(!mem.device_pointer) {
926                 mem_alloc(mem);
927         }
928
929         if(mem.device_pointer) {
930                 if(base_program.is_loaded()) {
931                         mem_zero_kernel(mem.device_pointer, mem.memory_size());
932                 }
933
934                 if(mem.host_pointer) {
935                         memset(mem.host_pointer, 0, mem.memory_size());
936                 }
937
938                 if(!base_program.is_loaded()) {
939                         void* zero = mem.host_pointer;
940
941                         if(!mem.host_pointer) {
942                                 zero = util_aligned_malloc(mem.memory_size(), 16);
943                                 memset(zero, 0, mem.memory_size());
944                         }
945
946                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
947                                                            CL_MEM_PTR(mem.device_pointer),
948                                                            CL_TRUE,
949                                                            0,
950                                                            mem.memory_size(),
951                                                            zero,
952                                                            0,
953                                                            NULL, NULL));
954
955                         if(!mem.host_pointer) {
956                                 util_aligned_free(zero);
957                         }
958                 }
959         }
960 }
961
962 void OpenCLDevice::mem_free(device_memory& mem)
963 {
964         if(mem.type == MEM_TEXTURE) {
965                 tex_free(mem);
966         }
967         else {
968                 if(mem.device_pointer) {
969                         if(mem.device_pointer != null_mem) {
970                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
971                         }
972                         mem.device_pointer = 0;
973
974                         stats.mem_free(mem.device_size);
975                         mem.device_size = 0;
976                 }
977         }
978 }
979
980 int OpenCLDevice::mem_sub_ptr_alignment()
981 {
982         return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
983 }
984
985 device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
986 {
987         cl_mem_flags mem_flag;
988         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
989                 mem_flag = CL_MEM_READ_ONLY;
990         else
991                 mem_flag = CL_MEM_READ_WRITE;
992
993         cl_buffer_region info;
994         info.origin = mem.memory_elements_size(offset);
995         info.size = mem.memory_elements_size(size);
996
997         device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
998                                                             mem_flag,
999                                                             CL_BUFFER_CREATE_TYPE_REGION,
1000                                                             &info,
1001                                                             &ciErr);
1002         opencl_assert_err(ciErr, "clCreateSubBuffer");
1003         return sub_buf;
1004 }
1005
1006 void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
1007 {
1008         if(device_pointer && device_pointer != null_mem) {
1009                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
1010         }
1011 }
1012
1013 void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
1014 {
1015         ConstMemMap::iterator i = const_mem_map.find(name);
1016         device_vector<uchar> *data;
1017
1018         if(i == const_mem_map.end()) {
1019                 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
1020                 data->alloc(size);
1021                 const_mem_map.insert(ConstMemMap::value_type(name, data));
1022         }
1023         else {
1024                 data = i->second;
1025         }
1026
1027         memcpy(data->data(), host, size);
1028         data->copy_to_device();
1029 }
1030
1031 void OpenCLDevice::tex_alloc(device_memory& mem)
1032 {
1033         VLOG(1) << "Texture allocate: " << mem.name << ", "
1034                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
1035                 << string_human_readable_size(mem.memory_size()) << ")";
1036
1037         memory_manager.alloc(mem.name, mem);
1038         /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
1039         mem.device_pointer = 1;
1040         textures[mem.name] = &mem;
1041         textures_need_update = true;
1042 }
1043
1044 void OpenCLDevice::tex_free(device_memory& mem)
1045 {
1046         if(mem.device_pointer) {
1047                 mem.device_pointer = 0;
1048
1049                 if(memory_manager.free(mem)) {
1050                         textures_need_update = true;
1051                 }
1052
1053                 foreach(TexturesMap::value_type& value, textures) {
1054                         if(value.second == &mem) {
1055                                 textures.erase(value.first);
1056                                 break;
1057                         }
1058                 }
1059         }
1060 }
1061
1062 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
1063 {
1064         int r = global_size % group_size;
1065         return global_size + ((r == 0)? 0: group_size - r);
1066 }
1067
1068 void OpenCLDevice::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
1069 {
1070         size_t workgroup_size, max_work_items[3];
1071
1072         clGetKernelWorkGroupInfo(kernel, cdDevice,
1073                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1074         clGetDeviceInfo(cdDevice,
1075                 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
1076
1077         if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
1078                 workgroup_size = max_workgroup_size;
1079         }
1080
1081         /* Try to divide evenly over 2 dimensions. */
1082         size_t local_size[2];
1083         if(x_workgroups) {
1084                 local_size[0] = workgroup_size;
1085                 local_size[1] = 1;
1086         }
1087         else {
1088                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1089                 local_size[0] = local_size[1] = sqrt_workgroup_size;
1090         }
1091
1092         /* Some implementations have max size 1 on 2nd dimension. */
1093         if(local_size[1] > max_work_items[1]) {
1094                 local_size[0] = workgroup_size/max_work_items[1];
1095                 local_size[1] = max_work_items[1];
1096         }
1097
1098         size_t global_size[2] = {global_size_round_up(local_size[0], w),
1099                                  global_size_round_up(local_size[1], h)};
1100
1101         /* Vertical size of 1 is coming from bake/shade kernels where we should
1102          * not round anything up because otherwise we'll either be doing too
1103          * much work per pixel (if we don't check global ID on Y axis) or will
1104          * be checking for global ID to always have Y of 0.
1105          */
1106         if(h == 1) {
1107                 global_size[h] = 1;
1108         }
1109
1110         /* run kernel */
1111         opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1112         opencl_assert(clFlush(cqCommandQueue));
1113 }
1114
1115 void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1116 {
1117         cl_mem ptr;
1118
1119         MemMap::iterator i = mem_map.find(name);
1120         if(i != mem_map.end()) {
1121                 ptr = CL_MEM_PTR(i->second);
1122         }
1123         else {
1124                 /* work around NULL not working, even though the spec says otherwise */
1125                 ptr = CL_MEM_PTR(null_mem);
1126         }
1127
1128         opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
1129 }
1130
1131 void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
1132 {
1133         flush_texture_buffers();
1134
1135         memory_manager.set_kernel_arg_buffers(kernel, narg);
1136 }
1137
1138 void OpenCLDevice::flush_texture_buffers()
1139 {
1140         if(!textures_need_update) {
1141                 return;
1142         }
1143         textures_need_update = false;
1144
1145         /* Setup slots for textures. */
1146         int num_slots = 0;
1147
1148         vector<texture_slot_t> texture_slots;
1149
1150 #define KERNEL_TEX(type, name) \
1151         if(textures.find(#name) != textures.end()) { \
1152                 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
1153         } \
1154         num_slots++;
1155 #include "kernel/kernel_textures.h"
1156
1157         int num_data_slots = num_slots;
1158
1159         foreach(TexturesMap::value_type& tex, textures) {
1160                 string name = tex.first;
1161
1162                 if(string_startswith(name, "__tex_image")) {
1163                         int pos = name.rfind("_");
1164                         int id = atoi(name.data() + pos + 1);
1165                         texture_slots.push_back(texture_slot_t(name,
1166                                                                num_data_slots + id));
1167                         num_slots = max(num_slots, num_data_slots + id + 1);
1168                 }
1169         }
1170
1171         /* Realloc texture descriptors buffer. */
1172         memory_manager.free(texture_info);
1173         texture_info.resize(num_slots);
1174         memory_manager.alloc("texture_info", texture_info);
1175
1176         /* Fill in descriptors */
1177         foreach(texture_slot_t& slot, texture_slots) {
1178                 TextureInfo& info = texture_info[slot.slot];
1179
1180                 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
1181                 info.data = desc.offset;
1182                 info.cl_buffer = desc.device_buffer;
1183
1184                 if(string_startswith(slot.name, "__tex_image")) {
1185                         device_memory *mem = textures[slot.name];
1186
1187                         info.width = mem->data_width;
1188                         info.height = mem->data_height;
1189                         info.depth = mem->data_depth;
1190
1191                         info.interpolation = mem->interpolation;
1192                         info.extension = mem->extension;
1193                 }
1194         }
1195
1196         /* Force write of descriptors. */
1197         memory_manager.free(texture_info);
1198         memory_manager.alloc("texture_info", texture_info);
1199 }
1200
1201
1202 void OpenCLDevice::thread_run(DeviceTask *task)
1203 {
1204         flush_texture_buffers();
1205
1206         if(task->type == DeviceTask::FILM_CONVERT) {
1207                 film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1208         }
1209         else if(task->type == DeviceTask::SHADER) {
1210                 shader(*task);
1211         }
1212         else if(task->type == DeviceTask::RENDER) {
1213                 RenderTile tile;
1214                 DenoisingTask denoising(this, *task);
1215
1216                 /* Allocate buffer for kernel globals */
1217                 device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
1218                 kgbuffer.alloc_to_device(1);
1219
1220                 /* Keep rendering tiles until done. */
1221                 while(task->acquire_tile(this, tile)) {
1222                         if(tile.task == RenderTile::PATH_TRACE) {
1223                                 assert(tile.task == RenderTile::PATH_TRACE);
1224                                 scoped_timer timer(&tile.buffers->render_time);
1225
1226                                 split_kernel->path_trace(task,
1227                                                          tile,
1228                                                          kgbuffer,
1229                                                          *const_mem_map["__data"]);
1230
1231                                 /* Complete kernel execution before release tile. */
1232                                 /* This helps in multi-device render;
1233                                         * The device that reaches the critical-section function
1234                                         * release_tile waits (stalling other devices from entering
1235                                         * release_tile) for all kernels to complete. If device1 (a
1236                                         * slow-render device) reaches release_tile first then it would
1237                                         * stall device2 (a fast-render device) from proceeding to render
1238                                         * next tile.
1239                                         */
1240                                 clFinish(cqCommandQueue);
1241                         }
1242                         else if(tile.task == RenderTile::DENOISE) {
1243                                 tile.sample = tile.start_sample + tile.num_samples;
1244                                 denoise(tile, denoising);
1245                                 task->update_progress(&tile, tile.w*tile.h);
1246                         }
1247
1248                         task->release_tile(tile);
1249                 }
1250
1251                 kgbuffer.free();
1252         }
1253 }
1254
1255 void OpenCLDevice::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
1256 {
1257         /* cast arguments to cl types */
1258         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1259         cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
1260         cl_mem d_buffer = CL_MEM_PTR(buffer);
1261         cl_int d_x = task.x;
1262         cl_int d_y = task.y;
1263         cl_int d_w = task.w;
1264         cl_int d_h = task.h;
1265         cl_float d_sample_scale = 1.0f/(task.sample + 1);
1266         cl_int d_offset = task.offset;
1267         cl_int d_stride = task.stride;
1268
1269
1270         cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
1271
1272         cl_uint start_arg_index =
1273                 kernel_set_args(ckFilmConvertKernel,
1274                                 0,
1275                                 d_data,
1276                                 d_rgba,
1277                                 d_buffer);
1278
1279         set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
1280
1281         start_arg_index += kernel_set_args(ckFilmConvertKernel,
1282                                            start_arg_index,
1283                                            d_sample_scale,
1284                                            d_x,
1285                                            d_y,
1286                                            d_w,
1287                                            d_h,
1288                                            d_offset,
1289                                            d_stride);
1290
1291         enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1292 }
1293
1294 bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
1295                                              device_ptr guide_ptr,
1296                                              device_ptr variance_ptr,
1297                                              device_ptr out_ptr,
1298                                              DenoisingTask *task)
1299 {
1300         int stride = task->buffer.stride;
1301         int w = task->buffer.width;
1302         int h = task->buffer.h;
1303         int r = task->nlm_state.r;
1304         int f = task->nlm_state.f;
1305         float a = task->nlm_state.a;
1306         float k_2 = task->nlm_state.k_2;
1307
1308         int pass_stride = task->buffer.pass_stride;
1309         int num_shifts = (2*r+1)*(2*r+1);
1310         int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
1311
1312         device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
1313         device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
1314         device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride);
1315         cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
1316         cl_mem difference_mem = CL_MEM_PTR(*difference);
1317         cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1318
1319         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1320         cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
1321         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1322         cl_mem out_mem = CL_MEM_PTR(out_ptr);
1323         cl_mem scale_mem = NULL;
1324
1325         mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
1326         mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
1327
1328         cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1329         cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
1330         cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
1331         cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
1332         cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
1333
1334         kernel_set_args(ckNLMCalcDifference, 0,
1335                         guide_mem,
1336                         variance_mem,
1337                         scale_mem,
1338                         difference_mem,
1339                         w, h, stride,
1340                         pass_stride,
1341                         r, channel_offset,
1342                         0, a, k_2);
1343         kernel_set_args(ckNLMBlur, 0,
1344                         difference_mem,
1345                         blurDifference_mem,
1346                         w, h, stride,
1347                         pass_stride,
1348                         r, f);
1349         kernel_set_args(ckNLMCalcWeight, 0,
1350                         blurDifference_mem,
1351                         difference_mem,
1352                         w, h, stride,
1353                         pass_stride,
1354                         r, f);
1355         kernel_set_args(ckNLMUpdateOutput, 0,
1356                         blurDifference_mem,
1357                         image_mem,
1358                         out_mem,
1359                         weightAccum_mem,
1360                         w, h, stride,
1361                         pass_stride,
1362                         channel_offset,
1363                         r, f);
1364
1365         enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
1366         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
1367         enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
1368         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
1369         enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
1370
1371         kernel_set_args(ckNLMNormalize, 0,
1372                         out_mem, weightAccum_mem, w, h, stride);
1373         enqueue_kernel(ckNLMNormalize, w, h);
1374
1375         return true;
1376 }
1377
1378 bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
1379 {
1380         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1381         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1382         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1383         cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1384
1385         char use_time = task->buffer.use_time? 1 : 0;
1386
1387         cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
1388
1389         int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
1390                                       buffer_mem,
1391                                       tile_info_mem);
1392         cl_mem buffers[9];
1393         for(int i = 0; i < 9; i++) {
1394                 buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1395                 arg_ofs += kernel_set_args(ckFilterConstructTransform,
1396                                            arg_ofs,
1397                                            buffers[i]);
1398         }
1399         kernel_set_args(ckFilterConstructTransform,
1400                         arg_ofs,
1401                         transform_mem,
1402                         rank_mem,
1403                         task->filter_area,
1404                         task->rect,
1405                         task->buffer.pass_stride,
1406                         task->buffer.frame_stride,
1407                         use_time,
1408                         task->radius,
1409                         task->pca_threshold);
1410
1411         enqueue_kernel(ckFilterConstructTransform,
1412                        task->storage.w,
1413                        task->storage.h,
1414                        256);
1415
1416         return true;
1417 }
1418
1419 bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
1420                                         device_ptr color_variance_ptr,
1421                                         device_ptr scale_ptr,
1422                                         int frame,
1423                                         DenoisingTask *task)
1424 {
1425         cl_mem color_mem = CL_MEM_PTR(color_ptr);
1426         cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
1427         cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
1428
1429         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1430         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1431         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1432         cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1433         cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1434
1435         cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
1436         cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
1437         cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
1438         cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
1439
1440         int w = task->reconstruction_state.source_w;
1441         int h = task->reconstruction_state.source_h;
1442         int stride = task->buffer.stride;
1443         int frame_offset = frame * task->buffer.frame_stride;
1444         int t = task->tile_info->frames[frame];
1445         char use_time = task->buffer.use_time? 1 : 0;
1446
1447         int r = task->radius;
1448         int pass_stride = task->buffer.pass_stride;
1449         int num_shifts = (2*r+1)*(2*r+1);
1450
1451         device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
1452         device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
1453         cl_mem difference_mem = CL_MEM_PTR(*difference);
1454         cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1455
1456         kernel_set_args(ckNLMCalcDifference, 0,
1457                         color_mem,
1458                         color_variance_mem,
1459                         scale_mem,
1460                         difference_mem,
1461                         w, h, stride,
1462                         pass_stride,
1463                         r,
1464                         pass_stride,
1465                         frame_offset,
1466                         1.0f, task->nlm_k_2);
1467         kernel_set_args(ckNLMBlur, 0,
1468                         difference_mem,
1469                         blurDifference_mem,
1470                         w, h, stride,
1471                         pass_stride,
1472                         r, 4);
1473         kernel_set_args(ckNLMCalcWeight, 0,
1474                         blurDifference_mem,
1475                         difference_mem,
1476                         w, h, stride,
1477                         pass_stride,
1478                         r, 4);
1479         kernel_set_args(ckNLMConstructGramian, 0,
1480                         t,
1481                         blurDifference_mem,
1482                         buffer_mem,
1483                         transform_mem,
1484                         rank_mem,
1485                         XtWX_mem,
1486                         XtWY_mem,
1487                         task->reconstruction_state.filter_window,
1488                         w, h, stride,
1489                         pass_stride,
1490                         r, 4,
1491                         frame_offset,
1492                         use_time);
1493
1494         enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
1495         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
1496         enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
1497         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
1498         enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
1499
1500         return true;
1501 }
1502
1503 bool OpenCLDevice::denoising_solve(device_ptr output_ptr,
1504                                    DenoisingTask *task)
1505 {
1506         cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
1507
1508         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1509         cl_mem rank_mem   = CL_MEM_PTR(task->storage.rank.device_pointer);
1510         cl_mem XtWX_mem   = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1511         cl_mem XtWY_mem   = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1512
1513         int w = task->reconstruction_state.source_w;
1514         int h = task->reconstruction_state.source_h;
1515
1516         kernel_set_args(ckFinalize, 0,
1517                         output_mem,
1518                         rank_mem,
1519                         XtWX_mem,
1520                         XtWY_mem,
1521                         task->filter_area,
1522                         task->reconstruction_state.buffer_params,
1523                         task->render_buffer.samples);
1524         enqueue_kernel(ckFinalize, w, h);
1525
1526         return true;
1527 }
1528
1529 bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
1530                                             device_ptr b_ptr,
1531                                             device_ptr mean_ptr,
1532                                             device_ptr variance_ptr,
1533                                             int r, int4 rect,
1534                                             DenoisingTask *task)
1535 {
1536         cl_mem a_mem = CL_MEM_PTR(a_ptr);
1537         cl_mem b_mem = CL_MEM_PTR(b_ptr);
1538         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1539         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1540
1541         cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
1542
1543         kernel_set_args(ckFilterCombineHalves, 0,
1544                         mean_mem,
1545                         variance_mem,
1546                         a_mem,
1547                         b_mem,
1548                         rect,
1549                         r);
1550         enqueue_kernel(ckFilterCombineHalves,
1551                        task->rect.z-task->rect.x,
1552                        task->rect.w-task->rect.y);
1553
1554         return true;
1555 }
1556
1557 bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
1558                                            device_ptr b_ptr,
1559                                            device_ptr sample_variance_ptr,
1560                                            device_ptr sv_variance_ptr,
1561                                            device_ptr buffer_variance_ptr,
1562                                            DenoisingTask *task)
1563 {
1564         cl_mem a_mem = CL_MEM_PTR(a_ptr);
1565         cl_mem b_mem = CL_MEM_PTR(b_ptr);
1566         cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
1567         cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
1568         cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
1569
1570         cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1571
1572         cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
1573
1574         int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0,
1575                                       task->render_buffer.samples,
1576                                       tile_info_mem);
1577         cl_mem buffers[9];
1578         for(int i = 0; i < 9; i++) {
1579                 buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1580                 arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs,
1581                                            buffers[i]);
1582         }
1583         kernel_set_args(ckFilterDivideShadow, arg_ofs,
1584                         a_mem,
1585                         b_mem,
1586                         sample_variance_mem,
1587                         sv_variance_mem,
1588                         buffer_variance_mem,
1589                         task->rect,
1590                         task->render_buffer.pass_stride,
1591                         task->render_buffer.offset);
1592         enqueue_kernel(ckFilterDivideShadow,
1593                        task->rect.z-task->rect.x,
1594                        task->rect.w-task->rect.y);
1595
1596         return true;
1597 }
1598
1599 bool OpenCLDevice::denoising_get_feature(int mean_offset,
1600                                          int variance_offset,
1601                                          device_ptr mean_ptr,
1602                                          device_ptr variance_ptr,
1603                                          float scale,
1604                                          DenoisingTask *task)
1605 {
1606         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1607         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1608
1609         cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1610
1611         cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1612
1613         int arg_ofs = kernel_set_args(ckFilterGetFeature, 0,
1614                                       task->render_buffer.samples,
1615                                       tile_info_mem);
1616         cl_mem buffers[9];
1617         for(int i = 0; i < 9; i++) {
1618                 buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1619                 arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs,
1620                                            buffers[i]);
1621         }
1622         kernel_set_args(ckFilterGetFeature, arg_ofs,
1623                         mean_offset,
1624                         variance_offset,
1625                         mean_mem,
1626                         variance_mem,
1627                         scale,
1628                         task->rect,
1629                         task->render_buffer.pass_stride,
1630                         task->render_buffer.offset);
1631         enqueue_kernel(ckFilterGetFeature,
1632                        task->rect.z-task->rect.x,
1633                        task->rect.w-task->rect.y);
1634
1635         return true;
1636 }
1637
1638 bool OpenCLDevice::denoising_write_feature(int out_offset,
1639                                            device_ptr from_ptr,
1640                                            device_ptr buffer_ptr,
1641                                            DenoisingTask *task)
1642 {
1643         cl_mem from_mem = CL_MEM_PTR(from_ptr);
1644         cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
1645
1646         cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
1647
1648         kernel_set_args(ckFilterWriteFeature, 0,
1649                         task->render_buffer.samples,
1650                         task->reconstruction_state.buffer_params,
1651                         task->filter_area,
1652                         from_mem,
1653                         buffer_mem,
1654                         out_offset,
1655                         task->rect);
1656         enqueue_kernel(ckFilterWriteFeature,
1657                        task->filter_area.z,
1658                        task->filter_area.w);
1659
1660         return true;
1661 }
1662
1663 bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
1664                                              device_ptr variance_ptr,
1665                                              device_ptr depth_ptr,
1666                                              device_ptr output_ptr,
1667                                              DenoisingTask *task)
1668 {
1669         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1670         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1671         cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1672         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1673
1674         cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1675
1676         kernel_set_args(ckFilterDetectOutliers, 0,
1677                         image_mem,
1678                         variance_mem,
1679                         depth_mem,
1680                         output_mem,
1681                         task->rect,
1682                         task->buffer.pass_stride);
1683         enqueue_kernel(ckFilterDetectOutliers,
1684                        task->rect.z-task->rect.x,
1685                        task->rect.w-task->rect.y);
1686
1687         return true;
1688 }
1689
1690 void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask& denoising)
1691 {
1692         denoising.functions.construct_transform = function_bind(&OpenCLDevice::denoising_construct_transform, this, &denoising);
1693         denoising.functions.accumulate = function_bind(&OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1694         denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
1695         denoising.functions.divide_shadow = function_bind(&OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1696         denoising.functions.non_local_means = function_bind(&OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1697         denoising.functions.combine_halves = function_bind(&OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1698         denoising.functions.get_feature = function_bind(&OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1699         denoising.functions.write_feature = function_bind(&OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1700         denoising.functions.detect_outliers = function_bind(&OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1701
1702         denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1703         denoising.render_buffer.samples = rtile.sample;
1704         denoising.buffer.gpu_temporary_mem = true;
1705
1706         denoising.run_denoising(&rtile);
1707 }
1708
1709 void OpenCLDevice::shader(DeviceTask& task)
1710 {
1711         /* cast arguments to cl types */
1712         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1713         cl_mem d_input = CL_MEM_PTR(task.shader_input);
1714         cl_mem d_output = CL_MEM_PTR(task.shader_output);
1715         cl_int d_shader_eval_type = task.shader_eval_type;
1716         cl_int d_shader_filter = task.shader_filter;
1717         cl_int d_shader_x = task.shader_x;
1718         cl_int d_shader_w = task.shader_w;
1719         cl_int d_offset = task.offset;
1720
1721         cl_kernel kernel;
1722
1723         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1724                 kernel = bake_program(ustring("bake"));
1725         }
1726         else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1727                 kernel = displace_program(ustring("displace"));
1728         }
1729         else {
1730                 kernel = background_program(ustring("background"));
1731         }
1732
1733         cl_uint start_arg_index =
1734                 kernel_set_args(kernel,
1735                                 0,
1736                                 d_data,
1737                                 d_input,
1738                                 d_output);
1739
1740         set_kernel_arg_buffers(kernel, &start_arg_index);
1741
1742         start_arg_index += kernel_set_args(kernel,
1743                                            start_arg_index,
1744                                            d_shader_eval_type);
1745         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1746                 start_arg_index += kernel_set_args(kernel,
1747                                                    start_arg_index,
1748                                                    d_shader_filter);
1749         }
1750         start_arg_index += kernel_set_args(kernel,
1751                                            start_arg_index,
1752                                            d_shader_x,
1753                                            d_shader_w,
1754                                            d_offset);
1755
1756         for(int sample = 0; sample < task.num_samples; sample++) {
1757
1758                 if(task.get_cancel())
1759                         break;
1760
1761                 kernel_set_args(kernel, start_arg_index, sample);
1762
1763                 enqueue_kernel(kernel, task.shader_w, 1);
1764
1765                 clFinish(cqCommandQueue);
1766
1767                 task.update_progress(NULL);
1768         }
1769 }
1770
1771 string OpenCLDevice::kernel_build_options(const string *debug_src)
1772 {
1773         string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1774
1775         if(platform_name == "NVIDIA CUDA") {
1776                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1777                                  "-cl-nv-maxrregcount=32 "
1778                                  "-cl-nv-verbose ";
1779
1780                 uint compute_capability_major, compute_capability_minor;
1781                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1782                                 sizeof(cl_uint), &compute_capability_major, NULL);
1783                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1784                                 sizeof(cl_uint), &compute_capability_minor, NULL);
1785
1786                 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1787                                                compute_capability_major * 100 +
1788                                                compute_capability_minor * 10);
1789         }
1790
1791         else if(platform_name == "Apple")
1792                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1793
1794         else if(platform_name == "AMD Accelerated Parallel Processing")
1795                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1796
1797         else if(platform_name == "Intel(R) OpenCL") {
1798                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1799
1800                 /* Options for gdb source level kernel debugging.
1801                  * this segfaults on linux currently.
1802                  */
1803                 if(OpenCLInfo::use_debug() && debug_src)
1804                         build_options += "-g -s \"" + *debug_src + "\" ";
1805         }
1806
1807         if(info.has_half_images) {
1808                 build_options += "-D__KERNEL_CL_KHR_FP16__ ";
1809         }
1810
1811         if(OpenCLInfo::use_debug()) {
1812                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1813         }
1814
1815 #ifdef WITH_CYCLES_DEBUG
1816         build_options += "-D__KERNEL_DEBUG__ ";
1817 #endif
1818
1819         return build_options;
1820 }
1821
1822 /* TODO(sergey): In the future we can use variadic templates, once
1823  * C++0x is allowed. Should allow to clean this up a bit.
1824  */
1825 int OpenCLDevice::kernel_set_args(cl_kernel kernel,
1826                                   int start_argument_index,
1827                                   const ArgumentWrapper& arg1,
1828                                   const ArgumentWrapper& arg2,
1829                                   const ArgumentWrapper& arg3,
1830                                   const ArgumentWrapper& arg4,
1831                                   const ArgumentWrapper& arg5,
1832                                   const ArgumentWrapper& arg6,
1833                                   const ArgumentWrapper& arg7,
1834                                   const ArgumentWrapper& arg8,
1835                                   const ArgumentWrapper& arg9,
1836                                   const ArgumentWrapper& arg10,
1837                                   const ArgumentWrapper& arg11,
1838                                   const ArgumentWrapper& arg12,
1839                                   const ArgumentWrapper& arg13,
1840                                   const ArgumentWrapper& arg14,
1841                                   const ArgumentWrapper& arg15,
1842                                   const ArgumentWrapper& arg16,
1843                                   const ArgumentWrapper& arg17,
1844                                   const ArgumentWrapper& arg18,
1845                                   const ArgumentWrapper& arg19,
1846                                   const ArgumentWrapper& arg20,
1847                                   const ArgumentWrapper& arg21,
1848                                   const ArgumentWrapper& arg22,
1849                                   const ArgumentWrapper& arg23,
1850                                   const ArgumentWrapper& arg24,
1851                                   const ArgumentWrapper& arg25,
1852                                   const ArgumentWrapper& arg26,
1853                                   const ArgumentWrapper& arg27,
1854                                   const ArgumentWrapper& arg28,
1855                                   const ArgumentWrapper& arg29,
1856                                   const ArgumentWrapper& arg30,
1857                                   const ArgumentWrapper& arg31,
1858                                   const ArgumentWrapper& arg32,
1859                                   const ArgumentWrapper& arg33)
1860 {
1861         int current_arg_index = 0;
1862 #define FAKE_VARARG_HANDLE_ARG(arg) \
1863         do { \
1864                 if(arg.pointer != NULL) { \
1865                         opencl_assert(clSetKernelArg( \
1866                                 kernel, \
1867                                 start_argument_index + current_arg_index, \
1868                                 arg.size, arg.pointer)); \
1869                         ++current_arg_index; \
1870                 } \
1871                 else { \
1872                         return current_arg_index; \
1873                 } \
1874         } while(false)
1875         FAKE_VARARG_HANDLE_ARG(arg1);
1876         FAKE_VARARG_HANDLE_ARG(arg2);
1877         FAKE_VARARG_HANDLE_ARG(arg3);
1878         FAKE_VARARG_HANDLE_ARG(arg4);
1879         FAKE_VARARG_HANDLE_ARG(arg5);
1880         FAKE_VARARG_HANDLE_ARG(arg6);
1881         FAKE_VARARG_HANDLE_ARG(arg7);
1882         FAKE_VARARG_HANDLE_ARG(arg8);
1883         FAKE_VARARG_HANDLE_ARG(arg9);
1884         FAKE_VARARG_HANDLE_ARG(arg10);
1885         FAKE_VARARG_HANDLE_ARG(arg11);
1886         FAKE_VARARG_HANDLE_ARG(arg12);
1887         FAKE_VARARG_HANDLE_ARG(arg13);
1888         FAKE_VARARG_HANDLE_ARG(arg14);
1889         FAKE_VARARG_HANDLE_ARG(arg15);
1890         FAKE_VARARG_HANDLE_ARG(arg16);
1891         FAKE_VARARG_HANDLE_ARG(arg17);
1892         FAKE_VARARG_HANDLE_ARG(arg18);
1893         FAKE_VARARG_HANDLE_ARG(arg19);
1894         FAKE_VARARG_HANDLE_ARG(arg20);
1895         FAKE_VARARG_HANDLE_ARG(arg21);
1896         FAKE_VARARG_HANDLE_ARG(arg22);
1897         FAKE_VARARG_HANDLE_ARG(arg23);
1898         FAKE_VARARG_HANDLE_ARG(arg24);
1899         FAKE_VARARG_HANDLE_ARG(arg25);
1900         FAKE_VARARG_HANDLE_ARG(arg26);
1901         FAKE_VARARG_HANDLE_ARG(arg27);
1902         FAKE_VARARG_HANDLE_ARG(arg28);
1903         FAKE_VARARG_HANDLE_ARG(arg29);
1904         FAKE_VARARG_HANDLE_ARG(arg30);
1905         FAKE_VARARG_HANDLE_ARG(arg31);
1906         FAKE_VARARG_HANDLE_ARG(arg32);
1907         FAKE_VARARG_HANDLE_ARG(arg33);
1908 #undef FAKE_VARARG_HANDLE_ARG
1909         return current_arg_index;
1910 }
1911
1912 void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
1913 {
1914         if(kernel) {
1915                 clReleaseKernel(kernel);
1916         }
1917 }
1918
1919 void OpenCLDevice::release_mem_object_safe(cl_mem mem)
1920 {
1921         if(mem != NULL) {
1922                 clReleaseMemObject(mem);
1923         }
1924 }
1925
1926 void OpenCLDevice::release_program_safe(cl_program program)
1927 {
1928         if(program) {
1929                 clReleaseProgram(program);
1930         }
1931 }
1932
1933 /* ** Those guys are for workign around some compiler-specific bugs ** */
1934
1935 cl_program OpenCLDevice::load_cached_kernel(ustring key,
1936                                             thread_scoped_lock& cache_locker)
1937 {
1938         return OpenCLCache::get_program(cpPlatform,
1939                                         cdDevice,
1940                                         key,
1941                                         cache_locker);
1942 }
1943
1944 void OpenCLDevice::store_cached_kernel(cl_program program,
1945                                        ustring key,
1946                                        thread_scoped_lock& cache_locker)
1947 {
1948         OpenCLCache::store_program(cpPlatform,
1949                                    cdDevice,
1950                                    program,
1951                                    key,
1952                                    cache_locker);
1953 }
1954
1955 Device *opencl_create_split_device(DeviceInfo& info, Stats& stats, Profiler &profiler, bool background)
1956 {
1957         return new OpenCLDevice(info, stats, profiler, background);
1958 }
1959
1960 CCL_NAMESPACE_END
1961
1962 #endif