Merge remote-tracking branch 'origin/blender-v2.93-release'
[blender.git] / intern / cycles / device / opencl / device_opencl_impl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #  include "device/opencl/device_opencl.h"
20
21 #  include "kernel/kernel_types.h"
22 #  include "kernel/split/kernel_split_data_types.h"
23
24 #  include "util/util_algorithm.h"
25 #  include "util/util_debug.h"
26 #  include "util/util_foreach.h"
27 #  include "util/util_logging.h"
28 #  include "util/util_md5.h"
29 #  include "util/util_path.h"
30 #  include "util/util_time.h"
31
32 CCL_NAMESPACE_BEGIN
33
34 struct texture_slot_t {
35   texture_slot_t(const string &name, int slot) : name(name), slot(slot)
36   {
37   }
38   string name;
39   int slot;
40 };
41
42 static const string NON_SPLIT_KERNELS =
43     "denoising "
44     "base "
45     "background "
46     "displace ";
47
48 static const string SPLIT_BUNDLE_KERNELS =
49     "data_init "
50     "path_init "
51     "state_buffer_size "
52     "scene_intersect "
53     "queue_enqueue "
54     "shader_setup "
55     "shader_sort "
56     "enqueue_inactive "
57     "next_iteration_setup "
58     "indirect_subsurface "
59     "buffer_update "
60     "adaptive_stopping "
61     "adaptive_filter_x "
62     "adaptive_filter_y "
63     "adaptive_adjust_samples";
64
65 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
66 {
67   if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
68     return kernel_name;
69   }
70   else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
71     return "split_bundle";
72   }
73   else {
74     return "split_" + kernel_name;
75   }
76 }
77
78 const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
79 {
80   if (kernel_name == "denoising") {
81     return "filter.cl";
82   }
83   else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
84     return "kernel_split_bundle.cl";
85   }
86   else {
87     return "kernel_" + kernel_name + ".cl";
88   }
89 }
90
91 /* Enable features that we always want to compile to reduce recompilation events */
92 void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
93 {
94   features.use_transparent = true;
95   features.use_shadow_tricks = true;
96   features.use_principled = true;
97   features.use_denoising = true;
98
99   if (!background) {
100     features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
101     features.nodes_features = NODE_FEATURE_ALL;
102     features.use_hair = true;
103     features.use_subsurface = true;
104     features.use_camera_motion = false;
105     features.use_object_motion = false;
106   }
107 }
108
109 string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
110                                        const string &opencl_program_name)
111 {
112   /* first check for non-split kernel programs */
113   if (opencl_program_name == "base" || opencl_program_name == "denoising") {
114     return "";
115   }
116   else if (opencl_program_name == "bake") {
117     /* Note: get_build_options for bake is only requested when baking is enabled.
118      * displace and background are always requested.
119      * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
120     DeviceRequestedFeatures features(requested_features);
121     enable_default_features(features);
122     features.use_denoising = false;
123     features.use_object_motion = false;
124     features.use_camera_motion = false;
125     features.use_hair = true;
126     features.use_subsurface = true;
127     features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
128     features.nodes_features = NODE_FEATURE_ALL;
129     features.use_integrator_branched = false;
130     return features.get_build_options();
131   }
132   else if (opencl_program_name == "displace") {
133     /* As displacement does not use any nodes from the Shading group (eg BSDF).
134      * We disable all features that are related to shading. */
135     DeviceRequestedFeatures features(requested_features);
136     enable_default_features(features);
137     features.use_denoising = false;
138     features.use_object_motion = false;
139     features.use_camera_motion = false;
140     features.use_baking = false;
141     features.use_transparent = false;
142     features.use_shadow_tricks = false;
143     features.use_subsurface = false;
144     features.use_volume = false;
145     features.nodes_features &= ~NODE_FEATURE_VOLUME;
146     features.use_denoising = false;
147     features.use_principled = false;
148     features.use_integrator_branched = false;
149     return features.get_build_options();
150   }
151   else if (opencl_program_name == "background") {
152     /* Background uses Background shading
153      * It is save to disable shadow features, subsurface and volumetric. */
154     DeviceRequestedFeatures features(requested_features);
155     enable_default_features(features);
156     features.use_baking = false;
157     features.use_object_motion = false;
158     features.use_camera_motion = false;
159     features.use_transparent = false;
160     features.use_shadow_tricks = false;
161     features.use_denoising = false;
162     /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
163      * Perhaps we should remove them in UI as it does not make any sense when
164      * rendering background. */
165     features.nodes_features &= ~NODE_FEATURE_VOLUME;
166     features.use_subsurface = false;
167     features.use_volume = false;
168     features.use_shader_raytrace = false;
169     features.use_patch_evaluation = false;
170     features.use_integrator_branched = false;
171     return features.get_build_options();
172   }
173
174   string build_options = "-D__SPLIT_KERNEL__ ";
175   /* Set compute device build option. */
176   cl_device_type device_type;
177   OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
178   assert(this->ciErr == CL_SUCCESS);
179   if (device_type == CL_DEVICE_TYPE_GPU) {
180     build_options += "-D__COMPUTE_DEVICE_GPU__ ";
181   }
182
183   DeviceRequestedFeatures nofeatures;
184   enable_default_features(nofeatures);
185
186   /* Add program specific optimized compile directives */
187   if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
188     build_options += nofeatures.get_build_options();
189   }
190   else {
191     DeviceRequestedFeatures features(requested_features);
192     enable_default_features(features);
193
194     /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
195      * this also makes sure that the kernels that are build during baking can be reused
196      * when not doing any baking. */
197     features.use_baking = false;
198
199     /* Do not vary on shaders when program doesn't do any shading.
200      * We have bundled them in a single program. */
201     if (opencl_program_name == "split_bundle") {
202       features.max_nodes_group = 0;
203       features.nodes_features = 0;
204       features.use_shader_raytrace = false;
205     }
206
207     /* No specific settings, just add the regular ones */
208     build_options += features.get_build_options();
209   }
210
211   return build_options;
212 }
213
214 OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
215 {
216   device = device_;
217 }
218
219 OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
220 {
221   program_split.release();
222   program_lamp_emission.release();
223   program_do_volume.release();
224   program_indirect_background.release();
225   program_shader_eval.release();
226   program_holdout_emission_blurring_pathtermination_ao.release();
227   program_subsurface_scatter.release();
228   program_direct_lighting.release();
229   program_shadow_blocked_ao.release();
230   program_shadow_blocked_dl.release();
231 }
232
233 void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
234     vector<OpenCLProgram *> &programs, const DeviceRequestedFeatures &requested_features)
235 {
236   if (!requested_features.use_baking) {
237 #  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
238     program_split.add_kernel(ustring("path_trace_" #kernel_name));
239 #  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
240     const string program_name_##kernel_name = "split_" #kernel_name; \
241     program_##kernel_name = OpenCLDevice::OpenCLProgram( \
242         device, \
243         program_name_##kernel_name, \
244         "kernel_" #kernel_name ".cl", \
245         device->get_build_options(requested_features, program_name_##kernel_name)); \
246     program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
247     programs.push_back(&program_##kernel_name);
248
249     /* Ordered with most complex kernels first, to reduce overall compile time. */
250     ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
251     ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
252     ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
253     if (requested_features.use_volume) {
254       ADD_SPLIT_KERNEL_PROGRAM(do_volume);
255     }
256     ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
257     ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
258     ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
259     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
260     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
261
262     /* Quick kernels bundled in a single program to reduce overhead of starting
263      * Blender processes. */
264     program_split = OpenCLDevice::OpenCLProgram(
265         device,
266         "split_bundle",
267         "kernel_split_bundle.cl",
268         device->get_build_options(requested_features, "split_bundle"));
269
270     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
271     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
272     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
273     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
274     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
275     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
276     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
277     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
278     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
279     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
280     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
281     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
282     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
283     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
284     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
285     programs.push_back(&program_split);
286
287 #  undef ADD_SPLIT_KERNEL_PROGRAM
288 #  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
289   }
290 }
291
292 namespace {
293
294 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
295  * fetch its size.
296  */
297 typedef struct KernelGlobalsDummy {
298   ccl_constant KernelData *data;
299   ccl_global char *buffers[8];
300
301 #  define KERNEL_TEX(type, name) TextureInfo name;
302 #  include "kernel/kernel_textures.h"
303 #  undef KERNEL_TEX
304   SplitData split_data;
305   SplitParams split_param_data;
306 } KernelGlobalsDummy;
307
308 }  // namespace
309
310 struct CachedSplitMemory {
311   int id;
312   device_memory *split_data;
313   device_memory *ray_state;
314   device_memory *queue_index;
315   device_memory *use_queues_flag;
316   device_memory *work_pools;
317   device_ptr *buffer;
318 };
319
320 class OpenCLSplitKernelFunction : public SplitKernelFunction {
321  public:
322   OpenCLDevice *device;
323   OpenCLDevice::OpenCLProgram program;
324   CachedSplitMemory &cached_memory;
325   int cached_id;
326
327   OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
328       : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
329   {
330   }
331
332   ~OpenCLSplitKernelFunction()
333   {
334     program.release();
335   }
336
337   virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
338   {
339     if (cached_id != cached_memory.id) {
340       cl_uint start_arg_index = device->kernel_set_args(
341           program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
342
343       device->set_kernel_arg_buffers(program(), &start_arg_index);
344
345       start_arg_index += device->kernel_set_args(program(),
346                                                  start_arg_index,
347                                                  *cached_memory.queue_index,
348                                                  *cached_memory.use_queues_flag,
349                                                  *cached_memory.work_pools,
350                                                  *cached_memory.buffer);
351
352       cached_id = cached_memory.id;
353     }
354
355     device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
356                                            program(),
357                                            2,
358                                            NULL,
359                                            dim.global_size,
360                                            dim.local_size,
361                                            0,
362                                            NULL,
363                                            NULL);
364
365     device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
366
367     if (device->ciErr != CL_SUCCESS) {
368       string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
369                                      clewErrorString(device->ciErr));
370       device->opencl_error(message);
371       return false;
372     }
373
374     return true;
375   }
376 };
377
378 class OpenCLSplitKernel : public DeviceSplitKernel {
379   OpenCLDevice *device;
380   CachedSplitMemory cached_memory;
381
382  public:
383   explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
384   {
385   }
386
387   virtual SplitKernelFunction *get_split_kernel_function(
388       const string &kernel_name, const DeviceRequestedFeatures &requested_features)
389   {
390     OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
391
392     const string program_name = device->get_opencl_program_name(kernel_name);
393     kernel->program = OpenCLDevice::OpenCLProgram(
394         device,
395         program_name,
396         device->get_opencl_program_filename(kernel_name),
397         device->get_build_options(requested_features, program_name));
398
399     kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
400     kernel->program.load();
401
402     if (!kernel->program.is_loaded()) {
403       delete kernel;
404       return NULL;
405     }
406
407     return kernel;
408   }
409
410   virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
411   {
412     device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
413     size_buffer.alloc(1);
414     size_buffer.zero_to_device();
415
416     uint threads = num_threads;
417     OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
418     cl_kernel kernel_state_buffer_size = programs->program_split(
419         ustring("path_trace_state_buffer_size"));
420     device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
421
422     size_t global_size = 64;
423     device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
424                                            kernel_state_buffer_size,
425                                            1,
426                                            NULL,
427                                            &global_size,
428                                            NULL,
429                                            0,
430                                            NULL,
431                                            NULL);
432
433     device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
434
435     size_buffer.copy_from_device(0, 1, 1);
436     size_t size = size_buffer[0];
437     size_buffer.free();
438
439     if (device->ciErr != CL_SUCCESS) {
440       string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
441                                      clewErrorString(device->ciErr));
442       device->opencl_error(message);
443       return 0;
444     }
445
446     return size;
447   }
448
449   virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
450                                               RenderTile &rtile,
451                                               int num_global_elements,
452                                               device_memory &kernel_globals,
453                                               device_memory &kernel_data,
454                                               device_memory &split_data,
455                                               device_memory &ray_state,
456                                               device_memory &queue_index,
457                                               device_memory &use_queues_flag,
458                                               device_memory &work_pool_wgs)
459   {
460     cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
461
462     /* Set the range of samples to be processed for every ray in
463      * path-regeneration logic.
464      */
465     cl_int start_sample = rtile.start_sample;
466     cl_int end_sample = rtile.start_sample + rtile.num_samples;
467
468     OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
469     cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
470
471     cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
472                                                       0,
473                                                       kernel_globals,
474                                                       kernel_data,
475                                                       split_data,
476                                                       num_global_elements,
477                                                       ray_state);
478
479     device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
480
481     start_arg_index += device->kernel_set_args(kernel_data_init,
482                                                start_arg_index,
483                                                start_sample,
484                                                end_sample,
485                                                rtile.x,
486                                                rtile.y,
487                                                rtile.w,
488                                                rtile.h,
489                                                rtile.offset,
490                                                rtile.stride,
491                                                queue_index,
492                                                dQueue_size,
493                                                use_queues_flag,
494                                                work_pool_wgs,
495                                                rtile.num_samples,
496                                                rtile.buffer);
497
498     /* Enqueue ckPathTraceKernel_data_init kernel. */
499     device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
500                                            kernel_data_init,
501                                            2,
502                                            NULL,
503                                            dim.global_size,
504                                            dim.local_size,
505                                            0,
506                                            NULL,
507                                            NULL);
508
509     device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
510
511     if (device->ciErr != CL_SUCCESS) {
512       string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
513                                      clewErrorString(device->ciErr));
514       device->opencl_error(message);
515       return false;
516     }
517
518     cached_memory.split_data = &split_data;
519     cached_memory.ray_state = &ray_state;
520     cached_memory.queue_index = &queue_index;
521     cached_memory.use_queues_flag = &use_queues_flag;
522     cached_memory.work_pools = &work_pool_wgs;
523     cached_memory.buffer = &rtile.buffer;
524     cached_memory.id++;
525
526     return true;
527   }
528
529   virtual int2 split_kernel_local_size()
530   {
531     return make_int2(64, 1);
532   }
533
534   virtual int2 split_kernel_global_size(device_memory &kg,
535                                         device_memory &data,
536                                         DeviceTask & /*task*/)
537   {
538     cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
539     /* Use small global size on CPU devices as it seems to be much faster. */
540     if (type == CL_DEVICE_TYPE_CPU) {
541       VLOG(1) << "Global size: (64, 64).";
542       return make_int2(64, 64);
543     }
544
545     cl_ulong max_buffer_size;
546     clGetDeviceInfo(
547         device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
548
549     if (DebugFlags().opencl.mem_limit) {
550       max_buffer_size = min(max_buffer_size,
551                             cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
552     }
553
554     VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
555             << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
556
557     /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
558     max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
559
560     size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
561     int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
562                                  (int)sqrt(num_elements));
563
564     if (device->info.description.find("Intel") != string::npos) {
565       global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
566     }
567
568     VLOG(1) << "Global size: " << global_size << ".";
569     return global_size;
570   }
571 };
572
573 bool OpenCLDevice::opencl_error(cl_int err)
574 {
575   if (err != CL_SUCCESS) {
576     string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
577     if (error_msg == "")
578       error_msg = message;
579     fprintf(stderr, "%s\n", message.c_str());
580     return true;
581   }
582
583   return false;
584 }
585
586 void OpenCLDevice::opencl_error(const string &message)
587 {
588   if (error_msg == "")
589     error_msg = message;
590   fprintf(stderr, "%s\n", message.c_str());
591 }
592
593 void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
594 {
595   if (err != CL_SUCCESS) {
596     string message = string_printf(
597         "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
598     if (error_msg == "")
599       error_msg = message;
600     fprintf(stderr, "%s\n", message.c_str());
601 #  ifndef NDEBUG
602     abort();
603 #  endif
604   }
605 }
606
607 OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
608     : Device(info, stats, profiler, background),
609       load_kernel_num_compiling(0),
610       kernel_programs(this),
611       memory_manager(this),
612       texture_info(this, "__texture_info", MEM_GLOBAL)
613 {
614   cpPlatform = NULL;
615   cdDevice = NULL;
616   cxContext = NULL;
617   cqCommandQueue = NULL;
618   device_initialized = false;
619   textures_need_update = true;
620
621   vector<OpenCLPlatformDevice> usable_devices;
622   OpenCLInfo::get_usable_devices(&usable_devices);
623   if (usable_devices.size() == 0) {
624     opencl_error("OpenCL: no devices found.");
625     return;
626   }
627   assert(info.num < usable_devices.size());
628   OpenCLPlatformDevice &platform_device = usable_devices[info.num];
629   device_num = info.num;
630   cpPlatform = platform_device.platform_id;
631   cdDevice = platform_device.device_id;
632   platform_name = platform_device.platform_name;
633   device_name = platform_device.device_name;
634   VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
635           << device_name << ".";
636
637   {
638     /* try to use cached context */
639     thread_scoped_lock cache_locker;
640     cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
641
642     if (cxContext == NULL) {
643       /* create context properties array to specify platform */
644       const cl_context_properties context_props[] = {
645           CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
646
647       /* create context */
648       cxContext = clCreateContext(
649           context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
650
651       if (opencl_error(ciErr)) {
652         opencl_error("OpenCL: clCreateContext failed");
653         return;
654       }
655
656       /* cache it */
657       OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
658     }
659   }
660
661   cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
662   if (opencl_error(ciErr)) {
663     opencl_error("OpenCL: Error creating command queue");
664     return;
665   }
666
667   /* Allocate this right away so that texture_info
668    * is placed at offset 0 in the device memory buffers. */
669   texture_info.resize(1);
670   memory_manager.alloc("texture_info", texture_info);
671
672   device_initialized = true;
673
674   split_kernel = new OpenCLSplitKernel(this);
675 }
676
677 OpenCLDevice::~OpenCLDevice()
678 {
679   task_pool.cancel();
680   load_required_kernel_task_pool.cancel();
681   load_kernel_task_pool.cancel();
682
683   memory_manager.free();
684
685   ConstMemMap::iterator mt;
686   for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
687     delete mt->second;
688   }
689
690   base_program.release();
691   bake_program.release();
692   displace_program.release();
693   background_program.release();
694   denoising_program.release();
695
696   if (cqCommandQueue)
697     clReleaseCommandQueue(cqCommandQueue);
698   if (cxContext)
699     clReleaseContext(cxContext);
700
701   delete split_kernel;
702 }
703
704 void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
705                                                        const void * /*private_info*/,
706                                                        size_t /*cb*/,
707                                                        void *user_data)
708 {
709   string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
710   fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
711 }
712
713 bool OpenCLDevice::opencl_version_check()
714 {
715   string error;
716   if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
717     opencl_error(error);
718     return false;
719   }
720   if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
721     opencl_error(error);
722     return false;
723   }
724   return true;
725 }
726
727 string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
728 {
729   MD5Hash md5;
730   char version[256], driver[256], name[256], vendor[256];
731
732   clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
733   clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
734   clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
735   clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
736
737   md5.append((uint8_t *)vendor, strlen(vendor));
738   md5.append((uint8_t *)version, strlen(version));
739   md5.append((uint8_t *)name, strlen(name));
740   md5.append((uint8_t *)driver, strlen(driver));
741
742   string options = kernel_build_options();
743   options += kernel_custom_build_options;
744   md5.append((uint8_t *)options.c_str(), options.size());
745
746   return md5.get_hex();
747 }
748
749 bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
750 {
751   VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
752   /* Verify if device was initialized. */
753   if (!device_initialized) {
754     fprintf(stderr, "OpenCL: failed to initialize device.\n");
755     return false;
756   }
757
758   /* Verify we have right opencl version. */
759   if (!opencl_version_check())
760     return false;
761
762   load_required_kernels(requested_features);
763
764   vector<OpenCLProgram *> programs;
765   kernel_programs.load_kernels(programs, requested_features);
766
767   if (!requested_features.use_baking && requested_features.use_denoising) {
768     denoising_program = OpenCLProgram(
769         this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
770     denoising_program.add_kernel(ustring("filter_divide_shadow"));
771     denoising_program.add_kernel(ustring("filter_get_feature"));
772     denoising_program.add_kernel(ustring("filter_write_feature"));
773     denoising_program.add_kernel(ustring("filter_detect_outliers"));
774     denoising_program.add_kernel(ustring("filter_combine_halves"));
775     denoising_program.add_kernel(ustring("filter_construct_transform"));
776     denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
777     denoising_program.add_kernel(ustring("filter_nlm_blur"));
778     denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
779     denoising_program.add_kernel(ustring("filter_nlm_update_output"));
780     denoising_program.add_kernel(ustring("filter_nlm_normalize"));
781     denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
782     denoising_program.add_kernel(ustring("filter_finalize"));
783     programs.push_back(&denoising_program);
784   }
785
786   load_required_kernel_task_pool.wait_work();
787
788   /* Parallel compilation of Cycles kernels, this launches multiple
789    * processes to workaround OpenCL frameworks serializing the calls
790    * internally within a single process. */
791   foreach (OpenCLProgram *program, programs) {
792     if (!program->load()) {
793       load_kernel_num_compiling++;
794       load_kernel_task_pool.push([=] {
795         program->compile();
796         load_kernel_num_compiling--;
797       });
798     }
799   }
800   return true;
801 }
802
803 void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
804 {
805   vector<OpenCLProgram *> programs;
806   base_program = OpenCLProgram(
807       this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
808   base_program.add_kernel(ustring("convert_to_byte"));
809   base_program.add_kernel(ustring("convert_to_half_float"));
810   base_program.add_kernel(ustring("zero_buffer"));
811   programs.push_back(&base_program);
812
813   if (requested_features.use_true_displacement) {
814     displace_program = OpenCLProgram(
815         this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
816     displace_program.add_kernel(ustring("displace"));
817     programs.push_back(&displace_program);
818   }
819
820   if (requested_features.use_background_light) {
821     background_program = OpenCLProgram(this,
822                                        "background",
823                                        "kernel_background.cl",
824                                        get_build_options(requested_features, "background"));
825     background_program.add_kernel(ustring("background"));
826     programs.push_back(&background_program);
827   }
828
829   if (requested_features.use_baking) {
830     bake_program = OpenCLProgram(
831         this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
832     bake_program.add_kernel(ustring("bake"));
833     programs.push_back(&bake_program);
834   }
835
836   foreach (OpenCLProgram *program, programs) {
837     if (!program->load()) {
838       load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
839     }
840   }
841 }
842
843 bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
844 {
845   if (requested_features.use_baking) {
846     /* For baking, kernels have already been loaded in load_required_kernels(). */
847     return true;
848   }
849
850   load_kernel_task_pool.wait_work();
851   return split_kernel->load_kernels(requested_features);
852 }
853
854 OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
855 {
856   return &kernel_programs;
857 }
858
859 DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
860 {
861   return DEVICE_KERNEL_USING_FEATURE_KERNEL;
862 }
863
864 void OpenCLDevice::mem_alloc(device_memory &mem)
865 {
866   if (mem.name) {
867     VLOG(1) << "Buffer allocate: " << mem.name << ", "
868             << string_human_readable_number(mem.memory_size()) << " bytes. ("
869             << string_human_readable_size(mem.memory_size()) << ")";
870   }
871
872   size_t size = mem.memory_size();
873
874   /* check there is enough memory available for the allocation */
875   cl_ulong max_alloc_size = 0;
876   clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
877
878   if (DebugFlags().opencl.mem_limit) {
879     max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
880   }
881
882   if (size > max_alloc_size) {
883     string error = "Scene too complex to fit in available memory.";
884     if (mem.name != NULL) {
885       error += string_printf(" (allocating buffer %s failed.)", mem.name);
886     }
887     set_error(error);
888
889     return;
890   }
891
892   cl_mem_flags mem_flag;
893   void *mem_ptr = NULL;
894
895   if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
896     mem_flag = CL_MEM_READ_ONLY;
897   else
898     mem_flag = CL_MEM_READ_WRITE;
899
900   /* Zero-size allocation might be invoked by render, but not really
901    * supported by OpenCL. Using NULL as device pointer also doesn't really
902    * work for some reason, so for the time being we'll use special case
903    * will null_mem buffer.
904    */
905   if (size != 0) {
906     mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
907     opencl_assert_err(ciErr, "clCreateBuffer");
908   }
909   else {
910     mem.device_pointer = 0;
911   }
912
913   stats.mem_alloc(size);
914   mem.device_size = size;
915 }
916
917 void OpenCLDevice::mem_copy_to(device_memory &mem)
918 {
919   if (mem.type == MEM_GLOBAL) {
920     global_free(mem);
921     global_alloc(mem);
922   }
923   else if (mem.type == MEM_TEXTURE) {
924     tex_free((device_texture &)mem);
925     tex_alloc((device_texture &)mem);
926   }
927   else {
928     if (!mem.device_pointer) {
929       mem_alloc(mem);
930     }
931
932     /* this is blocking */
933     size_t size = mem.memory_size();
934     if (size != 0) {
935       opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
936                                          CL_MEM_PTR(mem.device_pointer),
937                                          CL_TRUE,
938                                          0,
939                                          size,
940                                          mem.host_pointer,
941                                          0,
942                                          NULL,
943                                          NULL));
944     }
945   }
946 }
947
948 void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
949 {
950   size_t offset = elem * y * w;
951   size_t size = elem * w * h;
952   assert(size != 0);
953   opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
954                                     CL_MEM_PTR(mem.device_pointer),
955                                     CL_TRUE,
956                                     offset,
957                                     size,
958                                     (uchar *)mem.host_pointer + offset,
959                                     0,
960                                     NULL,
961                                     NULL));
962 }
963
964 void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
965 {
966   base_program.wait_for_availability();
967   cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
968
969   size_t global_size[] = {1024, 1024};
970   size_t num_threads = global_size[0] * global_size[1];
971
972   cl_mem d_buffer = CL_MEM_PTR(mem);
973   cl_ulong d_offset = 0;
974   cl_ulong d_size = 0;
975
976   while (d_offset < size) {
977     d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
978
979     kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
980
981     ciErr = clEnqueueNDRangeKernel(
982         cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
983     opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
984
985     d_offset += d_size;
986   }
987 }
988
989 void OpenCLDevice::mem_zero(device_memory &mem)
990 {
991   if (!mem.device_pointer) {
992     mem_alloc(mem);
993   }
994
995   if (mem.device_pointer) {
996     if (base_program.is_loaded()) {
997       mem_zero_kernel(mem.device_pointer, mem.memory_size());
998     }
999
1000     if (mem.host_pointer) {
1001       memset(mem.host_pointer, 0, mem.memory_size());
1002     }
1003
1004     if (!base_program.is_loaded()) {
1005       void *zero = mem.host_pointer;
1006
1007       if (!mem.host_pointer) {
1008         zero = util_aligned_malloc(mem.memory_size(), 16);
1009         memset(zero, 0, mem.memory_size());
1010       }
1011
1012       opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1013                                          CL_MEM_PTR(mem.device_pointer),
1014                                          CL_TRUE,
1015                                          0,
1016                                          mem.memory_size(),
1017                                          zero,
1018                                          0,
1019                                          NULL,
1020                                          NULL));
1021
1022       if (!mem.host_pointer) {
1023         util_aligned_free(zero);
1024       }
1025     }
1026   }
1027 }
1028
1029 void OpenCLDevice::mem_free(device_memory &mem)
1030 {
1031   if (mem.type == MEM_GLOBAL) {
1032     global_free(mem);
1033   }
1034   else if (mem.type == MEM_TEXTURE) {
1035     tex_free((device_texture &)mem);
1036   }
1037   else {
1038     if (mem.device_pointer) {
1039       if (mem.device_pointer != 0) {
1040         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1041       }
1042       mem.device_pointer = 0;
1043
1044       stats.mem_free(mem.device_size);
1045       mem.device_size = 0;
1046     }
1047   }
1048 }
1049
1050 int OpenCLDevice::mem_sub_ptr_alignment()
1051 {
1052   return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
1053 }
1054
1055 device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
1056 {
1057   cl_mem_flags mem_flag;
1058   if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
1059     mem_flag = CL_MEM_READ_ONLY;
1060   else
1061     mem_flag = CL_MEM_READ_WRITE;
1062
1063   cl_buffer_region info;
1064   info.origin = mem.memory_elements_size(offset);
1065   info.size = mem.memory_elements_size(size);
1066
1067   device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
1068       CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
1069   opencl_assert_err(ciErr, "clCreateSubBuffer");
1070   return sub_buf;
1071 }
1072
1073 void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
1074 {
1075   if (device_pointer != 0) {
1076     opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
1077   }
1078 }
1079
1080 void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
1081 {
1082   ConstMemMap::iterator i = const_mem_map.find(name);
1083   device_vector<uchar> *data;
1084
1085   if (i == const_mem_map.end()) {
1086     data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
1087     data->alloc(size);
1088     const_mem_map.insert(ConstMemMap::value_type(name, data));
1089   }
1090   else {
1091     data = i->second;
1092   }
1093
1094   memcpy(data->data(), host, size);
1095   data->copy_to_device();
1096 }
1097
1098 void OpenCLDevice::global_alloc(device_memory &mem)
1099 {
1100   VLOG(1) << "Global memory allocate: " << mem.name << ", "
1101           << string_human_readable_number(mem.memory_size()) << " bytes. ("
1102           << string_human_readable_size(mem.memory_size()) << ")";
1103
1104   memory_manager.alloc(mem.name, mem);
1105   /* Set the pointer to non-null to keep code that inspects its value from thinking its
1106    * unallocated. */
1107   mem.device_pointer = 1;
1108   textures[mem.name] = &mem;
1109   textures_need_update = true;
1110 }
1111
1112 void OpenCLDevice::global_free(device_memory &mem)
1113 {
1114   if (mem.device_pointer) {
1115     mem.device_pointer = 0;
1116
1117     if (memory_manager.free(mem)) {
1118       textures_need_update = true;
1119     }
1120
1121     foreach (TexturesMap::value_type &value, textures) {
1122       if (value.second == &mem) {
1123         textures.erase(value.first);
1124         break;
1125       }
1126     }
1127   }
1128 }
1129
1130 void OpenCLDevice::tex_alloc(device_texture &mem)
1131 {
1132   VLOG(1) << "Texture allocate: " << mem.name << ", "
1133           << string_human_readable_number(mem.memory_size()) << " bytes. ("
1134           << string_human_readable_size(mem.memory_size()) << ")";
1135
1136   memory_manager.alloc(mem.name, mem);
1137   /* Set the pointer to non-null to keep code that inspects its value from thinking its
1138    * unallocated. */
1139   mem.device_pointer = 1;
1140   textures[mem.name] = &mem;
1141   textures_need_update = true;
1142 }
1143
1144 void OpenCLDevice::tex_free(device_texture &mem)
1145 {
1146   global_free(mem);
1147 }
1148
1149 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
1150 {
1151   int r = global_size % group_size;
1152   return global_size + ((r == 0) ? 0 : group_size - r);
1153 }
1154
1155 void OpenCLDevice::enqueue_kernel(
1156     cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
1157 {
1158   size_t workgroup_size, max_work_items[3];
1159
1160   clGetKernelWorkGroupInfo(
1161       kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1162   clGetDeviceInfo(
1163       cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
1164
1165   if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
1166     workgroup_size = max_workgroup_size;
1167   }
1168
1169   /* Try to divide evenly over 2 dimensions. */
1170   size_t local_size[2];
1171   if (x_workgroups) {
1172     local_size[0] = workgroup_size;
1173     local_size[1] = 1;
1174   }
1175   else {
1176     size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1177     local_size[0] = local_size[1] = sqrt_workgroup_size;
1178   }
1179
1180   /* Some implementations have max size 1 on 2nd dimension. */
1181   if (local_size[1] > max_work_items[1]) {
1182     local_size[0] = workgroup_size / max_work_items[1];
1183     local_size[1] = max_work_items[1];
1184   }
1185
1186   size_t global_size[2] = {global_size_round_up(local_size[0], w),
1187                            global_size_round_up(local_size[1], h)};
1188
1189   /* Vertical size of 1 is coming from bake/shade kernels where we should
1190    * not round anything up because otherwise we'll either be doing too
1191    * much work per pixel (if we don't check global ID on Y axis) or will
1192    * be checking for global ID to always have Y of 0.
1193    */
1194   if (h == 1) {
1195     global_size[h] = 1;
1196   }
1197
1198   /* run kernel */
1199   opencl_assert(
1200       clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1201   opencl_assert(clFlush(cqCommandQueue));
1202 }
1203
1204 void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1205 {
1206   cl_mem ptr;
1207
1208   MemMap::iterator i = mem_map.find(name);
1209   if (i != mem_map.end()) {
1210     ptr = CL_MEM_PTR(i->second);
1211   }
1212   else {
1213     ptr = 0;
1214   }
1215
1216   opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
1217 }
1218
1219 void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
1220 {
1221   flush_texture_buffers();
1222
1223   memory_manager.set_kernel_arg_buffers(kernel, narg);
1224 }
1225
1226 void OpenCLDevice::flush_texture_buffers()
1227 {
1228   if (!textures_need_update) {
1229     return;
1230   }
1231   textures_need_update = false;
1232
1233   /* Setup slots for textures. */
1234   int num_slots = 0;
1235
1236   vector<texture_slot_t> texture_slots;
1237
1238 #  define KERNEL_TEX(type, name) \
1239     if (textures.find(#name) != textures.end()) { \
1240       texture_slots.push_back(texture_slot_t(#name, num_slots)); \
1241     } \
1242     num_slots++;
1243 #  include "kernel/kernel_textures.h"
1244
1245   int num_data_slots = num_slots;
1246
1247   foreach (TexturesMap::value_type &tex, textures) {
1248     string name = tex.first;
1249     device_memory *mem = tex.second;
1250
1251     if (mem->type == MEM_TEXTURE) {
1252       const uint id = ((device_texture *)mem)->slot;
1253       texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
1254       num_slots = max(num_slots, num_data_slots + id + 1);
1255     }
1256   }
1257
1258   /* Realloc texture descriptors buffer. */
1259   memory_manager.free(texture_info);
1260   texture_info.resize(num_slots);
1261   memory_manager.alloc("texture_info", texture_info);
1262
1263   /* Fill in descriptors */
1264   foreach (texture_slot_t &slot, texture_slots) {
1265     device_memory *mem = textures[slot.name];
1266     TextureInfo &info = texture_info[slot.slot];
1267
1268     MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
1269
1270     if (mem->type == MEM_TEXTURE) {
1271       info = ((device_texture *)mem)->info;
1272     }
1273     else {
1274       memset(&info, 0, sizeof(TextureInfo));
1275     }
1276
1277     info.data = desc.offset;
1278     info.cl_buffer = desc.device_buffer;
1279   }
1280
1281   /* Force write of descriptors. */
1282   memory_manager.free(texture_info);
1283   memory_manager.alloc("texture_info", texture_info);
1284 }
1285
1286 void OpenCLDevice::thread_run(DeviceTask &task)
1287 {
1288   flush_texture_buffers();
1289
1290   if (task.type == DeviceTask::RENDER) {
1291     RenderTile tile;
1292     DenoisingTask denoising(this, task);
1293
1294     /* Allocate buffer for kernel globals */
1295     device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
1296     kgbuffer.alloc_to_device(1);
1297
1298     /* Keep rendering tiles until done. */
1299     while (task.acquire_tile(this, tile, task.tile_types)) {
1300       if (tile.task == RenderTile::PATH_TRACE) {
1301         assert(tile.task == RenderTile::PATH_TRACE);
1302         scoped_timer timer(&tile.buffers->render_time);
1303
1304         split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
1305
1306         /* Complete kernel execution before release tile. */
1307         /* This helps in multi-device render;
1308          * The device that reaches the critical-section function
1309          * release_tile waits (stalling other devices from entering
1310          * release_tile) for all kernels to complete. If device1 (a
1311          * slow-render device) reaches release_tile first then it would
1312          * stall device2 (a fast-render device) from proceeding to render
1313          * next tile.
1314          */
1315         clFinish(cqCommandQueue);
1316       }
1317       else if (tile.task == RenderTile::BAKE) {
1318         bake(task, tile);
1319       }
1320       else if (tile.task == RenderTile::DENOISE) {
1321         tile.sample = tile.start_sample + tile.num_samples;
1322         denoise(tile, denoising);
1323         task.update_progress(&tile, tile.w * tile.h);
1324       }
1325
1326       task.release_tile(tile);
1327     }
1328
1329     kgbuffer.free();
1330   }
1331   else if (task.type == DeviceTask::SHADER) {
1332     shader(task);
1333   }
1334   else if (task.type == DeviceTask::FILM_CONVERT) {
1335     film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1336   }
1337   else if (task.type == DeviceTask::DENOISE_BUFFER) {
1338     RenderTile tile;
1339     tile.x = task.x;
1340     tile.y = task.y;
1341     tile.w = task.w;
1342     tile.h = task.h;
1343     tile.buffer = task.buffer;
1344     tile.sample = task.sample + task.num_samples;
1345     tile.num_samples = task.num_samples;
1346     tile.start_sample = task.sample;
1347     tile.offset = task.offset;
1348     tile.stride = task.stride;
1349     tile.buffers = task.buffers;
1350
1351     DenoisingTask denoising(this, task);
1352     denoise(tile, denoising);
1353     task.update_progress(&tile, tile.w * tile.h);
1354   }
1355 }
1356
1357 void OpenCLDevice::film_convert(DeviceTask &task,
1358                                 device_ptr buffer,
1359                                 device_ptr rgba_byte,
1360                                 device_ptr rgba_half)
1361 {
1362   /* cast arguments to cl types */
1363   cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1364   cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
1365   cl_mem d_buffer = CL_MEM_PTR(buffer);
1366   cl_int d_x = task.x;
1367   cl_int d_y = task.y;
1368   cl_int d_w = task.w;
1369   cl_int d_h = task.h;
1370   cl_float d_sample_scale = 1.0f / (task.sample + 1);
1371   cl_int d_offset = task.offset;
1372   cl_int d_stride = task.stride;
1373
1374   cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
1375                                                 base_program(ustring("convert_to_half_float"));
1376
1377   cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
1378
1379   set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
1380
1381   start_arg_index += kernel_set_args(ckFilmConvertKernel,
1382                                      start_arg_index,
1383                                      d_sample_scale,
1384                                      d_x,
1385                                      d_y,
1386                                      d_w,
1387                                      d_h,
1388                                      d_offset,
1389                                      d_stride);
1390
1391   enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1392 }
1393
1394 bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
1395                                              device_ptr guide_ptr,
1396                                              device_ptr variance_ptr,
1397                                              device_ptr out_ptr,
1398                                              DenoisingTask *task)
1399 {
1400   int stride = task->buffer.stride;
1401   int w = task->buffer.width;
1402   int h = task->buffer.h;
1403   int r = task->nlm_state.r;
1404   int f = task->nlm_state.f;
1405   float a = task->nlm_state.a;
1406   float k_2 = task->nlm_state.k_2;
1407
1408   int pass_stride = task->buffer.pass_stride;
1409   int num_shifts = (2 * r + 1) * (2 * r + 1);
1410   int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
1411
1412   device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
1413   device_sub_ptr blurDifference(
1414       task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
1415   device_sub_ptr weightAccum(
1416       task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
1417   cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
1418   cl_mem difference_mem = CL_MEM_PTR(*difference);
1419   cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1420
1421   cl_mem image_mem = CL_MEM_PTR(image_ptr);
1422   cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
1423   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1424   cl_mem out_mem = CL_MEM_PTR(out_ptr);
1425   cl_mem scale_mem = NULL;
1426
1427   mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
1428   mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
1429
1430   cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1431   cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
1432   cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
1433   cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
1434   cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
1435
1436   kernel_set_args(ckNLMCalcDifference,
1437                   0,
1438                   guide_mem,
1439                   variance_mem,
1440                   scale_mem,
1441                   difference_mem,
1442                   w,
1443                   h,
1444                   stride,
1445                   pass_stride,
1446                   r,
1447                   channel_offset,
1448                   0,
1449                   a,
1450                   k_2);
1451   kernel_set_args(
1452       ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
1453   kernel_set_args(
1454       ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
1455   kernel_set_args(ckNLMUpdateOutput,
1456                   0,
1457                   blurDifference_mem,
1458                   image_mem,
1459                   out_mem,
1460                   weightAccum_mem,
1461                   w,
1462                   h,
1463                   stride,
1464                   pass_stride,
1465                   channel_offset,
1466                   r,
1467                   f);
1468
1469   enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
1470   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1471   enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
1472   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1473   enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
1474
1475   kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
1476   enqueue_kernel(ckNLMNormalize, w, h);
1477
1478   return true;
1479 }
1480
1481 bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
1482 {
1483   cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1484   cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1485   cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1486   cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1487
1488   char use_time = task->buffer.use_time ? 1 : 0;
1489
1490   cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
1491
1492   int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
1493   cl_mem buffers[9];
1494   for (int i = 0; i < 9; i++) {
1495     buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1496     arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
1497   }
1498   kernel_set_args(ckFilterConstructTransform,
1499                   arg_ofs,
1500                   transform_mem,
1501                   rank_mem,
1502                   task->filter_area,
1503                   task->rect,
1504                   task->buffer.pass_stride,
1505                   task->buffer.frame_stride,
1506                   use_time,
1507                   task->radius,
1508                   task->pca_threshold);
1509
1510   enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
1511
1512   return true;
1513 }
1514
1515 bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
1516                                         device_ptr color_variance_ptr,
1517                                         device_ptr scale_ptr,
1518                                         int frame,
1519                                         DenoisingTask *task)
1520 {
1521   cl_mem color_mem = CL_MEM_PTR(color_ptr);
1522   cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
1523   cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
1524
1525   cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1526   cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1527   cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1528   cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1529   cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1530
1531   cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1532   cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
1533   cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
1534   cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
1535
1536   int w = task->reconstruction_state.source_w;
1537   int h = task->reconstruction_state.source_h;
1538   int stride = task->buffer.stride;
1539   int frame_offset = frame * task->buffer.frame_stride;
1540   int t = task->tile_info->frames[frame];
1541   char use_time = task->buffer.use_time ? 1 : 0;
1542
1543   int r = task->radius;
1544   int pass_stride = task->buffer.pass_stride;
1545   int num_shifts = (2 * r + 1) * (2 * r + 1);
1546
1547   device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
1548   device_sub_ptr blurDifference(
1549       task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
1550   cl_mem difference_mem = CL_MEM_PTR(*difference);
1551   cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1552
1553   kernel_set_args(ckNLMCalcDifference,
1554                   0,
1555                   color_mem,
1556                   color_variance_mem,
1557                   scale_mem,
1558                   difference_mem,
1559                   w,
1560                   h,
1561                   stride,
1562                   pass_stride,
1563                   r,
1564                   pass_stride,
1565                   frame_offset,
1566                   1.0f,
1567                   task->nlm_k_2);
1568   kernel_set_args(
1569       ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
1570   kernel_set_args(
1571       ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
1572   kernel_set_args(ckNLMConstructGramian,
1573                   0,
1574                   t,
1575                   blurDifference_mem,
1576                   buffer_mem,
1577                   transform_mem,
1578                   rank_mem,
1579                   XtWX_mem,
1580                   XtWY_mem,
1581                   task->reconstruction_state.filter_window,
1582                   w,
1583                   h,
1584                   stride,
1585                   pass_stride,
1586                   r,
1587                   4,
1588                   frame_offset,
1589                   use_time);
1590
1591   enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
1592   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1593   enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
1594   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1595   enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
1596
1597   return true;
1598 }
1599
1600 bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
1601 {
1602   cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
1603
1604   cl_mem output_mem = CL_MEM_PTR(output_ptr);
1605   cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1606   cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1607   cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1608
1609   int w = task->reconstruction_state.source_w;
1610   int h = task->reconstruction_state.source_h;
1611
1612   kernel_set_args(ckFinalize,
1613                   0,
1614                   output_mem,
1615                   rank_mem,
1616                   XtWX_mem,
1617                   XtWY_mem,
1618                   task->filter_area,
1619                   task->reconstruction_state.buffer_params,
1620                   task->render_buffer.samples);
1621   enqueue_kernel(ckFinalize, w, h);
1622
1623   return true;
1624 }
1625
1626 bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
1627                                             device_ptr b_ptr,
1628                                             device_ptr mean_ptr,
1629                                             device_ptr variance_ptr,
1630                                             int r,
1631                                             int4 rect,
1632                                             DenoisingTask *task)
1633 {
1634   cl_mem a_mem = CL_MEM_PTR(a_ptr);
1635   cl_mem b_mem = CL_MEM_PTR(b_ptr);
1636   cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1637   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1638
1639   cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
1640
1641   kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
1642   enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1643
1644   return true;
1645 }
1646
1647 bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
1648                                            device_ptr b_ptr,
1649                                            device_ptr sample_variance_ptr,
1650                                            device_ptr sv_variance_ptr,
1651                                            device_ptr buffer_variance_ptr,
1652                                            DenoisingTask *task)
1653 {
1654   cl_mem a_mem = CL_MEM_PTR(a_ptr);
1655   cl_mem b_mem = CL_MEM_PTR(b_ptr);
1656   cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
1657   cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
1658   cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
1659
1660   cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1661
1662   cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
1663
1664   int arg_ofs = kernel_set_args(
1665       ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
1666   cl_mem buffers[9];
1667   for (int i = 0; i < 9; i++) {
1668     buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1669     arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
1670   }
1671   kernel_set_args(ckFilterDivideShadow,
1672                   arg_ofs,
1673                   a_mem,
1674                   b_mem,
1675                   sample_variance_mem,
1676                   sv_variance_mem,
1677                   buffer_variance_mem,
1678                   task->rect,
1679                   task->render_buffer.pass_stride,
1680                   task->render_buffer.offset);
1681   enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1682
1683   return true;
1684 }
1685
1686 bool OpenCLDevice::denoising_get_feature(int mean_offset,
1687                                          int variance_offset,
1688                                          device_ptr mean_ptr,
1689                                          device_ptr variance_ptr,
1690                                          float scale,
1691                                          DenoisingTask *task)
1692 {
1693   cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1694   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1695
1696   cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1697
1698   cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1699
1700   int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
1701   cl_mem buffers[9];
1702   for (int i = 0; i < 9; i++) {
1703     buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1704     arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
1705   }
1706   kernel_set_args(ckFilterGetFeature,
1707                   arg_ofs,
1708                   mean_offset,
1709                   variance_offset,
1710                   mean_mem,
1711                   variance_mem,
1712                   scale,
1713                   task->rect,
1714                   task->render_buffer.pass_stride,
1715                   task->render_buffer.offset);
1716   enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1717
1718   return true;
1719 }
1720
1721 bool OpenCLDevice::denoising_write_feature(int out_offset,
1722                                            device_ptr from_ptr,
1723                                            device_ptr buffer_ptr,
1724                                            DenoisingTask *task)
1725 {
1726   cl_mem from_mem = CL_MEM_PTR(from_ptr);
1727   cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
1728
1729   cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
1730
1731   kernel_set_args(ckFilterWriteFeature,
1732                   0,
1733                   task->render_buffer.samples,
1734                   task->reconstruction_state.buffer_params,
1735                   task->filter_area,
1736                   from_mem,
1737                   buffer_mem,
1738                   out_offset,
1739                   task->rect);
1740   enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
1741
1742   return true;
1743 }
1744
1745 bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
1746                                              device_ptr variance_ptr,
1747                                              device_ptr depth_ptr,
1748                                              device_ptr output_ptr,
1749                                              DenoisingTask *task)
1750 {
1751   cl_mem image_mem = CL_MEM_PTR(image_ptr);
1752   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1753   cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1754   cl_mem output_mem = CL_MEM_PTR(output_ptr);
1755
1756   cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1757
1758   kernel_set_args(ckFilterDetectOutliers,
1759                   0,
1760                   image_mem,
1761                   variance_mem,
1762                   depth_mem,
1763                   output_mem,
1764                   task->rect,
1765                   task->buffer.pass_stride);
1766   enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1767
1768   return true;
1769 }
1770
1771 void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
1772 {
1773   denoising.functions.construct_transform = function_bind(
1774       &OpenCLDevice::denoising_construct_transform, this, &denoising);
1775   denoising.functions.accumulate = function_bind(
1776       &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1777   denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
1778   denoising.functions.divide_shadow = function_bind(
1779       &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1780   denoising.functions.non_local_means = function_bind(
1781       &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1782   denoising.functions.combine_halves = function_bind(
1783       &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1784   denoising.functions.get_feature = function_bind(
1785       &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1786   denoising.functions.write_feature = function_bind(
1787       &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1788   denoising.functions.detect_outliers = function_bind(
1789       &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1790
1791   denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1792   denoising.render_buffer.samples = rtile.sample;
1793   denoising.buffer.gpu_temporary_mem = true;
1794
1795   denoising.run_denoising(rtile);
1796 }
1797
1798 void OpenCLDevice::shader(DeviceTask &task)
1799 {
1800   /* cast arguments to cl types */
1801   cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1802   cl_mem d_input = CL_MEM_PTR(task.shader_input);
1803   cl_mem d_output = CL_MEM_PTR(task.shader_output);
1804   cl_int d_shader_eval_type = task.shader_eval_type;
1805   cl_int d_shader_filter = task.shader_filter;
1806   cl_int d_shader_x = task.shader_x;
1807   cl_int d_shader_w = task.shader_w;
1808   cl_int d_offset = task.offset;
1809
1810   OpenCLDevice::OpenCLProgram *program = &background_program;
1811   if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1812     program = &displace_program;
1813   }
1814   program->wait_for_availability();
1815   cl_kernel kernel = (*program)();
1816
1817   cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
1818
1819   set_kernel_arg_buffers(kernel, &start_arg_index);
1820
1821   start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
1822   if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
1823     start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
1824   }
1825   start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
1826
1827   for (int sample = 0; sample < task.num_samples; sample++) {
1828
1829     if (task.get_cancel())
1830       break;
1831
1832     kernel_set_args(kernel, start_arg_index, sample);
1833
1834     enqueue_kernel(kernel, task.shader_w, 1);
1835
1836     clFinish(cqCommandQueue);
1837
1838     task.update_progress(NULL);
1839   }
1840 }
1841
1842 void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
1843 {
1844   scoped_timer timer(&rtile.buffers->render_time);
1845
1846   /* Cast arguments to cl types. */
1847   cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1848   cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1849   cl_int d_x = rtile.x;
1850   cl_int d_y = rtile.y;
1851   cl_int d_w = rtile.w;
1852   cl_int d_h = rtile.h;
1853   cl_int d_offset = rtile.offset;
1854   cl_int d_stride = rtile.stride;
1855
1856   bake_program.wait_for_availability();
1857   cl_kernel kernel = bake_program();
1858
1859   cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
1860
1861   set_kernel_arg_buffers(kernel, &start_arg_index);
1862
1863   start_arg_index += kernel_set_args(
1864       kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
1865
1866   int start_sample = rtile.start_sample;
1867   int end_sample = rtile.start_sample + rtile.num_samples;
1868
1869   for (int sample = start_sample; sample < end_sample; sample++) {
1870     if (task.get_cancel()) {
1871       if (task.need_finish_queue == false)
1872         break;
1873     }
1874
1875     kernel_set_args(kernel, start_arg_index, sample);
1876
1877     enqueue_kernel(kernel, d_w, d_h);
1878     clFinish(cqCommandQueue);
1879
1880     rtile.sample = sample + 1;
1881
1882     task.update_progress(&rtile, rtile.w * rtile.h);
1883   }
1884 }
1885
1886 static bool kernel_build_opencl_2(cl_device_id cdDevice)
1887 {
1888   /* Build with OpenCL 2.0 if available, this improves performance
1889    * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
1890    * Note that OpenCL selects the highest 1.x version by default,
1891    * only for 2.0 do we need the explicit compiler flag. */
1892   int version_major, version_minor;
1893   if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
1894     if (version_major >= 2) {
1895       /* This appears to trigger a driver bug in Radeon RX cards with certain
1896        * driver version, so don't use OpenCL 2.0 for those. */
1897       string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
1898       if (string_startswith(device_name, "Radeon RX 4") ||
1899           string_startswith(device_name, "Radeon (TM) RX 4") ||
1900           string_startswith(device_name, "Radeon RX 5") ||
1901           string_startswith(device_name, "Radeon (TM) RX 5")) {
1902         char version[256] = "";
1903         int driver_major, driver_minor;
1904         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
1905         if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
1906           return !(driver_major == 3075 && driver_minor <= 12);
1907         }
1908       }
1909
1910       return true;
1911     }
1912   }
1913
1914   return false;
1915 }
1916
1917 string OpenCLDevice::kernel_build_options(const string *debug_src)
1918 {
1919   string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1920
1921   if (kernel_build_opencl_2(cdDevice)) {
1922     build_options += "-cl-std=CL2.0 ";
1923   }
1924
1925   if (platform_name == "NVIDIA CUDA") {
1926     build_options +=
1927         "-D__KERNEL_OPENCL_NVIDIA__ "
1928         "-cl-nv-maxrregcount=32 "
1929         "-cl-nv-verbose ";
1930
1931     uint compute_capability_major, compute_capability_minor;
1932     clGetDeviceInfo(cdDevice,
1933                     CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1934                     sizeof(cl_uint),
1935                     &compute_capability_major,
1936                     NULL);
1937     clGetDeviceInfo(cdDevice,
1938                     CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1939                     sizeof(cl_uint),
1940                     &compute_capability_minor,
1941                     NULL);
1942
1943     build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1944                                    compute_capability_major * 100 + compute_capability_minor * 10);
1945   }
1946
1947   else if (platform_name == "Apple")
1948     build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1949
1950   else if (platform_name == "AMD Accelerated Parallel Processing")
1951     build_options += "-D__KERNEL_OPENCL_AMD__ ";
1952
1953   else if (platform_name == "Intel(R) OpenCL") {
1954     build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1955
1956     /* Options for gdb source level kernel debugging.
1957      * this segfaults on linux currently.
1958      */
1959     if (OpenCLInfo::use_debug() && debug_src)
1960       build_options += "-g -s \"" + *debug_src + "\" ";
1961   }
1962
1963   if (info.has_half_images) {
1964     build_options += "-D__KERNEL_CL_KHR_FP16__ ";
1965   }
1966
1967   if (OpenCLInfo::use_debug()) {
1968     build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1969   }
1970
1971 #  ifdef WITH_CYCLES_DEBUG
1972   build_options += "-D__KERNEL_DEBUG__ ";
1973 #  endif
1974
1975 #  ifdef WITH_NANOVDB
1976   if (info.has_nanovdb) {
1977     build_options += "-DWITH_NANOVDB ";
1978   }
1979 #  endif
1980
1981   return build_options;
1982 }
1983
1984 /* TODO(sergey): In the future we can use variadic templates, once
1985  * C++0x is allowed. Should allow to clean this up a bit.
1986  */
1987 int OpenCLDevice::kernel_set_args(cl_kernel kernel,
1988                                   int start_argument_index,
1989                                   const ArgumentWrapper &arg1,
1990                                   const ArgumentWrapper &arg2,
1991                                   const ArgumentWrapper &arg3,
1992                                   const ArgumentWrapper &arg4,
1993                                   const ArgumentWrapper &arg5,
1994                                   const ArgumentWrapper &arg6,
1995                                   const ArgumentWrapper &arg7,
1996                                   const ArgumentWrapper &arg8,
1997                                   const ArgumentWrapper &arg9,
1998                                   const ArgumentWrapper &arg10,
1999                                   const ArgumentWrapper &arg11,
2000                                   const ArgumentWrapper &arg12,
2001                                   const ArgumentWrapper &arg13,
2002                                   const ArgumentWrapper &arg14,
2003                                   const ArgumentWrapper &arg15,
2004                                   const ArgumentWrapper &arg16,
2005                                   const ArgumentWrapper &arg17,
2006                                   const ArgumentWrapper &arg18,
2007                                   const ArgumentWrapper &arg19,
2008                                   const ArgumentWrapper &arg20,
2009                                   const ArgumentWrapper &arg21,
2010                                   const ArgumentWrapper &arg22,
2011                                   const ArgumentWrapper &arg23,
2012                                   const ArgumentWrapper &arg24,
2013                                   const ArgumentWrapper &arg25,
2014                                   const ArgumentWrapper &arg26,
2015                                   const ArgumentWrapper &arg27,
2016                                   const ArgumentWrapper &arg28,
2017                                   const ArgumentWrapper &arg29,
2018                                   const ArgumentWrapper &arg30,
2019                                   const ArgumentWrapper &arg31,
2020                                   const ArgumentWrapper &arg32,
2021                                   const ArgumentWrapper &arg33)
2022 {
2023   int current_arg_index = 0;
2024 #  define FAKE_VARARG_HANDLE_ARG(arg) \
2025     do { \
2026       if (arg.pointer != NULL) { \
2027         opencl_assert(clSetKernelArg( \
2028             kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
2029         ++current_arg_index; \
2030       } \
2031       else { \
2032         return current_arg_index; \
2033       } \
2034     } while (false)
2035   FAKE_VARARG_HANDLE_ARG(arg1);
2036   FAKE_VARARG_HANDLE_ARG(arg2);
2037   FAKE_VARARG_HANDLE_ARG(arg3);
2038   FAKE_VARARG_HANDLE_ARG(arg4);
2039   FAKE_VARARG_HANDLE_ARG(arg5);
2040   FAKE_VARARG_HANDLE_ARG(arg6);
2041   FAKE_VARARG_HANDLE_ARG(arg7);
2042   FAKE_VARARG_HANDLE_ARG(arg8);
2043   FAKE_VARARG_HANDLE_ARG(arg9);
2044   FAKE_VARARG_HANDLE_ARG(arg10);
2045   FAKE_VARARG_HANDLE_ARG(arg11);
2046   FAKE_VARARG_HANDLE_ARG(arg12);
2047   FAKE_VARARG_HANDLE_ARG(arg13);
2048   FAKE_VARARG_HANDLE_ARG(arg14);
2049   FAKE_VARARG_HANDLE_ARG(arg15);
2050   FAKE_VARARG_HANDLE_ARG(arg16);
2051   FAKE_VARARG_HANDLE_ARG(arg17);
2052   FAKE_VARARG_HANDLE_ARG(arg18);
2053   FAKE_VARARG_HANDLE_ARG(arg19);
2054   FAKE_VARARG_HANDLE_ARG(arg20);
2055   FAKE_VARARG_HANDLE_ARG(arg21);
2056   FAKE_VARARG_HANDLE_ARG(arg22);
2057   FAKE_VARARG_HANDLE_ARG(arg23);
2058   FAKE_VARARG_HANDLE_ARG(arg24);
2059   FAKE_VARARG_HANDLE_ARG(arg25);
2060   FAKE_VARARG_HANDLE_ARG(arg26);
2061   FAKE_VARARG_HANDLE_ARG(arg27);
2062   FAKE_VARARG_HANDLE_ARG(arg28);
2063   FAKE_VARARG_HANDLE_ARG(arg29);
2064   FAKE_VARARG_HANDLE_ARG(arg30);
2065   FAKE_VARARG_HANDLE_ARG(arg31);
2066   FAKE_VARARG_HANDLE_ARG(arg32);
2067   FAKE_VARARG_HANDLE_ARG(arg33);
2068 #  undef FAKE_VARARG_HANDLE_ARG
2069   return current_arg_index;
2070 }
2071
2072 void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
2073 {
2074   if (kernel) {
2075     clReleaseKernel(kernel);
2076   }
2077 }
2078
2079 void OpenCLDevice::release_mem_object_safe(cl_mem mem)
2080 {
2081   if (mem != NULL) {
2082     clReleaseMemObject(mem);
2083   }
2084 }
2085
2086 void OpenCLDevice::release_program_safe(cl_program program)
2087 {
2088   if (program) {
2089     clReleaseProgram(program);
2090   }
2091 }
2092
2093 /* ** Those guys are for working around some compiler-specific bugs ** */
2094
2095 cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
2096 {
2097   return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
2098 }
2099
2100 void OpenCLDevice::store_cached_kernel(cl_program program,
2101                                        ustring key,
2102                                        thread_scoped_lock &cache_locker)
2103 {
2104   OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
2105 }
2106
2107 Device *opencl_create_split_device(DeviceInfo &info,
2108                                    Stats &stats,
2109                                    Profiler &profiler,
2110                                    bool background)
2111 {
2112   return new OpenCLDevice(info, stats, profiler, background);
2113 }
2114
2115 CCL_NAMESPACE_END
2116
2117 #endif