Geometry Nodes: new Material input node
[blender.git] / intern / cycles / device / opencl / device_opencl_impl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #  include "device/opencl/device_opencl.h"
20
21 #  include "kernel/kernel_types.h"
22 #  include "kernel/split/kernel_split_data_types.h"
23
24 #  include "util/util_algorithm.h"
25 #  include "util/util_debug.h"
26 #  include "util/util_foreach.h"
27 #  include "util/util_logging.h"
28 #  include "util/util_md5.h"
29 #  include "util/util_path.h"
30 #  include "util/util_time.h"
31
32 CCL_NAMESPACE_BEGIN
33
34 struct texture_slot_t {
35   texture_slot_t(const string &name, int slot) : name(name), slot(slot)
36   {
37   }
38   string name;
39   int slot;
40 };
41
42 static const string NON_SPLIT_KERNELS =
43     "denoising "
44     "base "
45     "background "
46     "displace ";
47
48 static const string SPLIT_BUNDLE_KERNELS =
49     "data_init "
50     "path_init "
51     "state_buffer_size "
52     "scene_intersect "
53     "queue_enqueue "
54     "shader_setup "
55     "shader_sort "
56     "enqueue_inactive "
57     "next_iteration_setup "
58     "indirect_subsurface "
59     "buffer_update "
60     "adaptive_stopping "
61     "adaptive_filter_x "
62     "adaptive_filter_y "
63     "adaptive_adjust_samples";
64
65 const string OpenCLDevice::get_opencl_program_name(const string &kernel_name)
66 {
67   if (NON_SPLIT_KERNELS.find(kernel_name) != std::string::npos) {
68     return kernel_name;
69   }
70   else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
71     return "split_bundle";
72   }
73   else {
74     return "split_" + kernel_name;
75   }
76 }
77
78 const string OpenCLDevice::get_opencl_program_filename(const string &kernel_name)
79 {
80   if (kernel_name == "denoising") {
81     return "filter.cl";
82   }
83   else if (SPLIT_BUNDLE_KERNELS.find(kernel_name) != std::string::npos) {
84     return "kernel_split_bundle.cl";
85   }
86   else {
87     return "kernel_" + kernel_name + ".cl";
88   }
89 }
90
91 /* Enable features that we always want to compile to reduce recompilation events */
92 void OpenCLDevice::enable_default_features(DeviceRequestedFeatures &features)
93 {
94   features.use_transparent = true;
95   features.use_shadow_tricks = true;
96   features.use_principled = true;
97   features.use_denoising = true;
98
99   if (!background) {
100     features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
101     features.nodes_features = NODE_FEATURE_ALL;
102     features.use_hair = true;
103     features.use_subsurface = true;
104     features.use_camera_motion = false;
105     features.use_object_motion = false;
106   }
107 }
108
109 string OpenCLDevice::get_build_options(const DeviceRequestedFeatures &requested_features,
110                                        const string &opencl_program_name,
111                                        bool preview_kernel)
112 {
113   /* first check for non-split kernel programs */
114   if (opencl_program_name == "base" || opencl_program_name == "denoising") {
115     return "";
116   }
117   else if (opencl_program_name == "bake") {
118     /* Note: get_build_options for bake is only requested when baking is enabled.
119      * displace and background are always requested.
120      * `__SPLIT_KERNEL__` must not be present in the compile directives for bake */
121     DeviceRequestedFeatures features(requested_features);
122     enable_default_features(features);
123     features.use_denoising = false;
124     features.use_object_motion = false;
125     features.use_camera_motion = false;
126     features.use_hair = true;
127     features.use_subsurface = true;
128     features.max_nodes_group = NODE_GROUP_LEVEL_MAX;
129     features.nodes_features = NODE_FEATURE_ALL;
130     features.use_integrator_branched = false;
131     return features.get_build_options();
132   }
133   else if (opencl_program_name == "displace") {
134     /* As displacement does not use any nodes from the Shading group (eg BSDF).
135      * We disable all features that are related to shading. */
136     DeviceRequestedFeatures features(requested_features);
137     enable_default_features(features);
138     features.use_denoising = false;
139     features.use_object_motion = false;
140     features.use_camera_motion = false;
141     features.use_baking = false;
142     features.use_transparent = false;
143     features.use_shadow_tricks = false;
144     features.use_subsurface = false;
145     features.use_volume = false;
146     features.nodes_features &= ~NODE_FEATURE_VOLUME;
147     features.use_denoising = false;
148     features.use_principled = false;
149     features.use_integrator_branched = false;
150     return features.get_build_options();
151   }
152   else if (opencl_program_name == "background") {
153     /* Background uses Background shading
154      * It is save to disable shadow features, subsurface and volumetric. */
155     DeviceRequestedFeatures features(requested_features);
156     enable_default_features(features);
157     features.use_baking = false;
158     features.use_object_motion = false;
159     features.use_camera_motion = false;
160     features.use_transparent = false;
161     features.use_shadow_tricks = false;
162     features.use_denoising = false;
163     /* NOTE: currently possible to use surface nodes like `Hair Info`, `Bump` node.
164      * Perhaps we should remove them in UI as it does not make any sense when
165      * rendering background. */
166     features.nodes_features &= ~NODE_FEATURE_VOLUME;
167     features.use_subsurface = false;
168     features.use_volume = false;
169     features.use_shader_raytrace = false;
170     features.use_patch_evaluation = false;
171     features.use_integrator_branched = false;
172     return features.get_build_options();
173   }
174
175   string build_options = "-D__SPLIT_KERNEL__ ";
176   /* Set compute device build option. */
177   cl_device_type device_type;
178   OpenCLInfo::get_device_type(this->cdDevice, &device_type, &this->ciErr);
179   assert(this->ciErr == CL_SUCCESS);
180   if (device_type == CL_DEVICE_TYPE_GPU) {
181     build_options += "-D__COMPUTE_DEVICE_GPU__ ";
182   }
183
184   DeviceRequestedFeatures nofeatures;
185   enable_default_features(nofeatures);
186
187   /* Add program specific optimized compile directives */
188   if (preview_kernel) {
189     DeviceRequestedFeatures preview_features;
190     preview_features.use_hair = true;
191     build_options += "-D__KERNEL_AO_PREVIEW__ ";
192     build_options += preview_features.get_build_options();
193   }
194   else if (opencl_program_name == "split_do_volume" && !requested_features.use_volume) {
195     build_options += nofeatures.get_build_options();
196   }
197   else {
198     DeviceRequestedFeatures features(requested_features);
199     enable_default_features(features);
200
201     /* Always turn off baking at this point. Baking is only useful when building the bake kernel.
202      * this also makes sure that the kernels that are build during baking can be reused
203      * when not doing any baking. */
204     features.use_baking = false;
205
206     /* Do not vary on shaders when program doesn't do any shading.
207      * We have bundled them in a single program. */
208     if (opencl_program_name == "split_bundle") {
209       features.max_nodes_group = 0;
210       features.nodes_features = 0;
211       features.use_shader_raytrace = false;
212     }
213
214     /* No specific settings, just add the regular ones */
215     build_options += features.get_build_options();
216   }
217
218   return build_options;
219 }
220
221 OpenCLDevice::OpenCLSplitPrograms::OpenCLSplitPrograms(OpenCLDevice *device_)
222 {
223   device = device_;
224 }
225
226 OpenCLDevice::OpenCLSplitPrograms::~OpenCLSplitPrograms()
227 {
228   program_split.release();
229   program_lamp_emission.release();
230   program_do_volume.release();
231   program_indirect_background.release();
232   program_shader_eval.release();
233   program_holdout_emission_blurring_pathtermination_ao.release();
234   program_subsurface_scatter.release();
235   program_direct_lighting.release();
236   program_shadow_blocked_ao.release();
237   program_shadow_blocked_dl.release();
238 }
239
240 void OpenCLDevice::OpenCLSplitPrograms::load_kernels(
241     vector<OpenCLProgram *> &programs,
242     const DeviceRequestedFeatures &requested_features,
243     bool is_preview)
244 {
245   if (!requested_features.use_baking) {
246 #  define ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(kernel_name) \
247     program_split.add_kernel(ustring("path_trace_" #kernel_name));
248 #  define ADD_SPLIT_KERNEL_PROGRAM(kernel_name) \
249     const string program_name_##kernel_name = "split_" #kernel_name; \
250     program_##kernel_name = OpenCLDevice::OpenCLProgram( \
251         device, \
252         program_name_##kernel_name, \
253         "kernel_" #kernel_name ".cl", \
254         device->get_build_options(requested_features, program_name_##kernel_name, is_preview)); \
255     program_##kernel_name.add_kernel(ustring("path_trace_" #kernel_name)); \
256     programs.push_back(&program_##kernel_name);
257
258     /* Ordered with most complex kernels first, to reduce overall compile time. */
259     ADD_SPLIT_KERNEL_PROGRAM(subsurface_scatter);
260     ADD_SPLIT_KERNEL_PROGRAM(direct_lighting);
261     ADD_SPLIT_KERNEL_PROGRAM(indirect_background);
262     if (requested_features.use_volume || is_preview) {
263       ADD_SPLIT_KERNEL_PROGRAM(do_volume);
264     }
265     ADD_SPLIT_KERNEL_PROGRAM(shader_eval);
266     ADD_SPLIT_KERNEL_PROGRAM(lamp_emission);
267     ADD_SPLIT_KERNEL_PROGRAM(holdout_emission_blurring_pathtermination_ao);
268     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_dl);
269     ADD_SPLIT_KERNEL_PROGRAM(shadow_blocked_ao);
270
271     /* Quick kernels bundled in a single program to reduce overhead of starting
272      * Blender processes. */
273     program_split = OpenCLDevice::OpenCLProgram(
274         device,
275         "split_bundle",
276         "kernel_split_bundle.cl",
277         device->get_build_options(requested_features, "split_bundle", is_preview));
278
279     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(data_init);
280     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(state_buffer_size);
281     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(path_init);
282     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(scene_intersect);
283     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(queue_enqueue);
284     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_setup);
285     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(shader_sort);
286     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(enqueue_inactive);
287     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(next_iteration_setup);
288     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(indirect_subsurface);
289     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(buffer_update);
290     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_stopping);
291     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_x);
292     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_filter_y);
293     ADD_SPLIT_KERNEL_BUNDLE_PROGRAM(adaptive_adjust_samples);
294     programs.push_back(&program_split);
295
296 #  undef ADD_SPLIT_KERNEL_PROGRAM
297 #  undef ADD_SPLIT_KERNEL_BUNDLE_PROGRAM
298   }
299 }
300
301 namespace {
302
303 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
304  * fetch its size.
305  */
306 typedef struct KernelGlobalsDummy {
307   ccl_constant KernelData *data;
308   ccl_global char *buffers[8];
309
310 #  define KERNEL_TEX(type, name) TextureInfo name;
311 #  include "kernel/kernel_textures.h"
312 #  undef KERNEL_TEX
313   SplitData split_data;
314   SplitParams split_param_data;
315 } KernelGlobalsDummy;
316
317 }  // namespace
318
319 struct CachedSplitMemory {
320   int id;
321   device_memory *split_data;
322   device_memory *ray_state;
323   device_memory *queue_index;
324   device_memory *use_queues_flag;
325   device_memory *work_pools;
326   device_ptr *buffer;
327 };
328
329 class OpenCLSplitKernelFunction : public SplitKernelFunction {
330  public:
331   OpenCLDevice *device;
332   OpenCLDevice::OpenCLProgram program;
333   CachedSplitMemory &cached_memory;
334   int cached_id;
335
336   OpenCLSplitKernelFunction(OpenCLDevice *device, CachedSplitMemory &cached_memory)
337       : device(device), cached_memory(cached_memory), cached_id(cached_memory.id - 1)
338   {
339   }
340
341   ~OpenCLSplitKernelFunction()
342   {
343     program.release();
344   }
345
346   virtual bool enqueue(const KernelDimensions &dim, device_memory &kg, device_memory &data)
347   {
348     if (cached_id != cached_memory.id) {
349       cl_uint start_arg_index = device->kernel_set_args(
350           program(), 0, kg, data, *cached_memory.split_data, *cached_memory.ray_state);
351
352       device->set_kernel_arg_buffers(program(), &start_arg_index);
353
354       start_arg_index += device->kernel_set_args(program(),
355                                                  start_arg_index,
356                                                  *cached_memory.queue_index,
357                                                  *cached_memory.use_queues_flag,
358                                                  *cached_memory.work_pools,
359                                                  *cached_memory.buffer);
360
361       cached_id = cached_memory.id;
362     }
363
364     device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
365                                            program(),
366                                            2,
367                                            NULL,
368                                            dim.global_size,
369                                            dim.local_size,
370                                            0,
371                                            NULL,
372                                            NULL);
373
374     device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
375
376     if (device->ciErr != CL_SUCCESS) {
377       string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
378                                      clewErrorString(device->ciErr));
379       device->opencl_error(message);
380       return false;
381     }
382
383     return true;
384   }
385 };
386
387 class OpenCLSplitKernel : public DeviceSplitKernel {
388   OpenCLDevice *device;
389   CachedSplitMemory cached_memory;
390
391  public:
392   explicit OpenCLSplitKernel(OpenCLDevice *device) : DeviceSplitKernel(device), device(device)
393   {
394   }
395
396   virtual SplitKernelFunction *get_split_kernel_function(
397       const string &kernel_name, const DeviceRequestedFeatures &requested_features)
398   {
399     OpenCLSplitKernelFunction *kernel = new OpenCLSplitKernelFunction(device, cached_memory);
400
401     const string program_name = device->get_opencl_program_name(kernel_name);
402     kernel->program = OpenCLDevice::OpenCLProgram(
403         device,
404         program_name,
405         device->get_opencl_program_filename(kernel_name),
406         device->get_build_options(requested_features, program_name, device->use_preview_kernels));
407
408     kernel->program.add_kernel(ustring("path_trace_" + kernel_name));
409     kernel->program.load();
410
411     if (!kernel->program.is_loaded()) {
412       delete kernel;
413       return NULL;
414     }
415
416     return kernel;
417   }
418
419   virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads)
420   {
421     device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
422     size_buffer.alloc(1);
423     size_buffer.zero_to_device();
424
425     uint threads = num_threads;
426     OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
427     cl_kernel kernel_state_buffer_size = programs->program_split(
428         ustring("path_trace_state_buffer_size"));
429     device->kernel_set_args(kernel_state_buffer_size, 0, kg, data, threads, size_buffer);
430
431     size_t global_size = 64;
432     device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
433                                            kernel_state_buffer_size,
434                                            1,
435                                            NULL,
436                                            &global_size,
437                                            NULL,
438                                            0,
439                                            NULL,
440                                            NULL);
441
442     device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
443
444     size_buffer.copy_from_device(0, 1, 1);
445     size_t size = size_buffer[0];
446     size_buffer.free();
447
448     if (device->ciErr != CL_SUCCESS) {
449       string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
450                                      clewErrorString(device->ciErr));
451       device->opencl_error(message);
452       return 0;
453     }
454
455     return size;
456   }
457
458   virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
459                                               RenderTile &rtile,
460                                               int num_global_elements,
461                                               device_memory &kernel_globals,
462                                               device_memory &kernel_data,
463                                               device_memory &split_data,
464                                               device_memory &ray_state,
465                                               device_memory &queue_index,
466                                               device_memory &use_queues_flag,
467                                               device_memory &work_pool_wgs)
468   {
469     cl_int dQueue_size = dim.global_size[0] * dim.global_size[1];
470
471     /* Set the range of samples to be processed for every ray in
472      * path-regeneration logic.
473      */
474     cl_int start_sample = rtile.start_sample;
475     cl_int end_sample = rtile.start_sample + rtile.num_samples;
476
477     OpenCLDevice::OpenCLSplitPrograms *programs = device->get_split_programs();
478     cl_kernel kernel_data_init = programs->program_split(ustring("path_trace_data_init"));
479
480     cl_uint start_arg_index = device->kernel_set_args(kernel_data_init,
481                                                       0,
482                                                       kernel_globals,
483                                                       kernel_data,
484                                                       split_data,
485                                                       num_global_elements,
486                                                       ray_state);
487
488     device->set_kernel_arg_buffers(kernel_data_init, &start_arg_index);
489
490     start_arg_index += device->kernel_set_args(kernel_data_init,
491                                                start_arg_index,
492                                                start_sample,
493                                                end_sample,
494                                                rtile.x,
495                                                rtile.y,
496                                                rtile.w,
497                                                rtile.h,
498                                                rtile.offset,
499                                                rtile.stride,
500                                                queue_index,
501                                                dQueue_size,
502                                                use_queues_flag,
503                                                work_pool_wgs,
504                                                rtile.num_samples,
505                                                rtile.buffer);
506
507     /* Enqueue ckPathTraceKernel_data_init kernel. */
508     device->ciErr = clEnqueueNDRangeKernel(device->cqCommandQueue,
509                                            kernel_data_init,
510                                            2,
511                                            NULL,
512                                            dim.global_size,
513                                            dim.local_size,
514                                            0,
515                                            NULL,
516                                            NULL);
517
518     device->opencl_assert_err(device->ciErr, "clEnqueueNDRangeKernel");
519
520     if (device->ciErr != CL_SUCCESS) {
521       string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()",
522                                      clewErrorString(device->ciErr));
523       device->opencl_error(message);
524       return false;
525     }
526
527     cached_memory.split_data = &split_data;
528     cached_memory.ray_state = &ray_state;
529     cached_memory.queue_index = &queue_index;
530     cached_memory.use_queues_flag = &use_queues_flag;
531     cached_memory.work_pools = &work_pool_wgs;
532     cached_memory.buffer = &rtile.buffer;
533     cached_memory.id++;
534
535     return true;
536   }
537
538   virtual int2 split_kernel_local_size()
539   {
540     return make_int2(64, 1);
541   }
542
543   virtual int2 split_kernel_global_size(device_memory &kg,
544                                         device_memory &data,
545                                         DeviceTask & /*task*/)
546   {
547     cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice);
548     /* Use small global size on CPU devices as it seems to be much faster. */
549     if (type == CL_DEVICE_TYPE_CPU) {
550       VLOG(1) << "Global size: (64, 64).";
551       return make_int2(64, 64);
552     }
553
554     cl_ulong max_buffer_size;
555     clGetDeviceInfo(
556         device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL);
557
558     if (DebugFlags().opencl.mem_limit) {
559       max_buffer_size = min(max_buffer_size,
560                             cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used));
561     }
562
563     VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size)
564             << " bytes. (" << string_human_readable_size(max_buffer_size) << ").";
565
566     /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */
567     max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l * 1024 * 1024 * 1024);
568
569     size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size);
570     int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64),
571                                  (int)sqrt(num_elements));
572
573     if (device->info.description.find("Intel") != string::npos) {
574       global_size = make_int2(min(512, global_size.x), min(512, global_size.y));
575     }
576
577     VLOG(1) << "Global size: " << global_size << ".";
578     return global_size;
579   }
580 };
581
582 bool OpenCLDevice::opencl_error(cl_int err)
583 {
584   if (err != CL_SUCCESS) {
585     string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
586     if (error_msg == "")
587       error_msg = message;
588     fprintf(stderr, "%s\n", message.c_str());
589     return true;
590   }
591
592   return false;
593 }
594
595 void OpenCLDevice::opencl_error(const string &message)
596 {
597   if (error_msg == "")
598     error_msg = message;
599   fprintf(stderr, "%s\n", message.c_str());
600 }
601
602 void OpenCLDevice::opencl_assert_err(cl_int err, const char *where)
603 {
604   if (err != CL_SUCCESS) {
605     string message = string_printf(
606         "OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
607     if (error_msg == "")
608       error_msg = message;
609     fprintf(stderr, "%s\n", message.c_str());
610 #  ifndef NDEBUG
611     abort();
612 #  endif
613   }
614 }
615
616 OpenCLDevice::OpenCLDevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
617     : Device(info, stats, profiler, background),
618       load_kernel_num_compiling(0),
619       kernel_programs(this),
620       preview_programs(this),
621       memory_manager(this),
622       texture_info(this, "__texture_info", MEM_GLOBAL)
623 {
624   cpPlatform = NULL;
625   cdDevice = NULL;
626   cxContext = NULL;
627   cqCommandQueue = NULL;
628   device_initialized = false;
629   textures_need_update = true;
630   use_preview_kernels = !background;
631
632   vector<OpenCLPlatformDevice> usable_devices;
633   OpenCLInfo::get_usable_devices(&usable_devices);
634   if (usable_devices.size() == 0) {
635     opencl_error("OpenCL: no devices found.");
636     return;
637   }
638   assert(info.num < usable_devices.size());
639   OpenCLPlatformDevice &platform_device = usable_devices[info.num];
640   device_num = info.num;
641   cpPlatform = platform_device.platform_id;
642   cdDevice = platform_device.device_id;
643   platform_name = platform_device.platform_name;
644   device_name = platform_device.device_name;
645   VLOG(2) << "Creating new Cycles device for OpenCL platform " << platform_name << ", device "
646           << device_name << ".";
647
648   {
649     /* try to use cached context */
650     thread_scoped_lock cache_locker;
651     cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
652
653     if (cxContext == NULL) {
654       /* create context properties array to specify platform */
655       const cl_context_properties context_props[] = {
656           CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform, 0, 0};
657
658       /* create context */
659       cxContext = clCreateContext(
660           context_props, 1, &cdDevice, context_notify_callback, cdDevice, &ciErr);
661
662       if (opencl_error(ciErr)) {
663         opencl_error("OpenCL: clCreateContext failed");
664         return;
665       }
666
667       /* cache it */
668       OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
669     }
670   }
671
672   cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
673   if (opencl_error(ciErr)) {
674     opencl_error("OpenCL: Error creating command queue");
675     return;
676   }
677
678   /* Allocate this right away so that texture_info
679    * is placed at offset 0 in the device memory buffers. */
680   texture_info.resize(1);
681   memory_manager.alloc("texture_info", texture_info);
682
683   device_initialized = true;
684
685   split_kernel = new OpenCLSplitKernel(this);
686   if (use_preview_kernels) {
687     load_preview_kernels();
688   }
689 }
690
691 OpenCLDevice::~OpenCLDevice()
692 {
693   task_pool.cancel();
694   load_required_kernel_task_pool.cancel();
695   load_kernel_task_pool.cancel();
696
697   memory_manager.free();
698
699   ConstMemMap::iterator mt;
700   for (mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
701     delete mt->second;
702   }
703
704   base_program.release();
705   bake_program.release();
706   displace_program.release();
707   background_program.release();
708   denoising_program.release();
709
710   if (cqCommandQueue)
711     clReleaseCommandQueue(cqCommandQueue);
712   if (cxContext)
713     clReleaseContext(cxContext);
714
715   delete split_kernel;
716 }
717
718 void CL_CALLBACK OpenCLDevice::context_notify_callback(const char *err_info,
719                                                        const void * /*private_info*/,
720                                                        size_t /*cb*/,
721                                                        void *user_data)
722 {
723   string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
724   fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
725 }
726
727 bool OpenCLDevice::opencl_version_check()
728 {
729   string error;
730   if (!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
731     opencl_error(error);
732     return false;
733   }
734   if (!OpenCLInfo::device_version_check(cdDevice, &error)) {
735     opencl_error(error);
736     return false;
737   }
738   return true;
739 }
740
741 string OpenCLDevice::device_md5_hash(string kernel_custom_build_options)
742 {
743   MD5Hash md5;
744   char version[256], driver[256], name[256], vendor[256];
745
746   clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
747   clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
748   clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
749   clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
750
751   md5.append((uint8_t *)vendor, strlen(vendor));
752   md5.append((uint8_t *)version, strlen(version));
753   md5.append((uint8_t *)name, strlen(name));
754   md5.append((uint8_t *)driver, strlen(driver));
755
756   string options = kernel_build_options();
757   options += kernel_custom_build_options;
758   md5.append((uint8_t *)options.c_str(), options.size());
759
760   return md5.get_hex();
761 }
762
763 bool OpenCLDevice::load_kernels(const DeviceRequestedFeatures &requested_features)
764 {
765   VLOG(2) << "Loading kernels for platform " << platform_name << ", device " << device_name << ".";
766   /* Verify if device was initialized. */
767   if (!device_initialized) {
768     fprintf(stderr, "OpenCL: failed to initialize device.\n");
769     return false;
770   }
771
772   /* Verify we have right opencl version. */
773   if (!opencl_version_check())
774     return false;
775
776   load_required_kernels(requested_features);
777
778   vector<OpenCLProgram *> programs;
779   kernel_programs.load_kernels(programs, requested_features, false);
780
781   if (!requested_features.use_baking && requested_features.use_denoising) {
782     denoising_program = OpenCLProgram(
783         this, "denoising", "filter.cl", get_build_options(requested_features, "denoising"));
784     denoising_program.add_kernel(ustring("filter_divide_shadow"));
785     denoising_program.add_kernel(ustring("filter_get_feature"));
786     denoising_program.add_kernel(ustring("filter_write_feature"));
787     denoising_program.add_kernel(ustring("filter_detect_outliers"));
788     denoising_program.add_kernel(ustring("filter_combine_halves"));
789     denoising_program.add_kernel(ustring("filter_construct_transform"));
790     denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
791     denoising_program.add_kernel(ustring("filter_nlm_blur"));
792     denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
793     denoising_program.add_kernel(ustring("filter_nlm_update_output"));
794     denoising_program.add_kernel(ustring("filter_nlm_normalize"));
795     denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
796     denoising_program.add_kernel(ustring("filter_finalize"));
797     programs.push_back(&denoising_program);
798   }
799
800   load_required_kernel_task_pool.wait_work();
801
802   /* Parallel compilation of Cycles kernels, this launches multiple
803    * processes to workaround OpenCL frameworks serializing the calls
804    * internally within a single process. */
805   foreach (OpenCLProgram *program, programs) {
806     if (!program->load()) {
807       load_kernel_num_compiling++;
808       load_kernel_task_pool.push([=] {
809         program->compile();
810         load_kernel_num_compiling--;
811       });
812     }
813   }
814   return true;
815 }
816
817 void OpenCLDevice::load_required_kernels(const DeviceRequestedFeatures &requested_features)
818 {
819   vector<OpenCLProgram *> programs;
820   base_program = OpenCLProgram(
821       this, "base", "kernel_base.cl", get_build_options(requested_features, "base"));
822   base_program.add_kernel(ustring("convert_to_byte"));
823   base_program.add_kernel(ustring("convert_to_half_float"));
824   base_program.add_kernel(ustring("zero_buffer"));
825   programs.push_back(&base_program);
826
827   if (requested_features.use_true_displacement) {
828     displace_program = OpenCLProgram(
829         this, "displace", "kernel_displace.cl", get_build_options(requested_features, "displace"));
830     displace_program.add_kernel(ustring("displace"));
831     programs.push_back(&displace_program);
832   }
833
834   if (requested_features.use_background_light) {
835     background_program = OpenCLProgram(this,
836                                        "background",
837                                        "kernel_background.cl",
838                                        get_build_options(requested_features, "background"));
839     background_program.add_kernel(ustring("background"));
840     programs.push_back(&background_program);
841   }
842
843   if (requested_features.use_baking) {
844     bake_program = OpenCLProgram(
845         this, "bake", "kernel_bake.cl", get_build_options(requested_features, "bake"));
846     bake_program.add_kernel(ustring("bake"));
847     programs.push_back(&bake_program);
848   }
849
850   foreach (OpenCLProgram *program, programs) {
851     if (!program->load()) {
852       load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
853     }
854   }
855 }
856
857 void OpenCLDevice::load_preview_kernels()
858 {
859   DeviceRequestedFeatures no_features;
860   vector<OpenCLProgram *> programs;
861   preview_programs.load_kernels(programs, no_features, true);
862
863   foreach (OpenCLProgram *program, programs) {
864     if (!program->load()) {
865       load_required_kernel_task_pool.push(function_bind(&OpenCLProgram::compile, program));
866     }
867   }
868 }
869
870 bool OpenCLDevice::wait_for_availability(const DeviceRequestedFeatures &requested_features)
871 {
872   if (requested_features.use_baking) {
873     /* For baking, kernels have already been loaded in load_required_kernels(). */
874     return true;
875   }
876
877   if (background) {
878     load_kernel_task_pool.wait_work();
879     use_preview_kernels = false;
880   }
881   else {
882     /* We use a device setting to determine to load preview kernels or not
883      * Better to check on device level than per kernel as mixing preview and
884      * non-preview kernels does not work due to different data types */
885     if (use_preview_kernels) {
886       use_preview_kernels = load_kernel_num_compiling.load() > 0;
887     }
888   }
889   return split_kernel->load_kernels(requested_features);
890 }
891
892 OpenCLDevice::OpenCLSplitPrograms *OpenCLDevice::get_split_programs()
893 {
894   return use_preview_kernels ? &preview_programs : &kernel_programs;
895 }
896
897 DeviceKernelStatus OpenCLDevice::get_active_kernel_switch_state()
898 {
899   /* Do not switch kernels for background renderings
900    * We do foreground rendering but use the preview kernels
901    * Check for the optimized kernels
902    *
903    * This works also the other way around, where we are using
904    * optimized kernels but new ones are being compiled due
905    * to other features that are needed */
906   if (background) {
907     /* The if-statements below would find the same result,
908      * But as the `finished` method uses a mutex we added
909      * this as an early exit */
910     return DEVICE_KERNEL_USING_FEATURE_KERNEL;
911   }
912
913   bool other_kernels_finished = load_kernel_num_compiling.load() == 0;
914   if (use_preview_kernels) {
915     if (other_kernels_finished) {
916       return DEVICE_KERNEL_FEATURE_KERNEL_AVAILABLE;
917     }
918     else {
919       return DEVICE_KERNEL_WAITING_FOR_FEATURE_KERNEL;
920     }
921   }
922   else {
923     if (other_kernels_finished) {
924       return DEVICE_KERNEL_USING_FEATURE_KERNEL;
925     }
926     else {
927       return DEVICE_KERNEL_FEATURE_KERNEL_INVALID;
928     }
929   }
930 }
931
932 void OpenCLDevice::mem_alloc(device_memory &mem)
933 {
934   if (mem.name) {
935     VLOG(1) << "Buffer allocate: " << mem.name << ", "
936             << string_human_readable_number(mem.memory_size()) << " bytes. ("
937             << string_human_readable_size(mem.memory_size()) << ")";
938   }
939
940   size_t size = mem.memory_size();
941
942   /* check there is enough memory available for the allocation */
943   cl_ulong max_alloc_size = 0;
944   clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
945
946   if (DebugFlags().opencl.mem_limit) {
947     max_alloc_size = min(max_alloc_size, cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
948   }
949
950   if (size > max_alloc_size) {
951     string error = "Scene too complex to fit in available memory.";
952     if (mem.name != NULL) {
953       error += string_printf(" (allocating buffer %s failed.)", mem.name);
954     }
955     set_error(error);
956
957     return;
958   }
959
960   cl_mem_flags mem_flag;
961   void *mem_ptr = NULL;
962
963   if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
964     mem_flag = CL_MEM_READ_ONLY;
965   else
966     mem_flag = CL_MEM_READ_WRITE;
967
968   /* Zero-size allocation might be invoked by render, but not really
969    * supported by OpenCL. Using NULL as device pointer also doesn't really
970    * work for some reason, so for the time being we'll use special case
971    * will null_mem buffer.
972    */
973   if (size != 0) {
974     mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
975     opencl_assert_err(ciErr, "clCreateBuffer");
976   }
977   else {
978     mem.device_pointer = 0;
979   }
980
981   stats.mem_alloc(size);
982   mem.device_size = size;
983 }
984
985 void OpenCLDevice::mem_copy_to(device_memory &mem)
986 {
987   if (mem.type == MEM_GLOBAL) {
988     global_free(mem);
989     global_alloc(mem);
990   }
991   else if (mem.type == MEM_TEXTURE) {
992     tex_free((device_texture &)mem);
993     tex_alloc((device_texture &)mem);
994   }
995   else {
996     if (!mem.device_pointer) {
997       mem_alloc(mem);
998     }
999
1000     /* this is blocking */
1001     size_t size = mem.memory_size();
1002     if (size != 0) {
1003       opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1004                                          CL_MEM_PTR(mem.device_pointer),
1005                                          CL_TRUE,
1006                                          0,
1007                                          size,
1008                                          mem.host_pointer,
1009                                          0,
1010                                          NULL,
1011                                          NULL));
1012     }
1013   }
1014 }
1015
1016 void OpenCLDevice::mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
1017 {
1018   size_t offset = elem * y * w;
1019   size_t size = elem * w * h;
1020   assert(size != 0);
1021   opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
1022                                     CL_MEM_PTR(mem.device_pointer),
1023                                     CL_TRUE,
1024                                     offset,
1025                                     size,
1026                                     (uchar *)mem.host_pointer + offset,
1027                                     0,
1028                                     NULL,
1029                                     NULL));
1030 }
1031
1032 void OpenCLDevice::mem_zero_kernel(device_ptr mem, size_t size)
1033 {
1034   base_program.wait_for_availability();
1035   cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
1036
1037   size_t global_size[] = {1024, 1024};
1038   size_t num_threads = global_size[0] * global_size[1];
1039
1040   cl_mem d_buffer = CL_MEM_PTR(mem);
1041   cl_ulong d_offset = 0;
1042   cl_ulong d_size = 0;
1043
1044   while (d_offset < size) {
1045     d_size = std::min<cl_ulong>(num_threads * sizeof(float4), size - d_offset);
1046
1047     kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
1048
1049     ciErr = clEnqueueNDRangeKernel(
1050         cqCommandQueue, ckZeroBuffer, 2, NULL, global_size, NULL, 0, NULL, NULL);
1051     opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
1052
1053     d_offset += d_size;
1054   }
1055 }
1056
1057 void OpenCLDevice::mem_zero(device_memory &mem)
1058 {
1059   if (!mem.device_pointer) {
1060     mem_alloc(mem);
1061   }
1062
1063   if (mem.device_pointer) {
1064     if (base_program.is_loaded()) {
1065       mem_zero_kernel(mem.device_pointer, mem.memory_size());
1066     }
1067
1068     if (mem.host_pointer) {
1069       memset(mem.host_pointer, 0, mem.memory_size());
1070     }
1071
1072     if (!base_program.is_loaded()) {
1073       void *zero = mem.host_pointer;
1074
1075       if (!mem.host_pointer) {
1076         zero = util_aligned_malloc(mem.memory_size(), 16);
1077         memset(zero, 0, mem.memory_size());
1078       }
1079
1080       opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1081                                          CL_MEM_PTR(mem.device_pointer),
1082                                          CL_TRUE,
1083                                          0,
1084                                          mem.memory_size(),
1085                                          zero,
1086                                          0,
1087                                          NULL,
1088                                          NULL));
1089
1090       if (!mem.host_pointer) {
1091         util_aligned_free(zero);
1092       }
1093     }
1094   }
1095 }
1096
1097 void OpenCLDevice::mem_free(device_memory &mem)
1098 {
1099   if (mem.type == MEM_GLOBAL) {
1100     global_free(mem);
1101   }
1102   else if (mem.type == MEM_TEXTURE) {
1103     tex_free((device_texture &)mem);
1104   }
1105   else {
1106     if (mem.device_pointer) {
1107       if (mem.device_pointer != 0) {
1108         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1109       }
1110       mem.device_pointer = 0;
1111
1112       stats.mem_free(mem.device_size);
1113       mem.device_size = 0;
1114     }
1115   }
1116 }
1117
1118 int OpenCLDevice::mem_sub_ptr_alignment()
1119 {
1120   return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
1121 }
1122
1123 device_ptr OpenCLDevice::mem_alloc_sub_ptr(device_memory &mem, int offset, int size)
1124 {
1125   cl_mem_flags mem_flag;
1126   if (mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL)
1127     mem_flag = CL_MEM_READ_ONLY;
1128   else
1129     mem_flag = CL_MEM_READ_WRITE;
1130
1131   cl_buffer_region info;
1132   info.origin = mem.memory_elements_size(offset);
1133   info.size = mem.memory_elements_size(size);
1134
1135   device_ptr sub_buf = (device_ptr)clCreateSubBuffer(
1136       CL_MEM_PTR(mem.device_pointer), mem_flag, CL_BUFFER_CREATE_TYPE_REGION, &info, &ciErr);
1137   opencl_assert_err(ciErr, "clCreateSubBuffer");
1138   return sub_buf;
1139 }
1140
1141 void OpenCLDevice::mem_free_sub_ptr(device_ptr device_pointer)
1142 {
1143   if (device_pointer != 0) {
1144     opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
1145   }
1146 }
1147
1148 void OpenCLDevice::const_copy_to(const char *name, void *host, size_t size)
1149 {
1150   ConstMemMap::iterator i = const_mem_map.find(name);
1151   device_vector<uchar> *data;
1152
1153   if (i == const_mem_map.end()) {
1154     data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
1155     data->alloc(size);
1156     const_mem_map.insert(ConstMemMap::value_type(name, data));
1157   }
1158   else {
1159     data = i->second;
1160   }
1161
1162   memcpy(data->data(), host, size);
1163   data->copy_to_device();
1164 }
1165
1166 void OpenCLDevice::global_alloc(device_memory &mem)
1167 {
1168   VLOG(1) << "Global memory allocate: " << mem.name << ", "
1169           << string_human_readable_number(mem.memory_size()) << " bytes. ("
1170           << string_human_readable_size(mem.memory_size()) << ")";
1171
1172   memory_manager.alloc(mem.name, mem);
1173   /* Set the pointer to non-null to keep code that inspects its value from thinking its
1174    * unallocated. */
1175   mem.device_pointer = 1;
1176   textures[mem.name] = &mem;
1177   textures_need_update = true;
1178 }
1179
1180 void OpenCLDevice::global_free(device_memory &mem)
1181 {
1182   if (mem.device_pointer) {
1183     mem.device_pointer = 0;
1184
1185     if (memory_manager.free(mem)) {
1186       textures_need_update = true;
1187     }
1188
1189     foreach (TexturesMap::value_type &value, textures) {
1190       if (value.second == &mem) {
1191         textures.erase(value.first);
1192         break;
1193       }
1194     }
1195   }
1196 }
1197
1198 void OpenCLDevice::tex_alloc(device_texture &mem)
1199 {
1200   VLOG(1) << "Texture allocate: " << mem.name << ", "
1201           << string_human_readable_number(mem.memory_size()) << " bytes. ("
1202           << string_human_readable_size(mem.memory_size()) << ")";
1203
1204   memory_manager.alloc(mem.name, mem);
1205   /* Set the pointer to non-null to keep code that inspects its value from thinking its
1206    * unallocated. */
1207   mem.device_pointer = 1;
1208   textures[mem.name] = &mem;
1209   textures_need_update = true;
1210 }
1211
1212 void OpenCLDevice::tex_free(device_texture &mem)
1213 {
1214   global_free(mem);
1215 }
1216
1217 size_t OpenCLDevice::global_size_round_up(int group_size, int global_size)
1218 {
1219   int r = global_size % group_size;
1220   return global_size + ((r == 0) ? 0 : group_size - r);
1221 }
1222
1223 void OpenCLDevice::enqueue_kernel(
1224     cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
1225 {
1226   size_t workgroup_size, max_work_items[3];
1227
1228   clGetKernelWorkGroupInfo(
1229       kernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1230   clGetDeviceInfo(
1231       cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, max_work_items, NULL);
1232
1233   if (max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
1234     workgroup_size = max_workgroup_size;
1235   }
1236
1237   /* Try to divide evenly over 2 dimensions. */
1238   size_t local_size[2];
1239   if (x_workgroups) {
1240     local_size[0] = workgroup_size;
1241     local_size[1] = 1;
1242   }
1243   else {
1244     size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1245     local_size[0] = local_size[1] = sqrt_workgroup_size;
1246   }
1247
1248   /* Some implementations have max size 1 on 2nd dimension. */
1249   if (local_size[1] > max_work_items[1]) {
1250     local_size[0] = workgroup_size / max_work_items[1];
1251     local_size[1] = max_work_items[1];
1252   }
1253
1254   size_t global_size[2] = {global_size_round_up(local_size[0], w),
1255                            global_size_round_up(local_size[1], h)};
1256
1257   /* Vertical size of 1 is coming from bake/shade kernels where we should
1258    * not round anything up because otherwise we'll either be doing too
1259    * much work per pixel (if we don't check global ID on Y axis) or will
1260    * be checking for global ID to always have Y of 0.
1261    */
1262   if (h == 1) {
1263     global_size[h] = 1;
1264   }
1265
1266   /* run kernel */
1267   opencl_assert(
1268       clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1269   opencl_assert(clFlush(cqCommandQueue));
1270 }
1271
1272 void OpenCLDevice::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1273 {
1274   cl_mem ptr;
1275
1276   MemMap::iterator i = mem_map.find(name);
1277   if (i != mem_map.end()) {
1278     ptr = CL_MEM_PTR(i->second);
1279   }
1280   else {
1281     ptr = 0;
1282   }
1283
1284   opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void *)&ptr));
1285 }
1286
1287 void OpenCLDevice::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
1288 {
1289   flush_texture_buffers();
1290
1291   memory_manager.set_kernel_arg_buffers(kernel, narg);
1292 }
1293
1294 void OpenCLDevice::flush_texture_buffers()
1295 {
1296   if (!textures_need_update) {
1297     return;
1298   }
1299   textures_need_update = false;
1300
1301   /* Setup slots for textures. */
1302   int num_slots = 0;
1303
1304   vector<texture_slot_t> texture_slots;
1305
1306 #  define KERNEL_TEX(type, name) \
1307     if (textures.find(#name) != textures.end()) { \
1308       texture_slots.push_back(texture_slot_t(#name, num_slots)); \
1309     } \
1310     num_slots++;
1311 #  include "kernel/kernel_textures.h"
1312
1313   int num_data_slots = num_slots;
1314
1315   foreach (TexturesMap::value_type &tex, textures) {
1316     string name = tex.first;
1317     device_memory *mem = tex.second;
1318
1319     if (mem->type == MEM_TEXTURE) {
1320       const uint id = ((device_texture *)mem)->slot;
1321       texture_slots.push_back(texture_slot_t(name, num_data_slots + id));
1322       num_slots = max(num_slots, num_data_slots + id + 1);
1323     }
1324   }
1325
1326   /* Realloc texture descriptors buffer. */
1327   memory_manager.free(texture_info);
1328   texture_info.resize(num_slots);
1329   memory_manager.alloc("texture_info", texture_info);
1330
1331   /* Fill in descriptors */
1332   foreach (texture_slot_t &slot, texture_slots) {
1333     device_memory *mem = textures[slot.name];
1334     TextureInfo &info = texture_info[slot.slot];
1335
1336     MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
1337
1338     if (mem->type == MEM_TEXTURE) {
1339       info = ((device_texture *)mem)->info;
1340     }
1341     else {
1342       memset(&info, 0, sizeof(TextureInfo));
1343     }
1344
1345     info.data = desc.offset;
1346     info.cl_buffer = desc.device_buffer;
1347   }
1348
1349   /* Force write of descriptors. */
1350   memory_manager.free(texture_info);
1351   memory_manager.alloc("texture_info", texture_info);
1352 }
1353
1354 void OpenCLDevice::thread_run(DeviceTask &task)
1355 {
1356   flush_texture_buffers();
1357
1358   if (task.type == DeviceTask::RENDER) {
1359     RenderTile tile;
1360     DenoisingTask denoising(this, task);
1361
1362     /* Allocate buffer for kernel globals */
1363     device_only_memory<KernelGlobalsDummy> kgbuffer(this, "kernel_globals");
1364     kgbuffer.alloc_to_device(1);
1365
1366     /* Keep rendering tiles until done. */
1367     while (task.acquire_tile(this, tile, task.tile_types)) {
1368       if (tile.task == RenderTile::PATH_TRACE) {
1369         assert(tile.task == RenderTile::PATH_TRACE);
1370         scoped_timer timer(&tile.buffers->render_time);
1371
1372         split_kernel->path_trace(task, tile, kgbuffer, *const_mem_map["__data"]);
1373
1374         /* Complete kernel execution before release tile. */
1375         /* This helps in multi-device render;
1376          * The device that reaches the critical-section function
1377          * release_tile waits (stalling other devices from entering
1378          * release_tile) for all kernels to complete. If device1 (a
1379          * slow-render device) reaches release_tile first then it would
1380          * stall device2 (a fast-render device) from proceeding to render
1381          * next tile.
1382          */
1383         clFinish(cqCommandQueue);
1384       }
1385       else if (tile.task == RenderTile::BAKE) {
1386         bake(task, tile);
1387       }
1388       else if (tile.task == RenderTile::DENOISE) {
1389         tile.sample = tile.start_sample + tile.num_samples;
1390         denoise(tile, denoising);
1391         task.update_progress(&tile, tile.w * tile.h);
1392       }
1393
1394       task.release_tile(tile);
1395     }
1396
1397     kgbuffer.free();
1398   }
1399   else if (task.type == DeviceTask::SHADER) {
1400     shader(task);
1401   }
1402   else if (task.type == DeviceTask::FILM_CONVERT) {
1403     film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
1404   }
1405   else if (task.type == DeviceTask::DENOISE_BUFFER) {
1406     RenderTile tile;
1407     tile.x = task.x;
1408     tile.y = task.y;
1409     tile.w = task.w;
1410     tile.h = task.h;
1411     tile.buffer = task.buffer;
1412     tile.sample = task.sample + task.num_samples;
1413     tile.num_samples = task.num_samples;
1414     tile.start_sample = task.sample;
1415     tile.offset = task.offset;
1416     tile.stride = task.stride;
1417     tile.buffers = task.buffers;
1418
1419     DenoisingTask denoising(this, task);
1420     denoise(tile, denoising);
1421     task.update_progress(&tile, tile.w * tile.h);
1422   }
1423 }
1424
1425 void OpenCLDevice::film_convert(DeviceTask &task,
1426                                 device_ptr buffer,
1427                                 device_ptr rgba_byte,
1428                                 device_ptr rgba_half)
1429 {
1430   /* cast arguments to cl types */
1431   cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1432   cl_mem d_rgba = (rgba_byte) ? CL_MEM_PTR(rgba_byte) : CL_MEM_PTR(rgba_half);
1433   cl_mem d_buffer = CL_MEM_PTR(buffer);
1434   cl_int d_x = task.x;
1435   cl_int d_y = task.y;
1436   cl_int d_w = task.w;
1437   cl_int d_h = task.h;
1438   cl_float d_sample_scale = 1.0f / (task.sample + 1);
1439   cl_int d_offset = task.offset;
1440   cl_int d_stride = task.stride;
1441
1442   cl_kernel ckFilmConvertKernel = (rgba_byte) ? base_program(ustring("convert_to_byte")) :
1443                                                 base_program(ustring("convert_to_half_float"));
1444
1445   cl_uint start_arg_index = kernel_set_args(ckFilmConvertKernel, 0, d_data, d_rgba, d_buffer);
1446
1447   set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
1448
1449   start_arg_index += kernel_set_args(ckFilmConvertKernel,
1450                                      start_arg_index,
1451                                      d_sample_scale,
1452                                      d_x,
1453                                      d_y,
1454                                      d_w,
1455                                      d_h,
1456                                      d_offset,
1457                                      d_stride);
1458
1459   enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1460 }
1461
1462 bool OpenCLDevice::denoising_non_local_means(device_ptr image_ptr,
1463                                              device_ptr guide_ptr,
1464                                              device_ptr variance_ptr,
1465                                              device_ptr out_ptr,
1466                                              DenoisingTask *task)
1467 {
1468   int stride = task->buffer.stride;
1469   int w = task->buffer.width;
1470   int h = task->buffer.h;
1471   int r = task->nlm_state.r;
1472   int f = task->nlm_state.f;
1473   float a = task->nlm_state.a;
1474   float k_2 = task->nlm_state.k_2;
1475
1476   int pass_stride = task->buffer.pass_stride;
1477   int num_shifts = (2 * r + 1) * (2 * r + 1);
1478   int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
1479
1480   device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
1481   device_sub_ptr blurDifference(
1482       task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
1483   device_sub_ptr weightAccum(
1484       task->buffer.temporary_mem, 2 * pass_stride * num_shifts, pass_stride);
1485   cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
1486   cl_mem difference_mem = CL_MEM_PTR(*difference);
1487   cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1488
1489   cl_mem image_mem = CL_MEM_PTR(image_ptr);
1490   cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
1491   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1492   cl_mem out_mem = CL_MEM_PTR(out_ptr);
1493   cl_mem scale_mem = NULL;
1494
1495   mem_zero_kernel(*weightAccum, sizeof(float) * pass_stride);
1496   mem_zero_kernel(out_ptr, sizeof(float) * pass_stride);
1497
1498   cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1499   cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
1500   cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
1501   cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
1502   cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
1503
1504   kernel_set_args(ckNLMCalcDifference,
1505                   0,
1506                   guide_mem,
1507                   variance_mem,
1508                   scale_mem,
1509                   difference_mem,
1510                   w,
1511                   h,
1512                   stride,
1513                   pass_stride,
1514                   r,
1515                   channel_offset,
1516                   0,
1517                   a,
1518                   k_2);
1519   kernel_set_args(
1520       ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, f);
1521   kernel_set_args(
1522       ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, f);
1523   kernel_set_args(ckNLMUpdateOutput,
1524                   0,
1525                   blurDifference_mem,
1526                   image_mem,
1527                   out_mem,
1528                   weightAccum_mem,
1529                   w,
1530                   h,
1531                   stride,
1532                   pass_stride,
1533                   channel_offset,
1534                   r,
1535                   f);
1536
1537   enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
1538   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1539   enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
1540   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1541   enqueue_kernel(ckNLMUpdateOutput, w * h, num_shifts, true);
1542
1543   kernel_set_args(ckNLMNormalize, 0, out_mem, weightAccum_mem, w, h, stride);
1544   enqueue_kernel(ckNLMNormalize, w, h);
1545
1546   return true;
1547 }
1548
1549 bool OpenCLDevice::denoising_construct_transform(DenoisingTask *task)
1550 {
1551   cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1552   cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1553   cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1554   cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1555
1556   char use_time = task->buffer.use_time ? 1 : 0;
1557
1558   cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
1559
1560   int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0, buffer_mem, tile_info_mem);
1561   cl_mem buffers[9];
1562   for (int i = 0; i < 9; i++) {
1563     buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1564     arg_ofs += kernel_set_args(ckFilterConstructTransform, arg_ofs, buffers[i]);
1565   }
1566   kernel_set_args(ckFilterConstructTransform,
1567                   arg_ofs,
1568                   transform_mem,
1569                   rank_mem,
1570                   task->filter_area,
1571                   task->rect,
1572                   task->buffer.pass_stride,
1573                   task->buffer.frame_stride,
1574                   use_time,
1575                   task->radius,
1576                   task->pca_threshold);
1577
1578   enqueue_kernel(ckFilterConstructTransform, task->storage.w, task->storage.h, 256);
1579
1580   return true;
1581 }
1582
1583 bool OpenCLDevice::denoising_accumulate(device_ptr color_ptr,
1584                                         device_ptr color_variance_ptr,
1585                                         device_ptr scale_ptr,
1586                                         int frame,
1587                                         DenoisingTask *task)
1588 {
1589   cl_mem color_mem = CL_MEM_PTR(color_ptr);
1590   cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
1591   cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
1592
1593   cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
1594   cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
1595   cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1596   cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1597   cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1598
1599   cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
1600   cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
1601   cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
1602   cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
1603
1604   int w = task->reconstruction_state.source_w;
1605   int h = task->reconstruction_state.source_h;
1606   int stride = task->buffer.stride;
1607   int frame_offset = frame * task->buffer.frame_stride;
1608   int t = task->tile_info->frames[frame];
1609   char use_time = task->buffer.use_time ? 1 : 0;
1610
1611   int r = task->radius;
1612   int pass_stride = task->buffer.pass_stride;
1613   int num_shifts = (2 * r + 1) * (2 * r + 1);
1614
1615   device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride * num_shifts);
1616   device_sub_ptr blurDifference(
1617       task->buffer.temporary_mem, pass_stride * num_shifts, pass_stride * num_shifts);
1618   cl_mem difference_mem = CL_MEM_PTR(*difference);
1619   cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
1620
1621   kernel_set_args(ckNLMCalcDifference,
1622                   0,
1623                   color_mem,
1624                   color_variance_mem,
1625                   scale_mem,
1626                   difference_mem,
1627                   w,
1628                   h,
1629                   stride,
1630                   pass_stride,
1631                   r,
1632                   pass_stride,
1633                   frame_offset,
1634                   1.0f,
1635                   task->nlm_k_2);
1636   kernel_set_args(
1637       ckNLMBlur, 0, difference_mem, blurDifference_mem, w, h, stride, pass_stride, r, 4);
1638   kernel_set_args(
1639       ckNLMCalcWeight, 0, blurDifference_mem, difference_mem, w, h, stride, pass_stride, r, 4);
1640   kernel_set_args(ckNLMConstructGramian,
1641                   0,
1642                   t,
1643                   blurDifference_mem,
1644                   buffer_mem,
1645                   transform_mem,
1646                   rank_mem,
1647                   XtWX_mem,
1648                   XtWY_mem,
1649                   task->reconstruction_state.filter_window,
1650                   w,
1651                   h,
1652                   stride,
1653                   pass_stride,
1654                   r,
1655                   4,
1656                   frame_offset,
1657                   use_time);
1658
1659   enqueue_kernel(ckNLMCalcDifference, w * h, num_shifts, true);
1660   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1661   enqueue_kernel(ckNLMCalcWeight, w * h, num_shifts, true);
1662   enqueue_kernel(ckNLMBlur, w * h, num_shifts, true);
1663   enqueue_kernel(ckNLMConstructGramian, w * h, num_shifts, true, 256);
1664
1665   return true;
1666 }
1667
1668 bool OpenCLDevice::denoising_solve(device_ptr output_ptr, DenoisingTask *task)
1669 {
1670   cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
1671
1672   cl_mem output_mem = CL_MEM_PTR(output_ptr);
1673   cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
1674   cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
1675   cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
1676
1677   int w = task->reconstruction_state.source_w;
1678   int h = task->reconstruction_state.source_h;
1679
1680   kernel_set_args(ckFinalize,
1681                   0,
1682                   output_mem,
1683                   rank_mem,
1684                   XtWX_mem,
1685                   XtWY_mem,
1686                   task->filter_area,
1687                   task->reconstruction_state.buffer_params,
1688                   task->render_buffer.samples);
1689   enqueue_kernel(ckFinalize, w, h);
1690
1691   return true;
1692 }
1693
1694 bool OpenCLDevice::denoising_combine_halves(device_ptr a_ptr,
1695                                             device_ptr b_ptr,
1696                                             device_ptr mean_ptr,
1697                                             device_ptr variance_ptr,
1698                                             int r,
1699                                             int4 rect,
1700                                             DenoisingTask *task)
1701 {
1702   cl_mem a_mem = CL_MEM_PTR(a_ptr);
1703   cl_mem b_mem = CL_MEM_PTR(b_ptr);
1704   cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1705   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1706
1707   cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
1708
1709   kernel_set_args(ckFilterCombineHalves, 0, mean_mem, variance_mem, a_mem, b_mem, rect, r);
1710   enqueue_kernel(ckFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1711
1712   return true;
1713 }
1714
1715 bool OpenCLDevice::denoising_divide_shadow(device_ptr a_ptr,
1716                                            device_ptr b_ptr,
1717                                            device_ptr sample_variance_ptr,
1718                                            device_ptr sv_variance_ptr,
1719                                            device_ptr buffer_variance_ptr,
1720                                            DenoisingTask *task)
1721 {
1722   cl_mem a_mem = CL_MEM_PTR(a_ptr);
1723   cl_mem b_mem = CL_MEM_PTR(b_ptr);
1724   cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
1725   cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
1726   cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
1727
1728   cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1729
1730   cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
1731
1732   int arg_ofs = kernel_set_args(
1733       ckFilterDivideShadow, 0, task->render_buffer.samples, tile_info_mem);
1734   cl_mem buffers[9];
1735   for (int i = 0; i < 9; i++) {
1736     buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1737     arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs, buffers[i]);
1738   }
1739   kernel_set_args(ckFilterDivideShadow,
1740                   arg_ofs,
1741                   a_mem,
1742                   b_mem,
1743                   sample_variance_mem,
1744                   sv_variance_mem,
1745                   buffer_variance_mem,
1746                   task->rect,
1747                   task->render_buffer.pass_stride,
1748                   task->render_buffer.offset);
1749   enqueue_kernel(ckFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1750
1751   return true;
1752 }
1753
1754 bool OpenCLDevice::denoising_get_feature(int mean_offset,
1755                                          int variance_offset,
1756                                          device_ptr mean_ptr,
1757                                          device_ptr variance_ptr,
1758                                          float scale,
1759                                          DenoisingTask *task)
1760 {
1761   cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1762   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1763
1764   cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1765
1766   cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1767
1768   int arg_ofs = kernel_set_args(ckFilterGetFeature, 0, task->render_buffer.samples, tile_info_mem);
1769   cl_mem buffers[9];
1770   for (int i = 0; i < 9; i++) {
1771     buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1772     arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs, buffers[i]);
1773   }
1774   kernel_set_args(ckFilterGetFeature,
1775                   arg_ofs,
1776                   mean_offset,
1777                   variance_offset,
1778                   mean_mem,
1779                   variance_mem,
1780                   scale,
1781                   task->rect,
1782                   task->render_buffer.pass_stride,
1783                   task->render_buffer.offset);
1784   enqueue_kernel(ckFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1785
1786   return true;
1787 }
1788
1789 bool OpenCLDevice::denoising_write_feature(int out_offset,
1790                                            device_ptr from_ptr,
1791                                            device_ptr buffer_ptr,
1792                                            DenoisingTask *task)
1793 {
1794   cl_mem from_mem = CL_MEM_PTR(from_ptr);
1795   cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
1796
1797   cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
1798
1799   kernel_set_args(ckFilterWriteFeature,
1800                   0,
1801                   task->render_buffer.samples,
1802                   task->reconstruction_state.buffer_params,
1803                   task->filter_area,
1804                   from_mem,
1805                   buffer_mem,
1806                   out_offset,
1807                   task->rect);
1808   enqueue_kernel(ckFilterWriteFeature, task->filter_area.z, task->filter_area.w);
1809
1810   return true;
1811 }
1812
1813 bool OpenCLDevice::denoising_detect_outliers(device_ptr image_ptr,
1814                                              device_ptr variance_ptr,
1815                                              device_ptr depth_ptr,
1816                                              device_ptr output_ptr,
1817                                              DenoisingTask *task)
1818 {
1819   cl_mem image_mem = CL_MEM_PTR(image_ptr);
1820   cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1821   cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1822   cl_mem output_mem = CL_MEM_PTR(output_ptr);
1823
1824   cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1825
1826   kernel_set_args(ckFilterDetectOutliers,
1827                   0,
1828                   image_mem,
1829                   variance_mem,
1830                   depth_mem,
1831                   output_mem,
1832                   task->rect,
1833                   task->buffer.pass_stride);
1834   enqueue_kernel(ckFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
1835
1836   return true;
1837 }
1838
1839 void OpenCLDevice::denoise(RenderTile &rtile, DenoisingTask &denoising)
1840 {
1841   denoising.functions.construct_transform = function_bind(
1842       &OpenCLDevice::denoising_construct_transform, this, &denoising);
1843   denoising.functions.accumulate = function_bind(
1844       &OpenCLDevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1845   denoising.functions.solve = function_bind(&OpenCLDevice::denoising_solve, this, _1, &denoising);
1846   denoising.functions.divide_shadow = function_bind(
1847       &OpenCLDevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1848   denoising.functions.non_local_means = function_bind(
1849       &OpenCLDevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1850   denoising.functions.combine_halves = function_bind(
1851       &OpenCLDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1852   denoising.functions.get_feature = function_bind(
1853       &OpenCLDevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1854   denoising.functions.write_feature = function_bind(
1855       &OpenCLDevice::denoising_write_feature, this, _1, _2, _3, &denoising);
1856   denoising.functions.detect_outliers = function_bind(
1857       &OpenCLDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1858
1859   denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1860   denoising.render_buffer.samples = rtile.sample;
1861   denoising.buffer.gpu_temporary_mem = true;
1862
1863   denoising.run_denoising(rtile);
1864 }
1865
1866 void OpenCLDevice::shader(DeviceTask &task)
1867 {
1868   /* cast arguments to cl types */
1869   cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1870   cl_mem d_input = CL_MEM_PTR(task.shader_input);
1871   cl_mem d_output = CL_MEM_PTR(task.shader_output);
1872   cl_int d_shader_eval_type = task.shader_eval_type;
1873   cl_int d_shader_filter = task.shader_filter;
1874   cl_int d_shader_x = task.shader_x;
1875   cl_int d_shader_w = task.shader_w;
1876   cl_int d_offset = task.offset;
1877
1878   OpenCLDevice::OpenCLProgram *program = &background_program;
1879   if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1880     program = &displace_program;
1881   }
1882   program->wait_for_availability();
1883   cl_kernel kernel = (*program)();
1884
1885   cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_input, d_output);
1886
1887   set_kernel_arg_buffers(kernel, &start_arg_index);
1888
1889   start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_eval_type);
1890   if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
1891     start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_filter);
1892   }
1893   start_arg_index += kernel_set_args(kernel, start_arg_index, d_shader_x, d_shader_w, d_offset);
1894
1895   for (int sample = 0; sample < task.num_samples; sample++) {
1896
1897     if (task.get_cancel())
1898       break;
1899
1900     kernel_set_args(kernel, start_arg_index, sample);
1901
1902     enqueue_kernel(kernel, task.shader_w, 1);
1903
1904     clFinish(cqCommandQueue);
1905
1906     task.update_progress(NULL);
1907   }
1908 }
1909
1910 void OpenCLDevice::bake(DeviceTask &task, RenderTile &rtile)
1911 {
1912   scoped_timer timer(&rtile.buffers->render_time);
1913
1914   /* Cast arguments to cl types. */
1915   cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1916   cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1917   cl_int d_x = rtile.x;
1918   cl_int d_y = rtile.y;
1919   cl_int d_w = rtile.w;
1920   cl_int d_h = rtile.h;
1921   cl_int d_offset = rtile.offset;
1922   cl_int d_stride = rtile.stride;
1923
1924   bake_program.wait_for_availability();
1925   cl_kernel kernel = bake_program();
1926
1927   cl_uint start_arg_index = kernel_set_args(kernel, 0, d_data, d_buffer);
1928
1929   set_kernel_arg_buffers(kernel, &start_arg_index);
1930
1931   start_arg_index += kernel_set_args(
1932       kernel, start_arg_index, d_x, d_y, d_w, d_h, d_offset, d_stride);
1933
1934   int start_sample = rtile.start_sample;
1935   int end_sample = rtile.start_sample + rtile.num_samples;
1936
1937   for (int sample = start_sample; sample < end_sample; sample++) {
1938     if (task.get_cancel()) {
1939       if (task.need_finish_queue == false)
1940         break;
1941     }
1942
1943     kernel_set_args(kernel, start_arg_index, sample);
1944
1945     enqueue_kernel(kernel, d_w, d_h);
1946     clFinish(cqCommandQueue);
1947
1948     rtile.sample = sample + 1;
1949
1950     task.update_progress(&rtile, rtile.w * rtile.h);
1951   }
1952 }
1953
1954 static bool kernel_build_opencl_2(cl_device_id cdDevice)
1955 {
1956   /* Build with OpenCL 2.0 if available, this improves performance
1957    * with AMD OpenCL drivers on Windows and Linux (legacy drivers).
1958    * Note that OpenCL selects the highest 1.x version by default,
1959    * only for 2.0 do we need the explicit compiler flag. */
1960   int version_major, version_minor;
1961   if (OpenCLInfo::get_device_version(cdDevice, &version_major, &version_minor)) {
1962     if (version_major >= 2) {
1963       /* This appears to trigger a driver bug in Radeon RX cards with certain
1964        * driver version, so don't use OpenCL 2.0 for those. */
1965       string device_name = OpenCLInfo::get_readable_device_name(cdDevice);
1966       if (string_startswith(device_name, "Radeon RX 4") ||
1967           string_startswith(device_name, "Radeon (TM) RX 4") ||
1968           string_startswith(device_name, "Radeon RX 5") ||
1969           string_startswith(device_name, "Radeon (TM) RX 5")) {
1970         char version[256] = "";
1971         int driver_major, driver_minor;
1972         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
1973         if (sscanf(version, "OpenCL 2.0 AMD-APP (%d.%d)", &driver_major, &driver_minor) == 2) {
1974           return !(driver_major == 3075 && driver_minor <= 12);
1975         }
1976       }
1977
1978       return true;
1979     }
1980   }
1981
1982   return false;
1983 }
1984
1985 string OpenCLDevice::kernel_build_options(const string *debug_src)
1986 {
1987   string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1988
1989   if (kernel_build_opencl_2(cdDevice)) {
1990     build_options += "-cl-std=CL2.0 ";
1991   }
1992
1993   if (platform_name == "NVIDIA CUDA") {
1994     build_options +=
1995         "-D__KERNEL_OPENCL_NVIDIA__ "
1996         "-cl-nv-maxrregcount=32 "
1997         "-cl-nv-verbose ";
1998
1999     uint compute_capability_major, compute_capability_minor;
2000     clGetDeviceInfo(cdDevice,
2001                     CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
2002                     sizeof(cl_uint),
2003                     &compute_capability_major,
2004                     NULL);
2005     clGetDeviceInfo(cdDevice,
2006                     CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
2007                     sizeof(cl_uint),
2008                     &compute_capability_minor,
2009                     NULL);
2010
2011     build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
2012                                    compute_capability_major * 100 + compute_capability_minor * 10);
2013   }
2014
2015   else if (platform_name == "Apple")
2016     build_options += "-D__KERNEL_OPENCL_APPLE__ ";
2017
2018   else if (platform_name == "AMD Accelerated Parallel Processing")
2019     build_options += "-D__KERNEL_OPENCL_AMD__ ";
2020
2021   else if (platform_name == "Intel(R) OpenCL") {
2022     build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
2023
2024     /* Options for gdb source level kernel debugging.
2025      * this segfaults on linux currently.
2026      */
2027     if (OpenCLInfo::use_debug() && debug_src)
2028       build_options += "-g -s \"" + *debug_src + "\" ";
2029   }
2030
2031   if (info.has_half_images) {
2032     build_options += "-D__KERNEL_CL_KHR_FP16__ ";
2033   }
2034
2035   if (OpenCLInfo::use_debug()) {
2036     build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
2037   }
2038
2039 #  ifdef WITH_CYCLES_DEBUG
2040   build_options += "-D__KERNEL_DEBUG__ ";
2041 #  endif
2042
2043 #  ifdef WITH_NANOVDB
2044   if (info.has_nanovdb) {
2045     build_options += "-DWITH_NANOVDB ";
2046   }
2047 #  endif
2048
2049   return build_options;
2050 }
2051
2052 /* TODO(sergey): In the future we can use variadic templates, once
2053  * C++0x is allowed. Should allow to clean this up a bit.
2054  */
2055 int OpenCLDevice::kernel_set_args(cl_kernel kernel,
2056                                   int start_argument_index,
2057                                   const ArgumentWrapper &arg1,
2058                                   const ArgumentWrapper &arg2,
2059                                   const ArgumentWrapper &arg3,
2060                                   const ArgumentWrapper &arg4,
2061                                   const ArgumentWrapper &arg5,
2062                                   const ArgumentWrapper &arg6,
2063                                   const ArgumentWrapper &arg7,
2064                                   const ArgumentWrapper &arg8,
2065                                   const ArgumentWrapper &arg9,
2066                                   const ArgumentWrapper &arg10,
2067                                   const ArgumentWrapper &arg11,
2068                                   const ArgumentWrapper &arg12,
2069                                   const ArgumentWrapper &arg13,
2070                                   const ArgumentWrapper &arg14,
2071                                   const ArgumentWrapper &arg15,
2072                                   const ArgumentWrapper &arg16,
2073                                   const ArgumentWrapper &arg17,
2074                                   const ArgumentWrapper &arg18,
2075                                   const ArgumentWrapper &arg19,
2076                                   const ArgumentWrapper &arg20,
2077                                   const ArgumentWrapper &arg21,
2078                                   const ArgumentWrapper &arg22,
2079                                   const ArgumentWrapper &arg23,
2080                                   const ArgumentWrapper &arg24,
2081                                   const ArgumentWrapper &arg25,
2082                                   const ArgumentWrapper &arg26,
2083                                   const ArgumentWrapper &arg27,
2084                                   const ArgumentWrapper &arg28,
2085                                   const ArgumentWrapper &arg29,
2086                                   const ArgumentWrapper &arg30,
2087                                   const ArgumentWrapper &arg31,
2088                                   const ArgumentWrapper &arg32,
2089                                   const ArgumentWrapper &arg33)
2090 {
2091   int current_arg_index = 0;
2092 #  define FAKE_VARARG_HANDLE_ARG(arg) \
2093     do { \
2094       if (arg.pointer != NULL) { \
2095         opencl_assert(clSetKernelArg( \
2096             kernel, start_argument_index + current_arg_index, arg.size, arg.pointer)); \
2097         ++current_arg_index; \
2098       } \
2099       else { \
2100         return current_arg_index; \
2101       } \
2102     } while (false)
2103   FAKE_VARARG_HANDLE_ARG(arg1);
2104   FAKE_VARARG_HANDLE_ARG(arg2);
2105   FAKE_VARARG_HANDLE_ARG(arg3);
2106   FAKE_VARARG_HANDLE_ARG(arg4);
2107   FAKE_VARARG_HANDLE_ARG(arg5);
2108   FAKE_VARARG_HANDLE_ARG(arg6);
2109   FAKE_VARARG_HANDLE_ARG(arg7);
2110   FAKE_VARARG_HANDLE_ARG(arg8);
2111   FAKE_VARARG_HANDLE_ARG(arg9);
2112   FAKE_VARARG_HANDLE_ARG(arg10);
2113   FAKE_VARARG_HANDLE_ARG(arg11);
2114   FAKE_VARARG_HANDLE_ARG(arg12);
2115   FAKE_VARARG_HANDLE_ARG(arg13);
2116   FAKE_VARARG_HANDLE_ARG(arg14);
2117   FAKE_VARARG_HANDLE_ARG(arg15);
2118   FAKE_VARARG_HANDLE_ARG(arg16);
2119   FAKE_VARARG_HANDLE_ARG(arg17);
2120   FAKE_VARARG_HANDLE_ARG(arg18);
2121   FAKE_VARARG_HANDLE_ARG(arg19);
2122   FAKE_VARARG_HANDLE_ARG(arg20);
2123   FAKE_VARARG_HANDLE_ARG(arg21);
2124   FAKE_VARARG_HANDLE_ARG(arg22);
2125   FAKE_VARARG_HANDLE_ARG(arg23);
2126   FAKE_VARARG_HANDLE_ARG(arg24);
2127   FAKE_VARARG_HANDLE_ARG(arg25);
2128   FAKE_VARARG_HANDLE_ARG(arg26);
2129   FAKE_VARARG_HANDLE_ARG(arg27);
2130   FAKE_VARARG_HANDLE_ARG(arg28);
2131   FAKE_VARARG_HANDLE_ARG(arg29);
2132   FAKE_VARARG_HANDLE_ARG(arg30);
2133   FAKE_VARARG_HANDLE_ARG(arg31);
2134   FAKE_VARARG_HANDLE_ARG(arg32);
2135   FAKE_VARARG_HANDLE_ARG(arg33);
2136 #  undef FAKE_VARARG_HANDLE_ARG
2137   return current_arg_index;
2138 }
2139
2140 void OpenCLDevice::release_kernel_safe(cl_kernel kernel)
2141 {
2142   if (kernel) {
2143     clReleaseKernel(kernel);
2144   }
2145 }
2146
2147 void OpenCLDevice::release_mem_object_safe(cl_mem mem)
2148 {
2149   if (mem != NULL) {
2150     clReleaseMemObject(mem);
2151   }
2152 }
2153
2154 void OpenCLDevice::release_program_safe(cl_program program)
2155 {
2156   if (program) {
2157     clReleaseProgram(program);
2158   }
2159 }
2160
2161 /* ** Those guys are for working around some compiler-specific bugs ** */
2162
2163 cl_program OpenCLDevice::load_cached_kernel(ustring key, thread_scoped_lock &cache_locker)
2164 {
2165   return OpenCLCache::get_program(cpPlatform, cdDevice, key, cache_locker);
2166 }
2167
2168 void OpenCLDevice::store_cached_kernel(cl_program program,
2169                                        ustring key,
2170                                        thread_scoped_lock &cache_locker)
2171 {
2172   OpenCLCache::store_program(cpPlatform, cdDevice, program, key, cache_locker);
2173 }
2174
2175 Device *opencl_create_split_device(DeviceInfo &info,
2176                                    Stats &stats,
2177                                    Profiler &profiler,
2178                                    bool background)
2179 {
2180   return new OpenCLDevice(info, stats, profiler, background);
2181 }
2182
2183 CCL_NAMESPACE_END
2184
2185 #endif