Cycles: Support multithreaded compilation of kernels
[blender.git] / intern / cycles / device / opencl / opencl_base.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "kernel/kernel_types.h"
22
23 #include "util/util_algorithm.h"
24 #include "util/util_debug.h"
25 #include "util/util_foreach.h"
26 #include "util/util_logging.h"
27 #include "util/util_md5.h"
28 #include "util/util_path.h"
29 #include "util/util_time.h"
30
31 CCL_NAMESPACE_BEGIN
32
33 struct texture_slot_t {
34         texture_slot_t(const string& name, int slot)
35                 : name(name),
36                   slot(slot) {
37         }
38         string name;
39         int slot;
40 };
41
42 bool OpenCLDeviceBase::opencl_error(cl_int err)
43 {
44         if(err != CL_SUCCESS) {
45                 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
46                 if(error_msg == "")
47                         error_msg = message;
48                 fprintf(stderr, "%s\n", message.c_str());
49                 return true;
50         }
51
52         return false;
53 }
54
55 void OpenCLDeviceBase::opencl_error(const string& message)
56 {
57         if(error_msg == "")
58                 error_msg = message;
59         fprintf(stderr, "%s\n", message.c_str());
60 }
61
62 void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
63 {
64         if(err != CL_SUCCESS) {
65                 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
66                 if(error_msg == "")
67                         error_msg = message;
68                 fprintf(stderr, "%s\n", message.c_str());
69 #ifndef NDEBUG
70                 abort();
71 #endif
72         }
73 }
74
75 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
76 : Device(info, stats, profiler, background_),
77   memory_manager(this),
78   texture_info(this, "__texture_info", MEM_TEXTURE)
79 {
80         cpPlatform = NULL;
81         cdDevice = NULL;
82         cxContext = NULL;
83         cqCommandQueue = NULL;
84         null_mem = 0;
85         device_initialized = false;
86         textures_need_update = true;
87
88         vector<OpenCLPlatformDevice> usable_devices;
89         OpenCLInfo::get_usable_devices(&usable_devices);
90         if(usable_devices.size() == 0) {
91                 opencl_error("OpenCL: no devices found.");
92                 return;
93         }
94         assert(info.num < usable_devices.size());
95         OpenCLPlatformDevice& platform_device = usable_devices[info.num];
96         device_num = info.num;
97         cpPlatform = platform_device.platform_id;
98         cdDevice = platform_device.device_id;
99         platform_name = platform_device.platform_name;
100         device_name = platform_device.device_name;
101         VLOG(2) << "Creating new Cycles device for OpenCL platform "
102                 << platform_name << ", device "
103                 << device_name << ".";
104
105         {
106                 /* try to use cached context */
107                 thread_scoped_lock cache_locker;
108                 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
109
110                 if(cxContext == NULL) {
111                         /* create context properties array to specify platform */
112                         const cl_context_properties context_props[] = {
113                                 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
114                                 0, 0
115                         };
116
117                         /* create context */
118                         cxContext = clCreateContext(context_props, 1, &cdDevice,
119                                 context_notify_callback, cdDevice, &ciErr);
120
121                         if(opencl_error(ciErr)) {
122                                 opencl_error("OpenCL: clCreateContext failed");
123                                 return;
124                         }
125
126                         /* cache it */
127                         OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
128                 }
129         }
130
131         cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
132         if(opencl_error(ciErr)) {
133                 opencl_error("OpenCL: Error creating command queue");
134                 return;
135         }
136
137         null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
138         if(opencl_error(ciErr)) {
139                 opencl_error("OpenCL: Error creating memory buffer for NULL");
140                 return;
141         }
142
143         /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
144         texture_info.resize(1);
145         memory_manager.alloc("texture_info", texture_info);
146
147         device_initialized = true;
148 }
149
150 OpenCLDeviceBase::~OpenCLDeviceBase()
151 {
152         task_pool.stop();
153
154         memory_manager.free();
155
156         if(null_mem)
157                 clReleaseMemObject(CL_MEM_PTR(null_mem));
158
159         ConstMemMap::iterator mt;
160         for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
161                 delete mt->second;
162         }
163
164         base_program.release();
165         if(cqCommandQueue)
166                 clReleaseCommandQueue(cqCommandQueue);
167         if(cxContext)
168                 clReleaseContext(cxContext);
169 }
170
171 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
172         const void * /*private_info*/, size_t /*cb*/, void *user_data)
173 {
174         string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
175         fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
176 }
177
178 bool OpenCLDeviceBase::opencl_version_check()
179 {
180         string error;
181         if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
182                 opencl_error(error);
183                 return false;
184         }
185         if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
186                 opencl_error(error);
187                 return false;
188         }
189         return true;
190 }
191
192 string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
193 {
194         MD5Hash md5;
195         char version[256], driver[256], name[256], vendor[256];
196
197         clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
198         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
199         clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
200         clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
201
202         md5.append((uint8_t*)vendor, strlen(vendor));
203         md5.append((uint8_t*)version, strlen(version));
204         md5.append((uint8_t*)name, strlen(name));
205         md5.append((uint8_t*)driver, strlen(driver));
206
207         string options = kernel_build_options();
208         options += kernel_custom_build_options;
209         md5.append((uint8_t*)options.c_str(), options.size());
210
211         return md5.get_hex();
212 }
213
214 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
215 {
216         VLOG(2) << "Loading kernels for platform " << platform_name
217                 << ", device " << device_name << ".";
218         /* Verify if device was initialized. */
219         if(!device_initialized) {
220                 fprintf(stderr, "OpenCL: failed to initialize device.\n");
221                 return false;
222         }
223
224         /* Verify we have right opencl version. */
225         if(!opencl_version_check())
226                 return false;
227
228         base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
229         base_program.add_kernel(ustring("convert_to_byte"));
230         base_program.add_kernel(ustring("convert_to_half_float"));
231         base_program.add_kernel(ustring("displace"));
232         base_program.add_kernel(ustring("background"));
233         base_program.add_kernel(ustring("bake"));
234         base_program.add_kernel(ustring("zero_buffer"));
235
236         denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
237         denoising_program.add_kernel(ustring("filter_divide_shadow"));
238         denoising_program.add_kernel(ustring("filter_get_feature"));
239         denoising_program.add_kernel(ustring("filter_detect_outliers"));
240         denoising_program.add_kernel(ustring("filter_combine_halves"));
241         denoising_program.add_kernel(ustring("filter_construct_transform"));
242         denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
243         denoising_program.add_kernel(ustring("filter_nlm_blur"));
244         denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
245         denoising_program.add_kernel(ustring("filter_nlm_update_output"));
246         denoising_program.add_kernel(ustring("filter_nlm_normalize"));
247         denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
248         denoising_program.add_kernel(ustring("filter_finalize"));
249
250         vector<OpenCLProgram*> programs;
251         programs.push_back(&base_program);
252         programs.push_back(&denoising_program);
253         /* Call actual class to fill the vector with its programs. */
254         if(!add_kernel_programs(requested_features, programs)) {
255                 return false;
256         }
257
258         /* Parallel compilation of Cycles kernels, this launches multiple
259          * processes to workaround OpenCL frameworks serializing the calls
260          * internally within a single process. */
261         TaskPool task_pool;
262         foreach(OpenCLProgram *program, programs) {
263                 task_pool.push(function_bind(&OpenCLProgram::load, program));
264         }
265         task_pool.wait_work();
266
267         foreach(OpenCLProgram *program, programs) {
268                 VLOG(2) << program->get_log();
269                 if(!program->is_loaded()) {
270                         program->report_error();
271                         return false;
272                 }
273         }
274
275         return true;
276 }
277
278 void OpenCLDeviceBase::mem_alloc(device_memory& mem)
279 {
280         if(mem.name) {
281                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
282                             << string_human_readable_number(mem.memory_size()) << " bytes. ("
283                             << string_human_readable_size(mem.memory_size()) << ")";
284         }
285
286         size_t size = mem.memory_size();
287
288         /* check there is enough memory available for the allocation */
289         cl_ulong max_alloc_size = 0;
290         clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
291
292         if(DebugFlags().opencl.mem_limit) {
293                 max_alloc_size = min(max_alloc_size,
294                                      cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
295         }
296
297         if(size > max_alloc_size) {
298                 string error = "Scene too complex to fit in available memory.";
299                 if(mem.name != NULL) {
300                         error += string_printf(" (allocating buffer %s failed.)", mem.name);
301                 }
302                 set_error(error);
303
304                 return;
305         }
306
307         cl_mem_flags mem_flag;
308         void *mem_ptr = NULL;
309
310         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
311                 mem_flag = CL_MEM_READ_ONLY;
312         else
313                 mem_flag = CL_MEM_READ_WRITE;
314
315         /* Zero-size allocation might be invoked by render, but not really
316          * supported by OpenCL. Using NULL as device pointer also doesn't really
317          * work for some reason, so for the time being we'll use special case
318          * will null_mem buffer.
319          */
320         if(size != 0) {
321                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
322                                                                 mem_flag,
323                                                                 size,
324                                                                 mem_ptr,
325                                                                 &ciErr);
326                 opencl_assert_err(ciErr, "clCreateBuffer");
327         }
328         else {
329                 mem.device_pointer = null_mem;
330         }
331
332         stats.mem_alloc(size);
333         mem.device_size = size;
334 }
335
336 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
337 {
338         if(mem.type == MEM_TEXTURE) {
339                 tex_free(mem);
340                 tex_alloc(mem);
341         }
342         else {
343                 if(!mem.device_pointer) {
344                         mem_alloc(mem);
345                 }
346
347                 /* this is blocking */
348                 size_t size = mem.memory_size();
349                 if(size != 0) {
350                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
351                                                            CL_MEM_PTR(mem.device_pointer),
352                                                            CL_TRUE,
353                                                            0,
354                                                            size,
355                                                            mem.host_pointer,
356                                                            0,
357                                                            NULL, NULL));
358                 }
359         }
360 }
361
362 void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
363 {
364         size_t offset = elem*y*w;
365         size_t size = elem*w*h;
366         assert(size != 0);
367         opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
368                                           CL_MEM_PTR(mem.device_pointer),
369                                           CL_TRUE,
370                                           offset,
371                                           size,
372                                           (uchar*)mem.host_pointer + offset,
373                                           0,
374                                           NULL, NULL));
375 }
376
377 void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
378 {
379         cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
380
381         size_t global_size[] = {1024, 1024};
382         size_t num_threads = global_size[0] * global_size[1];
383
384         cl_mem d_buffer = CL_MEM_PTR(mem);
385         cl_ulong d_offset = 0;
386         cl_ulong d_size = 0;
387
388         while(d_offset < size) {
389                 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
390
391                 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
392
393                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
394                                                ckZeroBuffer,
395                                                2,
396                                                NULL,
397                                                global_size,
398                                                NULL,
399                                                0,
400                                                NULL,
401                                                NULL);
402                 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
403
404                 d_offset += d_size;
405         }
406 }
407
408 void OpenCLDeviceBase::mem_zero(device_memory& mem)
409 {
410         if(!mem.device_pointer) {
411                 mem_alloc(mem);
412         }
413
414         if(mem.device_pointer) {
415                 if(base_program.is_loaded()) {
416                         mem_zero_kernel(mem.device_pointer, mem.memory_size());
417                 }
418
419                 if(mem.host_pointer) {
420                         memset(mem.host_pointer, 0, mem.memory_size());
421                 }
422
423                 if(!base_program.is_loaded()) {
424                         void* zero = mem.host_pointer;
425
426                         if(!mem.host_pointer) {
427                                 zero = util_aligned_malloc(mem.memory_size(), 16);
428                                 memset(zero, 0, mem.memory_size());
429                         }
430
431                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
432                                                            CL_MEM_PTR(mem.device_pointer),
433                                                            CL_TRUE,
434                                                            0,
435                                                            mem.memory_size(),
436                                                            zero,
437                                                            0,
438                                                            NULL, NULL));
439
440                         if(!mem.host_pointer) {
441                                 util_aligned_free(zero);
442                         }
443                 }
444         }
445 }
446
447 void OpenCLDeviceBase::mem_free(device_memory& mem)
448 {
449         if(mem.type == MEM_TEXTURE) {
450                 tex_free(mem);
451         }
452         else {
453                 if(mem.device_pointer) {
454                         if(mem.device_pointer != null_mem) {
455                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
456                         }
457                         mem.device_pointer = 0;
458
459                         stats.mem_free(mem.device_size);
460                         mem.device_size = 0;
461                 }
462         }
463 }
464
465 int OpenCLDeviceBase::mem_sub_ptr_alignment()
466 {
467         return OpenCLInfo::mem_sub_ptr_alignment(cdDevice);
468 }
469
470 device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
471 {
472         cl_mem_flags mem_flag;
473         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
474                 mem_flag = CL_MEM_READ_ONLY;
475         else
476                 mem_flag = CL_MEM_READ_WRITE;
477
478         cl_buffer_region info;
479         info.origin = mem.memory_elements_size(offset);
480         info.size = mem.memory_elements_size(size);
481
482         device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
483                                                             mem_flag,
484                                                             CL_BUFFER_CREATE_TYPE_REGION,
485                                                             &info,
486                                                             &ciErr);
487         opencl_assert_err(ciErr, "clCreateSubBuffer");
488         return sub_buf;
489 }
490
491 void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
492 {
493         if(device_pointer && device_pointer != null_mem) {
494                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
495         }
496 }
497
498 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
499 {
500         ConstMemMap::iterator i = const_mem_map.find(name);
501         device_vector<uchar> *data;
502
503         if(i == const_mem_map.end()) {
504                 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
505                 data->alloc(size);
506                 const_mem_map.insert(ConstMemMap::value_type(name, data));
507         }
508         else {
509                 data = i->second;
510         }
511
512         memcpy(data->data(), host, size);
513         data->copy_to_device();
514 }
515
516 void OpenCLDeviceBase::tex_alloc(device_memory& mem)
517 {
518         VLOG(1) << "Texture allocate: " << mem.name << ", "
519                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
520                 << string_human_readable_size(mem.memory_size()) << ")";
521
522         memory_manager.alloc(mem.name, mem);
523         /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
524         mem.device_pointer = 1;
525         textures[mem.name] = &mem;
526         textures_need_update = true;
527 }
528
529 void OpenCLDeviceBase::tex_free(device_memory& mem)
530 {
531         if(mem.device_pointer) {
532                 mem.device_pointer = 0;
533
534                 if(memory_manager.free(mem)) {
535                         textures_need_update = true;
536                 }
537
538                 foreach(TexturesMap::value_type& value, textures) {
539                         if(value.second == &mem) {
540                                 textures.erase(value.first);
541                                 break;
542                         }
543                 }
544         }
545 }
546
547 size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
548 {
549         int r = global_size % group_size;
550         return global_size + ((r == 0)? 0: group_size - r);
551 }
552
553 void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
554 {
555         size_t workgroup_size, max_work_items[3];
556
557         clGetKernelWorkGroupInfo(kernel, cdDevice,
558                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
559         clGetDeviceInfo(cdDevice,
560                 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
561
562         if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
563                 workgroup_size = max_workgroup_size;
564         }
565
566         /* Try to divide evenly over 2 dimensions. */
567         size_t local_size[2];
568         if(x_workgroups) {
569                 local_size[0] = workgroup_size;
570                 local_size[1] = 1;
571         }
572         else {
573                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
574                 local_size[0] = local_size[1] = sqrt_workgroup_size;
575         }
576
577         /* Some implementations have max size 1 on 2nd dimension. */
578         if(local_size[1] > max_work_items[1]) {
579                 local_size[0] = workgroup_size/max_work_items[1];
580                 local_size[1] = max_work_items[1];
581         }
582
583         size_t global_size[2] = {global_size_round_up(local_size[0], w),
584                                  global_size_round_up(local_size[1], h)};
585
586         /* Vertical size of 1 is coming from bake/shade kernels where we should
587          * not round anything up because otherwise we'll either be doing too
588          * much work per pixel (if we don't check global ID on Y axis) or will
589          * be checking for global ID to always have Y of 0.
590          */
591         if(h == 1) {
592                 global_size[h] = 1;
593         }
594
595         /* run kernel */
596         opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
597         opencl_assert(clFlush(cqCommandQueue));
598 }
599
600 void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
601 {
602         cl_mem ptr;
603
604         MemMap::iterator i = mem_map.find(name);
605         if(i != mem_map.end()) {
606                 ptr = CL_MEM_PTR(i->second);
607         }
608         else {
609                 /* work around NULL not working, even though the spec says otherwise */
610                 ptr = CL_MEM_PTR(null_mem);
611         }
612
613         opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
614 }
615
616 void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
617 {
618         flush_texture_buffers();
619
620         memory_manager.set_kernel_arg_buffers(kernel, narg);
621 }
622
623 void OpenCLDeviceBase::flush_texture_buffers()
624 {
625         if(!textures_need_update) {
626                 return;
627         }
628         textures_need_update = false;
629
630         /* Setup slots for textures. */
631         int num_slots = 0;
632
633         vector<texture_slot_t> texture_slots;
634
635 #define KERNEL_TEX(type, name) \
636         if(textures.find(#name) != textures.end()) { \
637                 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
638         } \
639         num_slots++;
640 #include "kernel/kernel_textures.h"
641
642         int num_data_slots = num_slots;
643
644         foreach(TexturesMap::value_type& tex, textures) {
645                 string name = tex.first;
646
647                 if(string_startswith(name, "__tex_image")) {
648                         int pos = name.rfind("_");
649                         int id = atoi(name.data() + pos + 1);
650                         texture_slots.push_back(texture_slot_t(name,
651                                                                    num_data_slots + id));
652                         num_slots = max(num_slots, num_data_slots + id + 1);
653                 }
654         }
655
656         /* Realloc texture descriptors buffer. */
657         memory_manager.free(texture_info);
658         texture_info.resize(num_slots);
659         memory_manager.alloc("texture_info", texture_info);
660
661         /* Fill in descriptors */
662         foreach(texture_slot_t& slot, texture_slots) {
663                 TextureInfo& info = texture_info[slot.slot];
664
665                 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
666                 info.data = desc.offset;
667                 info.cl_buffer = desc.device_buffer;
668
669                 if(string_startswith(slot.name, "__tex_image")) {
670                         device_memory *mem = textures[slot.name];
671
672                         info.width = mem->data_width;
673                         info.height = mem->data_height;
674                         info.depth = mem->data_depth;
675
676                         info.interpolation = mem->interpolation;
677                         info.extension = mem->extension;
678                 }
679         }
680
681         /* Force write of descriptors. */
682         memory_manager.free(texture_info);
683         memory_manager.alloc("texture_info", texture_info);
684 }
685
686 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
687 {
688         /* cast arguments to cl types */
689         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
690         cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
691         cl_mem d_buffer = CL_MEM_PTR(buffer);
692         cl_int d_x = task.x;
693         cl_int d_y = task.y;
694         cl_int d_w = task.w;
695         cl_int d_h = task.h;
696         cl_float d_sample_scale = 1.0f/(task.sample + 1);
697         cl_int d_offset = task.offset;
698         cl_int d_stride = task.stride;
699
700
701         cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
702
703         cl_uint start_arg_index =
704                 kernel_set_args(ckFilmConvertKernel,
705                                 0,
706                                 d_data,
707                                 d_rgba,
708                                 d_buffer);
709
710         set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
711
712         start_arg_index += kernel_set_args(ckFilmConvertKernel,
713                                            start_arg_index,
714                                            d_sample_scale,
715                                            d_x,
716                                            d_y,
717                                            d_w,
718                                            d_h,
719                                            d_offset,
720                                            d_stride);
721
722         enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
723 }
724
725 bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
726                                                  device_ptr guide_ptr,
727                                                  device_ptr variance_ptr,
728                                                  device_ptr out_ptr,
729                                                  DenoisingTask *task)
730 {
731         int stride = task->buffer.stride;
732         int w = task->buffer.width;
733         int h = task->buffer.h;
734         int r = task->nlm_state.r;
735         int f = task->nlm_state.f;
736         float a = task->nlm_state.a;
737         float k_2 = task->nlm_state.k_2;
738
739         int pass_stride = task->buffer.pass_stride;
740         int num_shifts = (2*r+1)*(2*r+1);
741         int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
742
743         device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
744         device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
745         device_sub_ptr weightAccum(task->buffer.temporary_mem, 2*pass_stride*num_shifts, pass_stride);
746         cl_mem weightAccum_mem = CL_MEM_PTR(*weightAccum);
747         cl_mem difference_mem = CL_MEM_PTR(*difference);
748         cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
749
750         cl_mem image_mem = CL_MEM_PTR(image_ptr);
751         cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
752         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
753         cl_mem out_mem = CL_MEM_PTR(out_ptr);
754         cl_mem scale_mem = NULL;
755
756         mem_zero_kernel(*weightAccum, sizeof(float)*pass_stride);
757         mem_zero_kernel(out_ptr, sizeof(float)*pass_stride);
758
759         cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
760         cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
761         cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
762         cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
763         cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
764
765         kernel_set_args(ckNLMCalcDifference, 0,
766                         guide_mem,
767                         variance_mem,
768                         scale_mem,
769                         difference_mem,
770                         w, h, stride,
771                         pass_stride,
772                         r, channel_offset,
773                         0, a, k_2);
774         kernel_set_args(ckNLMBlur, 0,
775                         difference_mem,
776                         blurDifference_mem,
777                         w, h, stride,
778                         pass_stride,
779                         r, f);
780         kernel_set_args(ckNLMCalcWeight, 0,
781                         blurDifference_mem,
782                         difference_mem,
783                         w, h, stride,
784                         pass_stride,
785                         r, f);
786         kernel_set_args(ckNLMUpdateOutput, 0,
787                         blurDifference_mem,
788                         image_mem,
789                         out_mem,
790                         weightAccum_mem,
791                         w, h, stride,
792                         pass_stride,
793                         channel_offset,
794                         r, f);
795
796         enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
797         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
798         enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
799         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
800         enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
801
802         kernel_set_args(ckNLMNormalize, 0,
803                         out_mem, weightAccum_mem, w, h, stride);
804         enqueue_kernel(ckNLMNormalize, w, h);
805
806         return true;
807 }
808
809 bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
810 {
811         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
812         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
813         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
814         cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
815
816         char use_time = task->buffer.use_time? 1 : 0;
817
818         cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
819
820         int arg_ofs = kernel_set_args(ckFilterConstructTransform, 0,
821                                       buffer_mem,
822                                       tile_info_mem);
823         cl_mem buffers[9];
824         for(int i = 0; i < 9; i++) {
825                 buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
826                 arg_ofs += kernel_set_args(ckFilterConstructTransform,
827                                            arg_ofs,
828                                            buffers[i]);
829         }
830         kernel_set_args(ckFilterConstructTransform,
831                         arg_ofs,
832                         transform_mem,
833                         rank_mem,
834                         task->filter_area,
835                         task->rect,
836                         task->buffer.pass_stride,
837                         task->buffer.frame_stride,
838                         use_time,
839                         task->radius,
840                         task->pca_threshold);
841
842         enqueue_kernel(ckFilterConstructTransform,
843                        task->storage.w,
844                        task->storage.h,
845                        256);
846
847         return true;
848 }
849
850 bool OpenCLDeviceBase::denoising_accumulate(device_ptr color_ptr,
851                                             device_ptr color_variance_ptr,
852                                             device_ptr scale_ptr,
853                                             int frame,
854                                             DenoisingTask *task)
855 {
856         cl_mem color_mem = CL_MEM_PTR(color_ptr);
857         cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
858         cl_mem scale_mem = CL_MEM_PTR(scale_ptr);
859
860         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
861         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
862         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
863         cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
864         cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
865
866         cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
867         cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
868         cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
869         cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
870
871         int w = task->reconstruction_state.source_w;
872         int h = task->reconstruction_state.source_h;
873         int stride = task->buffer.stride;
874         int frame_offset = frame * task->buffer.frame_stride;
875         int t = task->tile_info->frames[frame];
876         char use_time = task->buffer.use_time? 1 : 0;
877
878         int r = task->radius;
879         int pass_stride = task->buffer.pass_stride;
880         int num_shifts = (2*r+1)*(2*r+1);
881
882         device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
883         device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
884         cl_mem difference_mem = CL_MEM_PTR(*difference);
885         cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
886
887         kernel_set_args(ckNLMCalcDifference, 0,
888                         color_mem,
889                         color_variance_mem,
890                         scale_mem,
891                         difference_mem,
892                         w, h, stride,
893                         pass_stride,
894                         r,
895                         pass_stride,
896                         frame_offset,
897                         1.0f, task->nlm_k_2);
898         kernel_set_args(ckNLMBlur, 0,
899                         difference_mem,
900                         blurDifference_mem,
901                         w, h, stride,
902                         pass_stride,
903                         r, 4);
904         kernel_set_args(ckNLMCalcWeight, 0,
905                         blurDifference_mem,
906                         difference_mem,
907                         w, h, stride,
908                         pass_stride,
909                         r, 4);
910         kernel_set_args(ckNLMConstructGramian, 0,
911                         t,
912                         blurDifference_mem,
913                         buffer_mem,
914                         transform_mem,
915                         rank_mem,
916                         XtWX_mem,
917                         XtWY_mem,
918                         task->reconstruction_state.filter_window,
919                         w, h, stride,
920                         pass_stride,
921                         r, 4,
922                         frame_offset,
923                         use_time);
924
925         enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
926         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
927         enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
928         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
929         enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
930
931         return true;
932 }
933
934 bool OpenCLDeviceBase::denoising_solve(device_ptr output_ptr,
935                                        DenoisingTask *task)
936 {
937         cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
938
939         cl_mem output_mem = CL_MEM_PTR(output_ptr);
940         cl_mem rank_mem   = CL_MEM_PTR(task->storage.rank.device_pointer);
941         cl_mem XtWX_mem   = CL_MEM_PTR(task->storage.XtWX.device_pointer);
942         cl_mem XtWY_mem   = CL_MEM_PTR(task->storage.XtWY.device_pointer);
943
944         int w = task->reconstruction_state.source_w;
945         int h = task->reconstruction_state.source_h;
946
947         kernel_set_args(ckFinalize, 0,
948                         output_mem,
949                         rank_mem,
950                         XtWX_mem,
951                         XtWY_mem,
952                         task->filter_area,
953                         task->reconstruction_state.buffer_params,
954                         task->render_buffer.samples);
955         enqueue_kernel(ckFinalize, w, h);
956
957         return true;
958 }
959
960 bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
961                                                 device_ptr b_ptr,
962                                                 device_ptr mean_ptr,
963                                                 device_ptr variance_ptr,
964                                                 int r, int4 rect,
965                                                 DenoisingTask *task)
966 {
967         cl_mem a_mem = CL_MEM_PTR(a_ptr);
968         cl_mem b_mem = CL_MEM_PTR(b_ptr);
969         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
970         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
971
972         cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
973
974         kernel_set_args(ckFilterCombineHalves, 0,
975                         mean_mem,
976                         variance_mem,
977                         a_mem,
978                         b_mem,
979                         rect,
980                         r);
981         enqueue_kernel(ckFilterCombineHalves,
982                        task->rect.z-task->rect.x,
983                        task->rect.w-task->rect.y);
984
985         return true;
986 }
987
988 bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
989                                                device_ptr b_ptr,
990                                                device_ptr sample_variance_ptr,
991                                                device_ptr sv_variance_ptr,
992                                                device_ptr buffer_variance_ptr,
993                                                DenoisingTask *task)
994 {
995         cl_mem a_mem = CL_MEM_PTR(a_ptr);
996         cl_mem b_mem = CL_MEM_PTR(b_ptr);
997         cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
998         cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
999         cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
1000
1001         cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1002
1003         cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
1004
1005         int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0,
1006                                       task->render_buffer.samples,
1007                                       tile_info_mem);
1008         cl_mem buffers[9];
1009         for(int i = 0; i < 9; i++) {
1010                 buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1011                 arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs,
1012                                            buffers[i]);
1013         }
1014         kernel_set_args(ckFilterDivideShadow, arg_ofs,
1015                         a_mem,
1016                         b_mem,
1017                         sample_variance_mem,
1018                         sv_variance_mem,
1019                         buffer_variance_mem,
1020                         task->rect,
1021                         task->render_buffer.pass_stride,
1022                         task->render_buffer.offset);
1023         enqueue_kernel(ckFilterDivideShadow,
1024                        task->rect.z-task->rect.x,
1025                        task->rect.w-task->rect.y);
1026
1027         return true;
1028 }
1029
1030 bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
1031                                              int variance_offset,
1032                                              device_ptr mean_ptr,
1033                                              device_ptr variance_ptr,
1034                                              float scale,
1035                                              DenoisingTask *task)
1036 {
1037         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1038         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1039
1040         cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
1041
1042         cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1043
1044         int arg_ofs = kernel_set_args(ckFilterGetFeature, 0,
1045                                       task->render_buffer.samples,
1046                                       tile_info_mem);
1047         cl_mem buffers[9];
1048         for(int i = 0; i < 9; i++) {
1049                 buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
1050                 arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs,
1051                                            buffers[i]);
1052         }
1053         kernel_set_args(ckFilterGetFeature, arg_ofs,
1054                         mean_offset,
1055                         variance_offset,
1056                         mean_mem,
1057                         variance_mem,
1058                         scale,
1059                         task->rect,
1060                         task->render_buffer.pass_stride,
1061                         task->render_buffer.offset);
1062         enqueue_kernel(ckFilterGetFeature,
1063                        task->rect.z-task->rect.x,
1064                        task->rect.w-task->rect.y);
1065
1066         return true;
1067 }
1068
1069 bool OpenCLDeviceBase::denoising_write_feature(int out_offset,
1070                                                device_ptr from_ptr,
1071                                                device_ptr buffer_ptr,
1072                                                DenoisingTask *task)
1073 {
1074         cl_mem from_mem = CL_MEM_PTR(from_ptr);
1075         cl_mem buffer_mem = CL_MEM_PTR(buffer_ptr);
1076
1077         cl_kernel ckFilterWriteFeature = denoising_program(ustring("filter_write_feature"));
1078
1079         kernel_set_args(ckFilterWriteFeature, 0,
1080                         task->render_buffer.samples,
1081                         task->reconstruction_state.buffer_params,
1082                         task->filter_area,
1083                         from_mem,
1084                         buffer_mem,
1085                         out_offset,
1086                         task->rect);
1087         enqueue_kernel(ckFilterWriteFeature,
1088                        task->filter_area.z,
1089                        task->filter_area.w);
1090
1091         return true;
1092 }
1093
1094 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
1095                                                  device_ptr variance_ptr,
1096                                                  device_ptr depth_ptr,
1097                                                  device_ptr output_ptr,
1098                                                  DenoisingTask *task)
1099 {
1100         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1101         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1102         cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1103         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1104
1105         cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1106
1107         kernel_set_args(ckFilterDetectOutliers, 0,
1108                         image_mem,
1109                         variance_mem,
1110                         depth_mem,
1111                         output_mem,
1112                         task->rect,
1113                         task->buffer.pass_stride);
1114         enqueue_kernel(ckFilterDetectOutliers,
1115                        task->rect.z-task->rect.x,
1116                        task->rect.w-task->rect.y);
1117
1118         return true;
1119 }
1120
1121 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
1122 {
1123         denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
1124         denoising.functions.accumulate = function_bind(&OpenCLDeviceBase::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
1125         denoising.functions.solve = function_bind(&OpenCLDeviceBase::denoising_solve, this, _1, &denoising);
1126         denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1127         denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1128         denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1129         denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
1130         denoising.functions.write_feature = function_bind(&OpenCLDeviceBase::denoising_write_feature, this, _1, _2, _3, &denoising);
1131         denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1132
1133         denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1134         denoising.render_buffer.samples = rtile.sample;
1135         denoising.buffer.gpu_temporary_mem = true;
1136
1137         denoising.run_denoising(&rtile);
1138 }
1139
1140 void OpenCLDeviceBase::shader(DeviceTask& task)
1141 {
1142         /* cast arguments to cl types */
1143         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1144         cl_mem d_input = CL_MEM_PTR(task.shader_input);
1145         cl_mem d_output = CL_MEM_PTR(task.shader_output);
1146         cl_int d_shader_eval_type = task.shader_eval_type;
1147         cl_int d_shader_filter = task.shader_filter;
1148         cl_int d_shader_x = task.shader_x;
1149         cl_int d_shader_w = task.shader_w;
1150         cl_int d_offset = task.offset;
1151
1152         cl_kernel kernel;
1153
1154         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1155                 kernel = base_program(ustring("bake"));
1156         }
1157         else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1158                 kernel = base_program(ustring("displace"));
1159         }
1160         else {
1161                 kernel = base_program(ustring("background"));
1162         }
1163
1164         cl_uint start_arg_index =
1165                 kernel_set_args(kernel,
1166                                 0,
1167                                 d_data,
1168                                 d_input,
1169                                 d_output);
1170
1171         set_kernel_arg_buffers(kernel, &start_arg_index);
1172
1173         start_arg_index += kernel_set_args(kernel,
1174                                            start_arg_index,
1175                                            d_shader_eval_type);
1176         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1177                 start_arg_index += kernel_set_args(kernel,
1178                                                    start_arg_index,
1179                                                    d_shader_filter);
1180         }
1181         start_arg_index += kernel_set_args(kernel,
1182                                            start_arg_index,
1183                                            d_shader_x,
1184                                            d_shader_w,
1185                                            d_offset);
1186
1187         for(int sample = 0; sample < task.num_samples; sample++) {
1188
1189                 if(task.get_cancel())
1190                         break;
1191
1192                 kernel_set_args(kernel, start_arg_index, sample);
1193
1194                 enqueue_kernel(kernel, task.shader_w, 1);
1195
1196                 clFinish(cqCommandQueue);
1197
1198                 task.update_progress(NULL);
1199         }
1200 }
1201
1202 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
1203 {
1204         string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1205
1206         if(platform_name == "NVIDIA CUDA") {
1207                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1208                                  "-cl-nv-maxrregcount=32 "
1209                                  "-cl-nv-verbose ";
1210
1211                 uint compute_capability_major, compute_capability_minor;
1212                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1213                                 sizeof(cl_uint), &compute_capability_major, NULL);
1214                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1215                                 sizeof(cl_uint), &compute_capability_minor, NULL);
1216
1217                 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1218                                                compute_capability_major * 100 +
1219                                                compute_capability_minor * 10);
1220         }
1221
1222         else if(platform_name == "Apple")
1223                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1224
1225         else if(platform_name == "AMD Accelerated Parallel Processing")
1226                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1227
1228         else if(platform_name == "Intel(R) OpenCL") {
1229                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1230
1231                 /* Options for gdb source level kernel debugging.
1232                  * this segfaults on linux currently.
1233                  */
1234                 if(OpenCLInfo::use_debug() && debug_src)
1235                         build_options += "-g -s \"" + *debug_src + "\" ";
1236         }
1237
1238         if(info.has_half_images) {
1239                 build_options += "-D__KERNEL_CL_KHR_FP16__ ";
1240         }
1241
1242         if(OpenCLInfo::use_debug()) {
1243                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1244         }
1245
1246 #ifdef WITH_CYCLES_DEBUG
1247         build_options += "-D__KERNEL_DEBUG__ ";
1248 #endif
1249
1250         return build_options;
1251 }
1252
1253 /* TODO(sergey): In the future we can use variadic templates, once
1254  * C++0x is allowed. Should allow to clean this up a bit.
1255  */
1256 int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
1257                     int start_argument_index,
1258                     const ArgumentWrapper& arg1,
1259                     const ArgumentWrapper& arg2,
1260                     const ArgumentWrapper& arg3,
1261                     const ArgumentWrapper& arg4,
1262                     const ArgumentWrapper& arg5,
1263                     const ArgumentWrapper& arg6,
1264                     const ArgumentWrapper& arg7,
1265                     const ArgumentWrapper& arg8,
1266                     const ArgumentWrapper& arg9,
1267                     const ArgumentWrapper& arg10,
1268                     const ArgumentWrapper& arg11,
1269                     const ArgumentWrapper& arg12,
1270                     const ArgumentWrapper& arg13,
1271                     const ArgumentWrapper& arg14,
1272                     const ArgumentWrapper& arg15,
1273                     const ArgumentWrapper& arg16,
1274                     const ArgumentWrapper& arg17,
1275                     const ArgumentWrapper& arg18,
1276                     const ArgumentWrapper& arg19,
1277                     const ArgumentWrapper& arg20,
1278                     const ArgumentWrapper& arg21,
1279                     const ArgumentWrapper& arg22,
1280                     const ArgumentWrapper& arg23,
1281                     const ArgumentWrapper& arg24,
1282                     const ArgumentWrapper& arg25,
1283                     const ArgumentWrapper& arg26,
1284                     const ArgumentWrapper& arg27,
1285                     const ArgumentWrapper& arg28,
1286                     const ArgumentWrapper& arg29,
1287                     const ArgumentWrapper& arg30,
1288                     const ArgumentWrapper& arg31,
1289                     const ArgumentWrapper& arg32,
1290                     const ArgumentWrapper& arg33)
1291 {
1292         int current_arg_index = 0;
1293 #define FAKE_VARARG_HANDLE_ARG(arg) \
1294         do { \
1295                 if(arg.pointer != NULL) { \
1296                         opencl_assert(clSetKernelArg( \
1297                                 kernel, \
1298                                 start_argument_index + current_arg_index, \
1299                                 arg.size, arg.pointer)); \
1300                         ++current_arg_index; \
1301                 } \
1302                 else { \
1303                         return current_arg_index; \
1304                 } \
1305         } while(false)
1306         FAKE_VARARG_HANDLE_ARG(arg1);
1307         FAKE_VARARG_HANDLE_ARG(arg2);
1308         FAKE_VARARG_HANDLE_ARG(arg3);
1309         FAKE_VARARG_HANDLE_ARG(arg4);
1310         FAKE_VARARG_HANDLE_ARG(arg5);
1311         FAKE_VARARG_HANDLE_ARG(arg6);
1312         FAKE_VARARG_HANDLE_ARG(arg7);
1313         FAKE_VARARG_HANDLE_ARG(arg8);
1314         FAKE_VARARG_HANDLE_ARG(arg9);
1315         FAKE_VARARG_HANDLE_ARG(arg10);
1316         FAKE_VARARG_HANDLE_ARG(arg11);
1317         FAKE_VARARG_HANDLE_ARG(arg12);
1318         FAKE_VARARG_HANDLE_ARG(arg13);
1319         FAKE_VARARG_HANDLE_ARG(arg14);
1320         FAKE_VARARG_HANDLE_ARG(arg15);
1321         FAKE_VARARG_HANDLE_ARG(arg16);
1322         FAKE_VARARG_HANDLE_ARG(arg17);
1323         FAKE_VARARG_HANDLE_ARG(arg18);
1324         FAKE_VARARG_HANDLE_ARG(arg19);
1325         FAKE_VARARG_HANDLE_ARG(arg20);
1326         FAKE_VARARG_HANDLE_ARG(arg21);
1327         FAKE_VARARG_HANDLE_ARG(arg22);
1328         FAKE_VARARG_HANDLE_ARG(arg23);
1329         FAKE_VARARG_HANDLE_ARG(arg24);
1330         FAKE_VARARG_HANDLE_ARG(arg25);
1331         FAKE_VARARG_HANDLE_ARG(arg26);
1332         FAKE_VARARG_HANDLE_ARG(arg27);
1333         FAKE_VARARG_HANDLE_ARG(arg28);
1334         FAKE_VARARG_HANDLE_ARG(arg29);
1335         FAKE_VARARG_HANDLE_ARG(arg30);
1336         FAKE_VARARG_HANDLE_ARG(arg31);
1337         FAKE_VARARG_HANDLE_ARG(arg32);
1338         FAKE_VARARG_HANDLE_ARG(arg33);
1339 #undef FAKE_VARARG_HANDLE_ARG
1340         return current_arg_index;
1341 }
1342
1343 void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
1344 {
1345         if(kernel) {
1346                 clReleaseKernel(kernel);
1347         }
1348 }
1349
1350 void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
1351 {
1352         if(mem != NULL) {
1353                 clReleaseMemObject(mem);
1354         }
1355 }
1356
1357 void OpenCLDeviceBase::release_program_safe(cl_program program)
1358 {
1359         if(program) {
1360                 clReleaseProgram(program);
1361         }
1362 }
1363
1364 /* ** Those guys are for workign around some compiler-specific bugs ** */
1365
1366 cl_program OpenCLDeviceBase::load_cached_kernel(
1367         ustring key,
1368         thread_scoped_lock& cache_locker)
1369 {
1370         return OpenCLCache::get_program(cpPlatform,
1371                                         cdDevice,
1372                                         key,
1373                                         cache_locker);
1374 }
1375
1376 void OpenCLDeviceBase::store_cached_kernel(
1377         cl_program program,
1378         ustring key,
1379         thread_scoped_lock& cache_locker)
1380 {
1381         OpenCLCache::store_program(cpPlatform,
1382                                    cdDevice,
1383                                    program,
1384                                    key,
1385                                    cache_locker);
1386 }
1387
1388 string OpenCLDeviceBase::build_options_for_base_program(
1389         const DeviceRequestedFeatures& requested_features)
1390 {
1391         /* TODO(sergey): By default we compile all features, meaning
1392          * mega kernel is not getting feature-based optimizations.
1393          *
1394          * Ideally we need always compile kernel with as less features
1395          * enabled as possible to keep performance at it's max.
1396          */
1397
1398         /* For now disable baking when not in use as this has major
1399          * impact on kernel build times.
1400          */
1401         if(!requested_features.use_baking) {
1402                 return "-D__NO_BAKING__";
1403         }
1404
1405         return "";
1406 }
1407
1408 CCL_NAMESPACE_END
1409
1410 #endif