fe084edc90ec46140e3ec308549c9b9a846ec9d6
[blender-staging.git] / intern / cycles / device / opencl / opencl_base.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "kernel/kernel_types.h"
22
23 #include "util/util_algorithm.h"
24 #include "util/util_foreach.h"
25 #include "util/util_logging.h"
26 #include "util/util_md5.h"
27 #include "util/util_path.h"
28 #include "util/util_time.h"
29
30 CCL_NAMESPACE_BEGIN
31
32 struct texture_slot_t {
33         texture_slot_t(const string& name, int slot)
34                 : name(name),
35                   slot(slot) {
36         }
37         string name;
38         int slot;
39 };
40
41 bool OpenCLDeviceBase::opencl_error(cl_int err)
42 {
43         if(err != CL_SUCCESS) {
44                 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
45                 if(error_msg == "")
46                         error_msg = message;
47                 fprintf(stderr, "%s\n", message.c_str());
48                 return true;
49         }
50
51         return false;
52 }
53
54 void OpenCLDeviceBase::opencl_error(const string& message)
55 {
56         if(error_msg == "")
57                 error_msg = message;
58         fprintf(stderr, "%s\n", message.c_str());
59 }
60
61 void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
62 {
63         if(err != CL_SUCCESS) {
64                 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
65                 if(error_msg == "")
66                         error_msg = message;
67                 fprintf(stderr, "%s\n", message.c_str());
68 #ifndef NDEBUG
69                 abort();
70 #endif
71         }
72 }
73
74 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
75 : Device(info, stats, background_),
76   memory_manager(this),
77   texture_info(this, "__texture_info", MEM_TEXTURE)
78 {
79         cpPlatform = NULL;
80         cdDevice = NULL;
81         cxContext = NULL;
82         cqCommandQueue = NULL;
83         null_mem = 0;
84         device_initialized = false;
85         textures_need_update = true;
86
87         vector<OpenCLPlatformDevice> usable_devices;
88         OpenCLInfo::get_usable_devices(&usable_devices);
89         if(usable_devices.size() == 0) {
90                 opencl_error("OpenCL: no devices found.");
91                 return;
92         }
93         assert(info.num < usable_devices.size());
94         OpenCLPlatformDevice& platform_device = usable_devices[info.num];
95         cpPlatform = platform_device.platform_id;
96         cdDevice = platform_device.device_id;
97         platform_name = platform_device.platform_name;
98         device_name = platform_device.device_name;
99         VLOG(2) << "Creating new Cycles device for OpenCL platform "
100                 << platform_name << ", device "
101                 << device_name << ".";
102
103         {
104                 /* try to use cached context */
105                 thread_scoped_lock cache_locker;
106                 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
107
108                 if(cxContext == NULL) {
109                         /* create context properties array to specify platform */
110                         const cl_context_properties context_props[] = {
111                                 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
112                                 0, 0
113                         };
114
115                         /* create context */
116                         cxContext = clCreateContext(context_props, 1, &cdDevice,
117                                 context_notify_callback, cdDevice, &ciErr);
118
119                         if(opencl_error(ciErr)) {
120                                 opencl_error("OpenCL: clCreateContext failed");
121                                 return;
122                         }
123
124                         /* cache it */
125                         OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
126                 }
127         }
128
129         cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
130         if(opencl_error(ciErr)) {
131                 opencl_error("OpenCL: Error creating command queue");
132                 return;
133         }
134
135         null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
136         if(opencl_error(ciErr)) {
137                 opencl_error("OpenCL: Error creating memory buffer for NULL");
138                 return;
139         }
140
141         /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
142         texture_info.resize(1);
143         memory_manager.alloc("texture_info", texture_info);
144
145         fprintf(stderr, "Device init success\n");
146         device_initialized = true;
147 }
148
149 OpenCLDeviceBase::~OpenCLDeviceBase()
150 {
151         task_pool.stop();
152
153         memory_manager.free();
154
155         if(null_mem)
156                 clReleaseMemObject(CL_MEM_PTR(null_mem));
157
158         ConstMemMap::iterator mt;
159         for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
160                 delete mt->second;
161         }
162
163         base_program.release();
164         if(cqCommandQueue)
165                 clReleaseCommandQueue(cqCommandQueue);
166         if(cxContext)
167                 clReleaseContext(cxContext);
168 }
169
170 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
171         const void * /*private_info*/, size_t /*cb*/, void *user_data)
172 {
173         string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
174         fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
175 }
176
177 bool OpenCLDeviceBase::opencl_version_check()
178 {
179         string error;
180         if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
181                 opencl_error(error);
182                 return false;
183         }
184         if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
185                 opencl_error(error);
186                 return false;
187         }
188         return true;
189 }
190
191 string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
192 {
193         MD5Hash md5;
194         char version[256], driver[256], name[256], vendor[256];
195
196         clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
197         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
198         clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
199         clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
200
201         md5.append((uint8_t*)vendor, strlen(vendor));
202         md5.append((uint8_t*)version, strlen(version));
203         md5.append((uint8_t*)name, strlen(name));
204         md5.append((uint8_t*)driver, strlen(driver));
205
206         string options = kernel_build_options();
207         options += kernel_custom_build_options;
208         md5.append((uint8_t*)options.c_str(), options.size());
209
210         return md5.get_hex();
211 }
212
213 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
214 {
215         VLOG(2) << "Loading kernels for platform " << platform_name
216                 << ", device " << device_name << ".";
217         /* Verify if device was initialized. */
218         if(!device_initialized) {
219                 fprintf(stderr, "OpenCL: failed to initialize device.\n");
220                 return false;
221         }
222
223         /* Verify we have right opencl version. */
224         if(!opencl_version_check())
225                 return false;
226
227         base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
228         base_program.add_kernel(ustring("convert_to_byte"));
229         base_program.add_kernel(ustring("convert_to_half_float"));
230         base_program.add_kernel(ustring("displace"));
231         base_program.add_kernel(ustring("background"));
232         base_program.add_kernel(ustring("bake"));
233         base_program.add_kernel(ustring("zero_buffer"));
234
235         denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
236         denoising_program.add_kernel(ustring("filter_divide_shadow"));
237         denoising_program.add_kernel(ustring("filter_get_feature"));
238         denoising_program.add_kernel(ustring("filter_detect_outliers"));
239         denoising_program.add_kernel(ustring("filter_combine_halves"));
240         denoising_program.add_kernel(ustring("filter_construct_transform"));
241         denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
242         denoising_program.add_kernel(ustring("filter_nlm_blur"));
243         denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
244         denoising_program.add_kernel(ustring("filter_nlm_update_output"));
245         denoising_program.add_kernel(ustring("filter_nlm_normalize"));
246         denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
247         denoising_program.add_kernel(ustring("filter_finalize"));
248         denoising_program.add_kernel(ustring("filter_set_tiles"));
249
250         vector<OpenCLProgram*> programs;
251         programs.push_back(&base_program);
252         programs.push_back(&denoising_program);
253         /* Call actual class to fill the vector with its programs. */
254         if(!load_kernels(requested_features, programs)) {
255                 return false;
256         }
257
258         /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
259          * serialize the calls internally, so it's not much use right now.
260          * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
261          * should be set to false as well. */
262 #if 0
263         TaskPool task_pool;
264         foreach(OpenCLProgram *program, programs) {
265                 task_pool.push(function_bind(&OpenCLProgram::load, program));
266         }
267         task_pool.wait_work();
268
269         foreach(OpenCLProgram *program, programs) {
270                 VLOG(2) << program->get_log();
271                 if(!program->is_loaded()) {
272                         program->report_error();
273                         return false;
274                 }
275         }
276 #else
277         foreach(OpenCLProgram *program, programs) {
278                 program->load();
279                 if(!program->is_loaded()) {
280                         return false;
281                 }
282         }
283 #endif
284
285         return true;
286 }
287
288 void OpenCLDeviceBase::mem_alloc(device_memory& mem)
289 {
290         if(mem.name) {
291                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
292                             << string_human_readable_number(mem.memory_size()) << " bytes. ("
293                             << string_human_readable_size(mem.memory_size()) << ")";
294         }
295
296         size_t size = mem.memory_size();
297
298         /* check there is enough memory available for the allocation */
299         cl_ulong max_alloc_size = 0;
300         clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
301
302         if(DebugFlags().opencl.mem_limit) {
303                 max_alloc_size = min(max_alloc_size,
304                                      cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
305         }
306
307         if(size > max_alloc_size) {
308                 string error = "Scene too complex to fit in available memory.";
309                 if(mem.name != NULL) {
310                         error += string_printf(" (allocating buffer %s failed.)", mem.name);
311                 }
312                 set_error(error);
313
314                 return;
315         }
316
317         cl_mem_flags mem_flag;
318         void *mem_ptr = NULL;
319
320         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
321                 mem_flag = CL_MEM_READ_ONLY;
322         else
323                 mem_flag = CL_MEM_READ_WRITE;
324
325         /* Zero-size allocation might be invoked by render, but not really
326          * supported by OpenCL. Using NULL as device pointer also doesn't really
327          * work for some reason, so for the time being we'll use special case
328          * will null_mem buffer.
329          */
330         if(size != 0) {
331                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
332                                                                 mem_flag,
333                                                                 size,
334                                                                 mem_ptr,
335                                                                 &ciErr);
336                 opencl_assert_err(ciErr, "clCreateBuffer");
337         }
338         else {
339                 mem.device_pointer = null_mem;
340         }
341
342         stats.mem_alloc(size);
343         mem.device_size = size;
344 }
345
346 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
347 {
348         if(mem.type == MEM_TEXTURE) {
349                 tex_free(mem);
350                 tex_alloc(mem);
351         }
352         else {
353                 if(!mem.device_pointer) {
354                         mem_alloc(mem);
355                 }
356
357                 /* this is blocking */
358                 size_t size = mem.memory_size();
359                 if(size != 0) {
360                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
361                                                            CL_MEM_PTR(mem.device_pointer),
362                                                            CL_TRUE,
363                                                            0,
364                                                            size,
365                                                            mem.host_pointer,
366                                                            0,
367                                                            NULL, NULL));
368                 }
369         }
370 }
371
372 void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
373 {
374         size_t offset = elem*y*w;
375         size_t size = elem*w*h;
376         assert(size != 0);
377         opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
378                                           CL_MEM_PTR(mem.device_pointer),
379                                           CL_TRUE,
380                                           offset,
381                                           size,
382                                           (uchar*)mem.host_pointer + offset,
383                                           0,
384                                           NULL, NULL));
385 }
386
387 void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
388 {
389         cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
390
391         size_t global_size[] = {1024, 1024};
392         size_t num_threads = global_size[0] * global_size[1];
393
394         cl_mem d_buffer = CL_MEM_PTR(mem);
395         cl_ulong d_offset = 0;
396         cl_ulong d_size = 0;
397
398         while(d_offset < size) {
399                 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
400
401                 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
402
403                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
404                                                ckZeroBuffer,
405                                                2,
406                                                NULL,
407                                                global_size,
408                                                NULL,
409                                                0,
410                                                NULL,
411                                                NULL);
412                 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
413
414                 d_offset += d_size;
415         }
416 }
417
418 void OpenCLDeviceBase::mem_zero(device_memory& mem)
419 {
420         if(!mem.device_pointer) {
421                 mem_alloc(mem);
422         }
423
424         if(mem.device_pointer) {
425                 if(base_program.is_loaded()) {
426                         mem_zero_kernel(mem.device_pointer, mem.memory_size());
427                 }
428
429                 if(mem.host_pointer) {
430                         memset(mem.host_pointer, 0, mem.memory_size());
431                 }
432
433                 if(!base_program.is_loaded()) {
434                         void* zero = mem.host_pointer;
435
436                         if(!mem.host_pointer) {
437                                 zero = util_aligned_malloc(mem.memory_size(), 16);
438                                 memset(zero, 0, mem.memory_size());
439                         }
440
441                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
442                                                            CL_MEM_PTR(mem.device_pointer),
443                                                            CL_TRUE,
444                                                            0,
445                                                            mem.memory_size(),
446                                                            zero,
447                                                            0,
448                                                            NULL, NULL));
449
450                         if(!mem.host_pointer) {
451                                 util_aligned_free(zero);
452                         }
453                 }
454         }
455 }
456
457 void OpenCLDeviceBase::mem_free(device_memory& mem)
458 {
459         if(mem.type == MEM_TEXTURE) {
460                 tex_free(mem);
461         }
462         else {
463                 if(mem.device_pointer) {
464                         if(mem.device_pointer != null_mem) {
465                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
466                         }
467                         mem.device_pointer = 0;
468
469                         stats.mem_free(mem.device_size);
470                         mem.device_size = 0;
471                 }
472         }
473 }
474
475 int OpenCLDeviceBase::mem_address_alignment()
476 {
477         return OpenCLInfo::mem_address_alignment(cdDevice);
478 }
479
480 device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
481 {
482         cl_mem_flags mem_flag;
483         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
484                 mem_flag = CL_MEM_READ_ONLY;
485         else
486                 mem_flag = CL_MEM_READ_WRITE;
487
488         cl_buffer_region info;
489         info.origin = mem.memory_elements_size(offset);
490         info.size = mem.memory_elements_size(size);
491
492         device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
493                                                             mem_flag,
494                                                             CL_BUFFER_CREATE_TYPE_REGION,
495                                                             &info,
496                                                             &ciErr);
497         opencl_assert_err(ciErr, "clCreateSubBuffer");
498         return sub_buf;
499 }
500
501 void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
502 {
503         if(device_pointer && device_pointer != null_mem) {
504                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
505         }
506 }
507
508 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
509 {
510         ConstMemMap::iterator i = const_mem_map.find(name);
511         device_vector<uchar> *data;
512
513         if(i == const_mem_map.end()) {
514                 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
515                 data->alloc(size);
516                 const_mem_map.insert(ConstMemMap::value_type(name, data));
517         }
518         else {
519                 data = i->second;
520         }
521
522         memcpy(data->data(), host, size);
523         data->copy_to_device();
524 }
525
526 void OpenCLDeviceBase::tex_alloc(device_memory& mem)
527 {
528         VLOG(1) << "Texture allocate: " << mem.name << ", "
529                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
530                 << string_human_readable_size(mem.memory_size()) << ")";
531
532         memory_manager.alloc(mem.name, mem);
533         /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
534         mem.device_pointer = 1;
535         textures[mem.name] = &mem;
536         textures_need_update = true;
537 }
538
539 void OpenCLDeviceBase::tex_free(device_memory& mem)
540 {
541         if(mem.device_pointer) {
542                 mem.device_pointer = 0;
543
544                 if(memory_manager.free(mem)) {
545                         textures_need_update = true;
546                 }
547
548                 foreach(TexturesMap::value_type& value, textures) {
549                         if(value.second == &mem) {
550                                 textures.erase(value.first);
551                                 break;
552                         }
553                 }
554         }
555 }
556
557 size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
558 {
559         int r = global_size % group_size;
560         return global_size + ((r == 0)? 0: group_size - r);
561 }
562
563 void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
564 {
565         size_t workgroup_size, max_work_items[3];
566
567         clGetKernelWorkGroupInfo(kernel, cdDevice,
568                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
569         clGetDeviceInfo(cdDevice,
570                 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
571
572         if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
573                 workgroup_size = max_workgroup_size;
574         }
575
576         /* Try to divide evenly over 2 dimensions. */
577         size_t local_size[2];
578         if(x_workgroups) {
579                 local_size[0] = workgroup_size;
580                 local_size[1] = 1;
581         }
582         else {
583                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
584                 local_size[0] = local_size[1] = sqrt_workgroup_size;
585         }
586
587         /* Some implementations have max size 1 on 2nd dimension. */
588         if(local_size[1] > max_work_items[1]) {
589                 local_size[0] = workgroup_size/max_work_items[1];
590                 local_size[1] = max_work_items[1];
591         }
592
593         size_t global_size[2] = {global_size_round_up(local_size[0], w),
594                                  global_size_round_up(local_size[1], h)};
595
596         /* Vertical size of 1 is coming from bake/shade kernels where we should
597          * not round anything up because otherwise we'll either be doing too
598          * much work per pixel (if we don't check global ID on Y axis) or will
599          * be checking for global ID to always have Y of 0.
600          */
601         if(h == 1) {
602                 global_size[h] = 1;
603         }
604
605         /* run kernel */
606         opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
607         opencl_assert(clFlush(cqCommandQueue));
608 }
609
610 void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
611 {
612         cl_mem ptr;
613
614         MemMap::iterator i = mem_map.find(name);
615         if(i != mem_map.end()) {
616                 ptr = CL_MEM_PTR(i->second);
617         }
618         else {
619                 /* work around NULL not working, even though the spec says otherwise */
620                 ptr = CL_MEM_PTR(null_mem);
621         }
622
623         opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
624 }
625
626 void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
627 {
628         flush_texture_buffers();
629
630         memory_manager.set_kernel_arg_buffers(kernel, narg);
631 }
632
633 void OpenCLDeviceBase::flush_texture_buffers()
634 {
635         if(!textures_need_update) {
636                 return;
637         }
638         textures_need_update = false;
639
640         /* Setup slots for textures. */
641         int num_slots = 0;
642
643         vector<texture_slot_t> texture_slots;
644
645 #define KERNEL_TEX(type, name) \
646         if(textures.find(#name) != textures.end()) { \
647                 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
648         } \
649         num_slots++;
650 #include "kernel/kernel_textures.h"
651
652         int num_data_slots = num_slots;
653
654         foreach(TexturesMap::value_type& tex, textures) {
655                 string name = tex.first;
656
657                 if(string_startswith(name, "__tex_image")) {
658                         int pos = name.rfind("_");
659                         int id = atoi(name.data() + pos + 1);
660                         texture_slots.push_back(texture_slot_t(name,
661                                                                    num_data_slots + id));
662                         num_slots = max(num_slots, num_data_slots + id + 1);
663                 }
664         }
665
666         /* Realloc texture descriptors buffer. */
667         memory_manager.free(texture_info);
668         texture_info.resize(num_slots);
669         memory_manager.alloc("texture_info", texture_info);
670
671         /* Fill in descriptors */
672         foreach(texture_slot_t& slot, texture_slots) {
673                 TextureInfo& info = texture_info[slot.slot];
674
675                 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
676                 info.data = desc.offset;
677                 info.cl_buffer = desc.device_buffer;
678
679                 if(string_startswith(slot.name, "__tex_image")) {
680                         device_memory *mem = textures[slot.name];
681
682                         info.width = mem->data_width;
683                         info.height = mem->data_height;
684                         info.depth = mem->data_depth;
685
686                         info.interpolation = mem->interpolation;
687                         info.extension = mem->extension;
688                 }
689         }
690
691         /* Force write of descriptors. */
692         memory_manager.free(texture_info);
693         memory_manager.alloc("texture_info", texture_info);
694 }
695
696 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
697 {
698         /* cast arguments to cl types */
699         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
700         cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
701         cl_mem d_buffer = CL_MEM_PTR(buffer);
702         cl_int d_x = task.x;
703         cl_int d_y = task.y;
704         cl_int d_w = task.w;
705         cl_int d_h = task.h;
706         cl_float d_sample_scale = 1.0f/(task.sample + 1);
707         cl_int d_offset = task.offset;
708         cl_int d_stride = task.stride;
709
710
711         cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
712
713         cl_uint start_arg_index =
714                 kernel_set_args(ckFilmConvertKernel,
715                                 0,
716                                 d_data,
717                                 d_rgba,
718                                 d_buffer);
719
720         set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
721
722         start_arg_index += kernel_set_args(ckFilmConvertKernel,
723                                            start_arg_index,
724                                            d_sample_scale,
725                                            d_x,
726                                            d_y,
727                                            d_w,
728                                            d_h,
729                                            d_offset,
730                                            d_stride);
731
732         enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
733 }
734
735 bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
736                                                  device_ptr guide_ptr,
737                                                  device_ptr variance_ptr,
738                                                  device_ptr out_ptr,
739                                                  DenoisingTask *task)
740 {
741
742         int stride = task->buffer.stride;
743         int w = task->buffer.width;
744         int h = task->buffer.h;
745         int r = task->nlm_state.r;
746         int f = task->nlm_state.f;
747         float a = task->nlm_state.a;
748         float k_2 = task->nlm_state.k_2;
749
750         int shift_stride = stride*h;
751         int num_shifts = (2*r+1)*(2*r+1);
752         int mem_size = sizeof(float)*shift_stride*num_shifts;
753
754         cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
755
756         cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
757         opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
758         cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
759         opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
760
761         cl_mem image_mem = CL_MEM_PTR(image_ptr);
762         cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
763         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
764         cl_mem out_mem = CL_MEM_PTR(out_ptr);
765
766         mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
767         mem_zero_kernel(out_ptr, sizeof(float)*w*h);
768
769         cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
770         cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
771         cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
772         cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
773         cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
774
775         kernel_set_args(ckNLMCalcDifference, 0,
776                         guide_mem,
777                         variance_mem,
778                         difference,
779                         w, h, stride,
780                         shift_stride,
781                         r, 0, a, k_2);
782         kernel_set_args(ckNLMBlur, 0,
783                         difference,
784                         blurDifference,
785                         w, h, stride,
786                         shift_stride,
787                         r, f);
788         kernel_set_args(ckNLMCalcWeight, 0,
789                         blurDifference,
790                         difference,
791                         w, h, stride,
792                         shift_stride,
793                         r, f);
794         kernel_set_args(ckNLMUpdateOutput, 0,
795                         blurDifference,
796                         image_mem,
797                         out_mem,
798                         weightAccum,
799                         w, h, stride,
800                         shift_stride,
801                         r, f);
802
803         enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
804         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
805         enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
806         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
807         enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
808
809         opencl_assert(clReleaseMemObject(difference));
810         opencl_assert(clReleaseMemObject(blurDifference));
811
812         kernel_set_args(ckNLMNormalize, 0,
813                         out_mem, weightAccum, w, h, stride);
814         enqueue_kernel(ckNLMNormalize, w, h);
815
816         return true;
817 }
818
819 bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
820 {
821         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
822         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
823         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
824
825         cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
826
827         kernel_set_args(ckFilterConstructTransform, 0,
828                         buffer_mem,
829                         transform_mem,
830                         rank_mem,
831                         task->filter_area,
832                         task->rect,
833                         task->buffer.pass_stride,
834                         task->radius,
835                         task->pca_threshold);
836
837         enqueue_kernel(ckFilterConstructTransform,
838                        task->storage.w,
839                        task->storage.h,
840                        256);
841
842         return true;
843 }
844
845 bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
846                                              device_ptr color_variance_ptr,
847                                              device_ptr output_ptr,
848                                              DenoisingTask *task)
849 {
850         mem_zero(task->storage.XtWX);
851         mem_zero(task->storage.XtWY);
852
853         cl_mem color_mem = CL_MEM_PTR(color_ptr);
854         cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
855         cl_mem output_mem = CL_MEM_PTR(output_ptr);
856
857         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
858         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
859         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
860         cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
861         cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
862
863         cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
864         cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
865         cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
866         cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
867         cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
868
869         int w = task->reconstruction_state.source_w;
870         int h = task->reconstruction_state.source_h;
871         int stride = task->buffer.stride;
872
873         int shift_stride = stride*h;
874         int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
875         int mem_size = sizeof(float)*shift_stride*num_shifts;
876
877         cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
878         opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
879         cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
880         opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
881
882         kernel_set_args(ckNLMCalcDifference, 0,
883                         color_mem,
884                         color_variance_mem,
885                         difference,
886                         w, h, stride,
887                         shift_stride,
888                         task->radius,
889                         task->buffer.pass_stride,
890                         1.0f, task->nlm_k_2);
891         kernel_set_args(ckNLMBlur, 0,
892                         difference,
893                         blurDifference,
894                         w, h, stride,
895                         shift_stride,
896                         task->radius, 4);
897         kernel_set_args(ckNLMCalcWeight, 0,
898                         blurDifference,
899                         difference,
900                         w, h, stride,
901                         shift_stride,
902                         task->radius, 4);
903         kernel_set_args(ckNLMConstructGramian, 0,
904                         blurDifference,
905                         buffer_mem,
906                         transform_mem,
907                         rank_mem,
908                         XtWX_mem,
909                         XtWY_mem,
910                         task->reconstruction_state.filter_window,
911                         w, h, stride,
912                         shift_stride,
913                         task->radius, 4,
914                         task->buffer.pass_stride);
915
916         enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
917         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
918         enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
919         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
920         enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
921
922         opencl_assert(clReleaseMemObject(difference));
923         opencl_assert(clReleaseMemObject(blurDifference));
924
925         kernel_set_args(ckFinalize, 0,
926                         output_mem,
927                         rank_mem,
928                         XtWX_mem,
929                         XtWY_mem,
930                         task->filter_area,
931                         task->reconstruction_state.buffer_params,
932                         task->render_buffer.samples);
933         enqueue_kernel(ckFinalize, w, h);
934
935         return true;
936 }
937
938 bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
939                                                 device_ptr b_ptr,
940                                                 device_ptr mean_ptr,
941                                                 device_ptr variance_ptr,
942                                                 int r, int4 rect,
943                                                 DenoisingTask *task)
944 {
945         cl_mem a_mem = CL_MEM_PTR(a_ptr);
946         cl_mem b_mem = CL_MEM_PTR(b_ptr);
947         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
948         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
949
950         cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
951
952         kernel_set_args(ckFilterCombineHalves, 0,
953                         mean_mem,
954                         variance_mem,
955                         a_mem,
956                         b_mem,
957                         rect,
958                         r);
959         enqueue_kernel(ckFilterCombineHalves,
960                        task->rect.z-task->rect.x,
961                        task->rect.w-task->rect.y);
962
963         return true;
964 }
965
966 bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
967                                                device_ptr b_ptr,
968                                                device_ptr sample_variance_ptr,
969                                                device_ptr sv_variance_ptr,
970                                                device_ptr buffer_variance_ptr,
971                                                DenoisingTask *task)
972 {
973         cl_mem a_mem = CL_MEM_PTR(a_ptr);
974         cl_mem b_mem = CL_MEM_PTR(b_ptr);
975         cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
976         cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
977         cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
978
979         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
980
981         cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
982
983         kernel_set_args(ckFilterDivideShadow, 0,
984                         task->render_buffer.samples,
985                         tiles_mem,
986                         a_mem,
987                         b_mem,
988                         sample_variance_mem,
989                         sv_variance_mem,
990                         buffer_variance_mem,
991                         task->rect,
992                         task->render_buffer.pass_stride,
993                         task->render_buffer.denoising_data_offset);
994         enqueue_kernel(ckFilterDivideShadow,
995                        task->rect.z-task->rect.x,
996                        task->rect.w-task->rect.y);
997
998         return true;
999 }
1000
1001 bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
1002                                              int variance_offset,
1003                                              device_ptr mean_ptr,
1004                                              device_ptr variance_ptr,
1005                                              DenoisingTask *task)
1006 {
1007         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1008         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1009
1010         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1011
1012         cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1013
1014         kernel_set_args(ckFilterGetFeature, 0,
1015                         task->render_buffer.samples,
1016                         tiles_mem,
1017                         mean_offset,
1018                         variance_offset,
1019                         mean_mem,
1020                         variance_mem,
1021                         task->rect,
1022                         task->render_buffer.pass_stride,
1023                         task->render_buffer.denoising_data_offset);
1024         enqueue_kernel(ckFilterGetFeature,
1025                        task->rect.z-task->rect.x,
1026                        task->rect.w-task->rect.y);
1027
1028         return true;
1029 }
1030
1031 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
1032                                                  device_ptr variance_ptr,
1033                                                  device_ptr depth_ptr,
1034                                                  device_ptr output_ptr,
1035                                                  DenoisingTask *task)
1036 {
1037         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1038         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1039         cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1040         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1041
1042         cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1043
1044         kernel_set_args(ckFilterDetectOutliers, 0,
1045                         image_mem,
1046                         variance_mem,
1047                         depth_mem,
1048                         output_mem,
1049                         task->rect,
1050                         task->buffer.pass_stride);
1051         enqueue_kernel(ckFilterDetectOutliers,
1052                        task->rect.z-task->rect.x,
1053                        task->rect.w-task->rect.y);
1054
1055         return true;
1056 }
1057
1058 bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
1059                                            DenoisingTask *task)
1060 {
1061         task->tiles_mem.copy_to_device();
1062
1063         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1064
1065         cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
1066
1067         kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
1068         for(int i = 0; i < 9; i++) {
1069                 cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
1070                 kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
1071         }
1072
1073         enqueue_kernel(ckFilterSetTiles, 1, 1);
1074
1075         return true;
1076 }
1077
1078 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task)
1079 {
1080         denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
1081         denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
1082         denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
1083         denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1084         denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1085         denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1086         denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
1087         denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1088
1089         denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1090         denoising.render_buffer.samples = rtile.sample;
1091
1092         RenderTile rtiles[9];
1093         rtiles[4] = rtile;
1094         task.map_neighbor_tiles(rtiles, this);
1095         denoising.tiles_from_rendertiles(rtiles);
1096
1097         denoising.init_from_devicetask(task);
1098
1099         denoising.run_denoising();
1100
1101         task.unmap_neighbor_tiles(rtiles, this);
1102 }
1103
1104 void OpenCLDeviceBase::shader(DeviceTask& task)
1105 {
1106         /* cast arguments to cl types */
1107         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1108         cl_mem d_input = CL_MEM_PTR(task.shader_input);
1109         cl_mem d_output = CL_MEM_PTR(task.shader_output);
1110         cl_int d_shader_eval_type = task.shader_eval_type;
1111         cl_int d_shader_filter = task.shader_filter;
1112         cl_int d_shader_x = task.shader_x;
1113         cl_int d_shader_w = task.shader_w;
1114         cl_int d_offset = task.offset;
1115
1116         cl_kernel kernel;
1117
1118         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1119                 kernel = base_program(ustring("bake"));
1120         }
1121         else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1122                 kernel = base_program(ustring("displace"));
1123         }
1124         else {
1125                 kernel = base_program(ustring("background"));
1126         }
1127
1128         cl_uint start_arg_index =
1129                 kernel_set_args(kernel,
1130                                 0,
1131                                 d_data,
1132                                 d_input,
1133                                 d_output);
1134
1135         set_kernel_arg_buffers(kernel, &start_arg_index);
1136
1137         start_arg_index += kernel_set_args(kernel,
1138                                            start_arg_index,
1139                                            d_shader_eval_type);
1140         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1141                 start_arg_index += kernel_set_args(kernel,
1142                                                    start_arg_index,
1143                                                    d_shader_filter);
1144         }
1145         start_arg_index += kernel_set_args(kernel,
1146                                            start_arg_index,
1147                                            d_shader_x,
1148                                            d_shader_w,
1149                                            d_offset);
1150
1151         for(int sample = 0; sample < task.num_samples; sample++) {
1152
1153                 if(task.get_cancel())
1154                         break;
1155
1156                 kernel_set_args(kernel, start_arg_index, sample);
1157
1158                 enqueue_kernel(kernel, task.shader_w, 1);
1159
1160                 clFinish(cqCommandQueue);
1161
1162                 task.update_progress(NULL);
1163         }
1164 }
1165
1166 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
1167 {
1168         string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1169
1170         if(platform_name == "NVIDIA CUDA") {
1171                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1172                                  "-cl-nv-maxrregcount=32 "
1173                                  "-cl-nv-verbose ";
1174
1175                 uint compute_capability_major, compute_capability_minor;
1176                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1177                                 sizeof(cl_uint), &compute_capability_major, NULL);
1178                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1179                                 sizeof(cl_uint), &compute_capability_minor, NULL);
1180
1181                 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1182                                                compute_capability_major * 100 +
1183                                                compute_capability_minor * 10);
1184         }
1185
1186         else if(platform_name == "Apple")
1187                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1188
1189         else if(platform_name == "AMD Accelerated Parallel Processing")
1190                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1191
1192         else if(platform_name == "Intel(R) OpenCL") {
1193                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1194
1195                 /* Options for gdb source level kernel debugging.
1196                  * this segfaults on linux currently.
1197                  */
1198                 if(OpenCLInfo::use_debug() && debug_src)
1199                         build_options += "-g -s \"" + *debug_src + "\" ";
1200         }
1201
1202         if(OpenCLInfo::use_debug())
1203                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1204
1205 #ifdef WITH_CYCLES_DEBUG
1206         build_options += "-D__KERNEL_DEBUG__ ";
1207 #endif
1208
1209         return build_options;
1210 }
1211
1212 /* TODO(sergey): In the future we can use variadic templates, once
1213  * C++0x is allowed. Should allow to clean this up a bit.
1214  */
1215 int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
1216                     int start_argument_index,
1217                     const ArgumentWrapper& arg1,
1218                     const ArgumentWrapper& arg2,
1219                     const ArgumentWrapper& arg3,
1220                     const ArgumentWrapper& arg4,
1221                     const ArgumentWrapper& arg5,
1222                     const ArgumentWrapper& arg6,
1223                     const ArgumentWrapper& arg7,
1224                     const ArgumentWrapper& arg8,
1225                     const ArgumentWrapper& arg9,
1226                     const ArgumentWrapper& arg10,
1227                     const ArgumentWrapper& arg11,
1228                     const ArgumentWrapper& arg12,
1229                     const ArgumentWrapper& arg13,
1230                     const ArgumentWrapper& arg14,
1231                     const ArgumentWrapper& arg15,
1232                     const ArgumentWrapper& arg16,
1233                     const ArgumentWrapper& arg17,
1234                     const ArgumentWrapper& arg18,
1235                     const ArgumentWrapper& arg19,
1236                     const ArgumentWrapper& arg20,
1237                     const ArgumentWrapper& arg21,
1238                     const ArgumentWrapper& arg22,
1239                     const ArgumentWrapper& arg23,
1240                     const ArgumentWrapper& arg24,
1241                     const ArgumentWrapper& arg25,
1242                     const ArgumentWrapper& arg26,
1243                     const ArgumentWrapper& arg27,
1244                     const ArgumentWrapper& arg28,
1245                     const ArgumentWrapper& arg29,
1246                     const ArgumentWrapper& arg30,
1247                     const ArgumentWrapper& arg31,
1248                     const ArgumentWrapper& arg32,
1249                     const ArgumentWrapper& arg33)
1250 {
1251         int current_arg_index = 0;
1252 #define FAKE_VARARG_HANDLE_ARG(arg) \
1253         do { \
1254                 if(arg.pointer != NULL) { \
1255                         opencl_assert(clSetKernelArg( \
1256                                 kernel, \
1257                                 start_argument_index + current_arg_index, \
1258                                 arg.size, arg.pointer)); \
1259                         ++current_arg_index; \
1260                 } \
1261                 else { \
1262                         return current_arg_index; \
1263                 } \
1264         } while(false)
1265         FAKE_VARARG_HANDLE_ARG(arg1);
1266         FAKE_VARARG_HANDLE_ARG(arg2);
1267         FAKE_VARARG_HANDLE_ARG(arg3);
1268         FAKE_VARARG_HANDLE_ARG(arg4);
1269         FAKE_VARARG_HANDLE_ARG(arg5);
1270         FAKE_VARARG_HANDLE_ARG(arg6);
1271         FAKE_VARARG_HANDLE_ARG(arg7);
1272         FAKE_VARARG_HANDLE_ARG(arg8);
1273         FAKE_VARARG_HANDLE_ARG(arg9);
1274         FAKE_VARARG_HANDLE_ARG(arg10);
1275         FAKE_VARARG_HANDLE_ARG(arg11);
1276         FAKE_VARARG_HANDLE_ARG(arg12);
1277         FAKE_VARARG_HANDLE_ARG(arg13);
1278         FAKE_VARARG_HANDLE_ARG(arg14);
1279         FAKE_VARARG_HANDLE_ARG(arg15);
1280         FAKE_VARARG_HANDLE_ARG(arg16);
1281         FAKE_VARARG_HANDLE_ARG(arg17);
1282         FAKE_VARARG_HANDLE_ARG(arg18);
1283         FAKE_VARARG_HANDLE_ARG(arg19);
1284         FAKE_VARARG_HANDLE_ARG(arg20);
1285         FAKE_VARARG_HANDLE_ARG(arg21);
1286         FAKE_VARARG_HANDLE_ARG(arg22);
1287         FAKE_VARARG_HANDLE_ARG(arg23);
1288         FAKE_VARARG_HANDLE_ARG(arg24);
1289         FAKE_VARARG_HANDLE_ARG(arg25);
1290         FAKE_VARARG_HANDLE_ARG(arg26);
1291         FAKE_VARARG_HANDLE_ARG(arg27);
1292         FAKE_VARARG_HANDLE_ARG(arg28);
1293         FAKE_VARARG_HANDLE_ARG(arg29);
1294         FAKE_VARARG_HANDLE_ARG(arg30);
1295         FAKE_VARARG_HANDLE_ARG(arg31);
1296         FAKE_VARARG_HANDLE_ARG(arg32);
1297         FAKE_VARARG_HANDLE_ARG(arg33);
1298 #undef FAKE_VARARG_HANDLE_ARG
1299         return current_arg_index;
1300 }
1301
1302 void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
1303 {
1304         if(kernel) {
1305                 clReleaseKernel(kernel);
1306         }
1307 }
1308
1309 void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
1310 {
1311         if(mem != NULL) {
1312                 clReleaseMemObject(mem);
1313         }
1314 }
1315
1316 void OpenCLDeviceBase::release_program_safe(cl_program program)
1317 {
1318         if(program) {
1319                 clReleaseProgram(program);
1320         }
1321 }
1322
1323 /* ** Those guys are for workign around some compiler-specific bugs ** */
1324
1325 cl_program OpenCLDeviceBase::load_cached_kernel(
1326         ustring key,
1327         thread_scoped_lock& cache_locker)
1328 {
1329         return OpenCLCache::get_program(cpPlatform,
1330                                         cdDevice,
1331                                         key,
1332                                         cache_locker);
1333 }
1334
1335 void OpenCLDeviceBase::store_cached_kernel(
1336         cl_program program,
1337         ustring key,
1338         thread_scoped_lock& cache_locker)
1339 {
1340         OpenCLCache::store_program(cpPlatform,
1341                                    cdDevice,
1342                                    program,
1343                                    key,
1344                                    cache_locker);
1345 }
1346
1347 string OpenCLDeviceBase::build_options_for_base_program(
1348         const DeviceRequestedFeatures& requested_features)
1349 {
1350         /* TODO(sergey): By default we compile all features, meaning
1351          * mega kernel is not getting feature-based optimizations.
1352          *
1353          * Ideally we need always compile kernel with as less features
1354          * enabled as possible to keep performance at it's max.
1355          */
1356
1357         /* For now disable baking when not in use as this has major
1358          * impact on kernel build times.
1359          */
1360         if(!requested_features.use_baking) {
1361                 return "-D__NO_BAKING__";
1362         }
1363
1364         return "";
1365 }
1366
1367 CCL_NAMESPACE_END
1368
1369 #endif