Cycles: Remove unneeded include statements
[blender-staging.git] / intern / cycles / device / opencl / opencl_base.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "kernel/kernel_types.h"
22
23 #include "util/util_algorithm.h"
24 #include "util/util_debug.h"
25 #include "util/util_foreach.h"
26 #include "util/util_logging.h"
27 #include "util/util_md5.h"
28 #include "util/util_path.h"
29 #include "util/util_time.h"
30
31 CCL_NAMESPACE_BEGIN
32
33 struct texture_slot_t {
34         texture_slot_t(const string& name, int slot)
35                 : name(name),
36                   slot(slot) {
37         }
38         string name;
39         int slot;
40 };
41
42 bool OpenCLDeviceBase::opencl_error(cl_int err)
43 {
44         if(err != CL_SUCCESS) {
45                 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
46                 if(error_msg == "")
47                         error_msg = message;
48                 fprintf(stderr, "%s\n", message.c_str());
49                 return true;
50         }
51
52         return false;
53 }
54
55 void OpenCLDeviceBase::opencl_error(const string& message)
56 {
57         if(error_msg == "")
58                 error_msg = message;
59         fprintf(stderr, "%s\n", message.c_str());
60 }
61
62 void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
63 {
64         if(err != CL_SUCCESS) {
65                 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
66                 if(error_msg == "")
67                         error_msg = message;
68                 fprintf(stderr, "%s\n", message.c_str());
69 #ifndef NDEBUG
70                 abort();
71 #endif
72         }
73 }
74
75 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
76 : Device(info, stats, background_),
77   memory_manager(this),
78   texture_info(this, "__texture_info", MEM_TEXTURE)
79 {
80         cpPlatform = NULL;
81         cdDevice = NULL;
82         cxContext = NULL;
83         cqCommandQueue = NULL;
84         null_mem = 0;
85         device_initialized = false;
86         textures_need_update = true;
87
88         vector<OpenCLPlatformDevice> usable_devices;
89         OpenCLInfo::get_usable_devices(&usable_devices);
90         if(usable_devices.size() == 0) {
91                 opencl_error("OpenCL: no devices found.");
92                 return;
93         }
94         assert(info.num < usable_devices.size());
95         OpenCLPlatformDevice& platform_device = usable_devices[info.num];
96         cpPlatform = platform_device.platform_id;
97         cdDevice = platform_device.device_id;
98         platform_name = platform_device.platform_name;
99         device_name = platform_device.device_name;
100         VLOG(2) << "Creating new Cycles device for OpenCL platform "
101                 << platform_name << ", device "
102                 << device_name << ".";
103
104         {
105                 /* try to use cached context */
106                 thread_scoped_lock cache_locker;
107                 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
108
109                 if(cxContext == NULL) {
110                         /* create context properties array to specify platform */
111                         const cl_context_properties context_props[] = {
112                                 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
113                                 0, 0
114                         };
115
116                         /* create context */
117                         cxContext = clCreateContext(context_props, 1, &cdDevice,
118                                 context_notify_callback, cdDevice, &ciErr);
119
120                         if(opencl_error(ciErr)) {
121                                 opencl_error("OpenCL: clCreateContext failed");
122                                 return;
123                         }
124
125                         /* cache it */
126                         OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
127                 }
128         }
129
130         cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
131         if(opencl_error(ciErr)) {
132                 opencl_error("OpenCL: Error creating command queue");
133                 return;
134         }
135
136         null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
137         if(opencl_error(ciErr)) {
138                 opencl_error("OpenCL: Error creating memory buffer for NULL");
139                 return;
140         }
141
142         /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
143         texture_info.resize(1);
144         memory_manager.alloc("texture_info", texture_info);
145
146         fprintf(stderr, "Device init success\n");
147         device_initialized = true;
148 }
149
150 OpenCLDeviceBase::~OpenCLDeviceBase()
151 {
152         task_pool.stop();
153
154         memory_manager.free();
155
156         if(null_mem)
157                 clReleaseMemObject(CL_MEM_PTR(null_mem));
158
159         ConstMemMap::iterator mt;
160         for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
161                 delete mt->second;
162         }
163
164         base_program.release();
165         if(cqCommandQueue)
166                 clReleaseCommandQueue(cqCommandQueue);
167         if(cxContext)
168                 clReleaseContext(cxContext);
169 }
170
171 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
172         const void * /*private_info*/, size_t /*cb*/, void *user_data)
173 {
174         string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
175         fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
176 }
177
178 bool OpenCLDeviceBase::opencl_version_check()
179 {
180         string error;
181         if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
182                 opencl_error(error);
183                 return false;
184         }
185         if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
186                 opencl_error(error);
187                 return false;
188         }
189         return true;
190 }
191
192 string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
193 {
194         MD5Hash md5;
195         char version[256], driver[256], name[256], vendor[256];
196
197         clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
198         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
199         clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
200         clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
201
202         md5.append((uint8_t*)vendor, strlen(vendor));
203         md5.append((uint8_t*)version, strlen(version));
204         md5.append((uint8_t*)name, strlen(name));
205         md5.append((uint8_t*)driver, strlen(driver));
206
207         string options = kernel_build_options();
208         options += kernel_custom_build_options;
209         md5.append((uint8_t*)options.c_str(), options.size());
210
211         return md5.get_hex();
212 }
213
214 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
215 {
216         VLOG(2) << "Loading kernels for platform " << platform_name
217                 << ", device " << device_name << ".";
218         /* Verify if device was initialized. */
219         if(!device_initialized) {
220                 fprintf(stderr, "OpenCL: failed to initialize device.\n");
221                 return false;
222         }
223
224         /* Verify we have right opencl version. */
225         if(!opencl_version_check())
226                 return false;
227
228         base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
229         base_program.add_kernel(ustring("convert_to_byte"));
230         base_program.add_kernel(ustring("convert_to_half_float"));
231         base_program.add_kernel(ustring("displace"));
232         base_program.add_kernel(ustring("background"));
233         base_program.add_kernel(ustring("bake"));
234         base_program.add_kernel(ustring("zero_buffer"));
235
236         denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
237         denoising_program.add_kernel(ustring("filter_divide_shadow"));
238         denoising_program.add_kernel(ustring("filter_get_feature"));
239         denoising_program.add_kernel(ustring("filter_detect_outliers"));
240         denoising_program.add_kernel(ustring("filter_combine_halves"));
241         denoising_program.add_kernel(ustring("filter_construct_transform"));
242         denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
243         denoising_program.add_kernel(ustring("filter_nlm_blur"));
244         denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
245         denoising_program.add_kernel(ustring("filter_nlm_update_output"));
246         denoising_program.add_kernel(ustring("filter_nlm_normalize"));
247         denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
248         denoising_program.add_kernel(ustring("filter_finalize"));
249         denoising_program.add_kernel(ustring("filter_set_tiles"));
250
251         vector<OpenCLProgram*> programs;
252         programs.push_back(&base_program);
253         programs.push_back(&denoising_program);
254         /* Call actual class to fill the vector with its programs. */
255         if(!load_kernels(requested_features, programs)) {
256                 return false;
257         }
258
259         /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
260          * serialize the calls internally, so it's not much use right now.
261          * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
262          * should be set to false as well. */
263 #if 0
264         TaskPool task_pool;
265         foreach(OpenCLProgram *program, programs) {
266                 task_pool.push(function_bind(&OpenCLProgram::load, program));
267         }
268         task_pool.wait_work();
269
270         foreach(OpenCLProgram *program, programs) {
271                 VLOG(2) << program->get_log();
272                 if(!program->is_loaded()) {
273                         program->report_error();
274                         return false;
275                 }
276         }
277 #else
278         foreach(OpenCLProgram *program, programs) {
279                 program->load();
280                 if(!program->is_loaded()) {
281                         return false;
282                 }
283         }
284 #endif
285
286         return true;
287 }
288
289 void OpenCLDeviceBase::mem_alloc(device_memory& mem)
290 {
291         if(mem.name) {
292                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
293                             << string_human_readable_number(mem.memory_size()) << " bytes. ("
294                             << string_human_readable_size(mem.memory_size()) << ")";
295         }
296
297         size_t size = mem.memory_size();
298
299         /* check there is enough memory available for the allocation */
300         cl_ulong max_alloc_size = 0;
301         clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
302
303         if(DebugFlags().opencl.mem_limit) {
304                 max_alloc_size = min(max_alloc_size,
305                                      cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
306         }
307
308         if(size > max_alloc_size) {
309                 string error = "Scene too complex to fit in available memory.";
310                 if(mem.name != NULL) {
311                         error += string_printf(" (allocating buffer %s failed.)", mem.name);
312                 }
313                 set_error(error);
314
315                 return;
316         }
317
318         cl_mem_flags mem_flag;
319         void *mem_ptr = NULL;
320
321         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
322                 mem_flag = CL_MEM_READ_ONLY;
323         else
324                 mem_flag = CL_MEM_READ_WRITE;
325
326         /* Zero-size allocation might be invoked by render, but not really
327          * supported by OpenCL. Using NULL as device pointer also doesn't really
328          * work for some reason, so for the time being we'll use special case
329          * will null_mem buffer.
330          */
331         if(size != 0) {
332                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
333                                                                 mem_flag,
334                                                                 size,
335                                                                 mem_ptr,
336                                                                 &ciErr);
337                 opencl_assert_err(ciErr, "clCreateBuffer");
338         }
339         else {
340                 mem.device_pointer = null_mem;
341         }
342
343         stats.mem_alloc(size);
344         mem.device_size = size;
345 }
346
347 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
348 {
349         if(mem.type == MEM_TEXTURE) {
350                 tex_free(mem);
351                 tex_alloc(mem);
352         }
353         else {
354                 if(!mem.device_pointer) {
355                         mem_alloc(mem);
356                 }
357
358                 /* this is blocking */
359                 size_t size = mem.memory_size();
360                 if(size != 0) {
361                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
362                                                            CL_MEM_PTR(mem.device_pointer),
363                                                            CL_TRUE,
364                                                            0,
365                                                            size,
366                                                            mem.host_pointer,
367                                                            0,
368                                                            NULL, NULL));
369                 }
370         }
371 }
372
373 void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
374 {
375         size_t offset = elem*y*w;
376         size_t size = elem*w*h;
377         assert(size != 0);
378         opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
379                                           CL_MEM_PTR(mem.device_pointer),
380                                           CL_TRUE,
381                                           offset,
382                                           size,
383                                           (uchar*)mem.host_pointer + offset,
384                                           0,
385                                           NULL, NULL));
386 }
387
388 void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
389 {
390         cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
391
392         size_t global_size[] = {1024, 1024};
393         size_t num_threads = global_size[0] * global_size[1];
394
395         cl_mem d_buffer = CL_MEM_PTR(mem);
396         cl_ulong d_offset = 0;
397         cl_ulong d_size = 0;
398
399         while(d_offset < size) {
400                 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
401
402                 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
403
404                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
405                                                ckZeroBuffer,
406                                                2,
407                                                NULL,
408                                                global_size,
409                                                NULL,
410                                                0,
411                                                NULL,
412                                                NULL);
413                 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
414
415                 d_offset += d_size;
416         }
417 }
418
419 void OpenCLDeviceBase::mem_zero(device_memory& mem)
420 {
421         if(!mem.device_pointer) {
422                 mem_alloc(mem);
423         }
424
425         if(mem.device_pointer) {
426                 if(base_program.is_loaded()) {
427                         mem_zero_kernel(mem.device_pointer, mem.memory_size());
428                 }
429
430                 if(mem.host_pointer) {
431                         memset(mem.host_pointer, 0, mem.memory_size());
432                 }
433
434                 if(!base_program.is_loaded()) {
435                         void* zero = mem.host_pointer;
436
437                         if(!mem.host_pointer) {
438                                 zero = util_aligned_malloc(mem.memory_size(), 16);
439                                 memset(zero, 0, mem.memory_size());
440                         }
441
442                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
443                                                            CL_MEM_PTR(mem.device_pointer),
444                                                            CL_TRUE,
445                                                            0,
446                                                            mem.memory_size(),
447                                                            zero,
448                                                            0,
449                                                            NULL, NULL));
450
451                         if(!mem.host_pointer) {
452                                 util_aligned_free(zero);
453                         }
454                 }
455         }
456 }
457
458 void OpenCLDeviceBase::mem_free(device_memory& mem)
459 {
460         if(mem.type == MEM_TEXTURE) {
461                 tex_free(mem);
462         }
463         else {
464                 if(mem.device_pointer) {
465                         if(mem.device_pointer != null_mem) {
466                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
467                         }
468                         mem.device_pointer = 0;
469
470                         stats.mem_free(mem.device_size);
471                         mem.device_size = 0;
472                 }
473         }
474 }
475
476 int OpenCLDeviceBase::mem_address_alignment()
477 {
478         return OpenCLInfo::mem_address_alignment(cdDevice);
479 }
480
481 device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
482 {
483         cl_mem_flags mem_flag;
484         if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
485                 mem_flag = CL_MEM_READ_ONLY;
486         else
487                 mem_flag = CL_MEM_READ_WRITE;
488
489         cl_buffer_region info;
490         info.origin = mem.memory_elements_size(offset);
491         info.size = mem.memory_elements_size(size);
492
493         device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
494                                                             mem_flag,
495                                                             CL_BUFFER_CREATE_TYPE_REGION,
496                                                             &info,
497                                                             &ciErr);
498         opencl_assert_err(ciErr, "clCreateSubBuffer");
499         return sub_buf;
500 }
501
502 void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
503 {
504         if(device_pointer && device_pointer != null_mem) {
505                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
506         }
507 }
508
509 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
510 {
511         ConstMemMap::iterator i = const_mem_map.find(name);
512         device_vector<uchar> *data;
513
514         if(i == const_mem_map.end()) {
515                 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
516                 data->alloc(size);
517                 const_mem_map.insert(ConstMemMap::value_type(name, data));
518         }
519         else {
520                 data = i->second;
521         }
522
523         memcpy(data->data(), host, size);
524         data->copy_to_device();
525 }
526
527 void OpenCLDeviceBase::tex_alloc(device_memory& mem)
528 {
529         VLOG(1) << "Texture allocate: " << mem.name << ", "
530                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
531                 << string_human_readable_size(mem.memory_size()) << ")";
532
533         memory_manager.alloc(mem.name, mem);
534         /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
535         mem.device_pointer = 1;
536         textures[mem.name] = &mem;
537         textures_need_update = true;
538 }
539
540 void OpenCLDeviceBase::tex_free(device_memory& mem)
541 {
542         if(mem.device_pointer) {
543                 mem.device_pointer = 0;
544
545                 if(memory_manager.free(mem)) {
546                         textures_need_update = true;
547                 }
548
549                 foreach(TexturesMap::value_type& value, textures) {
550                         if(value.second == &mem) {
551                                 textures.erase(value.first);
552                                 break;
553                         }
554                 }
555         }
556 }
557
558 size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
559 {
560         int r = global_size % group_size;
561         return global_size + ((r == 0)? 0: group_size - r);
562 }
563
564 void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
565 {
566         size_t workgroup_size, max_work_items[3];
567
568         clGetKernelWorkGroupInfo(kernel, cdDevice,
569                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
570         clGetDeviceInfo(cdDevice,
571                 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
572
573         if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
574                 workgroup_size = max_workgroup_size;
575         }
576
577         /* Try to divide evenly over 2 dimensions. */
578         size_t local_size[2];
579         if(x_workgroups) {
580                 local_size[0] = workgroup_size;
581                 local_size[1] = 1;
582         }
583         else {
584                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
585                 local_size[0] = local_size[1] = sqrt_workgroup_size;
586         }
587
588         /* Some implementations have max size 1 on 2nd dimension. */
589         if(local_size[1] > max_work_items[1]) {
590                 local_size[0] = workgroup_size/max_work_items[1];
591                 local_size[1] = max_work_items[1];
592         }
593
594         size_t global_size[2] = {global_size_round_up(local_size[0], w),
595                                  global_size_round_up(local_size[1], h)};
596
597         /* Vertical size of 1 is coming from bake/shade kernels where we should
598          * not round anything up because otherwise we'll either be doing too
599          * much work per pixel (if we don't check global ID on Y axis) or will
600          * be checking for global ID to always have Y of 0.
601          */
602         if(h == 1) {
603                 global_size[h] = 1;
604         }
605
606         /* run kernel */
607         opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
608         opencl_assert(clFlush(cqCommandQueue));
609 }
610
611 void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
612 {
613         cl_mem ptr;
614
615         MemMap::iterator i = mem_map.find(name);
616         if(i != mem_map.end()) {
617                 ptr = CL_MEM_PTR(i->second);
618         }
619         else {
620                 /* work around NULL not working, even though the spec says otherwise */
621                 ptr = CL_MEM_PTR(null_mem);
622         }
623
624         opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
625 }
626
627 void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
628 {
629         flush_texture_buffers();
630
631         memory_manager.set_kernel_arg_buffers(kernel, narg);
632 }
633
634 void OpenCLDeviceBase::flush_texture_buffers()
635 {
636         if(!textures_need_update) {
637                 return;
638         }
639         textures_need_update = false;
640
641         /* Setup slots for textures. */
642         int num_slots = 0;
643
644         vector<texture_slot_t> texture_slots;
645
646 #define KERNEL_TEX(type, name) \
647         if(textures.find(#name) != textures.end()) { \
648                 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
649         } \
650         num_slots++;
651 #include "kernel/kernel_textures.h"
652
653         int num_data_slots = num_slots;
654
655         foreach(TexturesMap::value_type& tex, textures) {
656                 string name = tex.first;
657
658                 if(string_startswith(name, "__tex_image")) {
659                         int pos = name.rfind("_");
660                         int id = atoi(name.data() + pos + 1);
661                         texture_slots.push_back(texture_slot_t(name,
662                                                                    num_data_slots + id));
663                         num_slots = max(num_slots, num_data_slots + id + 1);
664                 }
665         }
666
667         /* Realloc texture descriptors buffer. */
668         memory_manager.free(texture_info);
669         texture_info.resize(num_slots);
670         memory_manager.alloc("texture_info", texture_info);
671
672         /* Fill in descriptors */
673         foreach(texture_slot_t& slot, texture_slots) {
674                 TextureInfo& info = texture_info[slot.slot];
675
676                 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
677                 info.data = desc.offset;
678                 info.cl_buffer = desc.device_buffer;
679
680                 if(string_startswith(slot.name, "__tex_image")) {
681                         device_memory *mem = textures[slot.name];
682
683                         info.width = mem->data_width;
684                         info.height = mem->data_height;
685                         info.depth = mem->data_depth;
686
687                         info.interpolation = mem->interpolation;
688                         info.extension = mem->extension;
689                 }
690         }
691
692         /* Force write of descriptors. */
693         memory_manager.free(texture_info);
694         memory_manager.alloc("texture_info", texture_info);
695 }
696
697 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
698 {
699         /* cast arguments to cl types */
700         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
701         cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
702         cl_mem d_buffer = CL_MEM_PTR(buffer);
703         cl_int d_x = task.x;
704         cl_int d_y = task.y;
705         cl_int d_w = task.w;
706         cl_int d_h = task.h;
707         cl_float d_sample_scale = 1.0f/(task.sample + 1);
708         cl_int d_offset = task.offset;
709         cl_int d_stride = task.stride;
710
711
712         cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
713
714         cl_uint start_arg_index =
715                 kernel_set_args(ckFilmConvertKernel,
716                                 0,
717                                 d_data,
718                                 d_rgba,
719                                 d_buffer);
720
721         set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
722
723         start_arg_index += kernel_set_args(ckFilmConvertKernel,
724                                            start_arg_index,
725                                            d_sample_scale,
726                                            d_x,
727                                            d_y,
728                                            d_w,
729                                            d_h,
730                                            d_offset,
731                                            d_stride);
732
733         enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
734 }
735
736 bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
737                                                  device_ptr guide_ptr,
738                                                  device_ptr variance_ptr,
739                                                  device_ptr out_ptr,
740                                                  DenoisingTask *task)
741 {
742
743         int stride = task->buffer.stride;
744         int w = task->buffer.width;
745         int h = task->buffer.h;
746         int r = task->nlm_state.r;
747         int f = task->nlm_state.f;
748         float a = task->nlm_state.a;
749         float k_2 = task->nlm_state.k_2;
750
751         int shift_stride = stride*h;
752         int num_shifts = (2*r+1)*(2*r+1);
753         int mem_size = sizeof(float)*shift_stride*num_shifts;
754
755         cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
756
757         cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
758         opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
759         cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
760         opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
761
762         cl_mem image_mem = CL_MEM_PTR(image_ptr);
763         cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
764         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
765         cl_mem out_mem = CL_MEM_PTR(out_ptr);
766
767         mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
768         mem_zero_kernel(out_ptr, sizeof(float)*w*h);
769
770         cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
771         cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
772         cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
773         cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
774         cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
775
776         kernel_set_args(ckNLMCalcDifference, 0,
777                         guide_mem,
778                         variance_mem,
779                         difference,
780                         w, h, stride,
781                         shift_stride,
782                         r, 0, a, k_2);
783         kernel_set_args(ckNLMBlur, 0,
784                         difference,
785                         blurDifference,
786                         w, h, stride,
787                         shift_stride,
788                         r, f);
789         kernel_set_args(ckNLMCalcWeight, 0,
790                         blurDifference,
791                         difference,
792                         w, h, stride,
793                         shift_stride,
794                         r, f);
795         kernel_set_args(ckNLMUpdateOutput, 0,
796                         blurDifference,
797                         image_mem,
798                         out_mem,
799                         weightAccum,
800                         w, h, stride,
801                         shift_stride,
802                         r, f);
803
804         enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
805         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
806         enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
807         enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
808         enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
809
810         opencl_assert(clReleaseMemObject(difference));
811         opencl_assert(clReleaseMemObject(blurDifference));
812
813         kernel_set_args(ckNLMNormalize, 0,
814                         out_mem, weightAccum, w, h, stride);
815         enqueue_kernel(ckNLMNormalize, w, h);
816
817         return true;
818 }
819
820 bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
821 {
822         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
823         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
824         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
825
826         cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
827
828         kernel_set_args(ckFilterConstructTransform, 0,
829                         buffer_mem,
830                         transform_mem,
831                         rank_mem,
832                         task->filter_area,
833                         task->rect,
834                         task->buffer.pass_stride,
835                         task->radius,
836                         task->pca_threshold);
837
838         enqueue_kernel(ckFilterConstructTransform,
839                        task->storage.w,
840                        task->storage.h,
841                        256);
842
843         return true;
844 }
845
846 bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
847                                              device_ptr color_variance_ptr,
848                                              device_ptr output_ptr,
849                                              DenoisingTask *task)
850 {
851         mem_zero(task->storage.XtWX);
852         mem_zero(task->storage.XtWY);
853
854         cl_mem color_mem = CL_MEM_PTR(color_ptr);
855         cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
856         cl_mem output_mem = CL_MEM_PTR(output_ptr);
857
858         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
859         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
860         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
861         cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
862         cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
863
864         cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
865         cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
866         cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
867         cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
868         cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
869
870         int w = task->reconstruction_state.source_w;
871         int h = task->reconstruction_state.source_h;
872         int stride = task->buffer.stride;
873
874         int shift_stride = stride*h;
875         int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
876         int mem_size = sizeof(float)*shift_stride*num_shifts;
877
878         cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
879         opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
880         cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
881         opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
882
883         kernel_set_args(ckNLMCalcDifference, 0,
884                         color_mem,
885                         color_variance_mem,
886                         difference,
887                         w, h, stride,
888                         shift_stride,
889                         task->radius,
890                         task->buffer.pass_stride,
891                         1.0f, task->nlm_k_2);
892         kernel_set_args(ckNLMBlur, 0,
893                         difference,
894                         blurDifference,
895                         w, h, stride,
896                         shift_stride,
897                         task->radius, 4);
898         kernel_set_args(ckNLMCalcWeight, 0,
899                         blurDifference,
900                         difference,
901                         w, h, stride,
902                         shift_stride,
903                         task->radius, 4);
904         kernel_set_args(ckNLMConstructGramian, 0,
905                         blurDifference,
906                         buffer_mem,
907                         transform_mem,
908                         rank_mem,
909                         XtWX_mem,
910                         XtWY_mem,
911                         task->reconstruction_state.filter_window,
912                         w, h, stride,
913                         shift_stride,
914                         task->radius, 4,
915                         task->buffer.pass_stride);
916
917         enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
918         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
919         enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
920         enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
921         enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
922
923         opencl_assert(clReleaseMemObject(difference));
924         opencl_assert(clReleaseMemObject(blurDifference));
925
926         kernel_set_args(ckFinalize, 0,
927                         output_mem,
928                         rank_mem,
929                         XtWX_mem,
930                         XtWY_mem,
931                         task->filter_area,
932                         task->reconstruction_state.buffer_params,
933                         task->render_buffer.samples);
934         enqueue_kernel(ckFinalize, w, h);
935
936         return true;
937 }
938
939 bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
940                                                 device_ptr b_ptr,
941                                                 device_ptr mean_ptr,
942                                                 device_ptr variance_ptr,
943                                                 int r, int4 rect,
944                                                 DenoisingTask *task)
945 {
946         cl_mem a_mem = CL_MEM_PTR(a_ptr);
947         cl_mem b_mem = CL_MEM_PTR(b_ptr);
948         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
949         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
950
951         cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
952
953         kernel_set_args(ckFilterCombineHalves, 0,
954                         mean_mem,
955                         variance_mem,
956                         a_mem,
957                         b_mem,
958                         rect,
959                         r);
960         enqueue_kernel(ckFilterCombineHalves,
961                        task->rect.z-task->rect.x,
962                        task->rect.w-task->rect.y);
963
964         return true;
965 }
966
967 bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
968                                                device_ptr b_ptr,
969                                                device_ptr sample_variance_ptr,
970                                                device_ptr sv_variance_ptr,
971                                                device_ptr buffer_variance_ptr,
972                                                DenoisingTask *task)
973 {
974         cl_mem a_mem = CL_MEM_PTR(a_ptr);
975         cl_mem b_mem = CL_MEM_PTR(b_ptr);
976         cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
977         cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
978         cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
979
980         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
981
982         cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
983
984         kernel_set_args(ckFilterDivideShadow, 0,
985                         task->render_buffer.samples,
986                         tiles_mem,
987                         a_mem,
988                         b_mem,
989                         sample_variance_mem,
990                         sv_variance_mem,
991                         buffer_variance_mem,
992                         task->rect,
993                         task->render_buffer.pass_stride,
994                         task->render_buffer.denoising_data_offset);
995         enqueue_kernel(ckFilterDivideShadow,
996                        task->rect.z-task->rect.x,
997                        task->rect.w-task->rect.y);
998
999         return true;
1000 }
1001
1002 bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
1003                                              int variance_offset,
1004                                              device_ptr mean_ptr,
1005                                              device_ptr variance_ptr,
1006                                              DenoisingTask *task)
1007 {
1008         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
1009         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1010
1011         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1012
1013         cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1014
1015         kernel_set_args(ckFilterGetFeature, 0,
1016                         task->render_buffer.samples,
1017                         tiles_mem,
1018                         mean_offset,
1019                         variance_offset,
1020                         mean_mem,
1021                         variance_mem,
1022                         task->rect,
1023                         task->render_buffer.pass_stride,
1024                         task->render_buffer.denoising_data_offset);
1025         enqueue_kernel(ckFilterGetFeature,
1026                        task->rect.z-task->rect.x,
1027                        task->rect.w-task->rect.y);
1028
1029         return true;
1030 }
1031
1032 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
1033                                                  device_ptr variance_ptr,
1034                                                  device_ptr depth_ptr,
1035                                                  device_ptr output_ptr,
1036                                                  DenoisingTask *task)
1037 {
1038         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1039         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1040         cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1041         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1042
1043         cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1044
1045         kernel_set_args(ckFilterDetectOutliers, 0,
1046                         image_mem,
1047                         variance_mem,
1048                         depth_mem,
1049                         output_mem,
1050                         task->rect,
1051                         task->buffer.pass_stride);
1052         enqueue_kernel(ckFilterDetectOutliers,
1053                        task->rect.z-task->rect.x,
1054                        task->rect.w-task->rect.y);
1055
1056         return true;
1057 }
1058
1059 bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
1060                                            DenoisingTask *task)
1061 {
1062         task->tiles_mem.copy_to_device();
1063
1064         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1065
1066         cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
1067
1068         kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
1069         for(int i = 0; i < 9; i++) {
1070                 cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
1071                 kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
1072         }
1073
1074         enqueue_kernel(ckFilterSetTiles, 1, 1);
1075
1076         return true;
1077 }
1078
1079 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task)
1080 {
1081         denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
1082         denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
1083         denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
1084         denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1085         denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1086         denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1087         denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
1088         denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1089
1090         denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1091         denoising.render_buffer.samples = rtile.sample;
1092
1093         RenderTile rtiles[9];
1094         rtiles[4] = rtile;
1095         task.map_neighbor_tiles(rtiles, this);
1096         denoising.tiles_from_rendertiles(rtiles);
1097
1098         denoising.init_from_devicetask(task);
1099
1100         denoising.run_denoising();
1101
1102         task.unmap_neighbor_tiles(rtiles, this);
1103 }
1104
1105 void OpenCLDeviceBase::shader(DeviceTask& task)
1106 {
1107         /* cast arguments to cl types */
1108         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1109         cl_mem d_input = CL_MEM_PTR(task.shader_input);
1110         cl_mem d_output = CL_MEM_PTR(task.shader_output);
1111         cl_int d_shader_eval_type = task.shader_eval_type;
1112         cl_int d_shader_filter = task.shader_filter;
1113         cl_int d_shader_x = task.shader_x;
1114         cl_int d_shader_w = task.shader_w;
1115         cl_int d_offset = task.offset;
1116
1117         cl_kernel kernel;
1118
1119         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1120                 kernel = base_program(ustring("bake"));
1121         }
1122         else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1123                 kernel = base_program(ustring("displace"));
1124         }
1125         else {
1126                 kernel = base_program(ustring("background"));
1127         }
1128
1129         cl_uint start_arg_index =
1130                 kernel_set_args(kernel,
1131                                 0,
1132                                 d_data,
1133                                 d_input,
1134                                 d_output);
1135
1136         set_kernel_arg_buffers(kernel, &start_arg_index);
1137
1138         start_arg_index += kernel_set_args(kernel,
1139                                            start_arg_index,
1140                                            d_shader_eval_type);
1141         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1142                 start_arg_index += kernel_set_args(kernel,
1143                                                    start_arg_index,
1144                                                    d_shader_filter);
1145         }
1146         start_arg_index += kernel_set_args(kernel,
1147                                            start_arg_index,
1148                                            d_shader_x,
1149                                            d_shader_w,
1150                                            d_offset);
1151
1152         for(int sample = 0; sample < task.num_samples; sample++) {
1153
1154                 if(task.get_cancel())
1155                         break;
1156
1157                 kernel_set_args(kernel, start_arg_index, sample);
1158
1159                 enqueue_kernel(kernel, task.shader_w, 1);
1160
1161                 clFinish(cqCommandQueue);
1162
1163                 task.update_progress(NULL);
1164         }
1165 }
1166
1167 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
1168 {
1169         string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1170
1171         if(platform_name == "NVIDIA CUDA") {
1172                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1173                                  "-cl-nv-maxrregcount=32 "
1174                                  "-cl-nv-verbose ";
1175
1176                 uint compute_capability_major, compute_capability_minor;
1177                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1178                                 sizeof(cl_uint), &compute_capability_major, NULL);
1179                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1180                                 sizeof(cl_uint), &compute_capability_minor, NULL);
1181
1182                 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1183                                                compute_capability_major * 100 +
1184                                                compute_capability_minor * 10);
1185         }
1186
1187         else if(platform_name == "Apple")
1188                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1189
1190         else if(platform_name == "AMD Accelerated Parallel Processing")
1191                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1192
1193         else if(platform_name == "Intel(R) OpenCL") {
1194                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1195
1196                 /* Options for gdb source level kernel debugging.
1197                  * this segfaults on linux currently.
1198                  */
1199                 if(OpenCLInfo::use_debug() && debug_src)
1200                         build_options += "-g -s \"" + *debug_src + "\" ";
1201         }
1202
1203         if(OpenCLInfo::use_debug())
1204                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1205
1206 #ifdef WITH_CYCLES_DEBUG
1207         build_options += "-D__KERNEL_DEBUG__ ";
1208 #endif
1209
1210         return build_options;
1211 }
1212
1213 /* TODO(sergey): In the future we can use variadic templates, once
1214  * C++0x is allowed. Should allow to clean this up a bit.
1215  */
1216 int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
1217                     int start_argument_index,
1218                     const ArgumentWrapper& arg1,
1219                     const ArgumentWrapper& arg2,
1220                     const ArgumentWrapper& arg3,
1221                     const ArgumentWrapper& arg4,
1222                     const ArgumentWrapper& arg5,
1223                     const ArgumentWrapper& arg6,
1224                     const ArgumentWrapper& arg7,
1225                     const ArgumentWrapper& arg8,
1226                     const ArgumentWrapper& arg9,
1227                     const ArgumentWrapper& arg10,
1228                     const ArgumentWrapper& arg11,
1229                     const ArgumentWrapper& arg12,
1230                     const ArgumentWrapper& arg13,
1231                     const ArgumentWrapper& arg14,
1232                     const ArgumentWrapper& arg15,
1233                     const ArgumentWrapper& arg16,
1234                     const ArgumentWrapper& arg17,
1235                     const ArgumentWrapper& arg18,
1236                     const ArgumentWrapper& arg19,
1237                     const ArgumentWrapper& arg20,
1238                     const ArgumentWrapper& arg21,
1239                     const ArgumentWrapper& arg22,
1240                     const ArgumentWrapper& arg23,
1241                     const ArgumentWrapper& arg24,
1242                     const ArgumentWrapper& arg25,
1243                     const ArgumentWrapper& arg26,
1244                     const ArgumentWrapper& arg27,
1245                     const ArgumentWrapper& arg28,
1246                     const ArgumentWrapper& arg29,
1247                     const ArgumentWrapper& arg30,
1248                     const ArgumentWrapper& arg31,
1249                     const ArgumentWrapper& arg32,
1250                     const ArgumentWrapper& arg33)
1251 {
1252         int current_arg_index = 0;
1253 #define FAKE_VARARG_HANDLE_ARG(arg) \
1254         do { \
1255                 if(arg.pointer != NULL) { \
1256                         opencl_assert(clSetKernelArg( \
1257                                 kernel, \
1258                                 start_argument_index + current_arg_index, \
1259                                 arg.size, arg.pointer)); \
1260                         ++current_arg_index; \
1261                 } \
1262                 else { \
1263                         return current_arg_index; \
1264                 } \
1265         } while(false)
1266         FAKE_VARARG_HANDLE_ARG(arg1);
1267         FAKE_VARARG_HANDLE_ARG(arg2);
1268         FAKE_VARARG_HANDLE_ARG(arg3);
1269         FAKE_VARARG_HANDLE_ARG(arg4);
1270         FAKE_VARARG_HANDLE_ARG(arg5);
1271         FAKE_VARARG_HANDLE_ARG(arg6);
1272         FAKE_VARARG_HANDLE_ARG(arg7);
1273         FAKE_VARARG_HANDLE_ARG(arg8);
1274         FAKE_VARARG_HANDLE_ARG(arg9);
1275         FAKE_VARARG_HANDLE_ARG(arg10);
1276         FAKE_VARARG_HANDLE_ARG(arg11);
1277         FAKE_VARARG_HANDLE_ARG(arg12);
1278         FAKE_VARARG_HANDLE_ARG(arg13);
1279         FAKE_VARARG_HANDLE_ARG(arg14);
1280         FAKE_VARARG_HANDLE_ARG(arg15);
1281         FAKE_VARARG_HANDLE_ARG(arg16);
1282         FAKE_VARARG_HANDLE_ARG(arg17);
1283         FAKE_VARARG_HANDLE_ARG(arg18);
1284         FAKE_VARARG_HANDLE_ARG(arg19);
1285         FAKE_VARARG_HANDLE_ARG(arg20);
1286         FAKE_VARARG_HANDLE_ARG(arg21);
1287         FAKE_VARARG_HANDLE_ARG(arg22);
1288         FAKE_VARARG_HANDLE_ARG(arg23);
1289         FAKE_VARARG_HANDLE_ARG(arg24);
1290         FAKE_VARARG_HANDLE_ARG(arg25);
1291         FAKE_VARARG_HANDLE_ARG(arg26);
1292         FAKE_VARARG_HANDLE_ARG(arg27);
1293         FAKE_VARARG_HANDLE_ARG(arg28);
1294         FAKE_VARARG_HANDLE_ARG(arg29);
1295         FAKE_VARARG_HANDLE_ARG(arg30);
1296         FAKE_VARARG_HANDLE_ARG(arg31);
1297         FAKE_VARARG_HANDLE_ARG(arg32);
1298         FAKE_VARARG_HANDLE_ARG(arg33);
1299 #undef FAKE_VARARG_HANDLE_ARG
1300         return current_arg_index;
1301 }
1302
1303 void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
1304 {
1305         if(kernel) {
1306                 clReleaseKernel(kernel);
1307         }
1308 }
1309
1310 void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
1311 {
1312         if(mem != NULL) {
1313                 clReleaseMemObject(mem);
1314         }
1315 }
1316
1317 void OpenCLDeviceBase::release_program_safe(cl_program program)
1318 {
1319         if(program) {
1320                 clReleaseProgram(program);
1321         }
1322 }
1323
1324 /* ** Those guys are for workign around some compiler-specific bugs ** */
1325
1326 cl_program OpenCLDeviceBase::load_cached_kernel(
1327         ustring key,
1328         thread_scoped_lock& cache_locker)
1329 {
1330         return OpenCLCache::get_program(cpPlatform,
1331                                         cdDevice,
1332                                         key,
1333                                         cache_locker);
1334 }
1335
1336 void OpenCLDeviceBase::store_cached_kernel(
1337         cl_program program,
1338         ustring key,
1339         thread_scoped_lock& cache_locker)
1340 {
1341         OpenCLCache::store_program(cpPlatform,
1342                                    cdDevice,
1343                                    program,
1344                                    key,
1345                                    cache_locker);
1346 }
1347
1348 string OpenCLDeviceBase::build_options_for_base_program(
1349         const DeviceRequestedFeatures& requested_features)
1350 {
1351         /* TODO(sergey): By default we compile all features, meaning
1352          * mega kernel is not getting feature-based optimizations.
1353          *
1354          * Ideally we need always compile kernel with as less features
1355          * enabled as possible to keep performance at it's max.
1356          */
1357
1358         /* For now disable baking when not in use as this has major
1359          * impact on kernel build times.
1360          */
1361         if(!requested_features.use_baking) {
1362                 return "-D__NO_BAKING__";
1363         }
1364
1365         return "";
1366 }
1367
1368 CCL_NAMESPACE_END
1369
1370 #endif