89ab1a43e68e72ed80f4e7161abe0931573d91af
[blender-staging.git] / intern / cycles / device / opencl / opencl_base.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "kernel/kernel_types.h"
22
23 #include "util/util_algorithm.h"
24 #include "util/util_foreach.h"
25 #include "util/util_logging.h"
26 #include "util/util_md5.h"
27 #include "util/util_path.h"
28 #include "util/util_time.h"
29
30 CCL_NAMESPACE_BEGIN
31
32 struct texture_slot_t {
33         texture_slot_t(const string& name, int slot)
34                 : name(name),
35                   slot(slot) {
36         }
37         string name;
38         int slot;
39 };
40
41 bool OpenCLDeviceBase::opencl_error(cl_int err)
42 {
43         if(err != CL_SUCCESS) {
44                 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
45                 if(error_msg == "")
46                         error_msg = message;
47                 fprintf(stderr, "%s\n", message.c_str());
48                 return true;
49         }
50
51         return false;
52 }
53
54 void OpenCLDeviceBase::opencl_error(const string& message)
55 {
56         if(error_msg == "")
57                 error_msg = message;
58         fprintf(stderr, "%s\n", message.c_str());
59 }
60
61 void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
62 {
63         if(err != CL_SUCCESS) {
64                 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
65                 if(error_msg == "")
66                         error_msg = message;
67                 fprintf(stderr, "%s\n", message.c_str());
68 #ifndef NDEBUG
69                 abort();
70 #endif
71         }
72 }
73
74 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
75 : Device(info, stats, background_),
76   memory_manager(this),
77   texture_info_buffer(this, "__texture_info", MEM_READ_ONLY)
78 {
79         cpPlatform = NULL;
80         cdDevice = NULL;
81         cxContext = NULL;
82         cqCommandQueue = NULL;
83         null_mem = 0;
84         device_initialized = false;
85         textures_need_update = true;
86
87         vector<OpenCLPlatformDevice> usable_devices;
88         OpenCLInfo::get_usable_devices(&usable_devices);
89         if(usable_devices.size() == 0) {
90                 opencl_error("OpenCL: no devices found.");
91                 return;
92         }
93         assert(info.num < usable_devices.size());
94         OpenCLPlatformDevice& platform_device = usable_devices[info.num];
95         cpPlatform = platform_device.platform_id;
96         cdDevice = platform_device.device_id;
97         platform_name = platform_device.platform_name;
98         device_name = platform_device.device_name;
99         VLOG(2) << "Creating new Cycles device for OpenCL platform "
100                 << platform_name << ", device "
101                 << device_name << ".";
102
103         {
104                 /* try to use cached context */
105                 thread_scoped_lock cache_locker;
106                 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
107
108                 if(cxContext == NULL) {
109                         /* create context properties array to specify platform */
110                         const cl_context_properties context_props[] = {
111                                 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
112                                 0, 0
113                         };
114
115                         /* create context */
116                         cxContext = clCreateContext(context_props, 1, &cdDevice,
117                                 context_notify_callback, cdDevice, &ciErr);
118
119                         if(opencl_error(ciErr)) {
120                                 opencl_error("OpenCL: clCreateContext failed");
121                                 return;
122                         }
123
124                         /* cache it */
125                         OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
126                 }
127         }
128
129         cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
130         if(opencl_error(ciErr)) {
131                 opencl_error("OpenCL: Error creating command queue");
132                 return;
133         }
134
135         null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
136         if(opencl_error(ciErr)) {
137                 opencl_error("OpenCL: Error creating memory buffer for NULL");
138                 return;
139         }
140
141         /* Allocate this right away so that texture_info_buffer is placed at offset 0 in the device memory buffers */
142         texture_info.resize(1);
143         texture_info_buffer.resize(1);
144         texture_info_buffer.data_pointer = (device_ptr)&texture_info[0];
145         memory_manager.alloc("texture_info", texture_info_buffer);
146
147         fprintf(stderr, "Device init success\n");
148         device_initialized = true;
149 }
150
151 OpenCLDeviceBase::~OpenCLDeviceBase()
152 {
153         task_pool.stop();
154
155         memory_manager.free();
156
157         if(null_mem)
158                 clReleaseMemObject(CL_MEM_PTR(null_mem));
159
160         ConstMemMap::iterator mt;
161         for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
162                 mem_free(*(mt->second));
163                 delete mt->second;
164         }
165
166         base_program.release();
167         if(cqCommandQueue)
168                 clReleaseCommandQueue(cqCommandQueue);
169         if(cxContext)
170                 clReleaseContext(cxContext);
171 }
172
173 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
174         const void * /*private_info*/, size_t /*cb*/, void *user_data)
175 {
176         string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
177         fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
178 }
179
180 bool OpenCLDeviceBase::opencl_version_check()
181 {
182         string error;
183         if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
184                 opencl_error(error);
185                 return false;
186         }
187         if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
188                 opencl_error(error);
189                 return false;
190         }
191         return true;
192 }
193
194 string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
195 {
196         MD5Hash md5;
197         char version[256], driver[256], name[256], vendor[256];
198
199         clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
200         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
201         clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
202         clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
203
204         md5.append((uint8_t*)vendor, strlen(vendor));
205         md5.append((uint8_t*)version, strlen(version));
206         md5.append((uint8_t*)name, strlen(name));
207         md5.append((uint8_t*)driver, strlen(driver));
208
209         string options = kernel_build_options();
210         options += kernel_custom_build_options;
211         md5.append((uint8_t*)options.c_str(), options.size());
212
213         return md5.get_hex();
214 }
215
216 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
217 {
218         VLOG(2) << "Loading kernels for platform " << platform_name
219                 << ", device " << device_name << ".";
220         /* Verify if device was initialized. */
221         if(!device_initialized) {
222                 fprintf(stderr, "OpenCL: failed to initialize device.\n");
223                 return false;
224         }
225
226         /* Verify we have right opencl version. */
227         if(!opencl_version_check())
228                 return false;
229
230         base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
231         base_program.add_kernel(ustring("convert_to_byte"));
232         base_program.add_kernel(ustring("convert_to_half_float"));
233         base_program.add_kernel(ustring("displace"));
234         base_program.add_kernel(ustring("background"));
235         base_program.add_kernel(ustring("bake"));
236         base_program.add_kernel(ustring("zero_buffer"));
237
238         denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
239         denoising_program.add_kernel(ustring("filter_divide_shadow"));
240         denoising_program.add_kernel(ustring("filter_get_feature"));
241         denoising_program.add_kernel(ustring("filter_detect_outliers"));
242         denoising_program.add_kernel(ustring("filter_combine_halves"));
243         denoising_program.add_kernel(ustring("filter_construct_transform"));
244         denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
245         denoising_program.add_kernel(ustring("filter_nlm_blur"));
246         denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
247         denoising_program.add_kernel(ustring("filter_nlm_update_output"));
248         denoising_program.add_kernel(ustring("filter_nlm_normalize"));
249         denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
250         denoising_program.add_kernel(ustring("filter_finalize"));
251         denoising_program.add_kernel(ustring("filter_set_tiles"));
252
253         vector<OpenCLProgram*> programs;
254         programs.push_back(&base_program);
255         programs.push_back(&denoising_program);
256         /* Call actual class to fill the vector with its programs. */
257         if(!load_kernels(requested_features, programs)) {
258                 return false;
259         }
260
261         /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
262          * serialize the calls internally, so it's not much use right now.
263          * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
264          * should be set to false as well. */
265 #if 0
266         TaskPool task_pool;
267         foreach(OpenCLProgram *program, programs) {
268                 task_pool.push(function_bind(&OpenCLProgram::load, program));
269         }
270         task_pool.wait_work();
271
272         foreach(OpenCLProgram *program, programs) {
273                 VLOG(2) << program->get_log();
274                 if(!program->is_loaded()) {
275                         program->report_error();
276                         return false;
277                 }
278         }
279 #else
280         foreach(OpenCLProgram *program, programs) {
281                 program->load();
282                 if(!program->is_loaded()) {
283                         return false;
284                 }
285         }
286 #endif
287
288         return true;
289 }
290
291 void OpenCLDeviceBase::mem_alloc(device_memory& mem)
292 {
293         if(mem.name) {
294                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
295                             << string_human_readable_number(mem.memory_size()) << " bytes. ("
296                             << string_human_readable_size(mem.memory_size()) << ")";
297         }
298
299         size_t size = mem.memory_size();
300
301         /* check there is enough memory available for the allocation */
302         cl_ulong max_alloc_size = 0;
303         clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
304
305         if(DebugFlags().opencl.mem_limit) {
306                 max_alloc_size = min(max_alloc_size,
307                                      cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
308         }
309
310         if(size > max_alloc_size) {
311                 string error = "Scene too complex to fit in available memory.";
312                 if(mem.name != NULL) {
313                         error += string_printf(" (allocating buffer %s failed.)", mem.name);
314                 }
315                 set_error(error);
316
317                 return;
318         }
319
320         cl_mem_flags mem_flag;
321         void *mem_ptr = NULL;
322
323         if(mem.type == MEM_READ_ONLY)
324                 mem_flag = CL_MEM_READ_ONLY;
325         else if(mem.type == MEM_WRITE_ONLY)
326                 mem_flag = CL_MEM_WRITE_ONLY;
327         else
328                 mem_flag = CL_MEM_READ_WRITE;
329
330         /* Zero-size allocation might be invoked by render, but not really
331          * supported by OpenCL. Using NULL as device pointer also doesn't really
332          * work for some reason, so for the time being we'll use special case
333          * will null_mem buffer.
334          */
335         if(size != 0) {
336                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
337                                                                 mem_flag,
338                                                                 size,
339                                                                 mem_ptr,
340                                                                 &ciErr);
341                 opencl_assert_err(ciErr, "clCreateBuffer");
342         }
343         else {
344                 mem.device_pointer = null_mem;
345         }
346
347         stats.mem_alloc(size);
348         mem.device_size = size;
349 }
350
351 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
352 {
353         /* this is blocking */
354         size_t size = mem.memory_size();
355         if(size != 0) {
356                 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
357                                                    CL_MEM_PTR(mem.device_pointer),
358                                                    CL_TRUE,
359                                                    0,
360                                                    size,
361                                                    (void*)mem.data_pointer,
362                                                    0,
363                                                    NULL, NULL));
364         }
365 }
366
367 void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
368 {
369         size_t offset = elem*y*w;
370         size_t size = elem*w*h;
371         assert(size != 0);
372         opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
373                                           CL_MEM_PTR(mem.device_pointer),
374                                           CL_TRUE,
375                                           offset,
376                                           size,
377                                           (uchar*)mem.data_pointer + offset,
378                                           0,
379                                           NULL, NULL));
380 }
381
382 void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
383 {
384         cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
385
386         size_t global_size[] = {1024, 1024};
387         size_t num_threads = global_size[0] * global_size[1];
388
389         cl_mem d_buffer = CL_MEM_PTR(mem);
390         cl_ulong d_offset = 0;
391         cl_ulong d_size = 0;
392
393         while(d_offset < size) {
394                 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
395
396                 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
397
398                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
399                                                ckZeroBuffer,
400                                                2,
401                                                NULL,
402                                                global_size,
403                                                NULL,
404                                                0,
405                                                NULL,
406                                                NULL);
407                 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
408
409                 d_offset += d_size;
410         }
411 }
412
413 void OpenCLDeviceBase::mem_zero(device_memory& mem)
414 {
415         if(mem.device_pointer) {
416                 if(base_program.is_loaded()) {
417                         mem_zero_kernel(mem.device_pointer, mem.memory_size());
418                 }
419
420                 if(mem.data_pointer) {
421                         memset((void*)mem.data_pointer, 0, mem.memory_size());
422                 }
423
424                 if(!base_program.is_loaded()) {
425                         void* zero = (void*)mem.data_pointer;
426
427                         if(!mem.data_pointer) {
428                                 zero = util_aligned_malloc(mem.memory_size(), 16);
429                                 memset(zero, 0, mem.memory_size());
430                         }
431
432                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
433                                                            CL_MEM_PTR(mem.device_pointer),
434                                                            CL_TRUE,
435                                                            0,
436                                                            mem.memory_size(),
437                                                            zero,
438                                                            0,
439                                                            NULL, NULL));
440
441                         if(!mem.data_pointer) {
442                                 util_aligned_free(zero);
443                         }
444                 }
445         }
446 }
447
448 void OpenCLDeviceBase::mem_free(device_memory& mem)
449 {
450         if(mem.device_pointer) {
451                 if(mem.device_pointer != null_mem) {
452                         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
453                 }
454                 mem.device_pointer = 0;
455
456                 stats.mem_free(mem.device_size);
457                 mem.device_size = 0;
458         }
459 }
460
461 int OpenCLDeviceBase::mem_address_alignment()
462 {
463         return OpenCLInfo::mem_address_alignment(cdDevice);
464 }
465
466 device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
467 {
468         cl_mem_flags mem_flag;
469         if(mem.type == MEM_READ_ONLY)
470                 mem_flag = CL_MEM_READ_ONLY;
471         else if(mem.type == MEM_WRITE_ONLY)
472                 mem_flag = CL_MEM_WRITE_ONLY;
473         else
474                 mem_flag = CL_MEM_READ_WRITE;
475
476         cl_buffer_region info;
477         info.origin = mem.memory_elements_size(offset);
478         info.size = mem.memory_elements_size(size);
479
480         device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
481                                                             mem_flag,
482                                                             CL_BUFFER_CREATE_TYPE_REGION,
483                                                             &info,
484                                                             &ciErr);
485         opencl_assert_err(ciErr, "clCreateSubBuffer");
486         return sub_buf;
487 }
488
489 void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
490 {
491         if(device_pointer && device_pointer != null_mem) {
492                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
493         }
494 }
495
496 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
497 {
498         ConstMemMap::iterator i = const_mem_map.find(name);
499         device_vector<uchar> *data;
500
501         if(i == const_mem_map.end()) {
502                 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
503                 data->resize(size);
504
505                 mem_alloc(*data);
506                 const_mem_map.insert(ConstMemMap::value_type(name, data));
507         }
508         else {
509                 data = i->second;
510         }
511
512         memcpy(data->get_data(), host, size);
513         mem_copy_to(*data);
514 }
515
516 void OpenCLDeviceBase::tex_alloc(device_memory& mem)
517 {
518         VLOG(1) << "Texture allocate: " << mem.name << ", "
519                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
520                 << string_human_readable_size(mem.memory_size()) << ")";
521
522         memory_manager.alloc(mem.name, mem);
523         /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
524         mem.device_pointer = 1;
525         textures[mem.name] = &mem;
526         textures_need_update = true;
527 }
528
529 void OpenCLDeviceBase::tex_free(device_memory& mem)
530 {
531         if(mem.device_pointer) {
532                 mem.device_pointer = 0;
533
534                 if(memory_manager.free(mem)) {
535                         textures_need_update = true;
536                 }
537
538                 foreach(TexturesMap::value_type& value, textures) {
539                         if(value.second == &mem) {
540                                 textures.erase(value.first);
541                                 break;
542                         }
543                 }
544         }
545 }
546
547 size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
548 {
549         int r = global_size % group_size;
550         return global_size + ((r == 0)? 0: group_size - r);
551 }
552
553 void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
554 {
555         size_t workgroup_size, max_work_items[3];
556
557         clGetKernelWorkGroupInfo(kernel, cdDevice,
558                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
559         clGetDeviceInfo(cdDevice,
560                 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
561
562         if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
563                 workgroup_size = max_workgroup_size;
564         }
565
566         /* Try to divide evenly over 2 dimensions. */
567         size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
568         size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
569
570         /* Some implementations have max size 1 on 2nd dimension. */
571         if(local_size[1] > max_work_items[1]) {
572                 local_size[0] = workgroup_size/max_work_items[1];
573                 local_size[1] = max_work_items[1];
574         }
575
576         size_t global_size[2] = {global_size_round_up(local_size[0], w),
577                                  global_size_round_up(local_size[1], h)};
578
579         /* Vertical size of 1 is coming from bake/shade kernels where we should
580          * not round anything up because otherwise we'll either be doing too
581          * much work per pixel (if we don't check global ID on Y axis) or will
582          * be checking for global ID to always have Y of 0.
583          */
584         if(h == 1) {
585                 global_size[h] = 1;
586         }
587
588         /* run kernel */
589         opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
590         opencl_assert(clFlush(cqCommandQueue));
591 }
592
593 void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
594 {
595         cl_mem ptr;
596
597         MemMap::iterator i = mem_map.find(name);
598         if(i != mem_map.end()) {
599                 ptr = CL_MEM_PTR(i->second);
600         }
601         else {
602                 /* work around NULL not working, even though the spec says otherwise */
603                 ptr = CL_MEM_PTR(null_mem);
604         }
605
606         opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
607 }
608
609 void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
610 {
611         flush_texture_buffers();
612
613         memory_manager.set_kernel_arg_buffers(kernel, narg);
614 }
615
616 void OpenCLDeviceBase::flush_texture_buffers()
617 {
618         if(!textures_need_update) {
619                 return;
620         }
621         textures_need_update = false;
622
623         /* Setup slots for textures. */
624         int num_slots = 0;
625
626         vector<texture_slot_t> texture_slots;
627
628 #define KERNEL_TEX(type, name) \
629         if(textures.find(#name) != textures.end()) { \
630                 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
631         } \
632         num_slots++;
633 #include "kernel/kernel_textures.h"
634
635         int num_data_slots = num_slots;
636
637         foreach(TexturesMap::value_type& tex, textures) {
638                 string name = tex.first;
639
640                 if(string_startswith(name, "__tex_image")) {
641                         int pos = name.rfind("_");
642                         int id = atoi(name.data() + pos + 1);
643                         texture_slots.push_back(texture_slot_t(name,
644                                                                    num_data_slots + id));
645                         num_slots = max(num_slots, num_data_slots + id + 1);
646                 }
647         }
648
649         /* Realloc texture descriptors buffer. */
650         memory_manager.free(texture_info_buffer);
651
652         texture_info.resize(num_slots);
653         texture_info_buffer.resize(num_slots * sizeof(TextureInfo));
654         texture_info_buffer.data_pointer = (device_ptr)&texture_info[0];
655
656         memory_manager.alloc("texture_info", texture_info_buffer);
657
658         /* Fill in descriptors */
659         foreach(texture_slot_t& slot, texture_slots) {
660                 TextureInfo& info = texture_info[slot.slot];
661
662                 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
663                 info.data = desc.offset;
664                 info.cl_buffer = desc.device_buffer;
665
666                 if(string_startswith(slot.name, "__tex_image")) {
667                         device_memory *mem = textures[slot.name];
668
669                         info.width = mem->data_width;
670                         info.height = mem->data_height;
671                         info.depth = mem->data_depth;
672
673                         info.interpolation = mem->interpolation;
674                         info.extension = mem->extension;
675                 }
676         }
677
678         /* Force write of descriptors. */
679         memory_manager.free(texture_info_buffer);
680         memory_manager.alloc("texture_info", texture_info_buffer);
681 }
682
683 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
684 {
685         /* cast arguments to cl types */
686         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
687         cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
688         cl_mem d_buffer = CL_MEM_PTR(buffer);
689         cl_int d_x = task.x;
690         cl_int d_y = task.y;
691         cl_int d_w = task.w;
692         cl_int d_h = task.h;
693         cl_float d_sample_scale = 1.0f/(task.sample + 1);
694         cl_int d_offset = task.offset;
695         cl_int d_stride = task.stride;
696
697
698         cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
699
700         cl_uint start_arg_index =
701                 kernel_set_args(ckFilmConvertKernel,
702                                 0,
703                                 d_data,
704                                 d_rgba,
705                                 d_buffer);
706
707         set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
708
709         start_arg_index += kernel_set_args(ckFilmConvertKernel,
710                                            start_arg_index,
711                                            d_sample_scale,
712                                            d_x,
713                                            d_y,
714                                            d_w,
715                                            d_h,
716                                            d_offset,
717                                            d_stride);
718
719         enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
720 }
721
722 bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
723                                                  device_ptr guide_ptr,
724                                                  device_ptr variance_ptr,
725                                                  device_ptr out_ptr,
726                                                  DenoisingTask *task)
727 {
728         int4 rect = task->rect;
729         int w = rect.z-rect.x;
730         int h = rect.w-rect.y;
731         int r = task->nlm_state.r;
732         int f = task->nlm_state.f;
733         float a = task->nlm_state.a;
734         float k_2 = task->nlm_state.k_2;
735
736         cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
737         cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
738         cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
739
740         cl_mem image_mem = CL_MEM_PTR(image_ptr);
741         cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
742         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
743         cl_mem out_mem = CL_MEM_PTR(out_ptr);
744
745         mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
746         mem_zero_kernel(out_ptr, sizeof(float)*w*h);
747
748         cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
749         cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
750         cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
751         cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
752         cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
753
754         for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
755                 int dy = i / (2*r+1) - r;
756                 int dx = i % (2*r+1) - r;
757                 int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
758                 kernel_set_args(ckNLMCalcDifference, 0,
759                                 dx, dy, guide_mem, variance_mem,
760                                 difference, local_rect, w, 0, a, k_2);
761                 kernel_set_args(ckNLMBlur, 0,
762                                 difference, blurDifference, local_rect, w, f);
763                 kernel_set_args(ckNLMCalcWeight, 0,
764                                 blurDifference, difference, local_rect, w, f);
765                 kernel_set_args(ckNLMUpdateOutput, 0,
766                                 dx, dy, blurDifference, image_mem,
767                                 out_mem, weightAccum, local_rect, w, f);
768
769                 enqueue_kernel(ckNLMCalcDifference, w, h);
770                 enqueue_kernel(ckNLMBlur,           w, h);
771                 enqueue_kernel(ckNLMCalcWeight,     w, h);
772                 enqueue_kernel(ckNLMBlur,           w, h);
773                 enqueue_kernel(ckNLMUpdateOutput,   w, h);
774         }
775
776         int4 local_rect = make_int4(0, 0, w, h);
777         kernel_set_args(ckNLMNormalize, 0,
778                         out_mem, weightAccum, local_rect, w);
779         enqueue_kernel(ckNLMNormalize, w, h);
780
781         return true;
782 }
783
784 bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
785 {
786         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
787         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
788         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
789
790         cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
791
792         kernel_set_args(ckFilterConstructTransform, 0,
793                         buffer_mem,
794                         transform_mem,
795                         rank_mem,
796                         task->filter_area,
797                         task->rect,
798                         task->buffer.pass_stride,
799                         task->radius,
800                         task->pca_threshold);
801
802         enqueue_kernel(ckFilterConstructTransform,
803                        task->storage.w,
804                        task->storage.h,
805                        256);
806
807         return true;
808 }
809
810 bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
811                                              device_ptr color_variance_ptr,
812                                              device_ptr output_ptr,
813                                              DenoisingTask *task)
814 {
815         mem_zero(task->storage.XtWX);
816         mem_zero(task->storage.XtWY);
817
818         cl_mem color_mem = CL_MEM_PTR(color_ptr);
819         cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
820         cl_mem output_mem = CL_MEM_PTR(output_ptr);
821
822         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
823         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
824         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
825         cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
826         cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
827
828         cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
829         cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
830         cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
831         cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
832         cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
833
834         cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
835         cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
836
837         int r = task->radius;
838         int f = 4;
839         float a = 1.0f;
840         for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
841                 int dy = i / (2*r+1) - r;
842                 int dx = i % (2*r+1) - r;
843
844                 int local_rect[4] = {max(0, -dx), max(0, -dy),
845                                      task->reconstruction_state.source_w - max(0, dx),
846                                      task->reconstruction_state.source_h - max(0, dy)};
847
848                 kernel_set_args(ckNLMCalcDifference, 0,
849                                 dx, dy,
850                                 color_mem,
851                                 color_variance_mem,
852                                 difference,
853                                 local_rect,
854                                 task->buffer.w,
855                                 task->buffer.pass_stride,
856                                 a, task->nlm_k_2);
857                 enqueue_kernel(ckNLMCalcDifference,
858                                task->reconstruction_state.source_w,
859                                task->reconstruction_state.source_h);
860
861                 kernel_set_args(ckNLMBlur, 0,
862                                 difference,
863                                 blurDifference,
864                                 local_rect,
865                                 task->buffer.w,
866                                 f);
867                 enqueue_kernel(ckNLMBlur,
868                                task->reconstruction_state.source_w,
869                                task->reconstruction_state.source_h);
870
871                 kernel_set_args(ckNLMCalcWeight, 0,
872                                 blurDifference,
873                                 difference,
874                                 local_rect,
875                                 task->buffer.w,
876                                 f);
877                 enqueue_kernel(ckNLMCalcWeight,
878                                task->reconstruction_state.source_w,
879                                task->reconstruction_state.source_h);
880
881                 /* Reuse previous arguments. */
882                 enqueue_kernel(ckNLMBlur,
883                                task->reconstruction_state.source_w,
884                                task->reconstruction_state.source_h);
885
886                 kernel_set_args(ckNLMConstructGramian, 0,
887                                 dx, dy,
888                                 blurDifference,
889                                 buffer_mem,
890                                 transform_mem,
891                                 rank_mem,
892                                 XtWX_mem,
893                                 XtWY_mem,
894                                 local_rect,
895                                 task->reconstruction_state.filter_rect,
896                                 task->buffer.w,
897                                 task->buffer.h,
898                                 f,
899                             task->buffer.pass_stride);
900                 enqueue_kernel(ckNLMConstructGramian,
901                                task->reconstruction_state.source_w,
902                                task->reconstruction_state.source_h,
903                                256);
904         }
905
906         kernel_set_args(ckFinalize, 0,
907                         task->buffer.w,
908                         task->buffer.h,
909                         output_mem,
910                         rank_mem,
911                         XtWX_mem,
912                         XtWY_mem,
913                         task->filter_area,
914                         task->reconstruction_state.buffer_params,
915                         task->render_buffer.samples);
916         enqueue_kernel(ckFinalize,
917                        task->reconstruction_state.source_w,
918                        task->reconstruction_state.source_h);
919
920         return true;
921 }
922
923 bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
924                                                 device_ptr b_ptr,
925                                                 device_ptr mean_ptr,
926                                                 device_ptr variance_ptr,
927                                                 int r, int4 rect,
928                                                 DenoisingTask *task)
929 {
930         cl_mem a_mem = CL_MEM_PTR(a_ptr);
931         cl_mem b_mem = CL_MEM_PTR(b_ptr);
932         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
933         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
934
935         cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
936
937         kernel_set_args(ckFilterCombineHalves, 0,
938                         mean_mem,
939                         variance_mem,
940                         a_mem,
941                         b_mem,
942                         rect,
943                         r);
944         enqueue_kernel(ckFilterCombineHalves,
945                        task->rect.z-task->rect.x,
946                        task->rect.w-task->rect.y);
947
948         return true;
949 }
950
951 bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
952                                                device_ptr b_ptr,
953                                                device_ptr sample_variance_ptr,
954                                                device_ptr sv_variance_ptr,
955                                                device_ptr buffer_variance_ptr,
956                                                DenoisingTask *task)
957 {
958         cl_mem a_mem = CL_MEM_PTR(a_ptr);
959         cl_mem b_mem = CL_MEM_PTR(b_ptr);
960         cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
961         cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
962         cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
963
964         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
965
966         cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
967
968         kernel_set_args(ckFilterDivideShadow, 0,
969                         task->render_buffer.samples,
970                         tiles_mem,
971                         a_mem,
972                         b_mem,
973                         sample_variance_mem,
974                         sv_variance_mem,
975                         buffer_variance_mem,
976                         task->rect,
977                         task->render_buffer.pass_stride,
978                         task->render_buffer.denoising_data_offset);
979         enqueue_kernel(ckFilterDivideShadow,
980                        task->rect.z-task->rect.x,
981                        task->rect.w-task->rect.y);
982
983         return true;
984 }
985
986 bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
987                                              int variance_offset,
988                                              device_ptr mean_ptr,
989                                              device_ptr variance_ptr,
990                                              DenoisingTask *task)
991 {
992         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
993         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
994
995         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
996
997         cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
998
999         kernel_set_args(ckFilterGetFeature, 0,
1000                         task->render_buffer.samples,
1001                         tiles_mem,
1002                         mean_offset,
1003                         variance_offset,
1004                         mean_mem,
1005                         variance_mem,
1006                         task->rect,
1007                         task->render_buffer.pass_stride,
1008                         task->render_buffer.denoising_data_offset);
1009         enqueue_kernel(ckFilterGetFeature,
1010                        task->rect.z-task->rect.x,
1011                        task->rect.w-task->rect.y);
1012
1013         return true;
1014 }
1015
1016 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
1017                                                  device_ptr variance_ptr,
1018                                                  device_ptr depth_ptr,
1019                                                  device_ptr output_ptr,
1020                                                  DenoisingTask *task)
1021 {
1022         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1023         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1024         cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1025         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1026
1027         cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1028
1029         kernel_set_args(ckFilterDetectOutliers, 0,
1030                         image_mem,
1031                         variance_mem,
1032                         depth_mem,
1033                         output_mem,
1034                         task->rect,
1035                         task->buffer.pass_stride);
1036         enqueue_kernel(ckFilterDetectOutliers,
1037                        task->rect.z-task->rect.x,
1038                        task->rect.w-task->rect.y);
1039
1040         return true;
1041 }
1042
1043 bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
1044                                            DenoisingTask *task)
1045 {
1046         mem_alloc(task->tiles_mem);
1047         mem_copy_to(task->tiles_mem);
1048
1049         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1050
1051         cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
1052
1053         kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
1054         for(int i = 0; i < 9; i++) {
1055                 cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
1056                 kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
1057         }
1058
1059         enqueue_kernel(ckFilterSetTiles, 1, 1);
1060
1061         return true;
1062 }
1063
1064 void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
1065 {
1066         DenoisingTask denoising(this);
1067
1068         denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
1069         denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
1070         denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
1071         denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1072         denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1073         denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1074         denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
1075         denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1076
1077         denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1078         denoising.render_buffer.samples = rtile.sample;
1079
1080         RenderTile rtiles[9];
1081         rtiles[4] = rtile;
1082         task.map_neighbor_tiles(rtiles, this);
1083         denoising.tiles_from_rendertiles(rtiles);
1084
1085         denoising.init_from_devicetask(task);
1086
1087         denoising.run_denoising();
1088
1089         task.unmap_neighbor_tiles(rtiles, this);
1090 }
1091
1092 void OpenCLDeviceBase::shader(DeviceTask& task)
1093 {
1094         /* cast arguments to cl types */
1095         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1096         cl_mem d_input = CL_MEM_PTR(task.shader_input);
1097         cl_mem d_output = CL_MEM_PTR(task.shader_output);
1098         cl_int d_shader_eval_type = task.shader_eval_type;
1099         cl_int d_shader_filter = task.shader_filter;
1100         cl_int d_shader_x = task.shader_x;
1101         cl_int d_shader_w = task.shader_w;
1102         cl_int d_offset = task.offset;
1103
1104         cl_kernel kernel;
1105
1106         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1107                 kernel = base_program(ustring("bake"));
1108         }
1109         else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1110                 kernel = base_program(ustring("displace"));
1111         }
1112         else {
1113                 kernel = base_program(ustring("background"));
1114         }
1115
1116         cl_uint start_arg_index =
1117                 kernel_set_args(kernel,
1118                                 0,
1119                                 d_data,
1120                                 d_input,
1121                                 d_output);
1122
1123         set_kernel_arg_buffers(kernel, &start_arg_index);
1124
1125         start_arg_index += kernel_set_args(kernel,
1126                                            start_arg_index,
1127                                            d_shader_eval_type);
1128         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1129                 start_arg_index += kernel_set_args(kernel,
1130                                                    start_arg_index,
1131                                                    d_shader_filter);
1132         }
1133         start_arg_index += kernel_set_args(kernel,
1134                                            start_arg_index,
1135                                            d_shader_x,
1136                                            d_shader_w,
1137                                            d_offset);
1138
1139         for(int sample = 0; sample < task.num_samples; sample++) {
1140
1141                 if(task.get_cancel())
1142                         break;
1143
1144                 kernel_set_args(kernel, start_arg_index, sample);
1145
1146                 enqueue_kernel(kernel, task.shader_w, 1);
1147
1148                 clFinish(cqCommandQueue);
1149
1150                 task.update_progress(NULL);
1151         }
1152 }
1153
1154 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
1155 {
1156         string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1157
1158         if(platform_name == "NVIDIA CUDA") {
1159                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1160                                  "-cl-nv-maxrregcount=32 "
1161                                  "-cl-nv-verbose ";
1162
1163                 uint compute_capability_major, compute_capability_minor;
1164                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1165                                 sizeof(cl_uint), &compute_capability_major, NULL);
1166                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1167                                 sizeof(cl_uint), &compute_capability_minor, NULL);
1168
1169                 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1170                                                compute_capability_major * 100 +
1171                                                compute_capability_minor * 10);
1172         }
1173
1174         else if(platform_name == "Apple")
1175                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1176
1177         else if(platform_name == "AMD Accelerated Parallel Processing")
1178                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1179
1180         else if(platform_name == "Intel(R) OpenCL") {
1181                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1182
1183                 /* Options for gdb source level kernel debugging.
1184                  * this segfaults on linux currently.
1185                  */
1186                 if(OpenCLInfo::use_debug() && debug_src)
1187                         build_options += "-g -s \"" + *debug_src + "\" ";
1188         }
1189
1190         if(OpenCLInfo::use_debug())
1191                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1192
1193 #ifdef WITH_CYCLES_DEBUG
1194         build_options += "-D__KERNEL_DEBUG__ ";
1195 #endif
1196
1197         return build_options;
1198 }
1199
1200 /* TODO(sergey): In the future we can use variadic templates, once
1201  * C++0x is allowed. Should allow to clean this up a bit.
1202  */
1203 int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
1204                     int start_argument_index,
1205                     const ArgumentWrapper& arg1,
1206                     const ArgumentWrapper& arg2,
1207                     const ArgumentWrapper& arg3,
1208                     const ArgumentWrapper& arg4,
1209                     const ArgumentWrapper& arg5,
1210                     const ArgumentWrapper& arg6,
1211                     const ArgumentWrapper& arg7,
1212                     const ArgumentWrapper& arg8,
1213                     const ArgumentWrapper& arg9,
1214                     const ArgumentWrapper& arg10,
1215                     const ArgumentWrapper& arg11,
1216                     const ArgumentWrapper& arg12,
1217                     const ArgumentWrapper& arg13,
1218                     const ArgumentWrapper& arg14,
1219                     const ArgumentWrapper& arg15,
1220                     const ArgumentWrapper& arg16,
1221                     const ArgumentWrapper& arg17,
1222                     const ArgumentWrapper& arg18,
1223                     const ArgumentWrapper& arg19,
1224                     const ArgumentWrapper& arg20,
1225                     const ArgumentWrapper& arg21,
1226                     const ArgumentWrapper& arg22,
1227                     const ArgumentWrapper& arg23,
1228                     const ArgumentWrapper& arg24,
1229                     const ArgumentWrapper& arg25,
1230                     const ArgumentWrapper& arg26,
1231                     const ArgumentWrapper& arg27,
1232                     const ArgumentWrapper& arg28,
1233                     const ArgumentWrapper& arg29,
1234                     const ArgumentWrapper& arg30,
1235                     const ArgumentWrapper& arg31,
1236                     const ArgumentWrapper& arg32,
1237                     const ArgumentWrapper& arg33)
1238 {
1239         int current_arg_index = 0;
1240 #define FAKE_VARARG_HANDLE_ARG(arg) \
1241         do { \
1242                 if(arg.pointer != NULL) { \
1243                         opencl_assert(clSetKernelArg( \
1244                                 kernel, \
1245                                 start_argument_index + current_arg_index, \
1246                                 arg.size, arg.pointer)); \
1247                         ++current_arg_index; \
1248                 } \
1249                 else { \
1250                         return current_arg_index; \
1251                 } \
1252         } while(false)
1253         FAKE_VARARG_HANDLE_ARG(arg1);
1254         FAKE_VARARG_HANDLE_ARG(arg2);
1255         FAKE_VARARG_HANDLE_ARG(arg3);
1256         FAKE_VARARG_HANDLE_ARG(arg4);
1257         FAKE_VARARG_HANDLE_ARG(arg5);
1258         FAKE_VARARG_HANDLE_ARG(arg6);
1259         FAKE_VARARG_HANDLE_ARG(arg7);
1260         FAKE_VARARG_HANDLE_ARG(arg8);
1261         FAKE_VARARG_HANDLE_ARG(arg9);
1262         FAKE_VARARG_HANDLE_ARG(arg10);
1263         FAKE_VARARG_HANDLE_ARG(arg11);
1264         FAKE_VARARG_HANDLE_ARG(arg12);
1265         FAKE_VARARG_HANDLE_ARG(arg13);
1266         FAKE_VARARG_HANDLE_ARG(arg14);
1267         FAKE_VARARG_HANDLE_ARG(arg15);
1268         FAKE_VARARG_HANDLE_ARG(arg16);
1269         FAKE_VARARG_HANDLE_ARG(arg17);
1270         FAKE_VARARG_HANDLE_ARG(arg18);
1271         FAKE_VARARG_HANDLE_ARG(arg19);
1272         FAKE_VARARG_HANDLE_ARG(arg20);
1273         FAKE_VARARG_HANDLE_ARG(arg21);
1274         FAKE_VARARG_HANDLE_ARG(arg22);
1275         FAKE_VARARG_HANDLE_ARG(arg23);
1276         FAKE_VARARG_HANDLE_ARG(arg24);
1277         FAKE_VARARG_HANDLE_ARG(arg25);
1278         FAKE_VARARG_HANDLE_ARG(arg26);
1279         FAKE_VARARG_HANDLE_ARG(arg27);
1280         FAKE_VARARG_HANDLE_ARG(arg28);
1281         FAKE_VARARG_HANDLE_ARG(arg29);
1282         FAKE_VARARG_HANDLE_ARG(arg30);
1283         FAKE_VARARG_HANDLE_ARG(arg31);
1284         FAKE_VARARG_HANDLE_ARG(arg32);
1285         FAKE_VARARG_HANDLE_ARG(arg33);
1286 #undef FAKE_VARARG_HANDLE_ARG
1287         return current_arg_index;
1288 }
1289
1290 void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
1291 {
1292         if(kernel) {
1293                 clReleaseKernel(kernel);
1294         }
1295 }
1296
1297 void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
1298 {
1299         if(mem != NULL) {
1300                 clReleaseMemObject(mem);
1301         }
1302 }
1303
1304 void OpenCLDeviceBase::release_program_safe(cl_program program)
1305 {
1306         if(program) {
1307                 clReleaseProgram(program);
1308         }
1309 }
1310
1311 /* ** Those guys are for workign around some compiler-specific bugs ** */
1312
1313 cl_program OpenCLDeviceBase::load_cached_kernel(
1314         ustring key,
1315         thread_scoped_lock& cache_locker)
1316 {
1317         return OpenCLCache::get_program(cpPlatform,
1318                                         cdDevice,
1319                                         key,
1320                                         cache_locker);
1321 }
1322
1323 void OpenCLDeviceBase::store_cached_kernel(
1324         cl_program program,
1325         ustring key,
1326         thread_scoped_lock& cache_locker)
1327 {
1328         OpenCLCache::store_program(cpPlatform,
1329                                    cdDevice,
1330                                    program,
1331                                    key,
1332                                    cache_locker);
1333 }
1334
1335 string OpenCLDeviceBase::build_options_for_base_program(
1336         const DeviceRequestedFeatures& requested_features)
1337 {
1338         /* TODO(sergey): By default we compile all features, meaning
1339          * mega kernel is not getting feature-based optimizations.
1340          *
1341          * Ideally we need always compile kernel with as less features
1342          * enabled as possible to keep performance at it's max.
1343          */
1344
1345         /* For now disable baking when not in use as this has major
1346          * impact on kernel build times.
1347          */
1348         if(!requested_features.use_baking) {
1349                 return "-D__NO_BAKING__";
1350         }
1351
1352         return "";
1353 }
1354
1355 CCL_NAMESPACE_END
1356
1357 #endif