90f461b4c988269e26b14fe9ec41bc89a94c296e
[blender-staging.git] / intern / cycles / device / opencl / opencl_base.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include "device/opencl/opencl.h"
20
21 #include "kernel/kernel_types.h"
22
23 #include "util/util_algorithm.h"
24 #include "util/util_foreach.h"
25 #include "util/util_logging.h"
26 #include "util/util_md5.h"
27 #include "util/util_path.h"
28 #include "util/util_time.h"
29
30 CCL_NAMESPACE_BEGIN
31
32 struct texture_slot_t {
33         texture_slot_t(const string& name, int slot)
34                 : name(name),
35                   slot(slot) {
36         }
37         string name;
38         int slot;
39 };
40
41 bool OpenCLDeviceBase::opencl_error(cl_int err)
42 {
43         if(err != CL_SUCCESS) {
44                 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
45                 if(error_msg == "")
46                         error_msg = message;
47                 fprintf(stderr, "%s\n", message.c_str());
48                 return true;
49         }
50
51         return false;
52 }
53
54 void OpenCLDeviceBase::opencl_error(const string& message)
55 {
56         if(error_msg == "")
57                 error_msg = message;
58         fprintf(stderr, "%s\n", message.c_str());
59 }
60
61 void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
62 {
63         if(err != CL_SUCCESS) {
64                 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
65                 if(error_msg == "")
66                         error_msg = message;
67                 fprintf(stderr, "%s\n", message.c_str());
68 #ifndef NDEBUG
69                 abort();
70 #endif
71         }
72 }
73
74 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
75 : Device(info, stats, background_),
76   memory_manager(this),
77   texture_info_buffer(this, "__texture_info", MEM_READ_ONLY)
78 {
79         cpPlatform = NULL;
80         cdDevice = NULL;
81         cxContext = NULL;
82         cqCommandQueue = NULL;
83         null_mem = 0;
84         device_initialized = false;
85         textures_need_update = true;
86
87         vector<OpenCLPlatformDevice> usable_devices;
88         OpenCLInfo::get_usable_devices(&usable_devices);
89         if(usable_devices.size() == 0) {
90                 opencl_error("OpenCL: no devices found.");
91                 return;
92         }
93         assert(info.num < usable_devices.size());
94         OpenCLPlatformDevice& platform_device = usable_devices[info.num];
95         cpPlatform = platform_device.platform_id;
96         cdDevice = platform_device.device_id;
97         platform_name = platform_device.platform_name;
98         device_name = platform_device.device_name;
99         VLOG(2) << "Creating new Cycles device for OpenCL platform "
100                 << platform_name << ", device "
101                 << device_name << ".";
102
103         {
104                 /* try to use cached context */
105                 thread_scoped_lock cache_locker;
106                 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
107
108                 if(cxContext == NULL) {
109                         /* create context properties array to specify platform */
110                         const cl_context_properties context_props[] = {
111                                 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
112                                 0, 0
113                         };
114
115                         /* create context */
116                         cxContext = clCreateContext(context_props, 1, &cdDevice,
117                                 context_notify_callback, cdDevice, &ciErr);
118
119                         if(opencl_error(ciErr)) {
120                                 opencl_error("OpenCL: clCreateContext failed");
121                                 return;
122                         }
123
124                         /* cache it */
125                         OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
126                 }
127         }
128
129         cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
130         if(opencl_error(ciErr)) {
131                 opencl_error("OpenCL: Error creating command queue");
132                 return;
133         }
134
135         null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
136         if(opencl_error(ciErr)) {
137                 opencl_error("OpenCL: Error creating memory buffer for NULL");
138                 return;
139         }
140
141         /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
142         texture_info.resize(1);
143         memory_manager.alloc("texture_info", texture_info);
144
145         fprintf(stderr, "Device init success\n");
146         device_initialized = true;
147 }
148
149 OpenCLDeviceBase::~OpenCLDeviceBase()
150 {
151         task_pool.stop();
152
153         memory_manager.free();
154
155         if(null_mem)
156                 clReleaseMemObject(CL_MEM_PTR(null_mem));
157
158         ConstMemMap::iterator mt;
159         for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
160                 mem_free(*(mt->second));
161                 delete mt->second;
162         }
163
164         base_program.release();
165         if(cqCommandQueue)
166                 clReleaseCommandQueue(cqCommandQueue);
167         if(cxContext)
168                 clReleaseContext(cxContext);
169 }
170
171 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
172         const void * /*private_info*/, size_t /*cb*/, void *user_data)
173 {
174         string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
175         fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
176 }
177
178 bool OpenCLDeviceBase::opencl_version_check()
179 {
180         string error;
181         if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
182                 opencl_error(error);
183                 return false;
184         }
185         if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
186                 opencl_error(error);
187                 return false;
188         }
189         return true;
190 }
191
192 string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
193 {
194         MD5Hash md5;
195         char version[256], driver[256], name[256], vendor[256];
196
197         clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
198         clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
199         clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
200         clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
201
202         md5.append((uint8_t*)vendor, strlen(vendor));
203         md5.append((uint8_t*)version, strlen(version));
204         md5.append((uint8_t*)name, strlen(name));
205         md5.append((uint8_t*)driver, strlen(driver));
206
207         string options = kernel_build_options();
208         options += kernel_custom_build_options;
209         md5.append((uint8_t*)options.c_str(), options.size());
210
211         return md5.get_hex();
212 }
213
214 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
215 {
216         VLOG(2) << "Loading kernels for platform " << platform_name
217                 << ", device " << device_name << ".";
218         /* Verify if device was initialized. */
219         if(!device_initialized) {
220                 fprintf(stderr, "OpenCL: failed to initialize device.\n");
221                 return false;
222         }
223
224         /* Verify we have right opencl version. */
225         if(!opencl_version_check())
226                 return false;
227
228         base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
229         base_program.add_kernel(ustring("convert_to_byte"));
230         base_program.add_kernel(ustring("convert_to_half_float"));
231         base_program.add_kernel(ustring("displace"));
232         base_program.add_kernel(ustring("background"));
233         base_program.add_kernel(ustring("bake"));
234         base_program.add_kernel(ustring("zero_buffer"));
235
236         denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
237         denoising_program.add_kernel(ustring("filter_divide_shadow"));
238         denoising_program.add_kernel(ustring("filter_get_feature"));
239         denoising_program.add_kernel(ustring("filter_detect_outliers"));
240         denoising_program.add_kernel(ustring("filter_combine_halves"));
241         denoising_program.add_kernel(ustring("filter_construct_transform"));
242         denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
243         denoising_program.add_kernel(ustring("filter_nlm_blur"));
244         denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
245         denoising_program.add_kernel(ustring("filter_nlm_update_output"));
246         denoising_program.add_kernel(ustring("filter_nlm_normalize"));
247         denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
248         denoising_program.add_kernel(ustring("filter_finalize"));
249         denoising_program.add_kernel(ustring("filter_set_tiles"));
250
251         vector<OpenCLProgram*> programs;
252         programs.push_back(&base_program);
253         programs.push_back(&denoising_program);
254         /* Call actual class to fill the vector with its programs. */
255         if(!load_kernels(requested_features, programs)) {
256                 return false;
257         }
258
259         /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
260          * serialize the calls internally, so it's not much use right now.
261          * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
262          * should be set to false as well. */
263 #if 0
264         TaskPool task_pool;
265         foreach(OpenCLProgram *program, programs) {
266                 task_pool.push(function_bind(&OpenCLProgram::load, program));
267         }
268         task_pool.wait_work();
269
270         foreach(OpenCLProgram *program, programs) {
271                 VLOG(2) << program->get_log();
272                 if(!program->is_loaded()) {
273                         program->report_error();
274                         return false;
275                 }
276         }
277 #else
278         foreach(OpenCLProgram *program, programs) {
279                 program->load();
280                 if(!program->is_loaded()) {
281                         return false;
282                 }
283         }
284 #endif
285
286         return true;
287 }
288
289 void OpenCLDeviceBase::mem_alloc(device_memory& mem)
290 {
291         if(mem.name) {
292                 VLOG(1) << "Buffer allocate: " << mem.name << ", "
293                             << string_human_readable_number(mem.memory_size()) << " bytes. ("
294                             << string_human_readable_size(mem.memory_size()) << ")";
295         }
296
297         size_t size = mem.memory_size();
298
299         /* check there is enough memory available for the allocation */
300         cl_ulong max_alloc_size = 0;
301         clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
302
303         if(DebugFlags().opencl.mem_limit) {
304                 max_alloc_size = min(max_alloc_size,
305                                      cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
306         }
307
308         if(size > max_alloc_size) {
309                 string error = "Scene too complex to fit in available memory.";
310                 if(mem.name != NULL) {
311                         error += string_printf(" (allocating buffer %s failed.)", mem.name);
312                 }
313                 set_error(error);
314
315                 return;
316         }
317
318         cl_mem_flags mem_flag;
319         void *mem_ptr = NULL;
320
321         if(mem.type == MEM_READ_ONLY)
322                 mem_flag = CL_MEM_READ_ONLY;
323         else if(mem.type == MEM_WRITE_ONLY)
324                 mem_flag = CL_MEM_WRITE_ONLY;
325         else
326                 mem_flag = CL_MEM_READ_WRITE;
327
328         /* Zero-size allocation might be invoked by render, but not really
329          * supported by OpenCL. Using NULL as device pointer also doesn't really
330          * work for some reason, so for the time being we'll use special case
331          * will null_mem buffer.
332          */
333         if(size != 0) {
334                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
335                                                                 mem_flag,
336                                                                 size,
337                                                                 mem_ptr,
338                                                                 &ciErr);
339                 opencl_assert_err(ciErr, "clCreateBuffer");
340         }
341         else {
342                 mem.device_pointer = null_mem;
343         }
344
345         stats.mem_alloc(size);
346         mem.device_size = size;
347 }
348
349 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
350 {
351         /* this is blocking */
352         size_t size = mem.memory_size();
353         if(size != 0) {
354                 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
355                                                    CL_MEM_PTR(mem.device_pointer),
356                                                    CL_TRUE,
357                                                    0,
358                                                    size,
359                                                    (void*)mem.data_pointer,
360                                                    0,
361                                                    NULL, NULL));
362         }
363 }
364
365 void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
366 {
367         size_t offset = elem*y*w;
368         size_t size = elem*w*h;
369         assert(size != 0);
370         opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
371                                           CL_MEM_PTR(mem.device_pointer),
372                                           CL_TRUE,
373                                           offset,
374                                           size,
375                                           (uchar*)mem.data_pointer + offset,
376                                           0,
377                                           NULL, NULL));
378 }
379
380 void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
381 {
382         cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
383
384         size_t global_size[] = {1024, 1024};
385         size_t num_threads = global_size[0] * global_size[1];
386
387         cl_mem d_buffer = CL_MEM_PTR(mem);
388         cl_ulong d_offset = 0;
389         cl_ulong d_size = 0;
390
391         while(d_offset < size) {
392                 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
393
394                 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
395
396                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
397                                                ckZeroBuffer,
398                                                2,
399                                                NULL,
400                                                global_size,
401                                                NULL,
402                                                0,
403                                                NULL,
404                                                NULL);
405                 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
406
407                 d_offset += d_size;
408         }
409 }
410
411 void OpenCLDeviceBase::mem_zero(device_memory& mem)
412 {
413         if(mem.device_pointer) {
414                 if(base_program.is_loaded()) {
415                         mem_zero_kernel(mem.device_pointer, mem.memory_size());
416                 }
417
418                 if(mem.data_pointer) {
419                         memset((void*)mem.data_pointer, 0, mem.memory_size());
420                 }
421
422                 if(!base_program.is_loaded()) {
423                         void* zero = (void*)mem.data_pointer;
424
425                         if(!mem.data_pointer) {
426                                 zero = util_aligned_malloc(mem.memory_size(), 16);
427                                 memset(zero, 0, mem.memory_size());
428                         }
429
430                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
431                                                            CL_MEM_PTR(mem.device_pointer),
432                                                            CL_TRUE,
433                                                            0,
434                                                            mem.memory_size(),
435                                                            zero,
436                                                            0,
437                                                            NULL, NULL));
438
439                         if(!mem.data_pointer) {
440                                 util_aligned_free(zero);
441                         }
442                 }
443         }
444 }
445
446 void OpenCLDeviceBase::mem_free(device_memory& mem)
447 {
448         if(mem.device_pointer) {
449                 if(mem.device_pointer != null_mem) {
450                         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
451                 }
452                 mem.device_pointer = 0;
453
454                 stats.mem_free(mem.device_size);
455                 mem.device_size = 0;
456         }
457 }
458
459 int OpenCLDeviceBase::mem_address_alignment()
460 {
461         return OpenCLInfo::mem_address_alignment(cdDevice);
462 }
463
464 device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
465 {
466         cl_mem_flags mem_flag;
467         if(mem.type == MEM_READ_ONLY)
468                 mem_flag = CL_MEM_READ_ONLY;
469         else if(mem.type == MEM_WRITE_ONLY)
470                 mem_flag = CL_MEM_WRITE_ONLY;
471         else
472                 mem_flag = CL_MEM_READ_WRITE;
473
474         cl_buffer_region info;
475         info.origin = mem.memory_elements_size(offset);
476         info.size = mem.memory_elements_size(size);
477
478         device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
479                                                             mem_flag,
480                                                             CL_BUFFER_CREATE_TYPE_REGION,
481                                                             &info,
482                                                             &ciErr);
483         opencl_assert_err(ciErr, "clCreateSubBuffer");
484         return sub_buf;
485 }
486
487 void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
488 {
489         if(device_pointer && device_pointer != null_mem) {
490                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
491         }
492 }
493
494 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
495 {
496         ConstMemMap::iterator i = const_mem_map.find(name);
497         device_vector<uchar> *data;
498
499         if(i == const_mem_map.end()) {
500                 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
501                 data->resize(size);
502
503                 mem_alloc(*data);
504                 const_mem_map.insert(ConstMemMap::value_type(name, data));
505         }
506         else {
507                 data = i->second;
508         }
509
510         memcpy(data->get_data(), host, size);
511         mem_copy_to(*data);
512 }
513
514 void OpenCLDeviceBase::tex_alloc(device_memory& mem)
515 {
516         VLOG(1) << "Texture allocate: " << mem.name << ", "
517                 << string_human_readable_number(mem.memory_size()) << " bytes. ("
518                 << string_human_readable_size(mem.memory_size()) << ")";
519
520         memory_manager.alloc(mem.name, mem);
521         /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
522         mem.device_pointer = 1;
523         textures[mem.name] = &mem;
524         textures_need_update = true;
525 }
526
527 void OpenCLDeviceBase::tex_free(device_memory& mem)
528 {
529         if(mem.device_pointer) {
530                 mem.device_pointer = 0;
531
532                 if(memory_manager.free(mem)) {
533                         textures_need_update = true;
534                 }
535
536                 foreach(TexturesMap::value_type& value, textures) {
537                         if(value.second == &mem) {
538                                 textures.erase(value.first);
539                                 break;
540                         }
541                 }
542         }
543 }
544
545 size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
546 {
547         int r = global_size % group_size;
548         return global_size + ((r == 0)? 0: group_size - r);
549 }
550
551 void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
552 {
553         size_t workgroup_size, max_work_items[3];
554
555         clGetKernelWorkGroupInfo(kernel, cdDevice,
556                 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
557         clGetDeviceInfo(cdDevice,
558                 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
559
560         if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
561                 workgroup_size = max_workgroup_size;
562         }
563
564         /* Try to divide evenly over 2 dimensions. */
565         size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
566         size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
567
568         /* Some implementations have max size 1 on 2nd dimension. */
569         if(local_size[1] > max_work_items[1]) {
570                 local_size[0] = workgroup_size/max_work_items[1];
571                 local_size[1] = max_work_items[1];
572         }
573
574         size_t global_size[2] = {global_size_round_up(local_size[0], w),
575                                  global_size_round_up(local_size[1], h)};
576
577         /* Vertical size of 1 is coming from bake/shade kernels where we should
578          * not round anything up because otherwise we'll either be doing too
579          * much work per pixel (if we don't check global ID on Y axis) or will
580          * be checking for global ID to always have Y of 0.
581          */
582         if(h == 1) {
583                 global_size[h] = 1;
584         }
585
586         /* run kernel */
587         opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
588         opencl_assert(clFlush(cqCommandQueue));
589 }
590
591 void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
592 {
593         cl_mem ptr;
594
595         MemMap::iterator i = mem_map.find(name);
596         if(i != mem_map.end()) {
597                 ptr = CL_MEM_PTR(i->second);
598         }
599         else {
600                 /* work around NULL not working, even though the spec says otherwise */
601                 ptr = CL_MEM_PTR(null_mem);
602         }
603
604         opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
605 }
606
607 void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
608 {
609         flush_texture_buffers();
610
611         memory_manager.set_kernel_arg_buffers(kernel, narg);
612 }
613
614 void OpenCLDeviceBase::flush_texture_buffers()
615 {
616         if(!textures_need_update) {
617                 return;
618         }
619         textures_need_update = false;
620
621         /* Setup slots for textures. */
622         int num_slots = 0;
623
624         vector<texture_slot_t> texture_slots;
625
626 #define KERNEL_TEX(type, name) \
627         if(textures.find(#name) != textures.end()) { \
628                 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
629         } \
630         num_slots++;
631 #include "kernel/kernel_textures.h"
632
633         int num_data_slots = num_slots;
634
635         foreach(TexturesMap::value_type& tex, textures) {
636                 string name = tex.first;
637
638                 if(string_startswith(name, "__tex_image")) {
639                         int pos = name.rfind("_");
640                         int id = atoi(name.data() + pos + 1);
641                         texture_slots.push_back(texture_slot_t(name,
642                                                                    num_data_slots + id));
643                         num_slots = max(num_slots, num_data_slots + id + 1);
644                 }
645         }
646
647         /* Realloc texture descriptors buffer. */
648         memory_manager.free(texture_info);
649         texture_info.resize(num_slots);
650         memory_manager.alloc("texture_info", texture_info);
651
652         /* Fill in descriptors */
653         foreach(texture_slot_t& slot, texture_slots) {
654                 TextureInfo& info = texture_info[slot.slot];
655
656                 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
657                 info.data = desc.offset;
658                 info.cl_buffer = desc.device_buffer;
659
660                 if(string_startswith(slot.name, "__tex_image")) {
661                         device_memory *mem = textures[slot.name];
662
663                         info.width = mem->data_width;
664                         info.height = mem->data_height;
665                         info.depth = mem->data_depth;
666
667                         info.interpolation = mem->interpolation;
668                         info.extension = mem->extension;
669                 }
670         }
671
672         /* Force write of descriptors. */
673         memory_manager.free(texture_info);
674         memory_manager.alloc("texture_info", texture_info);
675 }
676
677 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
678 {
679         /* cast arguments to cl types */
680         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
681         cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
682         cl_mem d_buffer = CL_MEM_PTR(buffer);
683         cl_int d_x = task.x;
684         cl_int d_y = task.y;
685         cl_int d_w = task.w;
686         cl_int d_h = task.h;
687         cl_float d_sample_scale = 1.0f/(task.sample + 1);
688         cl_int d_offset = task.offset;
689         cl_int d_stride = task.stride;
690
691
692         cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
693
694         cl_uint start_arg_index =
695                 kernel_set_args(ckFilmConvertKernel,
696                                 0,
697                                 d_data,
698                                 d_rgba,
699                                 d_buffer);
700
701         set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
702
703         start_arg_index += kernel_set_args(ckFilmConvertKernel,
704                                            start_arg_index,
705                                            d_sample_scale,
706                                            d_x,
707                                            d_y,
708                                            d_w,
709                                            d_h,
710                                            d_offset,
711                                            d_stride);
712
713         enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
714 }
715
716 bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
717                                                  device_ptr guide_ptr,
718                                                  device_ptr variance_ptr,
719                                                  device_ptr out_ptr,
720                                                  DenoisingTask *task)
721 {
722         int4 rect = task->rect;
723         int w = rect.z-rect.x;
724         int h = rect.w-rect.y;
725         int r = task->nlm_state.r;
726         int f = task->nlm_state.f;
727         float a = task->nlm_state.a;
728         float k_2 = task->nlm_state.k_2;
729
730         cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
731         cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
732         cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
733
734         cl_mem image_mem = CL_MEM_PTR(image_ptr);
735         cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
736         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
737         cl_mem out_mem = CL_MEM_PTR(out_ptr);
738
739         mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
740         mem_zero_kernel(out_ptr, sizeof(float)*w*h);
741
742         cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
743         cl_kernel ckNLMBlur           = denoising_program(ustring("filter_nlm_blur"));
744         cl_kernel ckNLMCalcWeight     = denoising_program(ustring("filter_nlm_calc_weight"));
745         cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
746         cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
747
748         for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
749                 int dy = i / (2*r+1) - r;
750                 int dx = i % (2*r+1) - r;
751                 int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
752                 kernel_set_args(ckNLMCalcDifference, 0,
753                                 dx, dy, guide_mem, variance_mem,
754                                 difference, local_rect, w, 0, a, k_2);
755                 kernel_set_args(ckNLMBlur, 0,
756                                 difference, blurDifference, local_rect, w, f);
757                 kernel_set_args(ckNLMCalcWeight, 0,
758                                 blurDifference, difference, local_rect, w, f);
759                 kernel_set_args(ckNLMUpdateOutput, 0,
760                                 dx, dy, blurDifference, image_mem,
761                                 out_mem, weightAccum, local_rect, w, f);
762
763                 enqueue_kernel(ckNLMCalcDifference, w, h);
764                 enqueue_kernel(ckNLMBlur,           w, h);
765                 enqueue_kernel(ckNLMCalcWeight,     w, h);
766                 enqueue_kernel(ckNLMBlur,           w, h);
767                 enqueue_kernel(ckNLMUpdateOutput,   w, h);
768         }
769
770         int4 local_rect = make_int4(0, 0, w, h);
771         kernel_set_args(ckNLMNormalize, 0,
772                         out_mem, weightAccum, local_rect, w);
773         enqueue_kernel(ckNLMNormalize, w, h);
774
775         return true;
776 }
777
778 bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
779 {
780         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
781         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
782         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
783
784         cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
785
786         kernel_set_args(ckFilterConstructTransform, 0,
787                         buffer_mem,
788                         transform_mem,
789                         rank_mem,
790                         task->filter_area,
791                         task->rect,
792                         task->buffer.pass_stride,
793                         task->radius,
794                         task->pca_threshold);
795
796         enqueue_kernel(ckFilterConstructTransform,
797                        task->storage.w,
798                        task->storage.h,
799                        256);
800
801         return true;
802 }
803
804 bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
805                                              device_ptr color_variance_ptr,
806                                              device_ptr output_ptr,
807                                              DenoisingTask *task)
808 {
809         mem_zero(task->storage.XtWX);
810         mem_zero(task->storage.XtWY);
811
812         cl_mem color_mem = CL_MEM_PTR(color_ptr);
813         cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
814         cl_mem output_mem = CL_MEM_PTR(output_ptr);
815
816         cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
817         cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
818         cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
819         cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
820         cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
821
822         cl_kernel ckNLMCalcDifference   = denoising_program(ustring("filter_nlm_calc_difference"));
823         cl_kernel ckNLMBlur             = denoising_program(ustring("filter_nlm_blur"));
824         cl_kernel ckNLMCalcWeight       = denoising_program(ustring("filter_nlm_calc_weight"));
825         cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
826         cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
827
828         cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
829         cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
830
831         int r = task->radius;
832         int f = 4;
833         float a = 1.0f;
834         for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
835                 int dy = i / (2*r+1) - r;
836                 int dx = i % (2*r+1) - r;
837
838                 int local_rect[4] = {max(0, -dx), max(0, -dy),
839                                      task->reconstruction_state.source_w - max(0, dx),
840                                      task->reconstruction_state.source_h - max(0, dy)};
841
842                 kernel_set_args(ckNLMCalcDifference, 0,
843                                 dx, dy,
844                                 color_mem,
845                                 color_variance_mem,
846                                 difference,
847                                 local_rect,
848                                 task->buffer.w,
849                                 task->buffer.pass_stride,
850                                 a, task->nlm_k_2);
851                 enqueue_kernel(ckNLMCalcDifference,
852                                task->reconstruction_state.source_w,
853                                task->reconstruction_state.source_h);
854
855                 kernel_set_args(ckNLMBlur, 0,
856                                 difference,
857                                 blurDifference,
858                                 local_rect,
859                                 task->buffer.w,
860                                 f);
861                 enqueue_kernel(ckNLMBlur,
862                                task->reconstruction_state.source_w,
863                                task->reconstruction_state.source_h);
864
865                 kernel_set_args(ckNLMCalcWeight, 0,
866                                 blurDifference,
867                                 difference,
868                                 local_rect,
869                                 task->buffer.w,
870                                 f);
871                 enqueue_kernel(ckNLMCalcWeight,
872                                task->reconstruction_state.source_w,
873                                task->reconstruction_state.source_h);
874
875                 /* Reuse previous arguments. */
876                 enqueue_kernel(ckNLMBlur,
877                                task->reconstruction_state.source_w,
878                                task->reconstruction_state.source_h);
879
880                 kernel_set_args(ckNLMConstructGramian, 0,
881                                 dx, dy,
882                                 blurDifference,
883                                 buffer_mem,
884                                 transform_mem,
885                                 rank_mem,
886                                 XtWX_mem,
887                                 XtWY_mem,
888                                 local_rect,
889                                 task->reconstruction_state.filter_rect,
890                                 task->buffer.w,
891                                 task->buffer.h,
892                                 f,
893                             task->buffer.pass_stride);
894                 enqueue_kernel(ckNLMConstructGramian,
895                                task->reconstruction_state.source_w,
896                                task->reconstruction_state.source_h,
897                                256);
898         }
899
900         kernel_set_args(ckFinalize, 0,
901                         task->buffer.w,
902                         task->buffer.h,
903                         output_mem,
904                         rank_mem,
905                         XtWX_mem,
906                         XtWY_mem,
907                         task->filter_area,
908                         task->reconstruction_state.buffer_params,
909                         task->render_buffer.samples);
910         enqueue_kernel(ckFinalize,
911                        task->reconstruction_state.source_w,
912                        task->reconstruction_state.source_h);
913
914         return true;
915 }
916
917 bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
918                                                 device_ptr b_ptr,
919                                                 device_ptr mean_ptr,
920                                                 device_ptr variance_ptr,
921                                                 int r, int4 rect,
922                                                 DenoisingTask *task)
923 {
924         cl_mem a_mem = CL_MEM_PTR(a_ptr);
925         cl_mem b_mem = CL_MEM_PTR(b_ptr);
926         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
927         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
928
929         cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
930
931         kernel_set_args(ckFilterCombineHalves, 0,
932                         mean_mem,
933                         variance_mem,
934                         a_mem,
935                         b_mem,
936                         rect,
937                         r);
938         enqueue_kernel(ckFilterCombineHalves,
939                        task->rect.z-task->rect.x,
940                        task->rect.w-task->rect.y);
941
942         return true;
943 }
944
945 bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
946                                                device_ptr b_ptr,
947                                                device_ptr sample_variance_ptr,
948                                                device_ptr sv_variance_ptr,
949                                                device_ptr buffer_variance_ptr,
950                                                DenoisingTask *task)
951 {
952         cl_mem a_mem = CL_MEM_PTR(a_ptr);
953         cl_mem b_mem = CL_MEM_PTR(b_ptr);
954         cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
955         cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
956         cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
957
958         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
959
960         cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
961
962         kernel_set_args(ckFilterDivideShadow, 0,
963                         task->render_buffer.samples,
964                         tiles_mem,
965                         a_mem,
966                         b_mem,
967                         sample_variance_mem,
968                         sv_variance_mem,
969                         buffer_variance_mem,
970                         task->rect,
971                         task->render_buffer.pass_stride,
972                         task->render_buffer.denoising_data_offset);
973         enqueue_kernel(ckFilterDivideShadow,
974                        task->rect.z-task->rect.x,
975                        task->rect.w-task->rect.y);
976
977         return true;
978 }
979
980 bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
981                                              int variance_offset,
982                                              device_ptr mean_ptr,
983                                              device_ptr variance_ptr,
984                                              DenoisingTask *task)
985 {
986         cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
987         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
988
989         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
990
991         cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
992
993         kernel_set_args(ckFilterGetFeature, 0,
994                         task->render_buffer.samples,
995                         tiles_mem,
996                         mean_offset,
997                         variance_offset,
998                         mean_mem,
999                         variance_mem,
1000                         task->rect,
1001                         task->render_buffer.pass_stride,
1002                         task->render_buffer.denoising_data_offset);
1003         enqueue_kernel(ckFilterGetFeature,
1004                        task->rect.z-task->rect.x,
1005                        task->rect.w-task->rect.y);
1006
1007         return true;
1008 }
1009
1010 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
1011                                                  device_ptr variance_ptr,
1012                                                  device_ptr depth_ptr,
1013                                                  device_ptr output_ptr,
1014                                                  DenoisingTask *task)
1015 {
1016         cl_mem image_mem = CL_MEM_PTR(image_ptr);
1017         cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1018         cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1019         cl_mem output_mem = CL_MEM_PTR(output_ptr);
1020
1021         cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1022
1023         kernel_set_args(ckFilterDetectOutliers, 0,
1024                         image_mem,
1025                         variance_mem,
1026                         depth_mem,
1027                         output_mem,
1028                         task->rect,
1029                         task->buffer.pass_stride);
1030         enqueue_kernel(ckFilterDetectOutliers,
1031                        task->rect.z-task->rect.x,
1032                        task->rect.w-task->rect.y);
1033
1034         return true;
1035 }
1036
1037 bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
1038                                            DenoisingTask *task)
1039 {
1040         mem_alloc(task->tiles_mem);
1041         mem_copy_to(task->tiles_mem);
1042
1043         cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1044
1045         cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
1046
1047         kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
1048         for(int i = 0; i < 9; i++) {
1049                 cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
1050                 kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
1051         }
1052
1053         enqueue_kernel(ckFilterSetTiles, 1, 1);
1054
1055         return true;
1056 }
1057
1058 void OpenCLDeviceBase::denoise(RenderTile &rtile, const DeviceTask &task)
1059 {
1060         DenoisingTask denoising(this);
1061
1062         denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
1063         denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
1064         denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
1065         denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1066         denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1067         denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1068         denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
1069         denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1070
1071         denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1072         denoising.render_buffer.samples = rtile.sample;
1073
1074         RenderTile rtiles[9];
1075         rtiles[4] = rtile;
1076         task.map_neighbor_tiles(rtiles, this);
1077         denoising.tiles_from_rendertiles(rtiles);
1078
1079         denoising.init_from_devicetask(task);
1080
1081         denoising.run_denoising();
1082
1083         task.unmap_neighbor_tiles(rtiles, this);
1084 }
1085
1086 void OpenCLDeviceBase::shader(DeviceTask& task)
1087 {
1088         /* cast arguments to cl types */
1089         cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1090         cl_mem d_input = CL_MEM_PTR(task.shader_input);
1091         cl_mem d_output = CL_MEM_PTR(task.shader_output);
1092         cl_int d_shader_eval_type = task.shader_eval_type;
1093         cl_int d_shader_filter = task.shader_filter;
1094         cl_int d_shader_x = task.shader_x;
1095         cl_int d_shader_w = task.shader_w;
1096         cl_int d_offset = task.offset;
1097
1098         cl_kernel kernel;
1099
1100         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1101                 kernel = base_program(ustring("bake"));
1102         }
1103         else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1104                 kernel = base_program(ustring("displace"));
1105         }
1106         else {
1107                 kernel = base_program(ustring("background"));
1108         }
1109
1110         cl_uint start_arg_index =
1111                 kernel_set_args(kernel,
1112                                 0,
1113                                 d_data,
1114                                 d_input,
1115                                 d_output);
1116
1117         set_kernel_arg_buffers(kernel, &start_arg_index);
1118
1119         start_arg_index += kernel_set_args(kernel,
1120                                            start_arg_index,
1121                                            d_shader_eval_type);
1122         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1123                 start_arg_index += kernel_set_args(kernel,
1124                                                    start_arg_index,
1125                                                    d_shader_filter);
1126         }
1127         start_arg_index += kernel_set_args(kernel,
1128                                            start_arg_index,
1129                                            d_shader_x,
1130                                            d_shader_w,
1131                                            d_offset);
1132
1133         for(int sample = 0; sample < task.num_samples; sample++) {
1134
1135                 if(task.get_cancel())
1136                         break;
1137
1138                 kernel_set_args(kernel, start_arg_index, sample);
1139
1140                 enqueue_kernel(kernel, task.shader_w, 1);
1141
1142                 clFinish(cqCommandQueue);
1143
1144                 task.update_progress(NULL);
1145         }
1146 }
1147
1148 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
1149 {
1150         string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1151
1152         if(platform_name == "NVIDIA CUDA") {
1153                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1154                                  "-cl-nv-maxrregcount=32 "
1155                                  "-cl-nv-verbose ";
1156
1157                 uint compute_capability_major, compute_capability_minor;
1158                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1159                                 sizeof(cl_uint), &compute_capability_major, NULL);
1160                 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1161                                 sizeof(cl_uint), &compute_capability_minor, NULL);
1162
1163                 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1164                                                compute_capability_major * 100 +
1165                                                compute_capability_minor * 10);
1166         }
1167
1168         else if(platform_name == "Apple")
1169                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1170
1171         else if(platform_name == "AMD Accelerated Parallel Processing")
1172                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1173
1174         else if(platform_name == "Intel(R) OpenCL") {
1175                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1176
1177                 /* Options for gdb source level kernel debugging.
1178                  * this segfaults on linux currently.
1179                  */
1180                 if(OpenCLInfo::use_debug() && debug_src)
1181                         build_options += "-g -s \"" + *debug_src + "\" ";
1182         }
1183
1184         if(OpenCLInfo::use_debug())
1185                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1186
1187 #ifdef WITH_CYCLES_DEBUG
1188         build_options += "-D__KERNEL_DEBUG__ ";
1189 #endif
1190
1191         return build_options;
1192 }
1193
1194 /* TODO(sergey): In the future we can use variadic templates, once
1195  * C++0x is allowed. Should allow to clean this up a bit.
1196  */
1197 int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
1198                     int start_argument_index,
1199                     const ArgumentWrapper& arg1,
1200                     const ArgumentWrapper& arg2,
1201                     const ArgumentWrapper& arg3,
1202                     const ArgumentWrapper& arg4,
1203                     const ArgumentWrapper& arg5,
1204                     const ArgumentWrapper& arg6,
1205                     const ArgumentWrapper& arg7,
1206                     const ArgumentWrapper& arg8,
1207                     const ArgumentWrapper& arg9,
1208                     const ArgumentWrapper& arg10,
1209                     const ArgumentWrapper& arg11,
1210                     const ArgumentWrapper& arg12,
1211                     const ArgumentWrapper& arg13,
1212                     const ArgumentWrapper& arg14,
1213                     const ArgumentWrapper& arg15,
1214                     const ArgumentWrapper& arg16,
1215                     const ArgumentWrapper& arg17,
1216                     const ArgumentWrapper& arg18,
1217                     const ArgumentWrapper& arg19,
1218                     const ArgumentWrapper& arg20,
1219                     const ArgumentWrapper& arg21,
1220                     const ArgumentWrapper& arg22,
1221                     const ArgumentWrapper& arg23,
1222                     const ArgumentWrapper& arg24,
1223                     const ArgumentWrapper& arg25,
1224                     const ArgumentWrapper& arg26,
1225                     const ArgumentWrapper& arg27,
1226                     const ArgumentWrapper& arg28,
1227                     const ArgumentWrapper& arg29,
1228                     const ArgumentWrapper& arg30,
1229                     const ArgumentWrapper& arg31,
1230                     const ArgumentWrapper& arg32,
1231                     const ArgumentWrapper& arg33)
1232 {
1233         int current_arg_index = 0;
1234 #define FAKE_VARARG_HANDLE_ARG(arg) \
1235         do { \
1236                 if(arg.pointer != NULL) { \
1237                         opencl_assert(clSetKernelArg( \
1238                                 kernel, \
1239                                 start_argument_index + current_arg_index, \
1240                                 arg.size, arg.pointer)); \
1241                         ++current_arg_index; \
1242                 } \
1243                 else { \
1244                         return current_arg_index; \
1245                 } \
1246         } while(false)
1247         FAKE_VARARG_HANDLE_ARG(arg1);
1248         FAKE_VARARG_HANDLE_ARG(arg2);
1249         FAKE_VARARG_HANDLE_ARG(arg3);
1250         FAKE_VARARG_HANDLE_ARG(arg4);
1251         FAKE_VARARG_HANDLE_ARG(arg5);
1252         FAKE_VARARG_HANDLE_ARG(arg6);
1253         FAKE_VARARG_HANDLE_ARG(arg7);
1254         FAKE_VARARG_HANDLE_ARG(arg8);
1255         FAKE_VARARG_HANDLE_ARG(arg9);
1256         FAKE_VARARG_HANDLE_ARG(arg10);
1257         FAKE_VARARG_HANDLE_ARG(arg11);
1258         FAKE_VARARG_HANDLE_ARG(arg12);
1259         FAKE_VARARG_HANDLE_ARG(arg13);
1260         FAKE_VARARG_HANDLE_ARG(arg14);
1261         FAKE_VARARG_HANDLE_ARG(arg15);
1262         FAKE_VARARG_HANDLE_ARG(arg16);
1263         FAKE_VARARG_HANDLE_ARG(arg17);
1264         FAKE_VARARG_HANDLE_ARG(arg18);
1265         FAKE_VARARG_HANDLE_ARG(arg19);
1266         FAKE_VARARG_HANDLE_ARG(arg20);
1267         FAKE_VARARG_HANDLE_ARG(arg21);
1268         FAKE_VARARG_HANDLE_ARG(arg22);
1269         FAKE_VARARG_HANDLE_ARG(arg23);
1270         FAKE_VARARG_HANDLE_ARG(arg24);
1271         FAKE_VARARG_HANDLE_ARG(arg25);
1272         FAKE_VARARG_HANDLE_ARG(arg26);
1273         FAKE_VARARG_HANDLE_ARG(arg27);
1274         FAKE_VARARG_HANDLE_ARG(arg28);
1275         FAKE_VARARG_HANDLE_ARG(arg29);
1276         FAKE_VARARG_HANDLE_ARG(arg30);
1277         FAKE_VARARG_HANDLE_ARG(arg31);
1278         FAKE_VARARG_HANDLE_ARG(arg32);
1279         FAKE_VARARG_HANDLE_ARG(arg33);
1280 #undef FAKE_VARARG_HANDLE_ARG
1281         return current_arg_index;
1282 }
1283
1284 void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
1285 {
1286         if(kernel) {
1287                 clReleaseKernel(kernel);
1288         }
1289 }
1290
1291 void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
1292 {
1293         if(mem != NULL) {
1294                 clReleaseMemObject(mem);
1295         }
1296 }
1297
1298 void OpenCLDeviceBase::release_program_safe(cl_program program)
1299 {
1300         if(program) {
1301                 clReleaseProgram(program);
1302         }
1303 }
1304
1305 /* ** Those guys are for workign around some compiler-specific bugs ** */
1306
1307 cl_program OpenCLDeviceBase::load_cached_kernel(
1308         ustring key,
1309         thread_scoped_lock& cache_locker)
1310 {
1311         return OpenCLCache::get_program(cpPlatform,
1312                                         cdDevice,
1313                                         key,
1314                                         cache_locker);
1315 }
1316
1317 void OpenCLDeviceBase::store_cached_kernel(
1318         cl_program program,
1319         ustring key,
1320         thread_scoped_lock& cache_locker)
1321 {
1322         OpenCLCache::store_program(cpPlatform,
1323                                    cdDevice,
1324                                    program,
1325                                    key,
1326                                    cache_locker);
1327 }
1328
1329 string OpenCLDeviceBase::build_options_for_base_program(
1330         const DeviceRequestedFeatures& requested_features)
1331 {
1332         /* TODO(sergey): By default we compile all features, meaning
1333          * mega kernel is not getting feature-based optimizations.
1334          *
1335          * Ideally we need always compile kernel with as less features
1336          * enabled as possible to keep performance at it's max.
1337          */
1338
1339         /* For now disable baking when not in use as this has major
1340          * impact on kernel build times.
1341          */
1342         if(!requested_features.use_baking) {
1343                 return "-D__NO_BAKING__";
1344         }
1345
1346         return "";
1347 }
1348
1349 CCL_NAMESPACE_END
1350
1351 #endif