2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
19 #include "device/opencl/opencl.h"
21 #include "kernel/kernel_types.h"
23 #include "util/util_algorithm.h"
24 #include "util/util_foreach.h"
25 #include "util/util_logging.h"
26 #include "util/util_md5.h"
27 #include "util/util_path.h"
28 #include "util/util_time.h"
32 struct texture_slot_t {
33 texture_slot_t(const string& name, int slot)
41 bool OpenCLDeviceBase::opencl_error(cl_int err)
43 if(err != CL_SUCCESS) {
44 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
47 fprintf(stderr, "%s\n", message.c_str());
54 void OpenCLDeviceBase::opencl_error(const string& message)
58 fprintf(stderr, "%s\n", message.c_str());
61 void OpenCLDeviceBase::opencl_assert_err(cl_int err, const char* where)
63 if(err != CL_SUCCESS) {
64 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
67 fprintf(stderr, "%s\n", message.c_str());
74 OpenCLDeviceBase::OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
75 : Device(info, stats, background_),
77 texture_info(this, "__texture_info", MEM_TEXTURE)
82 cqCommandQueue = NULL;
84 device_initialized = false;
85 textures_need_update = true;
87 vector<OpenCLPlatformDevice> usable_devices;
88 OpenCLInfo::get_usable_devices(&usable_devices);
89 if(usable_devices.size() == 0) {
90 opencl_error("OpenCL: no devices found.");
93 assert(info.num < usable_devices.size());
94 OpenCLPlatformDevice& platform_device = usable_devices[info.num];
95 cpPlatform = platform_device.platform_id;
96 cdDevice = platform_device.device_id;
97 platform_name = platform_device.platform_name;
98 device_name = platform_device.device_name;
99 VLOG(2) << "Creating new Cycles device for OpenCL platform "
100 << platform_name << ", device "
101 << device_name << ".";
104 /* try to use cached context */
105 thread_scoped_lock cache_locker;
106 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
108 if(cxContext == NULL) {
109 /* create context properties array to specify platform */
110 const cl_context_properties context_props[] = {
111 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
116 cxContext = clCreateContext(context_props, 1, &cdDevice,
117 context_notify_callback, cdDevice, &ciErr);
119 if(opencl_error(ciErr)) {
120 opencl_error("OpenCL: clCreateContext failed");
125 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
129 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
130 if(opencl_error(ciErr)) {
131 opencl_error("OpenCL: Error creating command queue");
135 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
136 if(opencl_error(ciErr)) {
137 opencl_error("OpenCL: Error creating memory buffer for NULL");
141 /* Allocate this right away so that texture_info is placed at offset 0 in the device memory buffers */
142 texture_info.resize(1);
143 memory_manager.alloc("texture_info", texture_info);
145 fprintf(stderr, "Device init success\n");
146 device_initialized = true;
149 OpenCLDeviceBase::~OpenCLDeviceBase()
153 memory_manager.free();
156 clReleaseMemObject(CL_MEM_PTR(null_mem));
158 ConstMemMap::iterator mt;
159 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
163 base_program.release();
165 clReleaseCommandQueue(cqCommandQueue);
167 clReleaseContext(cxContext);
170 void CL_CALLBACK OpenCLDeviceBase::context_notify_callback(const char *err_info,
171 const void * /*private_info*/, size_t /*cb*/, void *user_data)
173 string device_name = OpenCLInfo::get_device_name((cl_device_id)user_data);
174 fprintf(stderr, "OpenCL error (%s): %s\n", device_name.c_str(), err_info);
177 bool OpenCLDeviceBase::opencl_version_check()
180 if(!OpenCLInfo::platform_version_check(cpPlatform, &error)) {
184 if(!OpenCLInfo::device_version_check(cdDevice, &error)) {
191 string OpenCLDeviceBase::device_md5_hash(string kernel_custom_build_options)
194 char version[256], driver[256], name[256], vendor[256];
196 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
197 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
198 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
199 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
201 md5.append((uint8_t*)vendor, strlen(vendor));
202 md5.append((uint8_t*)version, strlen(version));
203 md5.append((uint8_t*)name, strlen(name));
204 md5.append((uint8_t*)driver, strlen(driver));
206 string options = kernel_build_options();
207 options += kernel_custom_build_options;
208 md5.append((uint8_t*)options.c_str(), options.size());
210 return md5.get_hex();
213 bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_features)
215 VLOG(2) << "Loading kernels for platform " << platform_name
216 << ", device " << device_name << ".";
217 /* Verify if device was initialized. */
218 if(!device_initialized) {
219 fprintf(stderr, "OpenCL: failed to initialize device.\n");
223 /* Verify we have right opencl version. */
224 if(!opencl_version_check())
227 base_program = OpenCLProgram(this, "base", "kernel.cl", build_options_for_base_program(requested_features));
228 base_program.add_kernel(ustring("convert_to_byte"));
229 base_program.add_kernel(ustring("convert_to_half_float"));
230 base_program.add_kernel(ustring("displace"));
231 base_program.add_kernel(ustring("background"));
232 base_program.add_kernel(ustring("bake"));
233 base_program.add_kernel(ustring("zero_buffer"));
235 denoising_program = OpenCLProgram(this, "denoising", "filter.cl", "");
236 denoising_program.add_kernel(ustring("filter_divide_shadow"));
237 denoising_program.add_kernel(ustring("filter_get_feature"));
238 denoising_program.add_kernel(ustring("filter_detect_outliers"));
239 denoising_program.add_kernel(ustring("filter_combine_halves"));
240 denoising_program.add_kernel(ustring("filter_construct_transform"));
241 denoising_program.add_kernel(ustring("filter_nlm_calc_difference"));
242 denoising_program.add_kernel(ustring("filter_nlm_blur"));
243 denoising_program.add_kernel(ustring("filter_nlm_calc_weight"));
244 denoising_program.add_kernel(ustring("filter_nlm_update_output"));
245 denoising_program.add_kernel(ustring("filter_nlm_normalize"));
246 denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
247 denoising_program.add_kernel(ustring("filter_finalize"));
248 denoising_program.add_kernel(ustring("filter_set_tiles"));
250 vector<OpenCLProgram*> programs;
251 programs.push_back(&base_program);
252 programs.push_back(&denoising_program);
253 /* Call actual class to fill the vector with its programs. */
254 if(!load_kernels(requested_features, programs)) {
258 /* Parallel compilation is supported by Cycles, but currently all OpenCL frameworks
259 * serialize the calls internally, so it's not much use right now.
260 * Note: When enabling parallel compilation, use_stdout in the OpenCLProgram constructor
261 * should be set to false as well. */
264 foreach(OpenCLProgram *program, programs) {
265 task_pool.push(function_bind(&OpenCLProgram::load, program));
267 task_pool.wait_work();
269 foreach(OpenCLProgram *program, programs) {
270 VLOG(2) << program->get_log();
271 if(!program->is_loaded()) {
272 program->report_error();
277 foreach(OpenCLProgram *program, programs) {
279 if(!program->is_loaded()) {
288 void OpenCLDeviceBase::mem_alloc(device_memory& mem)
291 VLOG(1) << "Buffer allocate: " << mem.name << ", "
292 << string_human_readable_number(mem.memory_size()) << " bytes. ("
293 << string_human_readable_size(mem.memory_size()) << ")";
296 size_t size = mem.memory_size();
298 /* check there is enough memory available for the allocation */
299 cl_ulong max_alloc_size = 0;
300 clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_alloc_size, NULL);
302 if(DebugFlags().opencl.mem_limit) {
303 max_alloc_size = min(max_alloc_size,
304 cl_ulong(DebugFlags().opencl.mem_limit - stats.mem_used));
307 if(size > max_alloc_size) {
308 string error = "Scene too complex to fit in available memory.";
309 if(mem.name != NULL) {
310 error += string_printf(" (allocating buffer %s failed.)", mem.name);
317 cl_mem_flags mem_flag;
318 void *mem_ptr = NULL;
320 if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
321 mem_flag = CL_MEM_READ_ONLY;
323 mem_flag = CL_MEM_READ_WRITE;
325 /* Zero-size allocation might be invoked by render, but not really
326 * supported by OpenCL. Using NULL as device pointer also doesn't really
327 * work for some reason, so for the time being we'll use special case
328 * will null_mem buffer.
331 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
336 opencl_assert_err(ciErr, "clCreateBuffer");
339 mem.device_pointer = null_mem;
342 stats.mem_alloc(size);
343 mem.device_size = size;
346 void OpenCLDeviceBase::mem_copy_to(device_memory& mem)
348 if(mem.type == MEM_TEXTURE) {
353 if(!mem.device_pointer) {
357 /* this is blocking */
358 size_t size = mem.memory_size();
360 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
361 CL_MEM_PTR(mem.device_pointer),
372 void OpenCLDeviceBase::mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
374 size_t offset = elem*y*w;
375 size_t size = elem*w*h;
377 opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
378 CL_MEM_PTR(mem.device_pointer),
382 (uchar*)mem.host_pointer + offset,
387 void OpenCLDeviceBase::mem_zero_kernel(device_ptr mem, size_t size)
389 cl_kernel ckZeroBuffer = base_program(ustring("zero_buffer"));
391 size_t global_size[] = {1024, 1024};
392 size_t num_threads = global_size[0] * global_size[1];
394 cl_mem d_buffer = CL_MEM_PTR(mem);
395 cl_ulong d_offset = 0;
398 while(d_offset < size) {
399 d_size = std::min<cl_ulong>(num_threads*sizeof(float4), size - d_offset);
401 kernel_set_args(ckZeroBuffer, 0, d_buffer, d_size, d_offset);
403 ciErr = clEnqueueNDRangeKernel(cqCommandQueue,
412 opencl_assert_err(ciErr, "clEnqueueNDRangeKernel");
418 void OpenCLDeviceBase::mem_zero(device_memory& mem)
420 if(!mem.device_pointer) {
424 if(mem.device_pointer) {
425 if(base_program.is_loaded()) {
426 mem_zero_kernel(mem.device_pointer, mem.memory_size());
429 if(mem.host_pointer) {
430 memset(mem.host_pointer, 0, mem.memory_size());
433 if(!base_program.is_loaded()) {
434 void* zero = mem.host_pointer;
436 if(!mem.host_pointer) {
437 zero = util_aligned_malloc(mem.memory_size(), 16);
438 memset(zero, 0, mem.memory_size());
441 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
442 CL_MEM_PTR(mem.device_pointer),
450 if(!mem.host_pointer) {
451 util_aligned_free(zero);
457 void OpenCLDeviceBase::mem_free(device_memory& mem)
459 if(mem.type == MEM_TEXTURE) {
463 if(mem.device_pointer) {
464 if(mem.device_pointer != null_mem) {
465 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
467 mem.device_pointer = 0;
469 stats.mem_free(mem.device_size);
475 int OpenCLDeviceBase::mem_address_alignment()
477 return OpenCLInfo::mem_address_alignment(cdDevice);
480 device_ptr OpenCLDeviceBase::mem_alloc_sub_ptr(device_memory& mem, int offset, int size)
482 cl_mem_flags mem_flag;
483 if(mem.type == MEM_READ_ONLY || mem.type == MEM_TEXTURE)
484 mem_flag = CL_MEM_READ_ONLY;
486 mem_flag = CL_MEM_READ_WRITE;
488 cl_buffer_region info;
489 info.origin = mem.memory_elements_size(offset);
490 info.size = mem.memory_elements_size(size);
492 device_ptr sub_buf = (device_ptr) clCreateSubBuffer(CL_MEM_PTR(mem.device_pointer),
494 CL_BUFFER_CREATE_TYPE_REGION,
497 opencl_assert_err(ciErr, "clCreateSubBuffer");
501 void OpenCLDeviceBase::mem_free_sub_ptr(device_ptr device_pointer)
503 if(device_pointer && device_pointer != null_mem) {
504 opencl_assert(clReleaseMemObject(CL_MEM_PTR(device_pointer)));
508 void OpenCLDeviceBase::const_copy_to(const char *name, void *host, size_t size)
510 ConstMemMap::iterator i = const_mem_map.find(name);
511 device_vector<uchar> *data;
513 if(i == const_mem_map.end()) {
514 data = new device_vector<uchar>(this, name, MEM_READ_ONLY);
516 const_mem_map.insert(ConstMemMap::value_type(name, data));
522 memcpy(data->data(), host, size);
523 data->copy_to_device();
526 void OpenCLDeviceBase::tex_alloc(device_memory& mem)
528 VLOG(1) << "Texture allocate: " << mem.name << ", "
529 << string_human_readable_number(mem.memory_size()) << " bytes. ("
530 << string_human_readable_size(mem.memory_size()) << ")";
532 memory_manager.alloc(mem.name, mem);
533 /* Set the pointer to non-null to keep code that inspects its value from thinking its unallocated. */
534 mem.device_pointer = 1;
535 textures[mem.name] = &mem;
536 textures_need_update = true;
539 void OpenCLDeviceBase::tex_free(device_memory& mem)
541 if(mem.device_pointer) {
542 mem.device_pointer = 0;
544 if(memory_manager.free(mem)) {
545 textures_need_update = true;
548 foreach(TexturesMap::value_type& value, textures) {
549 if(value.second == &mem) {
550 textures.erase(value.first);
557 size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
559 int r = global_size % group_size;
560 return global_size + ((r == 0)? 0: group_size - r);
563 void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
565 size_t workgroup_size, max_work_items[3];
567 clGetKernelWorkGroupInfo(kernel, cdDevice,
568 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
569 clGetDeviceInfo(cdDevice,
570 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
572 if(max_workgroup_size > 0 && workgroup_size > max_workgroup_size) {
573 workgroup_size = max_workgroup_size;
576 /* Try to divide evenly over 2 dimensions. */
577 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
578 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
580 /* Some implementations have max size 1 on 2nd dimension. */
581 if(local_size[1] > max_work_items[1]) {
582 local_size[0] = workgroup_size/max_work_items[1];
583 local_size[1] = max_work_items[1];
586 size_t global_size[2] = {global_size_round_up(local_size[0], w),
587 global_size_round_up(local_size[1], h)};
589 /* Vertical size of 1 is coming from bake/shade kernels where we should
590 * not round anything up because otherwise we'll either be doing too
591 * much work per pixel (if we don't check global ID on Y axis) or will
592 * be checking for global ID to always have Y of 0.
599 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
600 opencl_assert(clFlush(cqCommandQueue));
603 void OpenCLDeviceBase::set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
607 MemMap::iterator i = mem_map.find(name);
608 if(i != mem_map.end()) {
609 ptr = CL_MEM_PTR(i->second);
612 /* work around NULL not working, even though the spec says otherwise */
613 ptr = CL_MEM_PTR(null_mem);
616 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
619 void OpenCLDeviceBase::set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg)
621 flush_texture_buffers();
623 memory_manager.set_kernel_arg_buffers(kernel, narg);
626 void OpenCLDeviceBase::flush_texture_buffers()
628 if(!textures_need_update) {
631 textures_need_update = false;
633 /* Setup slots for textures. */
636 vector<texture_slot_t> texture_slots;
638 #define KERNEL_TEX(type, name) \
639 if(textures.find(#name) != textures.end()) { \
640 texture_slots.push_back(texture_slot_t(#name, num_slots)); \
643 #include "kernel/kernel_textures.h"
645 int num_data_slots = num_slots;
647 foreach(TexturesMap::value_type& tex, textures) {
648 string name = tex.first;
650 if(string_startswith(name, "__tex_image")) {
651 int pos = name.rfind("_");
652 int id = atoi(name.data() + pos + 1);
653 texture_slots.push_back(texture_slot_t(name,
654 num_data_slots + id));
655 num_slots = max(num_slots, num_data_slots + id + 1);
659 /* Realloc texture descriptors buffer. */
660 memory_manager.free(texture_info);
661 texture_info.resize(num_slots);
662 memory_manager.alloc("texture_info", texture_info);
664 /* Fill in descriptors */
665 foreach(texture_slot_t& slot, texture_slots) {
666 TextureInfo& info = texture_info[slot.slot];
668 MemoryManager::BufferDescriptor desc = memory_manager.get_descriptor(slot.name);
669 info.data = desc.offset;
670 info.cl_buffer = desc.device_buffer;
672 if(string_startswith(slot.name, "__tex_image")) {
673 device_memory *mem = textures[slot.name];
675 info.width = mem->data_width;
676 info.height = mem->data_height;
677 info.depth = mem->data_depth;
679 info.interpolation = mem->interpolation;
680 info.extension = mem->extension;
684 /* Force write of descriptors. */
685 memory_manager.free(texture_info);
686 memory_manager.alloc("texture_info", texture_info);
689 void OpenCLDeviceBase::film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
691 /* cast arguments to cl types */
692 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
693 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
694 cl_mem d_buffer = CL_MEM_PTR(buffer);
699 cl_float d_sample_scale = 1.0f/(task.sample + 1);
700 cl_int d_offset = task.offset;
701 cl_int d_stride = task.stride;
704 cl_kernel ckFilmConvertKernel = (rgba_byte)? base_program(ustring("convert_to_byte")): base_program(ustring("convert_to_half_float"));
706 cl_uint start_arg_index =
707 kernel_set_args(ckFilmConvertKernel,
713 set_kernel_arg_buffers(ckFilmConvertKernel, &start_arg_index);
715 start_arg_index += kernel_set_args(ckFilmConvertKernel,
725 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
728 bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
729 device_ptr guide_ptr,
730 device_ptr variance_ptr,
734 int4 rect = task->rect;
735 int w = rect.z-rect.x;
736 int h = rect.w-rect.y;
737 int r = task->nlm_state.r;
738 int f = task->nlm_state.f;
739 float a = task->nlm_state.a;
740 float k_2 = task->nlm_state.k_2;
742 cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
743 cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
744 cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
746 cl_mem image_mem = CL_MEM_PTR(image_ptr);
747 cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
748 cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
749 cl_mem out_mem = CL_MEM_PTR(out_ptr);
751 mem_zero_kernel(task->nlm_state.temporary_3_ptr, sizeof(float)*w*h);
752 mem_zero_kernel(out_ptr, sizeof(float)*w*h);
754 cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
755 cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
756 cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
757 cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
758 cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
760 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
761 int dy = i / (2*r+1) - r;
762 int dx = i % (2*r+1) - r;
763 int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
764 kernel_set_args(ckNLMCalcDifference, 0,
765 dx, dy, guide_mem, variance_mem,
766 difference, local_rect, w, 0, a, k_2);
767 kernel_set_args(ckNLMBlur, 0,
768 difference, blurDifference, local_rect, w, f);
769 kernel_set_args(ckNLMCalcWeight, 0,
770 blurDifference, difference, local_rect, w, f);
771 kernel_set_args(ckNLMUpdateOutput, 0,
772 dx, dy, blurDifference, image_mem,
773 out_mem, weightAccum, local_rect, w, f);
775 enqueue_kernel(ckNLMCalcDifference, w, h);
776 enqueue_kernel(ckNLMBlur, w, h);
777 enqueue_kernel(ckNLMCalcWeight, w, h);
778 enqueue_kernel(ckNLMBlur, w, h);
779 enqueue_kernel(ckNLMUpdateOutput, w, h);
782 int4 local_rect = make_int4(0, 0, w, h);
783 kernel_set_args(ckNLMNormalize, 0,
784 out_mem, weightAccum, local_rect, w);
785 enqueue_kernel(ckNLMNormalize, w, h);
790 bool OpenCLDeviceBase::denoising_construct_transform(DenoisingTask *task)
792 cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
793 cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
794 cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
796 cl_kernel ckFilterConstructTransform = denoising_program(ustring("filter_construct_transform"));
798 kernel_set_args(ckFilterConstructTransform, 0,
804 task->buffer.pass_stride,
806 task->pca_threshold);
808 enqueue_kernel(ckFilterConstructTransform,
816 bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
817 device_ptr color_variance_ptr,
818 device_ptr output_ptr,
821 mem_zero(task->storage.XtWX);
822 mem_zero(task->storage.XtWY);
824 cl_mem color_mem = CL_MEM_PTR(color_ptr);
825 cl_mem color_variance_mem = CL_MEM_PTR(color_variance_ptr);
826 cl_mem output_mem = CL_MEM_PTR(output_ptr);
828 cl_mem buffer_mem = CL_MEM_PTR(task->buffer.mem.device_pointer);
829 cl_mem transform_mem = CL_MEM_PTR(task->storage.transform.device_pointer);
830 cl_mem rank_mem = CL_MEM_PTR(task->storage.rank.device_pointer);
831 cl_mem XtWX_mem = CL_MEM_PTR(task->storage.XtWX.device_pointer);
832 cl_mem XtWY_mem = CL_MEM_PTR(task->storage.XtWY.device_pointer);
834 cl_kernel ckNLMCalcDifference = denoising_program(ustring("filter_nlm_calc_difference"));
835 cl_kernel ckNLMBlur = denoising_program(ustring("filter_nlm_blur"));
836 cl_kernel ckNLMCalcWeight = denoising_program(ustring("filter_nlm_calc_weight"));
837 cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
838 cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
840 cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
841 cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
843 int r = task->radius;
846 for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
847 int dy = i / (2*r+1) - r;
848 int dx = i % (2*r+1) - r;
850 int local_rect[4] = {max(0, -dx), max(0, -dy),
851 task->reconstruction_state.source_w - max(0, dx),
852 task->reconstruction_state.source_h - max(0, dy)};
854 kernel_set_args(ckNLMCalcDifference, 0,
861 task->buffer.pass_stride,
863 enqueue_kernel(ckNLMCalcDifference,
864 task->reconstruction_state.source_w,
865 task->reconstruction_state.source_h);
867 kernel_set_args(ckNLMBlur, 0,
873 enqueue_kernel(ckNLMBlur,
874 task->reconstruction_state.source_w,
875 task->reconstruction_state.source_h);
877 kernel_set_args(ckNLMCalcWeight, 0,
883 enqueue_kernel(ckNLMCalcWeight,
884 task->reconstruction_state.source_w,
885 task->reconstruction_state.source_h);
887 /* Reuse previous arguments. */
888 enqueue_kernel(ckNLMBlur,
889 task->reconstruction_state.source_w,
890 task->reconstruction_state.source_h);
892 kernel_set_args(ckNLMConstructGramian, 0,
901 task->reconstruction_state.filter_rect,
905 task->buffer.pass_stride);
906 enqueue_kernel(ckNLMConstructGramian,
907 task->reconstruction_state.source_w,
908 task->reconstruction_state.source_h,
912 kernel_set_args(ckFinalize, 0,
920 task->reconstruction_state.buffer_params,
921 task->render_buffer.samples);
922 enqueue_kernel(ckFinalize,
923 task->reconstruction_state.source_w,
924 task->reconstruction_state.source_h);
929 bool OpenCLDeviceBase::denoising_combine_halves(device_ptr a_ptr,
932 device_ptr variance_ptr,
936 cl_mem a_mem = CL_MEM_PTR(a_ptr);
937 cl_mem b_mem = CL_MEM_PTR(b_ptr);
938 cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
939 cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
941 cl_kernel ckFilterCombineHalves = denoising_program(ustring("filter_combine_halves"));
943 kernel_set_args(ckFilterCombineHalves, 0,
950 enqueue_kernel(ckFilterCombineHalves,
951 task->rect.z-task->rect.x,
952 task->rect.w-task->rect.y);
957 bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
959 device_ptr sample_variance_ptr,
960 device_ptr sv_variance_ptr,
961 device_ptr buffer_variance_ptr,
964 cl_mem a_mem = CL_MEM_PTR(a_ptr);
965 cl_mem b_mem = CL_MEM_PTR(b_ptr);
966 cl_mem sample_variance_mem = CL_MEM_PTR(sample_variance_ptr);
967 cl_mem sv_variance_mem = CL_MEM_PTR(sv_variance_ptr);
968 cl_mem buffer_variance_mem = CL_MEM_PTR(buffer_variance_ptr);
970 cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
972 cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
974 kernel_set_args(ckFilterDivideShadow, 0,
975 task->render_buffer.samples,
983 task->render_buffer.pass_stride,
984 task->render_buffer.denoising_data_offset);
985 enqueue_kernel(ckFilterDivideShadow,
986 task->rect.z-task->rect.x,
987 task->rect.w-task->rect.y);
992 bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
995 device_ptr variance_ptr,
998 cl_mem mean_mem = CL_MEM_PTR(mean_ptr);
999 cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1001 cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1003 cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
1005 kernel_set_args(ckFilterGetFeature, 0,
1006 task->render_buffer.samples,
1013 task->render_buffer.pass_stride,
1014 task->render_buffer.denoising_data_offset);
1015 enqueue_kernel(ckFilterGetFeature,
1016 task->rect.z-task->rect.x,
1017 task->rect.w-task->rect.y);
1022 bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
1023 device_ptr variance_ptr,
1024 device_ptr depth_ptr,
1025 device_ptr output_ptr,
1026 DenoisingTask *task)
1028 cl_mem image_mem = CL_MEM_PTR(image_ptr);
1029 cl_mem variance_mem = CL_MEM_PTR(variance_ptr);
1030 cl_mem depth_mem = CL_MEM_PTR(depth_ptr);
1031 cl_mem output_mem = CL_MEM_PTR(output_ptr);
1033 cl_kernel ckFilterDetectOutliers = denoising_program(ustring("filter_detect_outliers"));
1035 kernel_set_args(ckFilterDetectOutliers, 0,
1041 task->buffer.pass_stride);
1042 enqueue_kernel(ckFilterDetectOutliers,
1043 task->rect.z-task->rect.x,
1044 task->rect.w-task->rect.y);
1049 bool OpenCLDeviceBase::denoising_set_tiles(device_ptr *buffers,
1050 DenoisingTask *task)
1052 task->tiles_mem.copy_to_device();
1054 cl_mem tiles_mem = CL_MEM_PTR(task->tiles_mem.device_pointer);
1056 cl_kernel ckFilterSetTiles = denoising_program(ustring("filter_set_tiles"));
1058 kernel_set_args(ckFilterSetTiles, 0, tiles_mem);
1059 for(int i = 0; i < 9; i++) {
1060 cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
1061 kernel_set_args(ckFilterSetTiles, i+1, buffer_mem);
1064 enqueue_kernel(ckFilterSetTiles, 1, 1);
1069 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising, const DeviceTask &task)
1071 denoising.functions.set_tiles = function_bind(&OpenCLDeviceBase::denoising_set_tiles, this, _1, &denoising);
1072 denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
1073 denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
1074 denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
1075 denoising.functions.non_local_means = function_bind(&OpenCLDeviceBase::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
1076 denoising.functions.combine_halves = function_bind(&OpenCLDeviceBase::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
1077 denoising.functions.get_feature = function_bind(&OpenCLDeviceBase::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
1078 denoising.functions.detect_outliers = function_bind(&OpenCLDeviceBase::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
1080 denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
1081 denoising.render_buffer.samples = rtile.sample;
1083 RenderTile rtiles[9];
1085 task.map_neighbor_tiles(rtiles, this);
1086 denoising.tiles_from_rendertiles(rtiles);
1088 denoising.init_from_devicetask(task);
1090 denoising.run_denoising();
1092 task.unmap_neighbor_tiles(rtiles, this);
1095 void OpenCLDeviceBase::shader(DeviceTask& task)
1097 /* cast arguments to cl types */
1098 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1099 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1100 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1101 cl_int d_shader_eval_type = task.shader_eval_type;
1102 cl_int d_shader_filter = task.shader_filter;
1103 cl_int d_shader_x = task.shader_x;
1104 cl_int d_shader_w = task.shader_w;
1105 cl_int d_offset = task.offset;
1109 if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1110 kernel = base_program(ustring("bake"));
1112 else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
1113 kernel = base_program(ustring("displace"));
1116 kernel = base_program(ustring("background"));
1119 cl_uint start_arg_index =
1120 kernel_set_args(kernel,
1126 set_kernel_arg_buffers(kernel, &start_arg_index);
1128 start_arg_index += kernel_set_args(kernel,
1130 d_shader_eval_type);
1131 if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1132 start_arg_index += kernel_set_args(kernel,
1136 start_arg_index += kernel_set_args(kernel,
1142 for(int sample = 0; sample < task.num_samples; sample++) {
1144 if(task.get_cancel())
1147 kernel_set_args(kernel, start_arg_index, sample);
1149 enqueue_kernel(kernel, task.shader_w, 1);
1151 clFinish(cqCommandQueue);
1153 task.update_progress(NULL);
1157 string OpenCLDeviceBase::kernel_build_options(const string *debug_src)
1159 string build_options = "-cl-no-signed-zeros -cl-mad-enable ";
1161 if(platform_name == "NVIDIA CUDA") {
1162 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1163 "-cl-nv-maxrregcount=32 "
1166 uint compute_capability_major, compute_capability_minor;
1167 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1168 sizeof(cl_uint), &compute_capability_major, NULL);
1169 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1170 sizeof(cl_uint), &compute_capability_minor, NULL);
1172 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1173 compute_capability_major * 100 +
1174 compute_capability_minor * 10);
1177 else if(platform_name == "Apple")
1178 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1180 else if(platform_name == "AMD Accelerated Parallel Processing")
1181 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1183 else if(platform_name == "Intel(R) OpenCL") {
1184 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1186 /* Options for gdb source level kernel debugging.
1187 * this segfaults on linux currently.
1189 if(OpenCLInfo::use_debug() && debug_src)
1190 build_options += "-g -s \"" + *debug_src + "\" ";
1193 if(OpenCLInfo::use_debug())
1194 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1196 #ifdef WITH_CYCLES_DEBUG
1197 build_options += "-D__KERNEL_DEBUG__ ";
1200 return build_options;
1203 /* TODO(sergey): In the future we can use variadic templates, once
1204 * C++0x is allowed. Should allow to clean this up a bit.
1206 int OpenCLDeviceBase::kernel_set_args(cl_kernel kernel,
1207 int start_argument_index,
1208 const ArgumentWrapper& arg1,
1209 const ArgumentWrapper& arg2,
1210 const ArgumentWrapper& arg3,
1211 const ArgumentWrapper& arg4,
1212 const ArgumentWrapper& arg5,
1213 const ArgumentWrapper& arg6,
1214 const ArgumentWrapper& arg7,
1215 const ArgumentWrapper& arg8,
1216 const ArgumentWrapper& arg9,
1217 const ArgumentWrapper& arg10,
1218 const ArgumentWrapper& arg11,
1219 const ArgumentWrapper& arg12,
1220 const ArgumentWrapper& arg13,
1221 const ArgumentWrapper& arg14,
1222 const ArgumentWrapper& arg15,
1223 const ArgumentWrapper& arg16,
1224 const ArgumentWrapper& arg17,
1225 const ArgumentWrapper& arg18,
1226 const ArgumentWrapper& arg19,
1227 const ArgumentWrapper& arg20,
1228 const ArgumentWrapper& arg21,
1229 const ArgumentWrapper& arg22,
1230 const ArgumentWrapper& arg23,
1231 const ArgumentWrapper& arg24,
1232 const ArgumentWrapper& arg25,
1233 const ArgumentWrapper& arg26,
1234 const ArgumentWrapper& arg27,
1235 const ArgumentWrapper& arg28,
1236 const ArgumentWrapper& arg29,
1237 const ArgumentWrapper& arg30,
1238 const ArgumentWrapper& arg31,
1239 const ArgumentWrapper& arg32,
1240 const ArgumentWrapper& arg33)
1242 int current_arg_index = 0;
1243 #define FAKE_VARARG_HANDLE_ARG(arg) \
1245 if(arg.pointer != NULL) { \
1246 opencl_assert(clSetKernelArg( \
1248 start_argument_index + current_arg_index, \
1249 arg.size, arg.pointer)); \
1250 ++current_arg_index; \
1253 return current_arg_index; \
1256 FAKE_VARARG_HANDLE_ARG(arg1);
1257 FAKE_VARARG_HANDLE_ARG(arg2);
1258 FAKE_VARARG_HANDLE_ARG(arg3);
1259 FAKE_VARARG_HANDLE_ARG(arg4);
1260 FAKE_VARARG_HANDLE_ARG(arg5);
1261 FAKE_VARARG_HANDLE_ARG(arg6);
1262 FAKE_VARARG_HANDLE_ARG(arg7);
1263 FAKE_VARARG_HANDLE_ARG(arg8);
1264 FAKE_VARARG_HANDLE_ARG(arg9);
1265 FAKE_VARARG_HANDLE_ARG(arg10);
1266 FAKE_VARARG_HANDLE_ARG(arg11);
1267 FAKE_VARARG_HANDLE_ARG(arg12);
1268 FAKE_VARARG_HANDLE_ARG(arg13);
1269 FAKE_VARARG_HANDLE_ARG(arg14);
1270 FAKE_VARARG_HANDLE_ARG(arg15);
1271 FAKE_VARARG_HANDLE_ARG(arg16);
1272 FAKE_VARARG_HANDLE_ARG(arg17);
1273 FAKE_VARARG_HANDLE_ARG(arg18);
1274 FAKE_VARARG_HANDLE_ARG(arg19);
1275 FAKE_VARARG_HANDLE_ARG(arg20);
1276 FAKE_VARARG_HANDLE_ARG(arg21);
1277 FAKE_VARARG_HANDLE_ARG(arg22);
1278 FAKE_VARARG_HANDLE_ARG(arg23);
1279 FAKE_VARARG_HANDLE_ARG(arg24);
1280 FAKE_VARARG_HANDLE_ARG(arg25);
1281 FAKE_VARARG_HANDLE_ARG(arg26);
1282 FAKE_VARARG_HANDLE_ARG(arg27);
1283 FAKE_VARARG_HANDLE_ARG(arg28);
1284 FAKE_VARARG_HANDLE_ARG(arg29);
1285 FAKE_VARARG_HANDLE_ARG(arg30);
1286 FAKE_VARARG_HANDLE_ARG(arg31);
1287 FAKE_VARARG_HANDLE_ARG(arg32);
1288 FAKE_VARARG_HANDLE_ARG(arg33);
1289 #undef FAKE_VARARG_HANDLE_ARG
1290 return current_arg_index;
1293 void OpenCLDeviceBase::release_kernel_safe(cl_kernel kernel)
1296 clReleaseKernel(kernel);
1300 void OpenCLDeviceBase::release_mem_object_safe(cl_mem mem)
1303 clReleaseMemObject(mem);
1307 void OpenCLDeviceBase::release_program_safe(cl_program program)
1310 clReleaseProgram(program);
1314 /* ** Those guys are for workign around some compiler-specific bugs ** */
1316 cl_program OpenCLDeviceBase::load_cached_kernel(
1318 thread_scoped_lock& cache_locker)
1320 return OpenCLCache::get_program(cpPlatform,
1326 void OpenCLDeviceBase::store_cached_kernel(
1329 thread_scoped_lock& cache_locker)
1331 OpenCLCache::store_program(cpPlatform,
1338 string OpenCLDeviceBase::build_options_for_base_program(
1339 const DeviceRequestedFeatures& requested_features)
1341 /* TODO(sergey): By default we compile all features, meaning
1342 * mega kernel is not getting feature-based optimizations.
1344 * Ideally we need always compile kernel with as less features
1345 * enabled as possible to keep performance at it's max.
1348 /* For now disable baking when not in use as this has major
1349 * impact on kernel build times.
1351 if(!requested_features.use_baking) {
1352 return "-D__NO_BAKING__";