2 * Copyright 2011-2013 Blender Foundation
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
26 #include "device_intern.h"
30 #include "util_debug.h"
31 #include "util_foreach.h"
32 #include "util_logging.h"
34 #include "util_math.h"
36 #include "util_opengl.h"
37 #include "util_path.h"
38 #include "util_time.h"
42 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
44 /* Macro declarations used with split kernel */
46 /* Macro to enable/disable work-stealing */
47 #define __WORK_STEALING__
49 #define SPLIT_KERNEL_LOCAL_SIZE_X 64
50 #define SPLIT_KERNEL_LOCAL_SIZE_Y 1
52 /* This value may be tuned according to the scene we are rendering.
54 * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
55 * ray-bounces will improve performance.
57 #define PATH_ITER_INC_FACTOR 8
59 /* When allocate global memory in chunks. We may not be able to
60 * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
61 * Since some bytes may be needed for aligning chunks of memory;
62 * This is the amount of memory that we dedicate for that purpose.
64 #define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
66 struct OpenCLPlatformDevice {
67 OpenCLPlatformDevice(cl_platform_id platform_id,
68 const string& platform_name,
69 cl_device_id device_id,
70 cl_device_type device_type,
71 const string& device_name)
72 : platform_id(platform_id),
73 platform_name(platform_name),
75 device_type(device_type),
76 device_name(device_name) {}
77 cl_platform_id platform_id;
79 cl_device_id device_id;
80 cl_device_type device_type;
86 cl_device_type opencl_device_type()
88 switch(DebugFlags().opencl.device_type)
90 case DebugFlags::OpenCL::DEVICE_NONE:
92 case DebugFlags::OpenCL::DEVICE_ALL:
93 return CL_DEVICE_TYPE_ALL;
94 case DebugFlags::OpenCL::DEVICE_DEFAULT:
95 return CL_DEVICE_TYPE_DEFAULT;
96 case DebugFlags::OpenCL::DEVICE_CPU:
97 return CL_DEVICE_TYPE_CPU;
98 case DebugFlags::OpenCL::DEVICE_GPU:
99 return CL_DEVICE_TYPE_GPU;
100 case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
101 return CL_DEVICE_TYPE_ACCELERATOR;
103 return CL_DEVICE_TYPE_ALL;
107 inline bool opencl_kernel_use_debug()
109 return DebugFlags().opencl.debug;
112 bool opencl_kernel_use_advanced_shading(const string& platform)
114 /* keep this in sync with kernel_types.h! */
115 if(platform == "NVIDIA CUDA")
117 else if(platform == "Apple")
119 else if(platform == "AMD Accelerated Parallel Processing")
121 else if(platform == "Intel(R) OpenCL")
123 /* Make sure officially unsupported OpenCL platforms
124 * does not set up to use advanced shading.
129 bool opencl_kernel_use_split(const string& platform_name,
130 const cl_device_type device_type)
132 if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) {
133 VLOG(1) << "Forcing split kernel to use.";
136 if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) {
137 VLOG(1) << "Forcing mega kernel to use.";
140 /* TODO(sergey): Replace string lookups with more enum-like API,
141 * similar to device/vendor checks blender's gpu.
143 if(platform_name == "AMD Accelerated Parallel Processing" &&
144 device_type == CL_DEVICE_TYPE_GPU)
151 bool opencl_device_supported(const string& platform_name,
152 const cl_device_id device_id)
154 cl_device_type device_type;
155 clGetDeviceInfo(device_id,
157 sizeof(cl_device_type),
160 if(platform_name == "AMD Accelerated Parallel Processing" &&
161 device_type == CL_DEVICE_TYPE_GPU)
165 if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
171 bool opencl_platform_version_check(cl_platform_id platform,
172 string *error = NULL)
174 const int req_major = 1, req_minor = 1;
177 clGetPlatformInfo(platform,
182 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
184 *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
188 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
190 *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
200 bool opencl_device_version_check(cl_device_id device,
201 string *error = NULL)
203 const int req_major = 1, req_minor = 1;
206 clGetDeviceInfo(device,
207 CL_DEVICE_OPENCL_C_VERSION,
211 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
213 *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
217 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
219 *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
229 void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
231 const bool force_all_platforms =
232 (DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT);
233 const cl_device_type device_type = opencl_device_type();
234 static bool first_time = true;
235 #define FIRST_VLOG(severity) if(first_time) VLOG(severity)
237 usable_devices->clear();
239 if(device_type == 0) {
240 FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
245 vector<cl_device_id> device_ids;
246 cl_uint num_devices = 0;
247 vector<cl_platform_id> platform_ids;
248 cl_uint num_platforms = 0;
251 if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
254 FIRST_VLOG(2) << "No OpenCL platforms were found.";
258 platform_ids.resize(num_platforms);
259 if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
260 FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
264 /* Devices are numbered consecutively across platforms. */
265 for(int platform = 0; platform < num_platforms; platform++) {
266 cl_platform_id platform_id = platform_ids[platform];
268 if(clGetPlatformInfo(platform_id,
274 FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
277 string platform_name = pname;
278 FIRST_VLOG(2) << "Enumerating devices for platform "
279 << platform_name << ".";
280 if(!opencl_platform_version_check(platform_id)) {
281 FIRST_VLOG(2) << "Ignoring platform " << platform_name
282 << " due to too old compiler version.";
286 if(clGetDeviceIDs(platform_id,
290 &num_devices) != CL_SUCCESS || num_devices == 0)
292 FIRST_VLOG(2) << "Ignoring platform " << platform_name
293 << ", failed to fetch number of devices.";
296 device_ids.resize(num_devices);
297 if(clGetDeviceIDs(platform_id,
303 FIRST_VLOG(2) << "Ignoring platform " << platform_name
304 << ", failed to fetch devices list.";
307 for(int num = 0; num < num_devices; num++) {
308 cl_device_id device_id = device_ids[num];
309 char device_name[1024] = "\0";
310 if(clGetDeviceInfo(device_id,
316 FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
319 if(!opencl_device_version_check(device_id)) {
320 FIRST_VLOG(2) << "Ignoring device " << device_name
321 << " due to old compiler version.";
324 if(force_all_platforms ||
325 opencl_device_supported(platform_name, device_id))
327 cl_device_type device_type;
328 if(clGetDeviceInfo(device_id,
330 sizeof(cl_device_type),
334 FIRST_VLOG(2) << "Ignoring device " << device_name
335 << ", failed to fetch device type.";
338 FIRST_VLOG(2) << "Adding new device " << device_name << ".";
339 usable_devices->push_back(OpenCLPlatformDevice(platform_id,
346 FIRST_VLOG(2) << "Ignoring device " << device_name
347 << ", not officially supported yet.";
356 /* Thread safe cache for contexts and programs.
358 * TODO(sergey): Make it more generous, so it can contain any type of program
359 * without hardcoding possible program types in the slot.
367 /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
368 cl_program ocl_dev_base_program;
369 /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
370 cl_program ocl_dev_megakernel_program;
372 Slot() : mutex(NULL),
374 ocl_dev_base_program(NULL),
375 ocl_dev_megakernel_program(NULL) {}
377 Slot(const Slot& rhs)
379 context(rhs.context),
380 ocl_dev_base_program(rhs.ocl_dev_base_program),
381 ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
383 /* copy can only happen in map insert, assert that */
384 assert(mutex == NULL);
394 /* key is combination of platform ID and device ID */
395 typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
397 /* map of Slot objects */
398 typedef map<PlatformDevicePair, Slot> CacheMap;
401 thread_mutex cache_lock;
403 /* lazy instantiate */
404 static OpenCLCache &global_instance()
406 static OpenCLCache instance;
416 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
417 * so this is disabled. It's not necessary to free objects
418 * at process exit anyway.
419 * http://software.intel.com/en-us/forums/topic/370083#comments */
424 /* lookup something in the cache. If this returns NULL, slot_locker
425 * will be holding a lock for the cache. slot_locker should refer to a
426 * default constructed thread_scoped_lock */
428 static T get_something(cl_platform_id platform,
431 thread_scoped_lock& slot_locker)
433 assert(platform != NULL);
435 OpenCLCache& self = global_instance();
437 thread_scoped_lock cache_lock(self.cache_lock);
439 pair<CacheMap::iterator,bool> ins = self.cache.insert(
440 CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
442 Slot &slot = ins.first->second;
444 /* create slot lock only while holding cache lock */
446 slot.mutex = new thread_mutex;
448 /* need to unlock cache before locking slot, to allow store to complete */
452 slot_locker = thread_scoped_lock(*slot.mutex);
454 /* If the thing isn't cached */
455 if(slot.*member == NULL) {
456 /* return with the caller's lock holder holding the slot lock */
460 /* the item was already cached, release the slot lock */
461 slot_locker.unlock();
466 /* store something in the cache. you MUST have tried to get the item before storing to it */
468 static void store_something(cl_platform_id platform,
472 thread_scoped_lock& slot_locker)
474 assert(platform != NULL);
475 assert(device != NULL);
476 assert(thing != NULL);
478 OpenCLCache &self = global_instance();
480 thread_scoped_lock cache_lock(self.cache_lock);
481 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
484 Slot &slot = i->second;
487 assert(i != self.cache.end());
488 assert(slot.*member == NULL);
490 slot.*member = thing;
492 /* unlock the slot */
493 slot_locker.unlock();
499 OCL_DEV_BASE_PROGRAM,
500 OCL_DEV_MEGAKERNEL_PROGRAM,
503 /* see get_something comment */
504 static cl_context get_context(cl_platform_id platform,
506 thread_scoped_lock& slot_locker)
508 cl_context context = get_something<cl_context>(platform,
516 /* caller is going to release it when done with it, so retain it */
517 cl_int ciErr = clRetainContext(context);
518 assert(ciErr == CL_SUCCESS);
524 /* see get_something comment */
525 static cl_program get_program(cl_platform_id platform,
527 ProgramName program_name,
528 thread_scoped_lock& slot_locker)
530 cl_program program = NULL;
532 switch(program_name) {
533 case OCL_DEV_BASE_PROGRAM:
534 /* Get program related to OpenCLDeviceBase */
535 program = get_something<cl_program>(platform,
537 &Slot::ocl_dev_base_program,
540 case OCL_DEV_MEGAKERNEL_PROGRAM:
541 /* Get program related to megakernel */
542 program = get_something<cl_program>(platform,
544 &Slot::ocl_dev_megakernel_program,
548 assert(!"Invalid program name");
554 /* caller is going to release it when done with it, so retain it */
555 cl_int ciErr = clRetainProgram(program);
556 assert(ciErr == CL_SUCCESS);
562 /* see store_something comment */
563 static void store_context(cl_platform_id platform,
566 thread_scoped_lock& slot_locker)
568 store_something<cl_context>(platform,
574 /* increment reference count in OpenCL.
575 * The caller is going to release the object when done with it. */
576 cl_int ciErr = clRetainContext(context);
577 assert(ciErr == CL_SUCCESS);
581 /* see store_something comment */
582 static void store_program(cl_platform_id platform,
585 ProgramName program_name,
586 thread_scoped_lock& slot_locker)
588 switch(program_name) {
589 case OCL_DEV_BASE_PROGRAM:
590 store_something<cl_program>(platform,
593 &Slot::ocl_dev_base_program,
596 case OCL_DEV_MEGAKERNEL_PROGRAM:
597 store_something<cl_program>(platform,
600 &Slot::ocl_dev_megakernel_program,
604 assert(!"Invalid program name\n");
608 /* Increment reference count in OpenCL.
609 * The caller is going to release the object when done with it.
611 cl_int ciErr = clRetainProgram(program);
612 assert(ciErr == CL_SUCCESS);
616 /* Discard all cached contexts and programs. */
619 OpenCLCache &self = global_instance();
620 thread_scoped_lock cache_lock(self.cache_lock);
622 foreach(CacheMap::value_type &item, self.cache) {
623 if(item.second.ocl_dev_base_program != NULL)
624 clReleaseProgram(item.second.ocl_dev_base_program);
625 if(item.second.ocl_dev_megakernel_program != NULL)
626 clReleaseProgram(item.second.ocl_dev_megakernel_program);
627 if(item.second.context != NULL)
628 clReleaseContext(item.second.context);
635 class OpenCLDeviceBase : public Device
638 DedicatedTaskPool task_pool;
639 cl_context cxContext;
640 cl_command_queue cqCommandQueue;
641 cl_platform_id cpPlatform;
642 cl_device_id cdDevice;
643 cl_program cpProgram;
644 cl_kernel ckFilmConvertByteKernel;
645 cl_kernel ckFilmConvertHalfFloatKernel;
646 cl_kernel ckShaderKernel;
647 cl_kernel ckBakeKernel;
650 typedef map<string, device_vector<uchar>*> ConstMemMap;
651 typedef map<string, device_ptr> MemMap;
653 ConstMemMap const_mem_map;
657 bool device_initialized;
658 string platform_name;
660 bool opencl_error(cl_int err)
662 if(err != CL_SUCCESS) {
663 string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
666 fprintf(stderr, "%s\n", message.c_str());
673 void opencl_error(const string& message)
677 fprintf(stderr, "%s\n", message.c_str());
680 #define opencl_assert(stmt) \
684 if(err != CL_SUCCESS) { \
685 string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
686 if(error_msg == "") \
687 error_msg = message; \
688 fprintf(stderr, "%s\n", message.c_str()); \
692 void opencl_assert_err(cl_int err, const char* where)
694 if(err != CL_SUCCESS) {
695 string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
698 fprintf(stderr, "%s\n", message.c_str());
705 OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
706 : Device(info, stats, background_)
711 cqCommandQueue = NULL;
713 ckFilmConvertByteKernel = NULL;
714 ckFilmConvertHalfFloatKernel = NULL;
715 ckShaderKernel = NULL;
718 device_initialized = false;
720 vector<OpenCLPlatformDevice> usable_devices;
721 opencl_get_usable_devices(&usable_devices);
722 if(usable_devices.size() == 0) {
723 opencl_error("OpenCL: no devices found.");
726 assert(info.num < usable_devices.size());
727 OpenCLPlatformDevice& platform_device = usable_devices[info.num];
728 cpPlatform = platform_device.platform_id;
729 cdDevice = platform_device.device_id;
730 platform_name = platform_device.platform_name;
731 VLOG(2) << "Creating new Cycles device for OpenCL platform "
732 << platform_name << ", device "
733 << platform_device.device_name << ".";
736 /* try to use cached context */
737 thread_scoped_lock cache_locker;
738 cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
740 if(cxContext == NULL) {
741 /* create context properties array to specify platform */
742 const cl_context_properties context_props[] = {
743 CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
748 cxContext = clCreateContext(context_props, 1, &cdDevice,
749 context_notify_callback, cdDevice, &ciErr);
751 if(opencl_error(ciErr)) {
752 opencl_error("OpenCL: clCreateContext failed");
757 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
761 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
762 if(opencl_error(ciErr))
765 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
766 if(opencl_error(ciErr))
769 fprintf(stderr, "Device init success\n");
770 device_initialized = true;
773 static void CL_CALLBACK context_notify_callback(const char *err_info,
774 const void * /*private_info*/, size_t /*cb*/, void *user_data)
777 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
779 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
782 bool opencl_version_check()
785 if(!opencl_platform_version_check(cpPlatform, &error)) {
789 if(!opencl_device_version_check(cdDevice, &error)) {
796 bool load_binary(const string& /*kernel_path*/,
798 string custom_kernel_build_options,
800 const string *debug_src = NULL)
802 /* read binary into memory */
803 vector<uint8_t> binary;
805 if(!path_read_binary(clbin, binary)) {
806 opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
812 size_t size = binary.size();
813 const uint8_t *bytes = &binary[0];
815 *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
816 &size, &bytes, &status, &ciErr);
818 if(opencl_error(status) || opencl_error(ciErr)) {
819 opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
823 if(!build_kernel(program, custom_kernel_build_options, debug_src))
829 bool save_binary(cl_program *program, const string& clbin)
832 clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
837 vector<uint8_t> binary(size);
838 uint8_t *bytes = &binary[0];
840 clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
842 if(!path_write_binary(clbin, binary)) {
843 opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
850 bool build_kernel(cl_program *kernel_program,
851 string custom_kernel_build_options,
852 const string *debug_src = NULL)
854 string build_options;
855 build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
857 ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
859 /* show warnings even if build is successful */
860 size_t ret_val_size = 0;
862 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
864 if(ret_val_size > 1) {
865 vector<char> build_log(ret_val_size + 1);
866 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
868 build_log[ret_val_size] = '\0';
869 /* Skip meaningless empty output from the NVidia compiler. */
870 if(!(ret_val_size == 2 && build_log[0] == '\n')) {
871 fprintf(stderr, "OpenCL kernel build output:\n");
872 fprintf(stderr, "%s\n", &build_log[0]);
876 if(ciErr != CL_SUCCESS) {
877 opencl_error("OpenCL build failed: errors in console");
884 bool compile_kernel(const string& kernel_path,
886 string custom_kernel_build_options,
887 cl_program *kernel_program,
888 const string *debug_src = NULL)
890 /* we compile kernels consisting of many files. unfortunately opencl
891 * kernel caches do not seem to recognize changes in included files.
892 * so we force recompile on changes by adding the md5 hash of all files */
893 source = path_source_replace_includes(source, kernel_path);
896 path_write_text(*debug_src, source);
898 size_t source_len = source.size();
899 const char *source_str = source.c_str();
901 *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
903 if(opencl_error(ciErr))
906 double starttime = time_dt();
907 printf("Compiling OpenCL kernel ...\n");
908 /* TODO(sergey): Report which kernel is being compiled
909 * as well (megakernel or which of split kernels etc..).
911 printf("Build flags: %s\n", custom_kernel_build_options.c_str());
913 if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
916 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
921 string device_md5_hash(string kernel_custom_build_options = "")
924 char version[256], driver[256], name[256], vendor[256];
926 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
927 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
928 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
929 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
931 md5.append((uint8_t*)vendor, strlen(vendor));
932 md5.append((uint8_t*)version, strlen(version));
933 md5.append((uint8_t*)name, strlen(name));
934 md5.append((uint8_t*)driver, strlen(driver));
936 string options = kernel_build_options();
937 options += kernel_custom_build_options;
938 md5.append((uint8_t*)options.c_str(), options.size());
940 return md5.get_hex();
943 bool load_kernels(const DeviceRequestedFeatures& requested_features)
945 /* Verify if device was initialized. */
946 if(!device_initialized) {
947 fprintf(stderr, "OpenCL: failed to initialize device.\n");
951 /* Try to use cached kernel. */
952 thread_scoped_lock cache_locker;
953 cpProgram = load_cached_kernel(requested_features,
954 OpenCLCache::OCL_DEV_BASE_PROGRAM,
958 VLOG(2) << "No cached OpenCL kernel.";
960 /* Verify we have right opencl version. */
961 if(!opencl_version_check())
964 string build_flags = build_options_for_base_program(requested_features);
966 /* Calculate md5 hashes to detect changes. */
967 string kernel_path = path_get("kernel");
968 string kernel_md5 = path_files_md5_hash(kernel_path);
969 string device_md5 = device_md5_hash(build_flags);
971 /* Path to cached binary.
973 * TODO(sergey): Seems we could de-duplicate all this string_printf()
974 * calls with some utility function which will give file name for a
977 string clbin = string_printf("cycles_kernel_%s_%s.clbin",
980 clbin = path_user_get(path_join("cache", clbin));
982 /* path to preprocessed source for debugging */
983 string clsrc, *debug_src = NULL;
985 if(opencl_kernel_use_debug()) {
986 clsrc = string_printf("cycles_kernel_%s_%s.cl",
989 clsrc = path_user_get(path_join("cache", clsrc));
993 /* If binary kernel exists already, try use it. */
994 if(path_exists(clbin) && load_binary(kernel_path,
998 /* Kernel loaded from binary, nothing to do. */
999 VLOG(2) << "Loaded kernel from " << clbin << ".";
1002 VLOG(2) << "Kernel file " << clbin << " either doesn't exist or failed to be loaded by driver.";
1003 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
1005 /* If does not exist or loading binary failed, compile kernel. */
1006 if(!compile_kernel(kernel_path,
1015 /* Save binary for reuse. */
1016 if(!save_binary(&cpProgram, clbin)) {
1021 /* Cache the program. */
1022 store_cached_kernel(cpPlatform,
1025 OpenCLCache::OCL_DEV_BASE_PROGRAM,
1029 VLOG(2) << "Found cached OpenCL kernel.";
1033 #define FIND_KERNEL(kernel_var, kernel_name) \
1035 kernel_var = clCreateKernel(cpProgram, "kernel_ocl_" kernel_name, &ciErr); \
1036 if(opencl_error(ciErr)) \
1040 FIND_KERNEL(ckFilmConvertByteKernel, "convert_to_byte");
1041 FIND_KERNEL(ckFilmConvertHalfFloatKernel, "convert_to_half_float");
1042 FIND_KERNEL(ckShaderKernel, "shader");
1043 FIND_KERNEL(ckBakeKernel, "bake");
1054 clReleaseMemObject(CL_MEM_PTR(null_mem));
1056 ConstMemMap::iterator mt;
1057 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
1058 mem_free(*(mt->second));
1062 if(ckFilmConvertByteKernel)
1063 clReleaseKernel(ckFilmConvertByteKernel);
1064 if(ckFilmConvertHalfFloatKernel)
1065 clReleaseKernel(ckFilmConvertHalfFloatKernel);
1067 clReleaseKernel(ckShaderKernel);
1069 clReleaseKernel(ckBakeKernel);
1071 clReleaseProgram(cpProgram);
1073 clReleaseCommandQueue(cqCommandQueue);
1075 clReleaseContext(cxContext);
1078 void mem_alloc(device_memory& mem, MemoryType type)
1080 size_t size = mem.memory_size();
1082 cl_mem_flags mem_flag;
1083 void *mem_ptr = NULL;
1085 if(type == MEM_READ_ONLY)
1086 mem_flag = CL_MEM_READ_ONLY;
1087 else if(type == MEM_WRITE_ONLY)
1088 mem_flag = CL_MEM_WRITE_ONLY;
1090 mem_flag = CL_MEM_READ_WRITE;
1092 /* Zero-size allocation might be invoked by render, but not really
1093 * supported by OpenCL. Using NULL as device pointer also doesn't really
1094 * work for some reason, so for the time being we'll use special case
1095 * will null_mem buffer.
1098 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
1103 opencl_assert_err(ciErr, "clCreateBuffer");
1106 mem.device_pointer = null_mem;
1109 stats.mem_alloc(size);
1110 mem.device_size = size;
1113 void mem_copy_to(device_memory& mem)
1115 /* this is blocking */
1116 size_t size = mem.memory_size();
1118 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1119 CL_MEM_PTR(mem.device_pointer),
1123 (void*)mem.data_pointer,
1129 void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
1131 size_t offset = elem*y*w;
1132 size_t size = elem*w*h;
1134 opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
1135 CL_MEM_PTR(mem.device_pointer),
1139 (uchar*)mem.data_pointer + offset,
1144 void mem_zero(device_memory& mem)
1146 if(mem.device_pointer) {
1147 memset((void*)mem.data_pointer, 0, mem.memory_size());
1152 void mem_free(device_memory& mem)
1154 if(mem.device_pointer) {
1155 if(mem.device_pointer != null_mem) {
1156 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1158 mem.device_pointer = 0;
1160 stats.mem_free(mem.device_size);
1161 mem.device_size = 0;
1165 void const_copy_to(const char *name, void *host, size_t size)
1167 ConstMemMap::iterator i = const_mem_map.find(name);
1169 if(i == const_mem_map.end()) {
1170 device_vector<uchar> *data = new device_vector<uchar>();
1171 data->copy((uchar*)host, size);
1173 mem_alloc(*data, MEM_READ_ONLY);
1174 i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
1177 device_vector<uchar> *data = i->second;
1178 data->copy((uchar*)host, size);
1181 mem_copy_to(*i->second);
1184 void tex_alloc(const char *name,
1186 InterpolationType /*interpolation*/,
1187 ExtensionType /*extension*/)
1189 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
1190 mem_alloc(mem, MEM_READ_ONLY);
1192 assert(mem_map.find(name) == mem_map.end());
1193 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
1196 void tex_free(device_memory& mem)
1198 if(mem.device_pointer) {
1199 foreach(const MemMap::value_type& value, mem_map) {
1200 if(value.second == mem.device_pointer) {
1201 mem_map.erase(value.first);
1210 size_t global_size_round_up(int group_size, int global_size)
1212 int r = global_size % group_size;
1213 return global_size + ((r == 0)? 0: group_size - r);
1216 void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
1218 size_t workgroup_size, max_work_items[3];
1220 clGetKernelWorkGroupInfo(kernel, cdDevice,
1221 CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1222 clGetDeviceInfo(cdDevice,
1223 CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
1225 /* try to divide evenly over 2 dimensions */
1226 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1227 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
1229 /* some implementations have max size 1 on 2nd dimension */
1230 if(local_size[1] > max_work_items[1]) {
1231 local_size[0] = workgroup_size/max_work_items[1];
1232 local_size[1] = max_work_items[1];
1235 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
1238 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1239 opencl_assert(clFlush(cqCommandQueue));
1242 void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1246 MemMap::iterator i = mem_map.find(name);
1247 if(i != mem_map.end()) {
1248 ptr = CL_MEM_PTR(i->second);
1251 /* work around NULL not working, even though the spec says otherwise */
1252 ptr = CL_MEM_PTR(null_mem);
1255 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
1258 void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
1260 /* cast arguments to cl types */
1261 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1262 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
1263 cl_mem d_buffer = CL_MEM_PTR(buffer);
1264 cl_int d_x = task.x;
1265 cl_int d_y = task.y;
1266 cl_int d_w = task.w;
1267 cl_int d_h = task.h;
1268 cl_float d_sample_scale = 1.0f/(task.sample + 1);
1269 cl_int d_offset = task.offset;
1270 cl_int d_stride = task.stride;
1273 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
1275 cl_uint start_arg_index =
1276 kernel_set_args(ckFilmConvertKernel,
1282 #define KERNEL_TEX(type, ttype, name) \
1283 set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
1284 #include "kernel_textures.h"
1287 start_arg_index += kernel_set_args(ckFilmConvertKernel,
1297 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1300 void shader(DeviceTask& task)
1302 /* cast arguments to cl types */
1303 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1304 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1305 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1306 cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
1307 cl_int d_shader_eval_type = task.shader_eval_type;
1308 cl_int d_shader_filter = task.shader_filter;
1309 cl_int d_shader_x = task.shader_x;
1310 cl_int d_shader_w = task.shader_w;
1311 cl_int d_offset = task.offset;
1315 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1316 kernel = ckBakeKernel;
1318 kernel = ckShaderKernel;
1320 for(int sample = 0; sample < task.num_samples; sample++) {
1322 if(task.get_cancel())
1325 cl_int d_sample = sample;
1327 cl_uint start_arg_index =
1328 kernel_set_args(kernel,
1334 if(task.shader_eval_type < SHADER_EVAL_BAKE) {
1335 start_arg_index += kernel_set_args(kernel,
1340 #define KERNEL_TEX(type, ttype, name) \
1341 set_kernel_arg_mem(kernel, &start_arg_index, #name);
1342 #include "kernel_textures.h"
1345 start_arg_index += kernel_set_args(kernel,
1347 d_shader_eval_type);
1348 if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1349 start_arg_index += kernel_set_args(kernel,
1353 start_arg_index += kernel_set_args(kernel,
1360 enqueue_kernel(kernel, task.shader_w, 1);
1362 task.update_progress(NULL);
1366 class OpenCLDeviceTask : public DeviceTask {
1368 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
1371 run = function_bind(&OpenCLDeviceBase::thread_run,
1377 int get_split_task_count(DeviceTask& /*task*/)
1382 void task_add(DeviceTask& task)
1384 task_pool.push(new OpenCLDeviceTask(this, task));
1397 virtual void thread_run(DeviceTask * /*task*/) = 0;
1400 string kernel_build_options(const string *debug_src = NULL)
1402 string build_options = "-cl-fast-relaxed-math ";
1404 if(platform_name == "NVIDIA CUDA") {
1405 build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1406 "-cl-nv-maxrregcount=32 "
1409 uint compute_capability_major, compute_capability_minor;
1410 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1411 sizeof(cl_uint), &compute_capability_major, NULL);
1412 clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1413 sizeof(cl_uint), &compute_capability_minor, NULL);
1415 build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1416 compute_capability_major * 100 +
1417 compute_capability_minor * 10);
1420 else if(platform_name == "Apple")
1421 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1423 else if(platform_name == "AMD Accelerated Parallel Processing")
1424 build_options += "-D__KERNEL_OPENCL_AMD__ ";
1426 else if(platform_name == "Intel(R) OpenCL") {
1427 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1429 /* Options for gdb source level kernel debugging.
1430 * this segfaults on linux currently.
1432 if(opencl_kernel_use_debug() && debug_src)
1433 build_options += "-g -s \"" + *debug_src + "\" ";
1436 if(opencl_kernel_use_debug())
1437 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1439 #ifdef WITH_CYCLES_DEBUG
1440 build_options += "-D__KERNEL_DEBUG__ ";
1443 return build_options;
1446 class ArgumentWrapper {
1448 ArgumentWrapper() : size(0), pointer(NULL) {}
1449 template <typename T>
1450 ArgumentWrapper(T& argument) : size(sizeof(argument)),
1451 pointer(&argument) { }
1452 ArgumentWrapper(int argument) : size(sizeof(int)),
1453 int_value(argument),
1454 pointer(&int_value) { }
1455 ArgumentWrapper(float argument) : size(sizeof(float)),
1456 float_value(argument),
1457 pointer(&float_value) { }
1464 /* TODO(sergey): In the future we can use variadic templates, once
1465 * C++0x is allowed. Should allow to clean this up a bit.
1467 int kernel_set_args(cl_kernel kernel,
1468 int start_argument_index,
1469 const ArgumentWrapper& arg1 = ArgumentWrapper(),
1470 const ArgumentWrapper& arg2 = ArgumentWrapper(),
1471 const ArgumentWrapper& arg3 = ArgumentWrapper(),
1472 const ArgumentWrapper& arg4 = ArgumentWrapper(),
1473 const ArgumentWrapper& arg5 = ArgumentWrapper(),
1474 const ArgumentWrapper& arg6 = ArgumentWrapper(),
1475 const ArgumentWrapper& arg7 = ArgumentWrapper(),
1476 const ArgumentWrapper& arg8 = ArgumentWrapper(),
1477 const ArgumentWrapper& arg9 = ArgumentWrapper(),
1478 const ArgumentWrapper& arg10 = ArgumentWrapper(),
1479 const ArgumentWrapper& arg11 = ArgumentWrapper(),
1480 const ArgumentWrapper& arg12 = ArgumentWrapper(),
1481 const ArgumentWrapper& arg13 = ArgumentWrapper(),
1482 const ArgumentWrapper& arg14 = ArgumentWrapper(),
1483 const ArgumentWrapper& arg15 = ArgumentWrapper(),
1484 const ArgumentWrapper& arg16 = ArgumentWrapper(),
1485 const ArgumentWrapper& arg17 = ArgumentWrapper(),
1486 const ArgumentWrapper& arg18 = ArgumentWrapper(),
1487 const ArgumentWrapper& arg19 = ArgumentWrapper(),
1488 const ArgumentWrapper& arg20 = ArgumentWrapper(),
1489 const ArgumentWrapper& arg21 = ArgumentWrapper(),
1490 const ArgumentWrapper& arg22 = ArgumentWrapper(),
1491 const ArgumentWrapper& arg23 = ArgumentWrapper(),
1492 const ArgumentWrapper& arg24 = ArgumentWrapper(),
1493 const ArgumentWrapper& arg25 = ArgumentWrapper(),
1494 const ArgumentWrapper& arg26 = ArgumentWrapper(),
1495 const ArgumentWrapper& arg27 = ArgumentWrapper(),
1496 const ArgumentWrapper& arg28 = ArgumentWrapper(),
1497 const ArgumentWrapper& arg29 = ArgumentWrapper(),
1498 const ArgumentWrapper& arg30 = ArgumentWrapper(),
1499 const ArgumentWrapper& arg31 = ArgumentWrapper(),
1500 const ArgumentWrapper& arg32 = ArgumentWrapper(),
1501 const ArgumentWrapper& arg33 = ArgumentWrapper())
1503 int current_arg_index = 0;
1504 #define FAKE_VARARG_HANDLE_ARG(arg) \
1506 if(arg.pointer != NULL) { \
1507 opencl_assert(clSetKernelArg( \
1509 start_argument_index + current_arg_index, \
1510 arg.size, arg.pointer)); \
1511 ++current_arg_index; \
1514 return current_arg_index; \
1517 FAKE_VARARG_HANDLE_ARG(arg1);
1518 FAKE_VARARG_HANDLE_ARG(arg2);
1519 FAKE_VARARG_HANDLE_ARG(arg3);
1520 FAKE_VARARG_HANDLE_ARG(arg4);
1521 FAKE_VARARG_HANDLE_ARG(arg5);
1522 FAKE_VARARG_HANDLE_ARG(arg6);
1523 FAKE_VARARG_HANDLE_ARG(arg7);
1524 FAKE_VARARG_HANDLE_ARG(arg8);
1525 FAKE_VARARG_HANDLE_ARG(arg9);
1526 FAKE_VARARG_HANDLE_ARG(arg10);
1527 FAKE_VARARG_HANDLE_ARG(arg11);
1528 FAKE_VARARG_HANDLE_ARG(arg12);
1529 FAKE_VARARG_HANDLE_ARG(arg13);
1530 FAKE_VARARG_HANDLE_ARG(arg14);
1531 FAKE_VARARG_HANDLE_ARG(arg15);
1532 FAKE_VARARG_HANDLE_ARG(arg16);
1533 FAKE_VARARG_HANDLE_ARG(arg17);
1534 FAKE_VARARG_HANDLE_ARG(arg18);
1535 FAKE_VARARG_HANDLE_ARG(arg19);
1536 FAKE_VARARG_HANDLE_ARG(arg20);
1537 FAKE_VARARG_HANDLE_ARG(arg21);
1538 FAKE_VARARG_HANDLE_ARG(arg22);
1539 FAKE_VARARG_HANDLE_ARG(arg23);
1540 FAKE_VARARG_HANDLE_ARG(arg24);
1541 FAKE_VARARG_HANDLE_ARG(arg25);
1542 FAKE_VARARG_HANDLE_ARG(arg26);
1543 FAKE_VARARG_HANDLE_ARG(arg27);
1544 FAKE_VARARG_HANDLE_ARG(arg28);
1545 FAKE_VARARG_HANDLE_ARG(arg29);
1546 FAKE_VARARG_HANDLE_ARG(arg30);
1547 FAKE_VARARG_HANDLE_ARG(arg31);
1548 FAKE_VARARG_HANDLE_ARG(arg32);
1549 FAKE_VARARG_HANDLE_ARG(arg33);
1550 #undef FAKE_VARARG_HANDLE_ARG
1551 return current_arg_index;
1554 inline void release_kernel_safe(cl_kernel kernel)
1557 clReleaseKernel(kernel);
1561 inline void release_mem_object_safe(cl_mem mem)
1564 clReleaseMemObject(mem);
1568 inline void release_program_safe(cl_program program)
1571 clReleaseProgram(program);
1575 /* ** Those guys are for workign around some compiler-specific bugs ** */
1577 virtual cl_program load_cached_kernel(
1578 const DeviceRequestedFeatures& /*requested_features*/,
1579 OpenCLCache::ProgramName program_name,
1580 thread_scoped_lock& cache_locker)
1582 return OpenCLCache::get_program(cpPlatform,
1588 virtual void store_cached_kernel(cl_platform_id platform,
1589 cl_device_id device,
1591 OpenCLCache::ProgramName program_name,
1592 thread_scoped_lock& cache_locker)
1594 OpenCLCache::store_program(platform,
1601 virtual string build_options_for_base_program(
1602 const DeviceRequestedFeatures& /*requested_features*/)
1604 /* TODO(sergey): By default we compile all features, meaning
1605 * mega kernel is not getting feature-based optimizations.
1607 * Ideally we need always compile kernel with as less features
1608 * enabled as possible to keep performance at it's max.
1614 class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
1617 cl_kernel ckPathTraceKernel;
1618 cl_program path_trace_program;
1620 OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
1621 : OpenCLDeviceBase(info, stats, background_)
1623 ckPathTraceKernel = NULL;
1624 path_trace_program = NULL;
1627 bool load_kernels(const DeviceRequestedFeatures& requested_features)
1629 /* Get Shader, bake and film convert kernels.
1630 * It'll also do verification of OpenCL actually initialized.
1632 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1636 /* Try to use cached kernel. */
1637 thread_scoped_lock cache_locker;
1638 path_trace_program = OpenCLCache::get_program(cpPlatform,
1640 OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1643 if(!path_trace_program) {
1644 /* Verify we have right opencl version. */
1645 if(!opencl_version_check())
1648 /* Calculate md5 hash to detect changes. */
1649 string kernel_path = path_get("kernel");
1650 string kernel_md5 = path_files_md5_hash(kernel_path);
1651 string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
1652 string device_md5 = device_md5_hash(custom_kernel_build_options);
1654 /* Path to cached binary. */
1655 string clbin = string_printf("cycles_kernel_%s_%s.clbin",
1657 kernel_md5.c_str());
1658 clbin = path_user_get(path_join("cache", clbin));
1660 /* Path to preprocessed source for debugging. */
1661 string clsrc, *debug_src = NULL;
1662 if(opencl_kernel_use_debug()) {
1663 clsrc = string_printf("cycles_kernel_%s_%s.cl",
1665 kernel_md5.c_str());
1666 clsrc = path_user_get(path_join("cache", clsrc));
1670 /* If exists already, try use it. */
1671 if(path_exists(clbin) && load_binary(kernel_path,
1673 custom_kernel_build_options,
1674 &path_trace_program,
1676 /* Kernel loaded from binary, nothing to do. */
1679 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
1681 /* If does not exist or loading binary failed, compile kernel. */
1682 if(!compile_kernel(kernel_path,
1684 custom_kernel_build_options,
1685 &path_trace_program,
1690 /* Save binary for reuse. */
1691 if(!save_binary(&path_trace_program, clbin)) {
1695 /* Cache the program. */
1696 OpenCLCache::store_program(cpPlatform,
1699 OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1704 ckPathTraceKernel = clCreateKernel(path_trace_program,
1705 "kernel_ocl_path_trace",
1707 if(opencl_error(ciErr))
1712 ~OpenCLDeviceMegaKernel()
1715 release_kernel_safe(ckPathTraceKernel);
1716 release_program_safe(path_trace_program);
1719 void path_trace(RenderTile& rtile, int sample)
1721 /* Cast arguments to cl types. */
1722 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1723 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1724 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
1725 cl_int d_x = rtile.x;
1726 cl_int d_y = rtile.y;
1727 cl_int d_w = rtile.w;
1728 cl_int d_h = rtile.h;
1729 cl_int d_offset = rtile.offset;
1730 cl_int d_stride = rtile.stride;
1732 /* Sample arguments. */
1733 cl_int d_sample = sample;
1735 cl_uint start_arg_index =
1736 kernel_set_args(ckPathTraceKernel,
1742 #define KERNEL_TEX(type, ttype, name) \
1743 set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
1744 #include "kernel_textures.h"
1747 start_arg_index += kernel_set_args(ckPathTraceKernel,
1757 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
1760 void thread_run(DeviceTask *task)
1762 if(task->type == DeviceTask::FILM_CONVERT) {
1763 film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1765 else if(task->type == DeviceTask::SHADER) {
1768 else if(task->type == DeviceTask::PATH_TRACE) {
1770 /* Keep rendering tiles until done. */
1771 while(task->acquire_tile(this, tile)) {
1772 int start_sample = tile.start_sample;
1773 int end_sample = tile.start_sample + tile.num_samples;
1775 for(int sample = start_sample; sample < end_sample; sample++) {
1776 if(task->get_cancel()) {
1777 if(task->need_finish_queue == false)
1781 path_trace(tile, sample);
1783 tile.sample = sample + 1;
1785 task->update_progress(&tile);
1788 /* Complete kernel execution before release tile */
1789 /* This helps in multi-device render;
1790 * The device that reaches the critical-section function
1791 * release_tile waits (stalling other devices from entering
1792 * release_tile) for all kernels to complete. If device1 (a
1793 * slow-render device) reaches release_tile first then it would
1794 * stall device2 (a fast-render device) from proceeding to render
1797 clFinish(cqCommandQueue);
1799 task->release_tile(tile);
1805 /* TODO(sergey): This is to keep tile split on OpenCL level working
1806 * for now, since without this view-port render does not work as it
1809 * Ideally it'll be done on the higher level, but we need to get ready
1810 * for merge rather soon, so let's keep split logic private here in
1813 class SplitRenderTile : public RenderTile {
1819 rng_state_offset_x(0),
1820 rng_state_offset_y(0),
1821 buffer_rng_state_stride(0) {}
1823 explicit SplitRenderTile(RenderTile& tile)
1827 rng_state_offset_x(0),
1828 rng_state_offset_y(0),
1829 buffer_rng_state_stride(0)
1835 start_sample = tile.start_sample;
1836 num_samples = tile.num_samples;
1837 sample = tile.sample;
1838 resolution = tile.resolution;
1839 offset = tile.offset;
1840 stride = tile.stride;
1841 buffer = tile.buffer;
1842 rng_state = tile.rng_state;
1843 buffers = tile.buffers;
1846 /* Split kernel is device global memory constrained;
1847 * hence split kernel cant render big tile size's in
1848 * one go. If the user sets a big tile size (big tile size
1849 * is a term relative to the available device global memory),
1850 * we split the tile further and then call path_trace on
1851 * each of those split tiles. The following variables declared,
1852 * assist in achieving that purpose
1854 int buffer_offset_x;
1855 int buffer_offset_y;
1856 int rng_state_offset_x;
1857 int rng_state_offset_y;
1858 int buffer_rng_state_stride;
1861 /* OpenCLDeviceSplitKernel's declaration/definition. */
1862 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
1865 /* Kernel declaration. */
1866 cl_kernel ckPathTraceKernel_data_init;
1867 cl_kernel ckPathTraceKernel_scene_intersect;
1868 cl_kernel ckPathTraceKernel_lamp_emission;
1869 cl_kernel ckPathTraceKernel_queue_enqueue;
1870 cl_kernel ckPathTraceKernel_background_buffer_update;
1871 cl_kernel ckPathTraceKernel_shader_eval;
1872 cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
1873 cl_kernel ckPathTraceKernel_direct_lighting;
1874 cl_kernel ckPathTraceKernel_shadow_blocked;
1875 cl_kernel ckPathTraceKernel_next_iteration_setup;
1876 cl_kernel ckPathTraceKernel_sum_all_radiance;
1878 /* cl_program declaration. */
1879 cl_program data_init_program;
1880 cl_program scene_intersect_program;
1881 cl_program lamp_emission_program;
1882 cl_program queue_enqueue_program;
1883 cl_program background_buffer_update_program;
1884 cl_program shader_eval_program;
1885 cl_program holdout_emission_blurring_pathtermination_ao_program;
1886 cl_program direct_lighting_program;
1887 cl_program shadow_blocked_program;
1888 cl_program next_iteration_setup_program;
1889 cl_program sum_all_radiance_program;
1891 /* Global memory variables [porting]; These memory is used for
1892 * co-operation between different kernels; Data written by one
1893 * kernel will be available to another kernel via this global
1897 cl_mem throughput_coop;
1898 cl_mem L_transparent_coop;
1899 cl_mem PathRadiance_coop;
1901 cl_mem PathState_coop;
1902 cl_mem Intersection_coop;
1903 cl_mem kgbuffer; /* KernelGlobals buffer. */
1905 /* Global buffers for ShaderData. */
1906 cl_mem sd; /* ShaderData used in the main path-iteration loop. */
1907 cl_mem sd_DL_shadow; /* ShaderData used in Direct Lighting and
1908 * shadow_blocked kernel.
1911 /* Global buffers of each member of ShaderData. */
1913 cl_mem P_sd_DL_shadow;
1915 cl_mem N_sd_DL_shadow;
1917 cl_mem Ng_sd_DL_shadow;
1919 cl_mem I_sd_DL_shadow;
1921 cl_mem shader_sd_DL_shadow;
1923 cl_mem flag_sd_DL_shadow;
1925 cl_mem prim_sd_DL_shadow;
1927 cl_mem type_sd_DL_shadow;
1929 cl_mem u_sd_DL_shadow;
1931 cl_mem v_sd_DL_shadow;
1933 cl_mem object_sd_DL_shadow;
1935 cl_mem time_sd_DL_shadow;
1936 cl_mem ray_length_sd;
1937 cl_mem ray_length_sd_DL_shadow;
1939 /* Ray differentials. */
1940 cl_mem dP_sd, dI_sd;
1941 cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
1942 cl_mem du_sd, dv_sd;
1943 cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
1946 cl_mem dPdu_sd, dPdv_sd;
1947 cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
1949 /* Object motion. */
1950 cl_mem ob_tfm_sd, ob_itfm_sd;
1951 cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
1954 cl_mem closure_sd_DL_shadow;
1955 cl_mem num_closure_sd;
1956 cl_mem num_closure_sd_DL_shadow;
1957 cl_mem randb_closure_sd;
1958 cl_mem randb_closure_sd_DL_shadow;
1960 cl_mem ray_P_sd_DL_shadow;
1962 cl_mem ray_dP_sd_DL_shadow;
1964 /* Global memory required for shadow blocked and accum_radiance. */
1965 cl_mem BSDFEval_coop;
1967 cl_mem LightRay_coop;
1968 cl_mem AOAlpha_coop;
1970 cl_mem AOLightRay_coop;
1971 cl_mem Intersection_coop_AO;
1972 cl_mem Intersection_coop_DL;
1974 #ifdef WITH_CYCLES_DEBUG
1975 /* DebugData memory */
1976 cl_mem debugdata_coop;
1979 /* Global state array that tracks ray state. */
1982 /* Per sample buffers. */
1983 cl_mem per_sample_output_buffers;
1985 /* Denotes which sample each ray is being processed for. */
1989 cl_mem Queue_data; /* Array of size queuesize * num_queues * sizeof(int). */
1990 cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
1991 * Tracks the size of each queue.
1994 /* Flag to make sceneintersect and lampemission kernel use queues. */
1995 cl_mem use_queues_flag;
1997 /* Amount of memory in output buffer associated with one pixel/thread. */
1998 size_t per_thread_output_buffer_size;
2000 /* Total allocatable available device memory. */
2001 size_t total_allocatable_memory;
2003 /* host version of ray_state; Used in checking host path-iteration
2006 char *hostRayStateArray;
2008 /* Number of path-iterations to be done in one shot. */
2009 unsigned int PathIteration_times;
2011 #ifdef __WORK_STEALING__
2012 /* Work pool with respect to each work group. */
2013 cl_mem work_pool_wgs;
2015 /* Denotes the maximum work groups possible w.r.t. current tile size. */
2016 unsigned int max_work_groups;
2019 /* clos_max value for which the kernels have been loaded currently. */
2020 int current_max_closure;
2022 /* Marked True in constructor and marked false at the end of path_trace(). */
2025 OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
2026 : OpenCLDeviceBase(info, stats, background_)
2028 background = background_;
2030 /* Initialize kernels. */
2031 ckPathTraceKernel_data_init = NULL;
2032 ckPathTraceKernel_scene_intersect = NULL;
2033 ckPathTraceKernel_lamp_emission = NULL;
2034 ckPathTraceKernel_background_buffer_update = NULL;
2035 ckPathTraceKernel_shader_eval = NULL;
2036 ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
2037 ckPathTraceKernel_direct_lighting = NULL;
2038 ckPathTraceKernel_shadow_blocked = NULL;
2039 ckPathTraceKernel_next_iteration_setup = NULL;
2040 ckPathTraceKernel_sum_all_radiance = NULL;
2041 ckPathTraceKernel_queue_enqueue = NULL;
2043 /* Initialize program. */
2044 data_init_program = NULL;
2045 scene_intersect_program = NULL;
2046 lamp_emission_program = NULL;
2047 queue_enqueue_program = NULL;
2048 background_buffer_update_program = NULL;
2049 shader_eval_program = NULL;
2050 holdout_emission_blurring_pathtermination_ao_program = NULL;
2051 direct_lighting_program = NULL;
2052 shadow_blocked_program = NULL;
2053 next_iteration_setup_program = NULL;
2054 sum_all_radiance_program = NULL;
2056 /* Initialize cl_mem variables. */
2059 sd_DL_shadow = NULL;
2062 P_sd_DL_shadow = NULL;
2064 N_sd_DL_shadow = NULL;
2066 Ng_sd_DL_shadow = NULL;
2068 I_sd_DL_shadow = NULL;
2070 shader_sd_DL_shadow = NULL;
2072 flag_sd_DL_shadow = NULL;
2074 prim_sd_DL_shadow = NULL;
2076 type_sd_DL_shadow = NULL;
2078 u_sd_DL_shadow = NULL;
2080 v_sd_DL_shadow = NULL;
2082 object_sd_DL_shadow = NULL;
2084 time_sd_DL_shadow = NULL;
2085 ray_length_sd = NULL;
2086 ray_length_sd_DL_shadow = NULL;
2088 /* Ray differentials. */
2091 dP_sd_DL_shadow = NULL;
2092 dI_sd_DL_shadow = NULL;
2095 du_sd_DL_shadow = NULL;
2096 dv_sd_DL_shadow = NULL;
2101 dPdu_sd_DL_shadow = NULL;
2102 dPdv_sd_DL_shadow = NULL;
2104 /* Object motion. */
2107 ob_tfm_sd_DL_shadow = NULL;
2108 ob_itfm_sd_DL_shadow = NULL;
2111 closure_sd_DL_shadow = NULL;
2112 num_closure_sd = NULL;
2113 num_closure_sd_DL_shadow = NULL;
2114 randb_closure_sd = NULL;
2115 randb_closure_sd_DL_shadow = NULL;
2117 ray_P_sd_DL_shadow = NULL;
2119 ray_dP_sd_DL_shadow = NULL;
2122 throughput_coop = NULL;
2123 L_transparent_coop = NULL;
2124 PathRadiance_coop = NULL;
2126 PathState_coop = NULL;
2127 Intersection_coop = NULL;
2130 AOAlpha_coop = NULL;
2132 AOLightRay_coop = NULL;
2133 BSDFEval_coop = NULL;
2135 LightRay_coop = NULL;
2136 Intersection_coop_AO = NULL;
2137 Intersection_coop_DL = NULL;
2139 #ifdef WITH_CYCLES_DEBUG
2140 debugdata_coop = NULL;
2148 use_queues_flag = NULL;
2150 per_sample_output_buffers = NULL;
2152 per_thread_output_buffer_size = 0;
2153 hostRayStateArray = NULL;
2154 PathIteration_times = PATH_ITER_INC_FACTOR;
2155 #ifdef __WORK_STEALING__
2156 work_pool_wgs = NULL;
2157 max_work_groups = 0;
2159 current_max_closure = -1;
2162 /* Get device's maximum memory that can be allocated. */
2163 ciErr = clGetDeviceInfo(cdDevice,
2164 CL_DEVICE_MAX_MEM_ALLOC_SIZE,
2166 &total_allocatable_memory,
2168 assert(ciErr == CL_SUCCESS);
2169 if(platform_name == "AMD Accelerated Parallel Processing") {
2170 /* This value is tweak-able; AMD platform does not seem to
2171 * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
2172 * is considered for further computation.
2174 total_allocatable_memory /= 2;
2178 /* TODO(sergey): Seems really close to load_kernel(),
2179 * could it be de-duplicated?
2181 bool load_split_kernel(string kernel_path,
2182 string kernel_init_source,
2184 string custom_kernel_build_options,
2185 cl_program *program,
2186 const string *debug_src = NULL)
2188 if(!opencl_version_check())
2191 clbin = path_user_get(path_join("cache", clbin));
2193 /* If exists already, try use it. */
2194 if(path_exists(clbin) && load_binary(kernel_path,
2196 custom_kernel_build_options,
2199 /* Kernel loaded from binary. */
2202 /* If does not exist or loading binary failed, compile kernel. */
2203 if(!compile_kernel(kernel_path,
2205 custom_kernel_build_options,
2211 /* Save binary for reuse. */
2212 if(!save_binary(program, clbin)) {
2219 /* Split kernel utility functions. */
2220 size_t get_tex_size(const char *tex_name)
2223 size_t ret_size = 0;
2224 MemMap::iterator i = mem_map.find(tex_name);
2225 if(i != mem_map.end()) {
2226 ptr = CL_MEM_PTR(i->second);
2227 ciErr = clGetMemObjectInfo(ptr,
2232 assert(ciErr == CL_SUCCESS);
2237 size_t get_shader_closure_size(int max_closure)
2239 return (sizeof(ShaderClosure) * max_closure);
2242 size_t get_shader_data_size(size_t shader_closure_size)
2244 /* ShaderData size without accounting for ShaderClosure array. */
2245 size_t shader_data_size =
2246 sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
2247 return (shader_data_size + shader_closure_size);
2250 /* Returns size of KernelGlobals structure associated with OpenCL. */
2251 size_t get_KernelGlobals_size()
2253 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
2256 typedef struct KernelGlobals {
2257 ccl_constant KernelData *data;
2258 #define KERNEL_TEX(type, ttype, name) \
2259 ccl_global type *name;
2260 #include "kernel_textures.h"
2264 return sizeof(KernelGlobals);
2267 /* Returns size of Structure of arrays implementation of. */
2268 size_t get_shaderdata_soa_size()
2270 size_t shader_soa_size = 0;
2272 #define SD_VAR(type, what) shader_soa_size += sizeof(void *);
2273 #define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
2274 #include "kernel_shaderdata_vars.h"
2276 #undef SD_CLOSURE_VAR
2278 return shader_soa_size;
2281 bool load_kernels(const DeviceRequestedFeatures& requested_features)
2283 /* Get Shader, bake and film_convert kernels.
2284 * It'll also do verification of OpenCL actually initialized.
2286 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
2290 string kernel_path = path_get("kernel");
2291 string kernel_md5 = path_files_md5_hash(kernel_path);
2293 string kernel_init_source;
2295 string clsrc, *debug_src = NULL;
2297 string build_options = "-D__SPLIT_KERNEL__ ";
2298 #ifdef __WORK_STEALING__
2299 build_options += "-D__WORK_STEALING__ ";
2301 build_options += requested_features.get_build_options();
2303 /* Set compute device build option. */
2304 cl_device_type device_type;
2305 ciErr = clGetDeviceInfo(cdDevice,
2307 sizeof(cl_device_type),
2310 assert(ciErr == CL_SUCCESS);
2311 if(device_type == CL_DEVICE_TYPE_GPU) {
2312 build_options += " -D__COMPUTE_DEVICE_GPU__";
2315 #define GLUE(a, b) a ## b
2316 #define LOAD_KERNEL(name) \
2318 kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
2319 kernel_md5 + "\n"; \
2320 device_md5 = device_md5_hash(build_options); \
2321 clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
2322 device_md5.c_str(), kernel_md5.c_str()); \
2323 if(opencl_kernel_use_debug()) { \
2324 clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
2325 device_md5.c_str(), kernel_md5.c_str()); \
2326 clsrc = path_user_get(path_join("cache", clsrc)); \
2327 debug_src = &clsrc; \
2329 if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
2331 &GLUE(name, _program), \
2334 fprintf(stderr, "Faled to compile %s\n", #name); \
2339 LOAD_KERNEL(data_init);
2340 LOAD_KERNEL(scene_intersect);
2341 LOAD_KERNEL(lamp_emission);
2342 LOAD_KERNEL(queue_enqueue);
2343 LOAD_KERNEL(background_buffer_update);
2344 LOAD_KERNEL(shader_eval);
2345 LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
2346 LOAD_KERNEL(direct_lighting);
2347 LOAD_KERNEL(shadow_blocked);
2348 LOAD_KERNEL(next_iteration_setup);
2349 LOAD_KERNEL(sum_all_radiance);
2353 #define FIND_KERNEL(name) \
2355 GLUE(ckPathTraceKernel_, name) = \
2356 clCreateKernel(GLUE(name, _program), \
2357 "kernel_ocl_path_trace_" #name, &ciErr); \
2358 if(opencl_error(ciErr)) { \
2359 fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
2364 FIND_KERNEL(data_init);
2365 FIND_KERNEL(scene_intersect);
2366 FIND_KERNEL(lamp_emission);
2367 FIND_KERNEL(queue_enqueue);
2368 FIND_KERNEL(background_buffer_update);
2369 FIND_KERNEL(shader_eval);
2370 FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
2371 FIND_KERNEL(direct_lighting);
2372 FIND_KERNEL(shadow_blocked);
2373 FIND_KERNEL(next_iteration_setup);
2374 FIND_KERNEL(sum_all_radiance);
2378 current_max_closure = requested_features.max_closure;
2383 ~OpenCLDeviceSplitKernel()
2387 /* Release kernels */
2388 release_kernel_safe(ckPathTraceKernel_data_init);
2389 release_kernel_safe(ckPathTraceKernel_scene_intersect);
2390 release_kernel_safe(ckPathTraceKernel_lamp_emission);
2391 release_kernel_safe(ckPathTraceKernel_queue_enqueue);
2392 release_kernel_safe(ckPathTraceKernel_background_buffer_update);
2393 release_kernel_safe(ckPathTraceKernel_shader_eval);
2394 release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
2395 release_kernel_safe(ckPathTraceKernel_direct_lighting);
2396 release_kernel_safe(ckPathTraceKernel_shadow_blocked);
2397 release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
2398 release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
2400 /* Release global memory */
2401 release_mem_object_safe(P_sd);
2402 release_mem_object_safe(P_sd_DL_shadow);
2403 release_mem_object_safe(N_sd);
2404 release_mem_object_safe(N_sd_DL_shadow);
2405 release_mem_object_safe(Ng_sd);
2406 release_mem_object_safe(Ng_sd_DL_shadow);
2407 release_mem_object_safe(I_sd);
2408 release_mem_object_safe(I_sd_DL_shadow);
2409 release_mem_object_safe(shader_sd);
2410 release_mem_object_safe(shader_sd_DL_shadow);
2411 release_mem_object_safe(flag_sd);
2412 release_mem_object_safe(flag_sd_DL_shadow);
2413 release_mem_object_safe(prim_sd);
2414 release_mem_object_safe(prim_sd_DL_shadow);
2415 release_mem_object_safe(type_sd);
2416 release_mem_object_safe(type_sd_DL_shadow);
2417 release_mem_object_safe(u_sd);
2418 release_mem_object_safe(u_sd_DL_shadow);
2419 release_mem_object_safe(v_sd);
2420 release_mem_object_safe(v_sd_DL_shadow);
2421 release_mem_object_safe(object_sd);
2422 release_mem_object_safe(object_sd_DL_shadow);
2423 release_mem_object_safe(time_sd);
2424 release_mem_object_safe(time_sd_DL_shadow);
2425 release_mem_object_safe(ray_length_sd);
2426 release_mem_object_safe(ray_length_sd_DL_shadow);
2428 /* Ray differentials. */
2429 release_mem_object_safe(dP_sd);
2430 release_mem_object_safe(dP_sd_DL_shadow);
2431 release_mem_object_safe(dI_sd);
2432 release_mem_object_safe(dI_sd_DL_shadow);
2433 release_mem_object_safe(du_sd);
2434 release_mem_object_safe(du_sd_DL_shadow);
2435 release_mem_object_safe(dv_sd);
2436 release_mem_object_safe(dv_sd_DL_shadow);
2439 release_mem_object_safe(dPdu_sd);
2440 release_mem_object_safe(dPdu_sd_DL_shadow);
2441 release_mem_object_safe(dPdv_sd);
2442 release_mem_object_safe(dPdv_sd_DL_shadow);
2444 /* Object motion. */
2445 release_mem_object_safe(ob_tfm_sd);
2446 release_mem_object_safe(ob_itfm_sd);
2448 release_mem_object_safe(ob_tfm_sd_DL_shadow);
2449 release_mem_object_safe(ob_itfm_sd_DL_shadow);
2451 release_mem_object_safe(closure_sd);
2452 release_mem_object_safe(closure_sd_DL_shadow);
2453 release_mem_object_safe(num_closure_sd);
2454 release_mem_object_safe(num_closure_sd_DL_shadow);
2455 release_mem_object_safe(randb_closure_sd);
2456 release_mem_object_safe(randb_closure_sd_DL_shadow);
2457 release_mem_object_safe(ray_P_sd);
2458 release_mem_object_safe(ray_P_sd_DL_shadow);
2459 release_mem_object_safe(ray_dP_sd);
2460 release_mem_object_safe(ray_dP_sd_DL_shadow);
2461 release_mem_object_safe(rng_coop);
2462 release_mem_object_safe(throughput_coop);
2463 release_mem_object_safe(L_transparent_coop);
2464 release_mem_object_safe(PathRadiance_coop);
2465 release_mem_object_safe(Ray_coop);
2466 release_mem_object_safe(PathState_coop);
2467 release_mem_object_safe(Intersection_coop);
2468 release_mem_object_safe(kgbuffer);
2469 release_mem_object_safe(sd);
2470 release_mem_object_safe(sd_DL_shadow);
2471 release_mem_object_safe(ray_state);
2472 release_mem_object_safe(AOAlpha_coop);
2473 release_mem_object_safe(AOBSDF_coop);
2474 release_mem_object_safe(AOLightRay_coop);
2475 release_mem_object_safe(BSDFEval_coop);
2476 release_mem_object_safe(ISLamp_coop);
2477 release_mem_object_safe(LightRay_coop);
2478 release_mem_object_safe(Intersection_coop_AO);
2479 release_mem_object_safe(Intersection_coop_DL);
2480 #ifdef WITH_CYCLES_DEBUG
2481 release_mem_object_safe(debugdata_coop);
2483 release_mem_object_safe(use_queues_flag);
2484 release_mem_object_safe(Queue_data);
2485 release_mem_object_safe(Queue_index);
2486 release_mem_object_safe(work_array);
2487 #ifdef __WORK_STEALING__
2488 release_mem_object_safe(work_pool_wgs);
2490 release_mem_object_safe(per_sample_output_buffers);
2492 /* Release programs */
2493 release_program_safe(data_init_program);
2494 release_program_safe(scene_intersect_program);
2495 release_program_safe(lamp_emission_program);
2496 release_program_safe(queue_enqueue_program);
2497 release_program_safe(background_buffer_update_program);
2498 release_program_safe(shader_eval_program);
2499 release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
2500 release_program_safe(direct_lighting_program);
2501 release_program_safe(shadow_blocked_program);
2502 release_program_safe(next_iteration_setup_program);
2503 release_program_safe(sum_all_radiance_program);
2505 if(hostRayStateArray != NULL) {
2506 free(hostRayStateArray);
2510 void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
2512 /* cast arguments to cl types */
2513 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
2514 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
2515 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
2516 cl_int d_x = rtile.x;
2517 cl_int d_y = rtile.y;
2518 cl_int d_w = rtile.w;
2519 cl_int d_h = rtile.h;
2520 cl_int d_offset = rtile.offset;
2521 cl_int d_stride = rtile.stride;
2523 /* Make sure that set render feasible tile size is a multiple of local
2524 * work size dimensions.
2526 assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
2527 assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
2529 size_t global_size[2];
2530 size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
2531 SPLIT_KERNEL_LOCAL_SIZE_Y};
2533 /* Set the range of samples to be processed for every ray in
2534 * path-regeneration logic.
2536 cl_int start_sample = rtile.start_sample;
2537 cl_int end_sample = rtile.start_sample + rtile.num_samples;
2538 cl_int num_samples = rtile.num_samples;
2540 #ifdef __WORK_STEALING__
2541 global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
2542 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2543 unsigned int num_parallel_samples = 1;
2545 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2546 unsigned int num_threads = max_render_feasible_tile_size.x *
2547 max_render_feasible_tile_size.y;
2548 unsigned int num_tile_columns_possible = num_threads / global_size[1];
2549 /* Estimate number of parallel samples that can be
2550 * processed in parallel.
2552 unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
2554 /* Wavefront size in AMD is 64.
2555 * TODO(sergey): What about other platforms?
2557 if(num_parallel_samples >= 64) {
2558 /* TODO(sergey): Could use generic round-up here. */
2559 num_parallel_samples = (num_parallel_samples / 64) * 64;
2561 assert(num_parallel_samples != 0);
2563 global_size[0] = d_w * num_parallel_samples;
2564 #endif /* __WORK_STEALING__ */
2566 assert(global_size[0] * global_size[1] <=
2567 max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
2569 /* Allocate all required global memory once. */
2571 size_t num_global_elements = max_render_feasible_tile_size.x *
2572 max_render_feasible_tile_size.y;
2573 /* TODO(sergey): This will actually over-allocate if
2574 * particular kernel does not support multiclosure.
2576 size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
2578 #ifdef __WORK_STEALING__
2579 /* Calculate max groups */
2580 size_t max_global_size[2];
2581 size_t tile_x = max_render_feasible_tile_size.x;
2582 size_t tile_y = max_render_feasible_tile_size.y;
2583 max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
2584 max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
2585 max_work_groups = (max_global_size[0] * max_global_size[1]) /
2586 (local_size[0] * local_size[1]);
2587 /* Allocate work_pool_wgs memory. */
2588 work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
2589 #endif /* __WORK_STEALING__ */
2591 /* Allocate queue_index memory only once. */
2592 Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
2593 use_queues_flag = mem_alloc(sizeof(char));
2594 kgbuffer = mem_alloc(get_KernelGlobals_size());
2596 /* Create global buffers for ShaderData. */
2597 sd = mem_alloc(get_shaderdata_soa_size());
2598 sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
2599 P_sd = mem_alloc(num_global_elements * sizeof(float3));
2600 P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2601 N_sd = mem_alloc(num_global_elements * sizeof(float3));
2602 N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2603 Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
2604 Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2605 I_sd = mem_alloc(num_global_elements * sizeof(float3));
2606 I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2607 shader_sd = mem_alloc(num_global_elements * sizeof(int));
2608 shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2609 flag_sd = mem_alloc(num_global_elements * sizeof(int));
2610 flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2611 prim_sd = mem_alloc(num_global_elements * sizeof(int));
2612 prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2613 type_sd = mem_alloc(num_global_elements * sizeof(int));
2614 type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2615 u_sd = mem_alloc(num_global_elements * sizeof(float));
2616 u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2617 v_sd = mem_alloc(num_global_elements * sizeof(float));
2618 v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2619 object_sd = mem_alloc(num_global_elements * sizeof(int));
2620 object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2621 time_sd = mem_alloc(num_global_elements * sizeof(float));
2622 time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2623 ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
2624 ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2626 /* Ray differentials. */
2627 dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2628 dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2629 dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
2630 dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2631 du_sd = mem_alloc(num_global_elements * sizeof(differential));
2632 du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2633 dv_sd = mem_alloc(num_global_elements * sizeof(differential));
2634 dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2637 dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
2638 dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2639 dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
2640 dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2642 /* Object motion. */
2643 ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2644 ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2645 ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2646 ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2648 closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
2649 closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
2650 num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
2651 num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2652 randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
2653 randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2654 ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
2655 ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2656 ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2657 ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2659 /* Creation of global memory buffers which are shared among
2662 rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
2663 throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
2664 L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
2665 PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
2666 Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
2667 PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
2668 Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
2669 AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
2670 AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
2671 AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2672 BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
2673 ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
2674 LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2675 Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
2676 Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
2678 #ifdef WITH_CYCLES_DEBUG
2679 debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
2682 ray_state = mem_alloc(num_global_elements * sizeof(char));
2684 hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
2685 assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
2687 Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
2688 work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
2689 per_sample_output_buffers = mem_alloc(num_global_elements *
2690 per_thread_output_buffer_size);
2693 cl_int dQueue_size = global_size[0] * global_size[1];
2694 cl_int total_num_rays = global_size[0] * global_size[1];
2696 cl_uint start_arg_index =
2697 kernel_set_args(ckPathTraceKernel_data_init,
2711 shader_sd_DL_shadow,
2723 object_sd_DL_shadow,
2727 ray_length_sd_DL_shadow);
2729 /* Ray differentials. */
2731 kernel_set_args(ckPathTraceKernel_data_init,
2744 kernel_set_args(ckPathTraceKernel_data_init,
2751 /* Object motion. */
2753 kernel_set_args(ckPathTraceKernel_data_init,
2756 ob_tfm_sd_DL_shadow,
2758 ob_itfm_sd_DL_shadow);
2761 kernel_set_args(ckPathTraceKernel_data_init,
2764 closure_sd_DL_shadow,
2766 num_closure_sd_DL_shadow,
2768 randb_closure_sd_DL_shadow,
2772 ray_dP_sd_DL_shadow,
2774 per_sample_output_buffers,
2784 /* TODO(sergey): Avoid map lookup here. */
2785 #define KERNEL_TEX(type, ttype, name) \
2786 set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
2787 #include "kernel_textures.h"
2791 kernel_set_args(ckPathTraceKernel_data_init,
2800 rtile.rng_state_offset_x,
2801 rtile.rng_state_offset_y,
2802 rtile.buffer_rng_state_stride,
2808 #ifdef __WORK_STEALING__
2812 #ifdef WITH_CYCLES_DEBUG
2815 num_parallel_samples);
2817 kernel_set_args(ckPathTraceKernel_scene_intersect,
2832 #ifdef WITH_CYCLES_DEBUG
2835 num_parallel_samples);
2837 kernel_set_args(ckPathTraceKernel_lamp_emission,
2854 num_parallel_samples);
2856 kernel_set_args(ckPathTraceKernel_queue_enqueue,
2863 kernel_set_args(ckPathTraceKernel_background_buffer_update,
2868 per_sample_output_buffers,
2882 rtile.rng_state_offset_x,
2883 rtile.rng_state_offset_y,
2884 rtile.buffer_rng_state_stride,
2891 #ifdef __WORK_STEALING__
2895 #ifdef WITH_CYCLES_DEBUG
2898 num_parallel_samples);
2900 kernel_set_args(ckPathTraceKernel_shader_eval,
2914 kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
2919 per_sample_output_buffers,