Cycles: OpenCL kernel split
authorGeorge Kyriazis <George.Kyriazis@amd.com>
Sat, 9 May 2015 14:34:30 +0000 (19:34 +0500)
committerSergey Sharybin <sergey.vfx@gmail.com>
Sat, 9 May 2015 14:52:40 +0000 (19:52 +0500)
This commit contains all the work related on the AMD megakernel split work
which was mainly done by Varun Sundar, George Kyriazis and Lenny Wang, plus
some help from Sergey Sharybin, Martijn Berger, Thomas Dinges and likely
someone else which we're forgetting to mention.

Currently only AMD cards are enabled for the new split kernel, but it is
possible to force split opencl kernel to be used by setting the following
environment variable: CYCLES_OPENCL_SPLIT_KERNEL_TEST=1.

Not all the features are supported yet, and that being said no motion blur,
camera blur, SSS and volumetrics for now. Also transparent shadows are
disabled on AMD device because of some compiler bug.

This kernel is also only implements regular path tracing and supporting
branched one will take a bit. Branched path tracing is exposed to the
interface still, which is a bit misleading and will be hidden there soon.

More feature will be enabled once they're ported to the split kernel and
tested.

Neither regular CPU nor CUDA has any difference, they're generating the
same exact code, which means no regressions/improvements there.

Based on the research paper:

  https://research.nvidia.com/sites/default/files/publications/laine2013hpg_paper.pdf

Here's the documentation:

  https://docs.google.com/document/d/1LuXW-CV-sVJkQaEGZlMJ86jZ8FmoPfecaMdR-oiWbUY/edit

Design discussion of the patch:

  https://developer.blender.org/T44197

Differential Revision: https://developer.blender.org/D1200

57 files changed:
intern/cycles/device/device.h
intern/cycles/device/device_opencl.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/closure/bsdf.h
intern/cycles/kernel/geom/geom_attribute.h
intern/cycles/kernel/geom/geom_bvh.h
intern/cycles/kernel/geom/geom_motion_triangle.h
intern/cycles/kernel/geom/geom_object.h
intern/cycles/kernel/geom/geom_primitive.h
intern/cycles/kernel/geom/geom_triangle.h
intern/cycles/kernel/kernel.cl
intern/cycles/kernel/kernel_accumulate.h
intern/cycles/kernel/kernel_background_buffer_update.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_camera.h
intern/cycles/kernel/kernel_compat_cpu.h
intern/cycles/kernel/kernel_compat_cuda.h
intern/cycles/kernel/kernel_compat_opencl.h
intern/cycles/kernel/kernel_data_init.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_debug.h
intern/cycles/kernel/kernel_differential.h
intern/cycles/kernel/kernel_direct_lighting.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_emission.h
intern/cycles/kernel/kernel_globals.h
intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_lamp_emission.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_next_iteration_setup.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_passes.h
intern/cycles/kernel/kernel_path.h
intern/cycles/kernel/kernel_path_common.h [new file with mode: 0644]
intern/cycles/kernel/kernel_path_state.h
intern/cycles/kernel/kernel_path_surface.h
intern/cycles/kernel/kernel_queue_enqueue.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_queues.h [new file with mode: 0644]
intern/cycles/kernel/kernel_random.h
intern/cycles/kernel/kernel_scene_intersect.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_shader.h
intern/cycles/kernel/kernel_shader_eval.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_shaderdata_vars.h [new file with mode: 0644]
intern/cycles/kernel/kernel_shadow.h
intern/cycles/kernel/kernel_shadow_blocked.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_split.h [new file with mode: 0644]
intern/cycles/kernel/kernel_sum_all_radiance.cl [new file with mode: 0644]
intern/cycles/kernel/kernel_types.h
intern/cycles/kernel/kernel_work_stealing.h [new file with mode: 0644]
intern/cycles/kernel/svm/svm.h
intern/cycles/kernel/svm/svm_attribute.h
intern/cycles/kernel/svm/svm_camera.h
intern/cycles/kernel/svm/svm_closure.h
intern/cycles/kernel/svm/svm_displace.h
intern/cycles/kernel/svm/svm_fresnel.h
intern/cycles/kernel/svm/svm_geometry.h
intern/cycles/kernel/svm/svm_image.h
intern/cycles/kernel/svm/svm_light_path.h
intern/cycles/kernel/svm/svm_tex_coord.h
intern/cycles/kernel/svm/svm_vector_transform.h
intern/cycles/kernel/svm/svm_wireframe.h
intern/cycles/render/session.cpp

index 4d40518644e91005d73523f8d8072694faa4cfa3..162f51252b02c1cc1951284627ec64ed09710ea5 100644 (file)
@@ -55,6 +55,7 @@ public:
        bool advanced_shading;
        bool pack_images;
        bool extended_images; /* flag for GPU and Multi device */
+       bool use_split_kernel; /* Denotes if the device is going to run cycles using split-kernel */
        vector<DeviceInfo> multi_devices;
 
        DeviceInfo()
@@ -66,6 +67,7 @@ public:
                advanced_shading = true;
                pack_images = false;
                extended_images = false;
+               use_split_kernel = false;
        }
 };
 
index 1147cbd69b43dd254b7e139c6a20f4b5724d8c2c..25eb160d71bb46232773d43a1de0bc0d4c1b913e 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
+#define KERNEL_APPEND_ARG(kernel_name, arg) \
+               opencl_assert(clSetKernelArg(kernel_name, narg++, sizeof(arg), (void*)&arg))
+
+/* Macro declarations used with split kernel */
+
+/* Macro to enable/disable work-stealing */
+#define __WORK_STEALING__
+
+#define SPLIT_KERNEL_LOCAL_SIZE_X 64
+#define SPLIT_KERNEL_LOCAL_SIZE_Y 1
+
+/* This value may be tuned according to the scene we are rendering.
+ *
+ * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
+ * ray-bounces will improve performance.
+ */
+#define PATH_ITER_INC_FACTOR 8
+
+/* When allocate global memory in chunks. We may not be able to
+ * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
+ * Since some bytes may be needed for aligning chunks of memory;
+ * This is the amount of memory that we dedicate for that purpose.
+ */
+#define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
 
 static cl_device_type opencl_device_type()
 {
@@ -94,11 +118,11 @@ static string opencl_kernel_build_options(const string& platform, const string *
                build_options += "-D__KERNEL_OPENCL_AMD__ ";
 
        else if(platform == "Intel(R) OpenCL") {
-               build_options += "-D__KERNEL_OPENCL_INTEL_CPU__";
+               build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
 
                /* options for gdb source level kernel debugging. this segfaults on linux currently */
                if(opencl_kernel_use_debug() && debug_src)
-                       build_options += "-g -s \"" + *debug_src + "\"";
+                       build_options += "-g -s \"" + *debug_src + "\" ";
        }
 
        if(opencl_kernel_use_debug())
@@ -118,14 +142,18 @@ class OpenCLCache
        {
                thread_mutex *mutex;
                cl_context context;
-               cl_program program;
+               /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
+               cl_program ocl_dev_base_program;
+               /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
+               cl_program ocl_dev_megakernel_program;
 
-               Slot() : mutex(NULL), context(NULL), program(NULL) {}
+               Slot() : mutex(NULL), context(NULL), ocl_dev_base_program(NULL), ocl_dev_megakernel_program(NULL) {}
 
                Slot(const Slot &rhs)
                        : mutex(rhs.mutex)
                        , context(rhs.context)
-                       , program(rhs.program)
+                       , ocl_dev_base_program(rhs.ocl_dev_base_program)
+                       , ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
                {
                        /* copy can only happen in map insert, assert that */
                        assert(mutex == NULL);
@@ -236,6 +264,12 @@ class OpenCLCache
        }
 
 public:
+
+       enum ProgramName {
+               OCL_DEV_BASE_PROGRAM,
+               OCL_DEV_MEGAKERNEL_PROGRAM,
+       };
+
        /* see get_something comment */
        static cl_context get_context(cl_platform_id platform, cl_device_id device,
                thread_scoped_lock &slot_locker)
@@ -254,10 +288,21 @@ public:
        }
 
        /* see get_something comment */
-       static cl_program get_program(cl_platform_id platform, cl_device_id device,
+       static cl_program get_program(cl_platform_id platform, cl_device_id device, ProgramName program_name,
                thread_scoped_lock &slot_locker)
        {
-               cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker);
+               cl_program program = NULL;
+
+               if(program_name == OCL_DEV_BASE_PROGRAM) {
+                       /* Get program related to OpenCLDeviceBase */
+                       program = get_something<cl_program>(platform, device, &Slot::ocl_dev_base_program, slot_locker);
+               }
+               else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
+                       /* Get program related to megakernel */
+                       program = get_something<cl_program>(platform, device, &Slot::ocl_dev_megakernel_program, slot_locker);
+               } else {
+                       assert(!"Invalid program name");
+               }
 
                if(!program)
                        return NULL;
@@ -284,10 +329,18 @@ public:
        }
 
        /* see store_something comment */
-       static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
+       static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, ProgramName program_name,
                thread_scoped_lock &slot_locker)
        {
-               store_something<cl_program>(platform, device, program, &Slot::program, slot_locker);
+               if(program_name == OCL_DEV_BASE_PROGRAM) {
+                       store_something<cl_program>(platform, device, program, &Slot::ocl_dev_base_program, slot_locker);
+               }
+               else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
+                       store_something<cl_program>(platform, device, program, &Slot::ocl_dev_megakernel_program, slot_locker);
+               } else {
+                       assert(!"Invalid program name\n");
+                       return;
+               }
 
                /* increment reference count in OpenCL.
                 * The caller is going to release the object when done with it. */
@@ -304,8 +357,10 @@ public:
                thread_scoped_lock cache_lock(self.cache_lock);
 
                foreach(CacheMap::value_type &item, self.cache) {
-                       if(item.second.program != NULL)
-                               clReleaseProgram(item.second.program);
+                       if(item.second.ocl_dev_base_program != NULL)
+                               clReleaseProgram(item.second.ocl_dev_base_program);
+                       if(item.second.ocl_dev_megakernel_program != NULL)
+                               clReleaseProgram(item.second.ocl_dev_megakernel_program);
                        if(item.second.context != NULL)
                                clReleaseContext(item.second.context);
                }
@@ -314,7 +369,7 @@ public:
        }
 };
 
-class OpenCLDevice : public Device
+class OpenCLDeviceBase : public Device
 {
 public:
        DedicatedTaskPool task_pool;
@@ -323,7 +378,6 @@ public:
        cl_platform_id cpPlatform;
        cl_device_id cdDevice;
        cl_program cpProgram;
-       cl_kernel ckPathTraceKernel;
        cl_kernel ckFilmConvertByteKernel;
        cl_kernel ckFilmConvertHalfFloatKernel;
        cl_kernel ckShaderKernel;
@@ -385,7 +439,7 @@ public:
                }
        }
 
-       OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
+       OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
        : Device(info, stats, background_)
        {
                cpPlatform = NULL;
@@ -393,7 +447,6 @@ public:
                cxContext = NULL;
                cqCommandQueue = NULL;
                cpProgram = NULL;
-               ckPathTraceKernel = NULL;
                ckFilmConvertByteKernel = NULL;
                ckFilmConvertHalfFloatKernel = NULL;
                ckShaderKernel = NULL;
@@ -501,7 +554,7 @@ public:
                if(opencl_error(ciErr))
                        return;
 
-               fprintf(stderr,"Device init succes\n");
+               fprintf(stderr, "Device init success\n");
                device_initialized = true;
        }
 
@@ -547,7 +600,11 @@ public:
                return true;
        }
 
-       bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL)
+       bool load_binary(const string& /*kernel_path*/,
+                        const string& clbin,
+                        string custom_kernel_build_options,
+                        cl_program *program,
+                        const string *debug_src = NULL)
        {
                /* read binary into memory */
                vector<uint8_t> binary;
@@ -562,7 +619,7 @@ public:
                size_t size = binary.size();
                const uint8_t *bytes = &binary[0];
 
-               cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
+               *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
                        &size, &bytes, &status, &ciErr);
 
                if(opencl_error(status) || opencl_error(ciErr)) {
@@ -570,16 +627,16 @@ public:
                        return false;
                }
 
-               if(!build_kernel(kernel_path, debug_src))
+               if(!build_kernel(program, custom_kernel_build_options, debug_src))
                        return false;
 
                return true;
        }
 
-       bool save_binary(const string& clbin)
+       bool save_binary(cl_program *program, const string& clbin)
        {
                size_t size = 0;
-               clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
+               clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
 
                if(!size)
                        return false;
@@ -587,7 +644,7 @@ public:
                vector<uint8_t> binary(size);
                uint8_t *bytes = &binary[0];
 
-               clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
+               clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
 
                if(!path_write_binary(clbin, binary)) {
                        opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
@@ -597,20 +654,23 @@ public:
                return true;
        }
 
-       bool build_kernel(const string& /*kernel_path*/, const string *debug_src = NULL)
+       bool build_kernel(cl_program *kernel_program,
+                         string custom_kernel_build_options,
+                         const string *debug_src = NULL)
        {
-               string build_options = opencl_kernel_build_options(platform_name, debug_src);
-       
-               ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
+               string build_options;
+               build_options = opencl_kernel_build_options(platform_name, debug_src) + custom_kernel_build_options;
+
+               ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
 
                /* show warnings even if build is successful */
                size_t ret_val_size = 0;
 
-               clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+               clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
 
                if(ret_val_size > 1) {
-                       vector<char> build_log(ret_val_size+1);
-                       clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
+                       vector<char> build_log(ret_val_size + 1);
+                       clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
 
                        build_log[ret_val_size] = '\0';
                        fprintf(stderr, "OpenCL kernel build output:\n");
@@ -625,12 +685,15 @@ public:
                return true;
        }
 
-       bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL)
+       bool compile_kernel(const string& kernel_path,
+                           string source,
+                           string custom_kernel_build_options,
+                           cl_program *kernel_program,
+                           const string *debug_src = NULL)
        {
                /* we compile kernels consisting of many files. unfortunately opencl
                 * kernel caches do not seem to recognize changes in included files.
                 * so we force recompile on changes by adding the md5 hash of all files */
-               string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
                source = path_source_replace_includes(source, kernel_path);
 
                if(debug_src)
@@ -639,7 +702,7 @@ public:
                size_t source_len = source.size();
                const char *source_str = source.c_str();
 
-               cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
+               *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
 
                if(opencl_error(ciErr))
                        return false;
@@ -647,7 +710,7 @@ public:
                double starttime = time_dt();
                printf("Compiling OpenCL kernel ...\n");
 
-               if(!build_kernel(kernel_path, debug_src))
+               if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
                        return false;
 
                printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
@@ -655,7 +718,7 @@ public:
                return true;
        }
 
-       string device_md5_hash()
+       string device_md5_hash(string kernel_custom_build_options = "")
        {
                MD5Hash md5;
                char version[256], driver[256], name[256], vendor[256];
@@ -671,12 +734,13 @@ public:
                md5.append((uint8_t*)driver, strlen(driver));
 
                string options = opencl_kernel_build_options(platform_name);
+               options += kernel_custom_build_options;
                md5.append((uint8_t*)options.c_str(), options.size());
 
                return md5.get_hex();
        }
 
-       bool load_kernels(bool /*experimental*/)
+       bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/)
        {
                /* verify if device was initialized */
                if(!device_initialized) {
@@ -686,7 +750,7 @@ public:
 
                /* try to use cached kernel */
                thread_scoped_lock cache_locker;
-               cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
+               cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
 
                if(!cpProgram) {
                        /* verify we have right opencl version */
@@ -712,28 +776,27 @@ public:
                        }
 
                        /* if exists already, try use it */
-                       if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
+                       if(path_exists(clbin) && load_binary(kernel_path, clbin, "", &cpProgram)) {
                                /* kernel loaded from binary */
                        }
                        else {
+
+                               string init_kernel_source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
+
                                /* if does not exist or loading binary failed, compile kernel */
-                               if(!compile_kernel(kernel_path, kernel_md5, debug_src))
+                               if(!compile_kernel(kernel_path, init_kernel_source, "", &cpProgram, debug_src))
                                        return false;
 
                                /* save binary for reuse */
-                               if(!save_binary(clbin))
+                               if(!save_binary(&cpProgram, clbin))
                                        return false;
                        }
 
                        /* cache the program */
-                       OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
+                       OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
                }
 
                /* find kernels */
-               ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
-               if(opencl_error(ciErr))
-                       return false;
-
                ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
                if(opencl_error(ciErr))
                        return false;
@@ -753,7 +816,7 @@ public:
                return true;
        }
 
-       ~OpenCLDevice()
+       ~OpenCLDeviceBase()
        {
                task_pool.stop();
 
@@ -766,12 +829,14 @@ public:
                        delete mt->second;
                }
 
-               if(ckPathTraceKernel)
-                       clReleaseKernel(ckPathTraceKernel);  
                if(ckFilmConvertByteKernel)
                        clReleaseKernel(ckFilmConvertByteKernel);  
                if(ckFilmConvertHalfFloatKernel)
                        clReleaseKernel(ckFilmConvertHalfFloatKernel);  
+               if(ckShaderKernel)
+                       clReleaseKernel(ckShaderKernel);
+               if(ckBakeKernel)
+                       clReleaseKernel(ckBakeKernel);
                if(cpProgram)
                        clReleaseProgram(cpProgram);
                if(cqCommandQueue)
@@ -913,42 +978,6 @@ public:
                opencl_assert(clFlush(cqCommandQueue));
        }
 
-       void path_trace(RenderTile& rtile, int sample)
-       {
-               /* cast arguments to cl types */
-               cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
-               cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
-               cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
-               cl_int d_x = rtile.x;
-               cl_int d_y = rtile.y;
-               cl_int d_w = rtile.w;
-               cl_int d_h = rtile.h;
-               cl_int d_sample = sample;
-               cl_int d_offset = rtile.offset;
-               cl_int d_stride = rtile.stride;
-
-               /* sample arguments */
-               cl_uint narg = 0;
-
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
-
-#define KERNEL_TEX(type, ttype, name) \
-       set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
-#include "kernel_textures.h"
-
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
-               opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-               enqueue_kernel(ckPathTraceKernel, d_w, d_h);
-       }
-
        void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
        {
                cl_mem ptr;
@@ -985,23 +1014,23 @@ public:
 
                cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
 
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
+               /* TODO : Make the kernel launch similar to Cuda */
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_data);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_rgba);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_buffer);
 
 #define KERNEL_TEX(type, ttype, name) \
        set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
 #include "kernel_textures.h"
+#undef KERNEL_TEX
 
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
-               opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
-
-
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_sample_scale);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_x);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_y);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_w);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_h);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_offset);
+               KERNEL_APPEND_ARG(ckFilmConvertKernel, d_stride);
 
                enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
        }
@@ -1034,19 +1063,21 @@ public:
 
                        cl_int d_sample = sample;
 
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
+                       /* TODO : Make the kernel launch similar to Cuda */
+                       KERNEL_APPEND_ARG(kernel, d_data);
+                       KERNEL_APPEND_ARG(kernel, d_input);
+                       KERNEL_APPEND_ARG(kernel, d_output);
 
 #define KERNEL_TEX(type, ttype, name) \
                set_kernel_arg_mem(kernel, &narg, #name);
 #include "kernel_textures.h"
+#undef KERNEL_TEX
 
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
-                       opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
+                       KERNEL_APPEND_ARG(kernel, d_shader_eval_type);
+                       KERNEL_APPEND_ARG(kernel, d_shader_x);
+                       KERNEL_APPEND_ARG(kernel, d_shader_w);
+                       KERNEL_APPEND_ARG(kernel, d_offset);
+                       KERNEL_APPEND_ARG(kernel, d_sample);
 
                        enqueue_kernel(kernel, task.shader_w, 1);
 
@@ -1054,6 +1085,305 @@ public:
                }
        }
 
+       class OpenCLDeviceTask : public DeviceTask {
+       public:
+               OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
+               : DeviceTask(task)
+               {
+                       run = function_bind(&OpenCLDeviceBase::thread_run,
+                                           device,
+                                           this);
+               }
+       };
+
+       int get_split_task_count(DeviceTask& /*task*/)
+       {
+               return 1;
+       }
+
+       void task_add(DeviceTask& task)
+       {
+               task_pool.push(new OpenCLDeviceTask(this, task));
+       }
+
+       void task_wait()
+       {
+               task_pool.wait();
+       }
+
+       void task_cancel()
+       {
+               task_pool.cancel();
+       }
+
+       virtual void thread_run(DeviceTask * /*task*/) = 0;
+
+protected:
+       class ArgumentWrapper {
+       public:
+               ArgumentWrapper() : size(0), pointer(NULL) {}
+               template <typename T>
+               ArgumentWrapper(T& argument) : size(sizeof(argument)),
+                                              pointer(&argument) { }
+               size_t size;
+               void *pointer;
+       };
+
+       /* TODO(sergey): In the future we can use variadic templates, once
+        * C++0x is allowed. Should allow to clean this up a bit.
+        */
+       int kernel_set_args(cl_kernel kernel,
+                           int start_argument_index,
+                           const ArgumentWrapper& arg1 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg2 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg3 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg4 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg5 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg6 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg7 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg8 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg9 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg10 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg11 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg12 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg13 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg14 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg15 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg16 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg17 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg18 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg19 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg20 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg21 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg22 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg23 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg24 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg25 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg26 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg27 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg28 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg29 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg30 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg31 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg32 = ArgumentWrapper(),
+                           const ArgumentWrapper& arg33 = ArgumentWrapper())
+       {
+               int current_arg_index = 0;
+#define FAKE_VARARG_HANDLE_ARG(arg) \
+               do { \
+                       if(arg.pointer != NULL) { \
+                               opencl_assert(clSetKernelArg( \
+                                       kernel, \
+                                       start_argument_index + current_arg_index, \
+                                       arg.size, arg.pointer)); \
+                               ++current_arg_index; \
+                       } \
+                       else { \
+                               return current_arg_index; \
+                       } \
+               } while(false)
+               FAKE_VARARG_HANDLE_ARG(arg1);
+               FAKE_VARARG_HANDLE_ARG(arg2);
+               FAKE_VARARG_HANDLE_ARG(arg3);
+               FAKE_VARARG_HANDLE_ARG(arg4);
+               FAKE_VARARG_HANDLE_ARG(arg5);
+               FAKE_VARARG_HANDLE_ARG(arg6);
+               FAKE_VARARG_HANDLE_ARG(arg7);
+               FAKE_VARARG_HANDLE_ARG(arg8);
+               FAKE_VARARG_HANDLE_ARG(arg9);
+               FAKE_VARARG_HANDLE_ARG(arg10);
+               FAKE_VARARG_HANDLE_ARG(arg11);
+               FAKE_VARARG_HANDLE_ARG(arg12);
+               FAKE_VARARG_HANDLE_ARG(arg13);
+               FAKE_VARARG_HANDLE_ARG(arg14);
+               FAKE_VARARG_HANDLE_ARG(arg15);
+               FAKE_VARARG_HANDLE_ARG(arg16);
+               FAKE_VARARG_HANDLE_ARG(arg17);
+               FAKE_VARARG_HANDLE_ARG(arg18);
+               FAKE_VARARG_HANDLE_ARG(arg19);
+               FAKE_VARARG_HANDLE_ARG(arg20);
+               FAKE_VARARG_HANDLE_ARG(arg21);
+               FAKE_VARARG_HANDLE_ARG(arg22);
+               FAKE_VARARG_HANDLE_ARG(arg23);
+               FAKE_VARARG_HANDLE_ARG(arg24);
+               FAKE_VARARG_HANDLE_ARG(arg25);
+               FAKE_VARARG_HANDLE_ARG(arg26);
+               FAKE_VARARG_HANDLE_ARG(arg27);
+               FAKE_VARARG_HANDLE_ARG(arg28);
+               FAKE_VARARG_HANDLE_ARG(arg29);
+               FAKE_VARARG_HANDLE_ARG(arg30);
+               FAKE_VARARG_HANDLE_ARG(arg31);
+               FAKE_VARARG_HANDLE_ARG(arg32);
+               FAKE_VARARG_HANDLE_ARG(arg33);
+#undef FAKE_VARARG_HANDLE_ARG
+               return current_arg_index;
+       }
+
+       inline void release_kernel_safe(cl_kernel kernel)
+       {
+               if(kernel) {
+                       clReleaseKernel(kernel);
+               }
+       }
+
+       inline void release_mem_object_safe(cl_mem mem)
+       {
+               if(mem != NULL) {
+                       clReleaseMemObject(mem);
+               }
+       }
+
+       inline void release_program_safe(cl_program program)
+       {
+               if(program) {
+                       clReleaseProgram(program);
+               }
+       }
+};
+
+class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
+{
+public:
+       cl_kernel ckPathTraceKernel;
+       cl_program path_trace_program;
+
+       OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
+       : OpenCLDeviceBase(info, stats, background_)
+       {
+               ckPathTraceKernel = NULL;
+               path_trace_program = NULL;
+       }
+
+       bool load_kernels(const DeviceRequestedFeatures& requested_features)
+       {
+               /* Get Shader, bake and film convert kernels.
+                * It'll also do verification of OpenCL actually initialized.
+                */
+               if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+                       return false;
+               }
+
+               /* Try to use cached kernel. */
+               thread_scoped_lock cache_locker;
+               path_trace_program = OpenCLCache::get_program(cpPlatform,
+                                                             cdDevice,
+                                                             OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+                                                             cache_locker);
+
+               if(!path_trace_program) {
+                       /* Verify we have right opencl version. */
+                       if(!opencl_version_check())
+                               return false;
+
+                       /* Calculate md5 hash to detect changes. */
+                       string kernel_path = path_get("kernel");
+                       string kernel_md5 = path_files_md5_hash(kernel_path);
+                       string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
+                       string device_md5 = device_md5_hash(custom_kernel_build_options);
+
+                       /* Path to cached binary. */
+                       string clbin = string_printf("cycles_kernel_%s_%s.clbin",
+                                                    device_md5.c_str(),
+                                                    kernel_md5.c_str());
+                       clbin = path_user_get(path_join("cache", clbin));
+
+                       /* Path to preprocessed source for debugging. */
+                       string clsrc, *debug_src = NULL;
+                       if(opencl_kernel_use_debug()) {
+                               clsrc = string_printf("cycles_kernel_%s_%s.cl",
+                                                     device_md5.c_str(),
+                                                     kernel_md5.c_str());
+                               clsrc = path_user_get(path_join("cache", clsrc));
+                               debug_src = &clsrc;
+                       }
+
+                       /* If exists already, try use it. */
+                       if(path_exists(clbin) && load_binary(kernel_path,
+                                                            clbin,
+                                                            custom_kernel_build_options,
+                                                            &path_trace_program,
+                                                            debug_src)) {
+                               /* Kernel loaded from binary, nothing to do. */
+                       }
+                       else {
+                               string init_kernel_source = "#include \"kernel.cl\" // " +
+                                                           kernel_md5 + "\n";
+                               /* If does not exist or loading binary failed, compile kernel. */
+                               if(!compile_kernel(kernel_path,
+                                                  init_kernel_source,
+                                                  custom_kernel_build_options,
+                                                  &path_trace_program,
+                                                  debug_src))
+                               {
+                                       return false;
+                               }
+                               /* Save binary for reuse. */
+                               if(!save_binary(&path_trace_program, clbin)) {
+                                       return false;
+                               }
+                       }
+                       /* Cache the program. */
+                       OpenCLCache::store_program(cpPlatform,
+                                                  cdDevice,
+                                                  path_trace_program,
+                                                  OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
+                                                  cache_locker);
+               }
+
+               /* Find kernels. */
+               ckPathTraceKernel = clCreateKernel(path_trace_program,
+                                                  "kernel_ocl_path_trace",
+                                                  &ciErr);
+               if(opencl_error(ciErr))
+                       return false;
+               return true;
+       }
+
+       ~OpenCLDeviceMegaKernel()
+       {
+               task_pool.stop();
+               release_kernel_safe(ckPathTraceKernel);
+               release_program_safe(path_trace_program);
+       }
+
+       void path_trace(RenderTile& rtile, int sample)
+       {
+               /* Cast arguments to cl types. */
+               cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+               cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+               cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+               cl_int d_x = rtile.x;
+               cl_int d_y = rtile.y;
+               cl_int d_w = rtile.w;
+               cl_int d_h = rtile.h;
+               cl_int d_offset = rtile.offset;
+               cl_int d_stride = rtile.stride;
+
+               /* Sample arguments. */
+               cl_int d_sample = sample;
+               cl_uint narg = 0;
+
+               /* TODO : Make the kernel launch similar to Cuda. */
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_data);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_buffer);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_rng_state);
+
+#define KERNEL_TEX(type, ttype, name) \
+               set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_sample);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_x);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_y);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_w);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_h);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_offset);
+               KERNEL_APPEND_ARG(ckPathTraceKernel, d_stride);
+
+               enqueue_kernel(ckPathTraceKernel, d_w, d_h);
+       }
+
        void thread_run(DeviceTask *task)
        {
                if(task->type == DeviceTask::FILM_CONVERT) {
@@ -1064,8 +1394,7 @@ public:
                }
                else if(task->type == DeviceTask::PATH_TRACE) {
                        RenderTile tile;
-                       
-                       /* keep rendering tiles until done */
+                       /* Keep rendering tiles until done. */
                        while(task->acquire_tile(this, tile)) {
                                int start_sample = tile.start_sample;
                                int end_sample = tile.start_sample + tile.num_samples;
@@ -1083,47 +1412,1908 @@ public:
                                        task->update_progress(&tile);
                                }
 
+                               /* Complete kernel execution before release tile */
+                               /* This helps in multi-device render;
+                                * The device that reaches the critical-section function
+                                * release_tile waits (stalling other devices from entering
+                                * release_tile) for all kernels to complete. If device1 (a
+                                * slow-render device) reaches release_tile first then it would
+                                * stall device2 (a fast-render device) from proceeding to render
+                                * next tile.
+                                */
+                               clFinish(cqCommandQueue);
+
                                task->release_tile(tile);
                        }
                }
        }
+};
 
-       class OpenCLDeviceTask : public DeviceTask {
-       public:
-               OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
-               : DeviceTask(task)
+/* TODO(sergey): This is to keep tile split on OpenCL level working
+ * for now, since withotu this viewport render does not work as it
+ * should.
+ *
+ * Ideally it'll be done on the higher level, but we need to get ready
+ * for merge rather soon, so let's keep split logic private here in
+ * the file.
+ */
+class SplitRenderTile : public RenderTile {
+public:
+       SplitRenderTile()
+               : RenderTile(),
+                 buffer_offset_x(0),
+                 buffer_offset_y(0),
+                 rng_state_offset_x(0),
+                 rng_state_offset_y(0),
+                 buffer_rng_state_stride(0) {}
+
+       explicit SplitRenderTile(RenderTile& tile)
+               : RenderTile(),
+                 buffer_offset_x(0),
+                 buffer_offset_y(0),
+                 rng_state_offset_x(0),
+                 rng_state_offset_y(0),
+                 buffer_rng_state_stride(0)
+       {
+               x = tile.x;
+               y = tile.y;
+               w = tile.w;
+               h = tile.h;
+               start_sample = tile.start_sample;
+               num_samples = tile.num_samples;
+               sample = tile.sample;
+               resolution = tile.resolution;
+               offset = tile.offset;
+               stride = tile.stride;
+               buffer = tile.buffer;
+               rng_state = tile.rng_state;
+               buffers = tile.buffers;
+       }
+
+       /* Split kernel is device global memory constained;
+        * hence split kernel cant render big tile size's in
+        * one go. If the user sets a big tile size (big tile size
+        * is a term relative to the available device global memory),
+        * we split the tile further and then call path_trace on
+        * each of those split tiles. The following variables declared,
+        * assist in achieving that purpose
+        */
+       int buffer_offset_x;
+       int buffer_offset_y;
+       int rng_state_offset_x;
+       int rng_state_offset_y;
+       int buffer_rng_state_stride;
+};
+
+/* OpenCLDeviceSplitKernel's declaration/definition. */
+class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
+{
+public:
+       /* Kernel declaration. */
+       cl_kernel ckPathTraceKernel_data_init;
+       cl_kernel ckPathTraceKernel_scene_intersect;
+       cl_kernel ckPathTraceKernel_lamp_emission;
+       cl_kernel ckPathTraceKernel_queue_enqueue;
+       cl_kernel ckPathTraceKernel_background_buffer_update;
+       cl_kernel ckPathTraceKernel_shader_lighting;
+       cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
+       cl_kernel ckPathTraceKernel_direct_lighting;
+       cl_kernel ckPathTraceKernel_shadow_blocked_direct_lighting;
+       cl_kernel ckPathTraceKernel_setup_next_iteration;
+       cl_kernel ckPathTraceKernel_sum_all_radiance;
+
+       /* cl_program declaration. */
+       cl_program data_init_program;
+       cl_program scene_intersect_program;
+       cl_program lamp_emission_program;
+       cl_program queue_enqueue_program;
+       cl_program background_buffer_update_program;
+       cl_program shader_eval_program;
+       cl_program holdout_emission_blurring_termination_ao_program;
+       cl_program direct_lighting_program;
+       cl_program shadow_blocked_program;
+       cl_program next_iteration_setup_program;
+       cl_program sum_all_radiance_program;
+
+       /* Global memory variables [porting]; These memory is used for
+        * co-operation between different kernels; Data written by one
+        * kernel will be avaible to another kernel via this global
+        * memory.
+        */
+       cl_mem rng_coop;
+       cl_mem throughput_coop;
+       cl_mem L_transparent_coop;
+       cl_mem PathRadiance_coop;
+       cl_mem Ray_coop;
+       cl_mem PathState_coop;
+       cl_mem Intersection_coop;
+       cl_mem kgbuffer;  /* KernelGlobals buffer. */
+
+       /* Global buffers for ShaderData. */
+       cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
+       cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
+                               * shadow_blocked kernel.
+                               */
+
+       /* Global buffers of each member of ShaderData. */
+       cl_mem P_sd;
+       cl_mem P_sd_DL_shadow;
+       cl_mem N_sd;
+       cl_mem N_sd_DL_shadow;
+       cl_mem Ng_sd;
+       cl_mem Ng_sd_DL_shadow;
+       cl_mem I_sd;
+       cl_mem I_sd_DL_shadow;
+       cl_mem shader_sd;
+       cl_mem shader_sd_DL_shadow;
+       cl_mem flag_sd;
+       cl_mem flag_sd_DL_shadow;
+       cl_mem prim_sd;
+       cl_mem prim_sd_DL_shadow;
+       cl_mem type_sd;
+       cl_mem type_sd_DL_shadow;
+       cl_mem u_sd;
+       cl_mem u_sd_DL_shadow;
+       cl_mem v_sd;
+       cl_mem v_sd_DL_shadow;
+       cl_mem object_sd;
+       cl_mem object_sd_DL_shadow;
+       cl_mem time_sd;
+       cl_mem time_sd_DL_shadow;
+       cl_mem ray_length_sd;
+       cl_mem ray_length_sd_DL_shadow;
+       cl_mem ray_depth_sd;
+       cl_mem ray_depth_sd_DL_shadow;
+       cl_mem transparent_depth_sd;
+       cl_mem transparent_depth_sd_DL_shadow;
+#ifdef __RAY_DIFFERENTIALS__
+       cl_mem dP_sd, dI_sd;
+       cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
+       cl_mem du_sd, dv_sd;
+       cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
+#endif
+#ifdef __DPDU__
+       cl_mem dPdu_sd, dPdv_sd;
+       cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
+#endif
+       cl_mem closure_sd;
+       cl_mem closure_sd_DL_shadow;
+       cl_mem num_closure_sd;
+       cl_mem num_closure_sd_DL_shadow;
+       cl_mem randb_closure_sd;
+       cl_mem randb_closure_sd_DL_shadow;
+       cl_mem ray_P_sd;
+       cl_mem ray_P_sd_DL_shadow;
+       cl_mem ray_dP_sd;
+       cl_mem ray_dP_sd_DL_shadow;
+
+       /* Global memory required for shadow blocked and accum_radiance. */
+       cl_mem BSDFEval_coop;
+       cl_mem ISLamp_coop;
+       cl_mem LightRay_coop;
+       cl_mem AOAlpha_coop;
+       cl_mem AOBSDF_coop;
+       cl_mem AOLightRay_coop;
+       cl_mem Intersection_coop_AO;
+       cl_mem Intersection_coop_DL;
+
+#ifdef WITH_CYCLES_DEBUG
+       /* DebugData memory */
+       cl_mem debugdata_coop;
+#endif
+
+       /* Global state array that tracks ray state. */
+       cl_mem ray_state;
+
+       /* Per sample buffers. */
+       cl_mem per_sample_output_buffers;
+
+       /* Denotes which sample each ray is being processed for. */
+       cl_mem work_array;
+
+       /* Queue */
+       cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
+       cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
+                            * Tracks the size of each queue.
+                            */
+
+       /* Flag to make sceneintersect and lampemission kernel use queues. */
+       cl_mem use_queues_flag;
+
+       /* Required-memory size. */
+       size_t throughput_size;
+       size_t L_transparent_size;
+       size_t rayState_size;
+       size_t hostRayState_size;
+       size_t work_element_size;
+       size_t ISLamp_size;
+
+       /* Sizes of memory required for shadow blocked function. */
+       size_t AOAlpha_size;
+       size_t AOBSDF_size;
+
+       /* Amount of memory in output buffer associated with one pixel/thread. */
+       size_t per_thread_output_buffer_size;
+
+       /* Total allocatable available device memory. */
+       size_t total_allocatable_memory;
+
+       /* host version of ray_state; Used in checking host path-iteration
+        * termination.
+        */
+       char *hostRayStateArray;
+
+       /* Number of path-iterations to be done in one shot. */
+       unsigned int PathIteration_times;
+
+#ifdef __WORK_STEALING__
+       /* Work pool with respect to each work group. */
+       cl_mem work_pool_wgs;
+
+       /* Denotes the maximum work groups possible w.r.t. current tile size. */
+       unsigned int max_work_groups;
+#endif
+
+       /* clos_max value for which the kernels have been loaded currently. */
+       int current_clos_max;
+
+       /* Marked True in constructor and marked false at the end of path_trace(). */
+       bool first_tile;
+
+       OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
+       : OpenCLDeviceBase(info, stats, background_)
+       {
+
+               info.use_split_kernel = true;
+               background = background_;
+
+               /* Initialize kernels. */
+               ckPathTraceKernel_data_init = NULL;
+               ckPathTraceKernel_scene_intersect = NULL;
+               ckPathTraceKernel_lamp_emission = NULL;
+               ckPathTraceKernel_background_buffer_update = NULL;
+               ckPathTraceKernel_shader_lighting = NULL;
+               ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
+               ckPathTraceKernel_direct_lighting = NULL;
+               ckPathTraceKernel_shadow_blocked_direct_lighting = NULL;
+               ckPathTraceKernel_setup_next_iteration = NULL;
+               ckPathTraceKernel_sum_all_radiance = NULL;
+               ckPathTraceKernel_queue_enqueue = NULL;
+
+               /* Initialize program. */
+               data_init_program = NULL;
+               scene_intersect_program = NULL;
+               lamp_emission_program = NULL;
+               queue_enqueue_program = NULL;
+               background_buffer_update_program = NULL;
+               shader_eval_program = NULL;
+               holdout_emission_blurring_termination_ao_program = NULL;
+               direct_lighting_program = NULL;
+               shadow_blocked_program = NULL;
+               next_iteration_setup_program = NULL;
+               sum_all_radiance_program = NULL;
+
+               /* Initialize cl_mem variables. */
+               kgbuffer = NULL;
+               sd = NULL;
+               sd_DL_shadow = NULL;
+
+               P_sd = NULL;
+               P_sd_DL_shadow = NULL;
+               N_sd = NULL;
+               N_sd_DL_shadow = NULL;
+               Ng_sd = NULL;
+               Ng_sd_DL_shadow = NULL;
+               I_sd = NULL;
+               I_sd_DL_shadow = NULL;
+               shader_sd = NULL;
+               shader_sd_DL_shadow = NULL;
+               flag_sd = NULL;
+               flag_sd_DL_shadow = NULL;
+               prim_sd = NULL;
+               prim_sd_DL_shadow = NULL;
+               type_sd = NULL;
+               type_sd_DL_shadow = NULL;
+               u_sd = NULL;
+               u_sd_DL_shadow = NULL;
+               v_sd = NULL;
+               v_sd_DL_shadow = NULL;
+               object_sd = NULL;
+               object_sd_DL_shadow = NULL;
+               time_sd = NULL;
+               time_sd_DL_shadow = NULL;
+               ray_length_sd = NULL;
+               ray_length_sd_DL_shadow = NULL;
+               ray_depth_sd = NULL;
+               ray_depth_sd_DL_shadow = NULL;
+               transparent_depth_sd = NULL;
+               transparent_depth_sd_DL_shadow = NULL;
+#ifdef __RAY_DIFFERENTIALS__
+               dP_sd = NULL;
+               dI_sd = NULL;
+               dP_sd_DL_shadow = NULL;
+               dI_sd_DL_shadow = NULL;
+               du_sd = NULL;
+               dv_sd = NULL;
+               du_sd_DL_shadow = NULL;
+               dv_sd_DL_shadow = NULL;
+#endif
+#ifdef __DPDU__
+               dPdu_sd = NULL;
+               dPdv_sd = NULL;
+               dPdu_sd_DL_shadow = NULL;
+               dPdv_sd_DL_shadow = NULL;
+#endif
+               closure_sd = NULL;
+               closure_sd_DL_shadow = NULL;
+               num_closure_sd = NULL;
+               num_closure_sd_DL_shadow = NULL;
+               randb_closure_sd = NULL;
+               randb_closure_sd_DL_shadow = NULL;
+               ray_P_sd = NULL;
+               ray_P_sd_DL_shadow = NULL;
+               ray_dP_sd = NULL;
+               ray_dP_sd_DL_shadow = NULL;
+
+               rng_coop = NULL;
+               throughput_coop = NULL;
+               L_transparent_coop = NULL;
+               PathRadiance_coop = NULL;
+               Ray_coop = NULL;
+               PathState_coop = NULL;
+               Intersection_coop = NULL;
+               ray_state = NULL;
+
+               AOAlpha_coop = NULL;
+               AOBSDF_coop = NULL;
+               AOLightRay_coop = NULL;
+               BSDFEval_coop = NULL;
+               ISLamp_coop = NULL;
+               LightRay_coop = NULL;
+               Intersection_coop_AO = NULL;
+               Intersection_coop_DL = NULL;
+
+#ifdef WITH_CYCLES_DEBUG
+               debugdata_coop = NULL;
+#endif
+
+               work_array = NULL;
+
+               /* Queue. */
+               Queue_data = NULL;
+               Queue_index = NULL;
+               use_queues_flag = NULL;
+
+               per_sample_output_buffers = NULL;
+
+               /* Initialize required memory size. */
+               throughput_size = sizeof(float3);
+               L_transparent_size = sizeof(float);
+               rayState_size = sizeof(char);
+               hostRayState_size = sizeof(char);
+               work_element_size = sizeof(unsigned int);
+               ISLamp_size = sizeof(int);
+
+               /* Initialize sizes of memory required for shadow blocked function. */
+               AOAlpha_size = sizeof(float3);
+               AOBSDF_size = sizeof(float3);
+
+               per_thread_output_buffer_size = 0;
+               hostRayStateArray = NULL;
+               PathIteration_times = PATH_ITER_INC_FACTOR;
+#ifdef __WORK_STEALING__
+               work_pool_wgs = NULL;
+               max_work_groups = 0;
+#endif
+               current_clos_max = -1;
+               first_tile = true;
+
+               /* Get device's maximum memory that can be allocated. */
+               ciErr = clGetDeviceInfo(cdDevice,
+                                       CL_DEVICE_MAX_MEM_ALLOC_SIZE,
+                                       sizeof(size_t),
+                                       &total_allocatable_memory,
+                                       NULL);
+               assert(ciErr == CL_SUCCESS);
+               if(platform_name == "AMD Accelerated Parallel Processing") {
+                       /* This value is tweak-able; AMD platform does not seem to
+                        * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
+                        * is considered for further computation.
+                        */
+                       total_allocatable_memory /= 2;
+               }
+       }
+
+       /* TODO(sergey): Seems really close to load_kernel(),
+        * could it be de-duplicated?
+        */
+       bool load_split_kernel(string kernel_path,
+                              string kernel_init_source,
+                              string clbin,
+                              string custom_kernel_build_options,
+                              cl_program *program)
+       {
+               if(!opencl_version_check())
+                       return false;
+
+               clbin = path_user_get(path_join("cache", clbin));
+
+               /* Path to preprocessed source for debugging. */
+               string *debug_src = NULL;
+
+               /* If exists already, try use it. */
+               if(path_exists(clbin) && load_binary(kernel_path,
+                                                    clbin,
+                                                    custom_kernel_build_options,
+                                                    program,
+                                                    debug_src)) {
+                       /* Kernel loaded from binary. */
+               }
+               else {
+                       /* If does not exist or loading binary failed, compile kernel. */
+                       if(!compile_kernel(kernel_path,
+                                          kernel_init_source,
+                                          custom_kernel_build_options,
+                                          program))
+                       {
+                               return false;
+                       }
+                       /* Save binary for reuse. */
+                       if(!save_binary(program, clbin)) {
+                               return false;
+                       }
+               }
+               return true;
+       }
+
+       /* Split kernel utility functions. */
+       size_t get_tex_size(const char *tex_name)
+       {
+               cl_mem ptr;
+               size_t ret_size = 0;
+               MemMap::iterator i = mem_map.find(tex_name);
+               if(i != mem_map.end()) {
+                       ptr = CL_MEM_PTR(i->second);
+                       ciErr = clGetMemObjectInfo(ptr,
+                                                  CL_MEM_SIZE,
+                                                  sizeof(ret_size),
+                                                  &ret_size,
+                                                  NULL);
+                       assert(ciErr == CL_SUCCESS);
+               }
+               return ret_size;
+       }
+
+       size_t get_shader_closure_size(int max_closure)
+       {
+               return (sizeof(ShaderClosure)* max_closure);
+       }
+
+       size_t get_shader_data_size(size_t shader_closure_size)
+       {
+               /* ShaderData size without accounting for ShaderClosure array. */
+               size_t shader_data_size =
+                       sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
+               return (shader_data_size + shader_closure_size);
+       }
+
+       /* Returns size of KernelGlobals structure associated with OpenCL. */
+       size_t get_KernelGlobals_size()
+       {
+               /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
+                * fetch its size.
+                */
+               typedef struct KernelGlobals {
+                       ccl_constant KernelData *data;
+#define KERNEL_TEX(type, ttype, name) \
+       ccl_global type *name;
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+               } KernelGlobals;
+
+               return sizeof(KernelGlobals);
+       }
+
+       /* Returns size of Structure of arrays implementation of. */
+       size_t get_shaderdata_soa_size()
+       {
+               size_t shader_soa_size = 0;
+
+#define SD_VAR(type, what) \
+               shader_soa_size += sizeof(void *);
+#define SD_CLOSURE_VAR(type, what, max_closure)
+               shader_soa_size += sizeof(void *);
+               #include "kernel_shaderdata_vars.h"
+#undef SD_VAR
+#undef SD_CLOSURE_VAR
+
+               return shader_soa_size;
+       }
+
+       bool load_kernels(const DeviceRequestedFeatures& requested_features)
+       {
+               /* If it is an interactive render; we ceil clos_max value to a multiple
+                * of 5 in order to limit re-compilations.
+                */
+               /* TODO(sergey): Decision about this should be done on higher levels. */
+               int max_closure = requested_features.max_closure;
+               if(!background) {
+                       assert((max_closure != 0) && "clos_max value is 0" );
+                       max_closure = (((max_closure - 1) / 5) + 1) * 5;
+                       /* clos_max value shouldn't be greater than MAX_CLOSURE. */
+                       max_closure = (max_closure > MAX_CLOSURE) ? MAX_CLOSURE : max_closure;
+                       if(current_clos_max == max_closure) {
+                               /* Present kernels have been created with the same closure count
+                                * build option.
+                                */
+                               return true;
+                       }
+               }
+               /* Get Shader, bake and film_convert kernels.
+                * It'll also do verification of OpenCL actually initialized.
+                */
+               if(!OpenCLDeviceBase::load_kernels(requested_features)) {
+                       return false;
+               }
+
+               string svm_build_options = "";
+               string max_closure_build_option = "";
+               string compute_device_type_build_option = "";
+
+               /* Set svm_build_options. */
+               svm_build_options += " -D__NODES_MAX_GROUP__=" +
+                       string_printf("%d", requested_features.max_nodes_group);
+               svm_build_options += " -D__NODES_FEATURES__=" +
+                       string_printf("%d", requested_features.nodes_features);
+
+               /* Set max closure build option. */
+               max_closure_build_option += string_printf("-D__MAX_CLOSURE__=%d ",
+                                                         max_closure);
+
+               /* Set compute device build option. */
+               cl_device_type device_type;
+               ciErr = clGetDeviceInfo(cdDevice,
+                                       CL_DEVICE_TYPE,
+                                       sizeof(cl_device_type),
+                                       &device_type,
+                                       NULL);
+               assert(ciErr == CL_SUCCESS);
+               if(device_type == CL_DEVICE_TYPE_GPU) {
+                       compute_device_type_build_option = "-D__COMPUTE_DEVICE_GPU__ ";
+               }
+
+               string kernel_path = path_get("kernel");
+               string kernel_md5 = path_files_md5_hash(kernel_path);
+               string device_md5;
+               string custom_kernel_build_options;
+               string kernel_init_source;
+               string clbin;
+
+               string common_custom_build_options = "";
+               common_custom_build_options += "-D__SPLIT_KERNEL__ ";
+               common_custom_build_options += max_closure_build_option;;
+#ifdef __WORK_STEALING__
+               common_custom_build_options += "-D__WORK_STEALING__ ";
+#endif
+
+#define LOAD_KERNEL(program, name) \
+       do { \
+               kernel_init_source = "#include \"kernel_" name ".cl\" // " + \
+                                    kernel_md5 + "\n"; \
+               custom_kernel_build_options = common_custom_build_options; \
+               device_md5 = device_md5_hash(custom_kernel_build_options); \
+               clbin = string_printf("cycles_kernel_%s_%s_" name ".clbin", \
+                                     device_md5.c_str(), kernel_md5.c_str()); \
+               if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
+                                     custom_kernel_build_options, &program)) \
+               { \
+                       return false; \
+               } \
+       } while(false)
+
+               /* TODO(sergey): If names are unified we can save some more bits of
+                * code here.
+                */
+               LOAD_KERNEL(data_init_program, "data_init");
+               LOAD_KERNEL(scene_intersect_program, "scene_intersect");
+               LOAD_KERNEL(lamp_emission_program, "lamp_emission");
+               LOAD_KERNEL(queue_enqueue_program, "queue_enqueue");
+               LOAD_KERNEL(background_buffer_update_program, "background_buffer_update");
+               LOAD_KERNEL(shader_eval_program, "shader_eval");
+               LOAD_KERNEL(holdout_emission_blurring_termination_ao_program,
+                           "holdout_emission_blurring_pathtermination_ao");
+               LOAD_KERNEL(direct_lighting_program, "direct_lighting");
+               LOAD_KERNEL(shadow_blocked_program, "shadow_blocked");
+               LOAD_KERNEL(next_iteration_setup_program, "next_iteration_setup");
+               LOAD_KERNEL(sum_all_radiance_program, "sum_all_radiance");
+
+#undef LOAD_KERNEL
+
+#define GLUE(a, b) a ## b
+#define FIND_KERNEL(kernel, program, function) \
+       do { \
+               GLUE(ckPathTraceKernel_, kernel) = \
+                       clCreateKernel(GLUE(program, _program), \
+                                      "kernel_ocl_path_trace_"  function, &ciErr); \
+               if(opencl_error(ciErr)) { \
+                       return false; \
+               } \
+       } while(false)
+
+               FIND_KERNEL(data_init, data_init, "data_initialization");
+               FIND_KERNEL(scene_intersect, scene_intersect, "scene_intersect");
+               FIND_KERNEL(lamp_emission, lamp_emission, "lamp_emission");
+               FIND_KERNEL(queue_enqueue, queue_enqueue, "queue_enqueue");
+               FIND_KERNEL(background_buffer_update, background_buffer_update, "background_buffer_update");
+               FIND_KERNEL(shader_lighting, shader_eval, "shader_evaluation");
+               FIND_KERNEL(holdout_emission_blurring_pathtermination_ao,
+                           holdout_emission_blurring_termination_ao,
+                           "holdout_emission_blurring_pathtermination_ao");
+               FIND_KERNEL(direct_lighting, direct_lighting, "direct_lighting");
+               FIND_KERNEL(shadow_blocked_direct_lighting, shadow_blocked, "shadow_blocked_direct_lighting");
+               FIND_KERNEL(setup_next_iteration, next_iteration_setup, "setup_next_iteration");
+               FIND_KERNEL(sum_all_radiance, sum_all_radiance, "sum_all_radiance");
+#undef FIND_KERNEL
+#undef GLUE
+
+               current_clos_max = max_closure;
+
+               return true;
+       }
+
+       ~OpenCLDeviceSplitKernel()
+       {
+               task_pool.stop();
+
+               /* Release kernels */
+               release_kernel_safe(ckPathTraceKernel_data_init);
+               release_kernel_safe(ckPathTraceKernel_scene_intersect);
+               release_kernel_safe(ckPathTraceKernel_lamp_emission);
+               release_kernel_safe(ckPathTraceKernel_queue_enqueue);
+               release_kernel_safe(ckPathTraceKernel_background_buffer_update);
+               release_kernel_safe(ckPathTraceKernel_shader_lighting);
+               release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
+               release_kernel_safe(ckPathTraceKernel_direct_lighting);
+               release_kernel_safe(ckPathTraceKernel_shadow_blocked_direct_lighting);
+               release_kernel_safe(ckPathTraceKernel_setup_next_iteration);
+               release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
+
+               /* Release global memory */
+               release_mem_object_safe(P_sd);
+               release_mem_object_safe(P_sd_DL_shadow);
+               release_mem_object_safe(N_sd);
+               release_mem_object_safe(N_sd_DL_shadow);
+               release_mem_object_safe(Ng_sd);
+               release_mem_object_safe(Ng_sd_DL_shadow);
+               release_mem_object_safe(I_sd);
+               release_mem_object_safe(I_sd_DL_shadow);
+               release_mem_object_safe(shader_sd);
+               release_mem_object_safe(shader_sd_DL_shadow);
+               release_mem_object_safe(flag_sd);
+               release_mem_object_safe(flag_sd_DL_shadow);
+               release_mem_object_safe(prim_sd);
+               release_mem_object_safe(prim_sd_DL_shadow);
+               release_mem_object_safe(type_sd);
+               release_mem_object_safe(type_sd_DL_shadow);
+               release_mem_object_safe(u_sd);
+               release_mem_object_safe(u_sd_DL_shadow);
+               release_mem_object_safe(v_sd);
+               release_mem_object_safe(v_sd_DL_shadow);
+               release_mem_object_safe(object_sd);
+               release_mem_object_safe(object_sd_DL_shadow);
+               release_mem_object_safe(time_sd);
+               release_mem_object_safe(time_sd_DL_shadow);
+               release_mem_object_safe(ray_length_sd);
+               release_mem_object_safe(ray_length_sd_DL_shadow);
+               release_mem_object_safe(ray_depth_sd);
+               release_mem_object_safe(ray_depth_sd_DL_shadow);
+               release_mem_object_safe(transparent_depth_sd);
+               release_mem_object_safe(transparent_depth_sd_DL_shadow);
+#ifdef __RAY_DIFFERENTIALS__
+               release_mem_object_safe(dP_sd);
+               release_mem_object_safe(dP_sd_DL_shadow);
+               release_mem_object_safe(dI_sd);
+               release_mem_object_safe(dI_sd_DL_shadow);
+               release_mem_object_safe(du_sd);
+               release_mem_object_safe(du_sd_DL_shadow);
+               release_mem_object_safe(dv_sd);
+               release_mem_object_safe(dv_sd_DL_shadow);
+#endif
+#ifdef __DPDU__
+               release_mem_object_safe(dPdu_sd);
+               release_mem_object_safe(dPdu_sd_DL_shadow);
+               release_mem_object_safe(dPdv_sd);
+               release_mem_object_safe(dPdv_sd_DL_shadow);
+#endif
+               release_mem_object_safe(closure_sd);
+               release_mem_object_safe(closure_sd_DL_shadow);
+               release_mem_object_safe(num_closure_sd);
+               release_mem_object_safe(num_closure_sd_DL_shadow);
+               release_mem_object_safe(randb_closure_sd);
+               release_mem_object_safe(randb_closure_sd_DL_shadow);
+               release_mem_object_safe(ray_P_sd);
+               release_mem_object_safe(ray_P_sd_DL_shadow);
+               release_mem_object_safe(ray_dP_sd);
+               release_mem_object_safe(ray_dP_sd_DL_shadow);
+               release_mem_object_safe(rng_coop);
+               release_mem_object_safe(throughput_coop);
+               release_mem_object_safe(L_transparent_coop);
+               release_mem_object_safe(PathRadiance_coop);
+               release_mem_object_safe(Ray_coop);
+               release_mem_object_safe(PathState_coop);
+               release_mem_object_safe(Intersection_coop);
+               release_mem_object_safe(kgbuffer);
+               release_mem_object_safe(sd);
+               release_mem_object_safe(sd_DL_shadow);
+               release_mem_object_safe(ray_state);
+               release_mem_object_safe(AOAlpha_coop);
+               release_mem_object_safe(AOBSDF_coop);
+               release_mem_object_safe(AOLightRay_coop);
+               release_mem_object_safe(BSDFEval_coop);
+               release_mem_object_safe(ISLamp_coop);
+               release_mem_object_safe(LightRay_coop);
+               release_mem_object_safe(Intersection_coop_AO);
+               release_mem_object_safe(Intersection_coop_DL);
+#ifdef WITH_CYCLES_DEBUG
+               release_mem_object_safe(debugdata_coop);
+#endif
+               release_mem_object_safe(use_queues_flag);
+               release_mem_object_safe(Queue_data);
+               release_mem_object_safe(Queue_index);
+               release_mem_object_safe(work_array);
+#ifdef __WORK_STEALING__
+               release_mem_object_safe(work_pool_wgs);
+#endif
+               release_mem_object_safe(per_sample_output_buffers);
+
+               /* Release programs */
+               release_program_safe(data_init_program);
+               release_program_safe(scene_intersect_program);
+               release_program_safe(lamp_emission_program);
+               release_program_safe(queue_enqueue_program);
+               release_program_safe(background_buffer_update_program);
+               release_program_safe(shader_eval_program);
+               release_program_safe(holdout_emission_blurring_termination_ao_program);
+               release_program_safe(direct_lighting_program);
+               release_program_safe(shadow_blocked_program);
+               release_program_safe(next_iteration_setup_program);
+               release_program_safe(sum_all_radiance_program);
+
+               if(hostRayStateArray != NULL) {
+                       free(hostRayStateArray);
+               }
+       }
+
+       void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
+       {
+               /* cast arguments to cl types */
+               cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
+               cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
+               cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
+               cl_int d_x = rtile.x;
+               cl_int d_y = rtile.y;
+               cl_int d_w = rtile.w;
+               cl_int d_h = rtile.h;
+               cl_int d_offset = rtile.offset;
+               cl_int d_stride = rtile.stride;
+
+               /* Make sure that set render feasible tile size is a multiple of local
+                * work size dimensions.
+                */
+               assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
+               assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
+
+               /* ray_state and hostRayStateArray should be of same size. */
+               assert(hostRayState_size == rayState_size);
+               assert(rayState_size == 1);
+
+               size_t global_size[2];
+               size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
+                                       SPLIT_KERNEL_LOCAL_SIZE_Y};
+
+               /* Set the range of samples to be processed for every ray in
+                * path-regeneration logic.
+                */
+               cl_int start_sample = rtile.start_sample;
+               cl_int end_sample = rtile.start_sample + rtile.num_samples;
+               cl_int num_samples = rtile.num_samples;
+
+#ifdef __WORK_STEALING__
+               global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
+               global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+               unsigned int num_parallel_samples = 1;
+#else
+               global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
+               unsigned int num_threads = max_render_feasible_tile_size.x *
+                                          max_render_feasible_tile_size.y;
+               unsigned int num_tile_columns_possible = num_threads / global_size[1];
+               /* Estimate number of parallel samples that can be
+                * processed in parallel.
+                */
+               unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
+                                                       rtile.num_samples);
+               /* Wavefront size in AMD is 64.
+                * TODO(sergey): What about other platforms?
+                */
+               if(num_parallel_samples >= 64) {
+                       /* TODO(sergey): Could use generic round-up here. */
+                       num_parallel_samples = (num_parallel_samples / 64) * 64
+               }
+               assert(num_parallel_samples != 0);
+
+               global_size[0] = d_w * num_parallel_samples;
+#endif  /* __WORK_STEALING__ */
+
+               assert(global_size[0] * global_size[1] <=
+                      max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
+
+               /* Allocate all required global memory once. */
+               if(first_tile) {
+                       size_t num_global_elements = max_render_feasible_tile_size.x *
+                                                    max_render_feasible_tile_size.y;
+                       /* TODO(sergey): This will actually over-allocate if
+                        * particular kernel does not support multiclosure.
+                        */
+                       size_t ShaderClosure_size = get_shader_closure_size(current_clos_max);
+
+#ifdef __WORK_STEALING__
+                       /* Calculate max groups */
+                       size_t max_global_size[2];
+                       size_t tile_x = max_render_feasible_tile_size.x;
+                       size_t tile_y = max_render_feasible_tile_size.y;
+                       max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
+                       max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
+                       max_work_groups = (max_global_size[0] * max_global_size[1]) /
+                                         (local_size[0] * local_size[1]);
+                       /* Allocate work_pool_wgs memory. */
+                       work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
+#endif  /* __WORK_STEALING__ */
+
+                       /* Allocate queue_index memory only once. */
+                       Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
+                       use_queues_flag = mem_alloc(sizeof(char));
+                       kgbuffer = mem_alloc(get_KernelGlobals_size());
+
+                       /* Create global buffers for ShaderData. */
+                       sd = mem_alloc(get_shaderdata_soa_size());
+                       sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
+                       P_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+                       N_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+                       Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+                       I_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+                       shader_sd = mem_alloc(num_global_elements * sizeof(int));
+                       shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       flag_sd = mem_alloc(num_global_elements * sizeof(int));
+                       flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       prim_sd = mem_alloc(num_global_elements * sizeof(int));
+                       prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       type_sd = mem_alloc(num_global_elements * sizeof(int));
+                       type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       u_sd = mem_alloc(num_global_elements * sizeof(float));
+                       u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+                       v_sd = mem_alloc(num_global_elements * sizeof(float));
+                       v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+                       object_sd = mem_alloc(num_global_elements * sizeof(int));
+                       object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       time_sd = mem_alloc(num_global_elements * sizeof(float));
+                       time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+                       ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
+                       ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+                       ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+                       ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
+                       transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+
+#ifdef __RAY_DIFFERENTIALS__
+                       dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+                       dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+                       dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
+                       dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+                       du_sd = mem_alloc(num_global_elements * sizeof(differential));
+                       du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+                       dv_sd = mem_alloc(num_global_elements * sizeof(differential));
+                       dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
+#endif
+
+#ifdef __DPDU__
+                       dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+                       dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+#endif
+                       closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
+                       closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
+                       num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
+                       num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
+                       randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
+                       randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
+                       ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
+                       ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
+                       ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
+                       ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
+
+                       /* Creation of global memory buffers which are shared among
+                        * the kernels.
+                        */
+                       rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
+                       throughput_coop = mem_alloc(num_global_elements * throughput_size);
+                       L_transparent_coop = mem_alloc(num_global_elements * L_transparent_size);
+                       PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
+                       Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
+                       PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
+                       Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
+                       AOAlpha_coop = mem_alloc(num_global_elements * AOAlpha_size);
+                       AOBSDF_coop = mem_alloc(num_global_elements * AOBSDF_size);
+                       AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+                       BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
+                       ISLamp_coop = mem_alloc(num_global_elements * ISLamp_size);
+                       LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
+                       Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
+                       Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
+
+#ifdef WITH_CYCLES_DEBUG
+                       debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
+#endif
+
+                       ray_state = mem_alloc(num_global_elements * rayState_size);
+
+                       hostRayStateArray = (char *)calloc(num_global_elements, hostRayState_size);
+                       assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
+
+                       Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
+                       work_array = mem_alloc(num_global_elements * work_element_size);
+                       per_sample_output_buffers = mem_alloc(num_global_elements *
+                                                             per_thread_output_buffer_size);
+               }
+
+               cl_int dQueue_size = global_size[0] * global_size[1];
+               cl_int total_num_rays = global_size[0] * global_size[1];
+
+               cl_uint start_arg_index =
+                       kernel_set_args(ckPathTraceKernel_data_init,
+                                       0,
+                                       kgbuffer,
+                                       sd,
+                                       sd_DL_shadow,
+                                       P_sd,
+                                       P_sd_DL_shadow,
+                                       N_sd,
+                                       N_sd_DL_shadow,
+                                       Ng_sd,
+                                       Ng_sd_DL_shadow,
+                                       I_sd,
+                                       I_sd_DL_shadow,
+                                       shader_sd,
+                                       shader_sd_DL_shadow,
+                                       flag_sd,
+                                       flag_sd_DL_shadow,
+                                       prim_sd,
+                                       prim_sd_DL_shadow,
+                                       type_sd,
+                                       type_sd_DL_shadow,
+                                       u_sd,
+                                       u_sd_DL_shadow,
+                                       v_sd,
+                                       v_sd_DL_shadow,
+                                       object_sd,
+                                       object_sd_DL_shadow,
+                                       time_sd,
+                                       time_sd_DL_shadow,
+                                       ray_length_sd,
+                                       ray_length_sd_DL_shadow,
+                                       ray_depth_sd,
+                                       ray_depth_sd_DL_shadow,
+                                       transparent_depth_sd,
+                                       transparent_depth_sd_DL_shadow);
+
+               start_arg_index +=
+                       kernel_set_args(ckPathTraceKernel_data_init,
+#ifdef __RAY_DIFFERENTIALS__
+                                       start_arg_index,
+                                       dP_sd,
+                                       dP_sd_DL_shadow,
+                                       dI_sd,
+                                       dI_sd_DL_shadow,
+                                       du_sd,
+                                       du_sd_DL_shadow,
+                                       dv_sd,
+                                       dv_sd_DL_shadow,
+#endif
+#ifdef __DPDU__
+                                       dPdu_sd,
+                                       dPdu_sd_DL_shadow,
+                                       dPdv_sd,
+                                       dPdv_sd_DL_shadow,
+#endif
+                                       closure_sd,
+                                       closure_sd_DL_shadow,
+                                       num_closure_sd,
+                                       num_closure_sd_DL_shadow,
+                                       randb_closure_sd,
+                                       randb_closure_sd_DL_shadow,
+                                       ray_P_sd,
+                                       ray_P_sd_DL_shadow,
+                                       ray_dP_sd,
+                                       ray_dP_sd_DL_shadow,
+                                       d_data,
+                                       per_sample_output_buffers,
+                                       d_rng_state,
+                                       rng_coop,
+                                       throughput_coop,
+                                       L_transparent_coop,
+                                       PathRadiance_coop,
+                                       Ray_coop,
+                                       PathState_coop,
+                                       ray_state);
+
+/* TODO(segrey): Avoid map lookup here. */
+#define KERNEL_TEX(type, ttype, name) \
+       set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+
+               start_arg_index +=
+                       kernel_set_args(ckPathTraceKernel_data_init,
+                                       start_arg_index,
+                                       start_sample,
+                                       d_x,
+                                       d_y,
+                                       d_w,
+                                       d_h,
+                                       d_offset,
+                                       d_stride,
+                                       rtile.rng_state_offset_x,
+                                       rtile.rng_state_offset_y,
+                                       rtile.buffer_rng_state_stride,
+                                       Queue_data,
+                                       Queue_index,
+                                       dQueue_size,
+                                       use_queues_flag,
+                                       work_array,
+#ifdef __WORK_STEALING__
+                                       work_pool_wgs,
+                                       num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+                                       debugdata_coop,
+#endif
+                                       num_parallel_samples);
+
+               kernel_set_args(ckPathTraceKernel_scene_intersect,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               rng_coop,
+                               Ray_coop,
+                               PathState_coop,
+                               Intersection_coop,
+                               ray_state,
+                               d_w,
+                               d_h,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size,
+                               use_queues_flag,
+#ifdef WITH_CYCLES_DEBUG
+                               debugdata_coop,
+#endif
+                               num_parallel_samples);
+
+               kernel_set_args(ckPathTraceKernel_lamp_emission,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               sd,
+                               throughput_coop,
+                               PathRadiance_coop,
+                               Ray_coop,
+                               PathState_coop,
+                               Intersection_coop,
+                               ray_state,
+                               d_w,
+                               d_h,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size,
+                               use_queues_flag,
+                               num_parallel_samples);
+
+               kernel_set_args(ckPathTraceKernel_queue_enqueue,
+                               0,
+                               Queue_data,
+                               Queue_index,
+                               ray_state,
+                               dQueue_size);
+
+               kernel_set_args(ckPathTraceKernel_background_buffer_update,
+                                0,
+                                kgbuffer,
+                                d_data,
+                                sd,
+                                per_sample_output_buffers,
+                                d_rng_state,
+                                rng_coop,
+                                throughput_coop,
+                                PathRadiance_coop,
+                                Ray_coop,
+                                PathState_coop,
+                                L_transparent_coop,
+                                ray_state,
+                                d_w,
+                                d_h,
+                                d_x,
+                                d_y,
+                                d_stride,
+                                rtile.rng_state_offset_x,
+                                rtile.rng_state_offset_y,
+                                rtile.buffer_rng_state_stride,
+                                work_array,
+                                Queue_data,
+                                Queue_index,
+                                dQueue_size,
+                                end_sample,
+                                start_sample,
+#ifdef __WORK_STEALING__
+                                work_pool_wgs,
+                                num_samples,
+#endif
+#ifdef WITH_CYCLES_DEBUG
+                                debugdata_coop,
+#endif
+                                num_parallel_samples);
+
+               kernel_set_args(ckPathTraceKernel_shader_lighting,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               sd,
+                               rng_coop,
+                               Ray_coop,
+                               PathState_coop,
+                               Intersection_coop,
+                               ray_state,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size);
+
+               kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               sd,
+                               per_sample_output_buffers,
+                               rng_coop,
+                               throughput_coop,
+                               L_transparent_coop,
+                               PathRadiance_coop,
+                               PathState_coop,
+                               Intersection_coop,
+                               AOAlpha_coop,
+                               AOBSDF_coop,
+                               AOLightRay_coop,
+                               d_w,
+                               d_h,
+                               d_x,
+                               d_y,
+                               d_stride,
+                               ray_state,
+                               work_array,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size,
+#ifdef __WORK_STEALING__
+                               start_sample,
+#endif
+                               num_parallel_samples);
+
+               kernel_set_args(ckPathTraceKernel_direct_lighting,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               sd,
+                               sd_DL_shadow,
+                               rng_coop,
+                               PathState_coop,
+                               ISLamp_coop,
+                               LightRay_coop,
+                               BSDFEval_coop,
+                               ray_state,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size);
+
+               kernel_set_args(ckPathTraceKernel_shadow_blocked_direct_lighting,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               sd_DL_shadow,
+                               PathState_coop,
+                               LightRay_coop,
+                               AOLightRay_coop,
+                               Intersection_coop_AO,
+                               Intersection_coop_DL,
+                               ray_state,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size,
+                               total_num_rays);
+
+               kernel_set_args(ckPathTraceKernel_setup_next_iteration,
+                               0,
+                               kgbuffer,
+                               d_data,
+                               sd,
+                               rng_coop,
+                               throughput_coop,
+                               PathRadiance_coop,
+                               Ray_coop,
+                               PathState_coop,
+                               LightRay_coop,
+                               ISLamp_coop,
+                               BSDFEval_coop,
+                               AOLightRay_coop,
+                               AOBSDF_coop,
+                               AOAlpha_coop,
+                               ray_state,
+                               Queue_data,
+                               Queue_index,
+                               dQueue_size,
+                               use_queues_flag);
+
+               kernel_set_args(ckPathTraceKernel_sum_all_radiance,
+                               0,
+                               d_data,
+                               d_buffer,
+                               per_sample_output_buffers,
+                               num_parallel_samples,
+                               d_w,
+                               d_h,
+                               d_stride,
+                               rtile.buffer_offset_x,
+                               rtile.buffer_offset_y,
+                               rtile.buffer_rng_state_stride,
+                               start_sample);
+
+               /* Macro for Enqueuing split kernels. */
+#define GLUE(a, b) a ## b
+#define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
+               opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, \
+                                                    GLUE(ckPathTraceKernel_, \
+                                                         kernelName), \
+                                                    2, \
+                                                    NULL, \
+                                                    globalSize, \
+                                                    localSize, \
+                                                    0, \
+                                                    NULL, \
+                                                    NULL))
+
+               /* Enqueue ckPathTraceKernel_data_init kernel. */
+               ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
+               bool activeRaysAvailable = true;
+
+               /* Record number of time host intervention has been made */
+               unsigned int numHostIntervention = 0;
+               unsigned int numNextPathIterTimes = PathIteration_times;
+               while(activeRaysAvailable) {
+                       /* Twice the global work size of other kernels for
+                        * ckPathTraceKernel_shadow_blocked_direct_lighting. */
+                       size_t global_size_shadow_blocked[2];
+                       global_size_shadow_blocked[0] = global_size[0] * 2;
+                       global_size_shadow_blocked[1] = global_size[1];
+
+                       /* Do path-iteration in host [Enqueue Path-iteration kernels. */
+                       for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
+                               ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(shader_lighting, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
+                               ENQUEUE_SPLIT_KERNEL(shadow_blocked_direct_lighting, global_size_shadow_blocked, local_size);
+                               ENQUEUE_SPLIT_KERNEL(setup_next_iteration, global_size, local_size);
+                       }
+
+                       /* Read ray-state into Host memory to decide if we should exit
+                        * path-iteration in host.
+                        */
+                       ciErr = clEnqueueReadBuffer(cqCommandQueue,
+                                                   ray_state,
+                                                   CL_TRUE,
+                                                   0,
+                                                   global_size[0] * global_size[1] * sizeof(char),
+                                                   hostRayStateArray,
+                                                   0,
+                                                   NULL,
+                                                   NULL);
+                       assert(ciErr == CL_SUCCESS);
+
+                       activeRaysAvailable = false;
+
+                       for(int rayStateIter = 0;
+                           rayStateIter < global_size[0] * global_size[1];
+                           ++rayStateIter)
+                       {
+                               if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
+                                       /* Not all rays are RAY_INACTIVE. */
+                                       activeRaysAvailable = true;
+                                       break;
+                               }
+                       }
+
+                       if(activeRaysAvailable) {
+                               numHostIntervention++;
+                               PathIteration_times = PATH_ITER_INC_FACTOR;
+                               /* Host intervention done before all rays become RAY_INACTIVE;
+                                * Set do more initial iterations for the next tile.
+                                */
+                               numNextPathIterTimes += PATH_ITER_INC_FACTOR;
+                       }
+               }
+
+               /* Execute SumALLRadiance kernel to accumulate radiance calculated in
+                * per_sample_output_buffers into RenderTile's output buffer.
+                */
+               size_t sum_all_radiance_local_size[2] = {16, 16};
+               size_t sum_all_radiance_global_size[2];
+               sum_all_radiance_global_size[0] =
+                       (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
+                       sum_all_radiance_local_size[0];
+               sum_all_radiance_global_size[1] =
+                       (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
+                       sum_all_radiance_local_size[1];
+               ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
+                                    sum_all_radiance_global_size,
+                                    sum_all_radiance_local_size);
+
+#undef ENQUEUE_SPLIT_KERNEL
+#undef GLUE
+
+               if(numHostIntervention == 0) {
+                       /* This means that we are executing kernel more than required
+                        * Must avoid this for the next sample/tile.
+                        */
+                       PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
+                       PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
+               }
+               else {
+                       /* Number of path-iterations done for this tile is set as
+                        * Initial path-iteration times for the next tile
+                        */
+                       PathIteration_times = numNextPathIterTimes;
+               }
+
+               first_tile = false;
+       }
+
+       /* Calculates the amount of memory that has to be always
+        * allocated in order for the split kernel to function.
+        * This memory is tile/scene-property invariant (meaning,
+        * the value returned by this function does not depend
+        * on the user set tile size or scene properties.
+        */
+       size_t get_invariable_mem_allocated()
+       {
+               size_t total_invariable_mem_allocated = 0;
+               size_t KernelGlobals_size = 0;
+               size_t ShaderData_SOA_size = 0;
+
+               KernelGlobals_size = get_KernelGlobals_size();
+               ShaderData_SOA_size = get_shaderdata_soa_size();
+
+               total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
+               total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
+               total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
+               total_invariable_mem_allocated += ShaderData_SOA_size; /* sd size */
+               total_invariable_mem_allocated += ShaderData_SOA_size; /* sd_DL_shadow size */
+
+               return total_invariable_mem_allocated;
+       }
+
+       /* Calculate the memory that has-to-be/has-been allocated for
+        * the split kernel to function.
+        */
+       size_t get_tile_specific_mem_allocated(const int2 tile_size)
+       {
+               size_t tile_specific_mem_allocated = 0;
+
+               /* Get required tile info */
+               unsigned int user_set_tile_w = tile_size.x;
+               unsigned int user_set_tile_h = tile_size.y;
+
+#ifdef __WORK_STEALING__
+               /* Calculate memory to be allocated for work_pools in
+                * case of work_stealing.
+                */
+               size_t max_global_size[2];
+               size_t max_num_work_pools = 0;
+               max_global_size[0] =
+                       (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_X;
+               max_global_size[1] =
+                       (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_Y;
+               max_num_work_pools =
+                       (max_global_size[0] * max_global_size[1]) /
+                       (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
+               tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
+#endif
+
+               tile_specific_mem_allocated +=
+                       user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
+               tile_specific_mem_allocated +=
+                       user_set_tile_w * user_set_tile_h * sizeof(RNG);
+
+               return tile_specific_mem_allocated;
+       }
+
+       /* Calculates the texture memories and KernelData (d_data) memory
+        * that has been allocated.
+        */
+       size_t get_scene_specific_mem_allocated(cl_mem d_data)
+       {
+               size_t scene_specific_mem_allocated = 0;
+               /* Calculate texture memories. */
+#define KERNEL_TEX(type, ttype, name) \
+       scene_specific_mem_allocated += get_tex_size(#name);
+#include "kernel_textures.h"
+#undef KERNEL_TEX
+               size_t d_data_size;
+               ciErr = clGetMemObjectInfo(d_data,
+                                          CL_MEM_SIZE,
+                                          sizeof(d_data_size),
+                                          &d_data_size,
+                                          NULL);
+               assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
+               scene_specific_mem_allocated += d_data_size;
+               return scene_specific_mem_allocated;
+       }
+
+       /* Calculate the memory required for one thread in split kernel. */
+       size_t get_per_thread_memory()
+       {
+               size_t shader_closure_size = 0;
+               size_t shaderdata_volume = 0;
+               shader_closure_size = get_shader_closure_size(current_clos_max);
+               /* TODO(sergey): This will actually over-allocate if
+                * particular kernel does not support multiclosure.
+                */
+               shaderdata_volume = get_shader_data_size(shader_closure_size);
+               size_t retval = sizeof(RNG)
+                       + throughput_size + L_transparent_size
+                       + rayState_size + work_element_size
+                       + ISLamp_size + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
+                       + sizeof(Intersection)    /* Overall isect */
+                       + sizeof(Intersection)    /* Instersection_coop_AO */
+                       + sizeof(Intersection)    /* Intersection coop DL */
+                       + shaderdata_volume       /* Overall ShaderData */
+                       + (shaderdata_volume * 2) /* ShaderData : DL and shadow */
+                       + sizeof(Ray) + sizeof(BsdfEval) + AOAlpha_size + AOBSDF_size + sizeof(Ray)
+                       + (sizeof(int)* NUM_QUEUES)
+                       + per_thread_output_buffer_size;
+               return retval;
+       }
+
+       /* Considers the total memory available in the device and
+        * and returns the maximum global work size possible.
+        */
+       size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
+       {
+               /* Calculate invariably allocated memory. */
+               size_t invariable_mem_allocated = get_invariable_mem_allocated();
+               /* Calculate tile specific allocated memory. */
+               size_t tile_specific_mem_allocated =
+                       get_tile_specific_mem_allocated(tile_size);
+               /* Calculate scene specific allocated memory. */
+               size_t scene_specific_mem_allocated =
+                       get_scene_specific_mem_allocated(d_data);
+               /* Calculate total memory available for the threads in global work size. */
+               size_t available_memory = total_allocatable_memory
+                       - invariable_mem_allocated
+                       - tile_specific_mem_allocated
+                       - scene_specific_mem_allocated
+                       - DATA_ALLOCATION_MEM_FACTOR;
+               size_t per_thread_memory_required = get_per_thread_memory();
+               return (available_memory / per_thread_memory_required);
+       }
+
+       /* Checks if the device has enough memory to render the whole tile;
+        * If not, we should split single tile into multiple tiles of small size
+        * and process them all.
+        */
+       bool need_to_split_tile(unsigned int d_w,
+                               unsigned int d_h,
+                               int2 max_render_feasible_tile_size)
+       {
+               size_t global_size_estimate[2];
+               /* TODO(sergey): Such round-ups are in quite few places, need to replace
+                * them with an utility macro.
+                */
+               global_size_estimate[0] =
+                       (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_X;
+               global_size_estimate[1] =
+                       (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_Y;
+               if((global_size_estimate[0] * global_size_estimate[1]) >
+                  (max_render_feasible_tile_size.x * max_render_feasible_tile_size.y))
                {
-                       run = function_bind(&OpenCLDevice::thread_run, device, this);
+                       return true;
                }
-       };
+               else {
+                       return false;
+               }
+       }
 
-       int get_split_task_count(DeviceTask& /*task*/)
+       /* Considers the scene properties, global memory available in the device
+        * and returns a rectanglular tile dimension (approx the maximum)
+        * that should render on split kernel.
+        */
+       int2 get_max_render_feasible_tile_size(size_t feasible_global_work_size)
        {
-               return 1;
+               int2 max_render_feasible_tile_size;
+               int square_root_val = (int)sqrt(feasible_global_work_size);
+               max_render_feasible_tile_size.x = square_root_val;
+               max_render_feasible_tile_size.y = square_root_val;
+               /* Ciel round-off max_render_feasible_tile_size. */
+               int2 ceil_render_feasible_tile_size;
+               ceil_render_feasible_tile_size.x =
+                       (((max_render_feasible_tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_X;
+               ceil_render_feasible_tile_size.y =
+                       (((max_render_feasible_tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_Y;
+               if(ceil_render_feasible_tile_size.x * ceil_render_feasible_tile_size.y <=
+                  feasible_global_work_size)
+               {
+                       return ceil_render_feasible_tile_size;
+               }
+               /* Floor round-off max_render_feasible_tile_size. */
+               int2 floor_render_feasible_tile_size;
+               floor_render_feasible_tile_size.x =
+                       (max_render_feasible_tile_size.x / SPLIT_KERNEL_LOCAL_SIZE_X) *
+                       SPLIT_KERNEL_LOCAL_SIZE_X;
+               floor_render_feasible_tile_size.y =
+                       (max_render_feasible_tile_size.y / SPLIT_KERNEL_LOCAL_SIZE_Y) *
+                       SPLIT_KERNEL_LOCAL_SIZE_Y;
+               return floor_render_feasible_tile_size;
        }
 
-       void task_add(DeviceTask& task)
+       /* Try splitting the current tile into multiple smaller
+        * almost-square-tiles.
+        */
+       int2 get_split_tile_size(RenderTile rtile,
+                                int2 max_render_feasible_tile_size)
        {
-               task_pool.push(new OpenCLDeviceTask(this, task));
+               int2 split_tile_size;
+               int num_global_threads = max_render_feasible_tile_size.x *
+                                        max_render_feasible_tile_size.y;
+               int d_w = rtile.w;
+               int d_h = rtile.h;
+               /* Ceil round off d_w and d_h */
+               d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_X;
+               d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+                       SPLIT_KERNEL_LOCAL_SIZE_Y;
+               while(d_w * d_h > num_global_threads) {
+                       /* Halve the longer dimension. */
+                       if(d_w >= d_h) {
+                               d_w = d_w / 2;
+                               d_w = (((d_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+                                       SPLIT_KERNEL_LOCAL_SIZE_X;
+                       }
+                       else {
+                               d_h = d_h / 2;
+                               d_h = (((d_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+                                       SPLIT_KERNEL_LOCAL_SIZE_Y;
+                       }
+               }
+               split_tile_size.x = d_w;
+               split_tile_size.y = d_h;
+               return split_tile_size;
        }
 
-       void task_wait()
+       /* Splits existing tile into multiple tiles of tile size split_tile_size. */
+       vector<SplitRenderTile> split_tiles(RenderTile rtile, int2 split_tile_size)
        {
-               task_pool.wait();
+               vector<SplitRenderTile> to_path_trace_rtile;
+               int d_w = rtile.w;
+               int d_h = rtile.h;
+               int num_tiles_x = (((d_w - 1) / split_tile_size.x) + 1);
+               int num_tiles_y = (((d_h - 1) / split_tile_size.y) + 1);
+               /* Buffer and rng_state offset calc. */
+               size_t offset_index = rtile.offset + (rtile.x + rtile.y * rtile.stride);
+               size_t offset_x = offset_index % rtile.stride;
+               size_t offset_y = offset_index / rtile.stride;
+               /* Resize to_path_trace_rtile. */
+               to_path_trace_rtile.resize(num_tiles_x * num_tiles_y);
+               for(int tile_iter_y = 0; tile_iter_y < num_tiles_y; tile_iter_y++) {
+                       for(int tile_iter_x = 0; tile_iter_x < num_tiles_x; tile_iter_x++) {
+                               int rtile_index = tile_iter_y * num_tiles_x + tile_iter_x;
+                               to_path_trace_rtile[rtile_index].rng_state_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+                               to_path_trace_rtile[rtile_index].rng_state_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+                               to_path_trace_rtile[rtile_index].buffer_offset_x = offset_x + tile_iter_x * split_tile_size.x;
+                               to_path_trace_rtile[rtile_index].buffer_offset_y = offset_y + tile_iter_y * split_tile_size.y;
+                               to_path_trace_rtile[rtile_index].start_sample = rtile.start_sample;
+                               to_path_trace_rtile[rtile_index].num_samples = rtile.num_samples;
+                               to_path_trace_rtile[rtile_index].sample = rtile.sample;
+                               to_path_trace_rtile[rtile_index].resolution = rtile.resolution;
+                               to_path_trace_rtile[rtile_index].offset = rtile.offset;
+                               to_path_trace_rtile[rtile_index].buffers = rtile.buffers;
+                               to_path_trace_rtile[rtile_index].buffer = rtile.buffer;
+                               to_path_trace_rtile[rtile_index].rng_state = rtile.rng_state;
+                               to_path_trace_rtile[rtile_index].x = rtile.x + (tile_iter_x * split_tile_size.x);
+                               to_path_trace_rtile[rtile_index].y = rtile.y + (tile_iter_y * split_tile_size.y);
+                               to_path_trace_rtile[rtile_index].buffer_rng_state_stride = rtile.stride;
+                               /* Fill width and height of the new render tile. */
+                               to_path_trace_rtile[rtile_index].w = (tile_iter_x == (num_tiles_x - 1)) ?
+                                       (d_w - (tile_iter_x * split_tile_size.x)) /* Border tile */
+                                       : split_tile_size.x;
+                               to_path_trace_rtile[rtile_index].h = (tile_iter_y == (num_tiles_y - 1)) ?
+                                       (d_h - (tile_iter_y * split_tile_size.y)) /* Border tile */
+                                       : split_tile_size.y;
+                               to_path_trace_rtile[rtile_index].stride = to_path_trace_rtile[rtile_index].w;
+                       }
+               }
+               return to_path_trace_rtile;
        }
 
-       void task_cancel()
+       void thread_run(DeviceTask *task)
        {
-               task_pool.cancel();
+               if(task->type == DeviceTask::FILM_CONVERT) {
+                       film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
+               }
+               else if(task->type == DeviceTask::SHADER) {
+                       shader(*task);
+               }
+               else if(task->type == DeviceTask::PATH_TRACE) {
+                       RenderTile tile;
+                       bool initialize_data_and_check_render_feasibility = false;
+                       bool need_to_split_tiles_further = false;
+                       int2 max_render_feasible_tile_size;
+                       size_t feasible_global_work_size;
+                       const int2 tile_size = task->requested_tile_size;
+                       /* Keep rendering tiles until done. */
+                       while(task->acquire_tile(this, tile)) {
+                               if(!initialize_data_and_check_render_feasibility) {
+                                       /* Initialize data. */
+                                       /* Calculate per_thread_output_buffer_size. */
+                                       size_t output_buffer_size = 0;
+                                       ciErr = clGetMemObjectInfo((cl_mem)tile.buffer,
+                                                                  CL_MEM_SIZE,
+                                                                  sizeof(output_buffer_size),
+                                                                  &output_buffer_size,
+                                                                  NULL);
+                                       assert(ciErr == CL_SUCCESS && "Can't get tile.buffer mem object info");
+                                       /* This value is different when running on AMD and NV. */
+                                       if(background) {
+                                               /* In offline render the number of buffer elements
+                                                * associated with tile.buffer is the current tile size.
+                                                */
+                                               per_thread_output_buffer_size =
+                                                       output_buffer_size / (tile.w * tile.h);
+                                       }
+                                       else {
+                                               /* interactive rendering, unlike offline render, the number of buffer elements
+                                                * associated with tile.buffer is the entire viewport size.
+                                                */
+                                               per_thread_output_buffer_size =
+                                                       output_buffer_size / (tile.buffers->params.width *
+                                                                             tile.buffers->params.height);
+                                       }
+                                       /* Check render feasibility. */
+                                       feasible_global_work_size = get_feasible_global_work_size(
+                                               tile_size,
+                                               CL_MEM_PTR(const_mem_map["__data"]->device_pointer));
+                                       max_render_feasible_tile_size =
+                                               get_max_render_feasible_tile_size(
+                                                       feasible_global_work_size);
+                                       need_to_split_tiles_further =
+                                               need_to_split_tile(tile_size.x,
+                                                                  tile_size.y,
+                                                                  max_render_feasible_tile_size);
+                                       initialize_data_and_check_render_feasibility = true;
+                               }
+                               if(need_to_split_tiles_further) {
+                                       int2 split_tile_size =
+                                               get_split_tile_size(tile,
+                                                                   max_render_feasible_tile_size);
+                                       vector<SplitRenderTile> to_path_trace_render_tiles =
+                                               split_tiles(tile, split_tile_size);
+                                       /* Print message to console */
+                                       if(background && (to_path_trace_render_tiles.size() > 1)) {
+                                               fprintf(stderr, "Message : Tiles need to be split "
+                                                       "further inside path trace (due to insufficient "
+                                                       "device-global-memory for split kernel to "
+                                                       "function) \n"
+                                                       "The current tile of dimensions %dx%d is split "
+                                                       "into tiles of dimension %dx%d for render \n",
+                                                       tile.w, tile.h,
+                                                       split_tile_size.x,
+                                                       split_tile_size.y);
+                                       }
+                                       /* Process all split tiles. */
+                                       for(int tile_iter = 0;
+                                           tile_iter < to_path_trace_render_tiles.size();
+                                           ++tile_iter)
+                                       {
+                                               path_trace(to_path_trace_render_tiles[tile_iter],
+                                                          max_render_feasible_tile_size);
+                                       }
+                               }
+                               else {
+                                       /* No splitting required; process the entire tile at once. */
+                                       /* Render feasible tile size is user-set-tile-size itself. */
+                                       max_render_feasible_tile_size.x =
+                                               (((tile_size.x - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
+                                               SPLIT_KERNEL_LOCAL_SIZE_X;
+                                       max_render_feasible_tile_size.y =
+                                               (((tile_size.y - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
+                                               SPLIT_KERNEL_LOCAL_SIZE_Y;
+                                       /* buffer_rng_state_stride is stride itself. */
+                                       SplitRenderTile split_tile(tile);
+                                       split_tile.buffer_rng_state_stride = tile.stride;
+                                       path_trace(split_tile, max_render_feasible_tile_size);
+                               }
+                               tile.sample = tile.start_sample + tile.num_samples;
+
+                               /* Complete kernel execution before release tile. */
+                               /* This helps in multi-device render;
+                                * The device that reaches the critical-section function
+                                * release_tile waits (stalling other devices from entering
+                                * release_tile) for all kernels to complete. If device1 (a
+                                * slow-render device) reaches release_tile first then it would
+                                * stall device2 (a fast-render device) from proceeding to render
+                                * next tile.
+                                */
+                               clFinish(cqCommandQueue);
+
+                               task->release_tile(tile);
+                       }
+               }
+       }
+
+protected:
+       cl_mem mem_alloc(size_t bufsize, cl_mem_flags mem_flag = CL_MEM_READ_WRITE)
+       {
+               cl_mem ptr;
+               ptr = clCreateBuffer(cxContext, mem_flag, bufsize, NULL, &ciErr);
+               if(opencl_error(ciErr)) {
+                       assert(0);
+               }
+               return ptr;
        }
 };
 
+/* Returns true in case of successful detection of platform and device type,
+ * else returns false.
+ */
+static bool get_platform_and_devicetype(const DeviceInfo info,
+                                        string &platform_name,
+                                        cl_device_type &device_type)
+{
+       cl_platform_id platform_id;
+       cl_device_id device_id;
+       cl_uint num_platforms;
+       cl_int ciErr;
+
+       /* TODO(sergey): Use some generic error print helper function/ */
+       ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
+       if(ciErr != CL_SUCCESS) {
+               fprintf(stderr, "Can't getPlatformIds. file - %s, line - %d\n", __FILE__, __LINE__);
+               return false;
+       }
+
+       if(num_platforms == 0) {
+               fprintf(stderr, "No OpenCL platforms found. file - %s, line - %d\n", __FILE__, __LINE__);
+               return false;
+       }
+
+       vector<cl_platform_id> platforms(num_platforms, NULL);
+
+       ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
+       if(ciErr != CL_SUCCESS) {
+               fprintf(stderr, "Can't getPlatformIds. file - %s, line - %d\n", __FILE__, __LINE__);
+               return false;
+       }
+
+       int num_base = 0;
+       int total_devices = 0;
+
+       for(int platform = 0; platform < num_platforms; platform++) {
+               cl_uint num_devices;
+
+               ciErr = clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices);
+               if(ciErr != CL_SUCCESS) {
+                       fprintf(stderr, "Can't getDeviceIDs. file - %s, line - %d\n", __FILE__, __LINE__);
+                       return false;
+               }
+
+               total_devices += num_devices;
+
+               if(info.num - num_base >= num_devices) {
+                       /* num doesn't refer to a device in this platform */
+                       num_base += num_devices;
+                       continue;
+               }
+
+               /* device is in this platform */
+               platform_id = platforms[platform];
+
+               /* get devices */
+               vector<cl_device_id> device_ids(num_devices, NULL);
+
+               ciErr = clGetDeviceIDs(platform_id, opencl_device_type(), num_devices, &device_ids[0], NULL);
+               if(ciErr != CL_SUCCESS) {
+                       fprintf(stderr, "Can't getDeviceIDs. file - %s, line - %d\n", __FILE__, __LINE__);
+                       return false;
+               }
+
+               device_id = device_ids[info.num - num_base];
+
+               char name[256];
+               ciErr = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
+               if(ciErr != CL_SUCCESS) {
+                       fprintf(stderr, "Can't getPlatformIDs. file - %s, line - %d \n", __FILE__, __LINE__);
+                       return false;
+               }
+               platform_name = name;
+
+               ciErr = clGetDeviceInfo(device_id, CL_DEVICE_TYPE, sizeof(cl_device_type), &device_type, NULL);
+               if(ciErr != CL_SUCCESS) {
+                       fprintf(stderr, "Can't getDeviceInfo. file - %s, line - %d \n", __FILE__, __LINE__);
+                       return false;
+               }
+
+               break;
+       }
+
+       if(total_devices == 0) {
+               fprintf(stderr, "No devices found. file - %s, line - %d \n", __FILE__, __LINE__);
+               return false;
+       }
+
+       return true;
+}
+
 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
 {
-       return new OpenCLDevice(info, stats, background);
+       string platform_name;
+       cl_device_type device_type;
+       if(get_platform_and_devicetype(info, platform_name, device_type)) {
+               const bool force_split_kernel =
+                       getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL;
+               /* TODO(sergey): Replace string lookups with more enum-like API,
+                * similar to device/venfdor checks blender's gpu.
+                */
+               if(force_split_kernel ||
+                  (platform_name == "AMD Accelerated Parallel Processing" &&
+                   device_type == CL_DEVICE_TYPE_GPU))
+               {
+                       /* If the device is an AMD GPU, take split kernel path. */
+                       VLOG(1) << "Using split kernel";
+                       return new OpenCLDeviceSplitKernel(info, stats, background);
+               } else {
+                       /* For any other device, take megakernel path. */
+                       VLOG(1) << "Using megekernel";
+                       return new OpenCLDeviceMegaKernel(info, stats, background);
+               }
+       } else {
+               /* If we can't retrieve platform and device type information for some
+                * reason, we default to megakernel path.
+                */
+               VLOG(1) << "Failed to rertieve platform or device, using megakernel";
+               return new OpenCLDeviceMegaKernel(info, stats, background);
+       }
 }
 
-bool device_opencl_init(void) {
+bool device_opencl_init(void)
+{
        static bool initialized = false;
        static bool result = false;
 
@@ -1132,13 +3322,7 @@ bool device_opencl_init(void) {
 
        initialized = true;
 
-       // OpenCL disabled for now, only works with this environment variable set
-       if(!getenv("CYCLES_OPENCL_TEST")) {
-               result = false;
-       }
-       else {
-               result = clewInit() == CLEW_SUCCESS;
-       }
+       result = clewInit() == CLEW_SUCCESS;
 
        return result;
 }
index 83b3450fc1cf22c576408d267ac553c69e248e7e..85b2760073b69dace19e151a5bc6e9f3b0c189cb 100644 (file)
@@ -14,6 +14,17 @@ set(INC_SYS
 set(SRC
        kernel.cpp
        kernel.cl
+       kernel_data_init.cl
+       kernel_queue_enqueue.cl
+       kernel_scene_intersect.cl
+       kernel_lamp_emission.cl
+       kernel_background_buffer_update.cl
+       kernel_shader_eval.cl
+       kernel_holdout_emission_blurring_pathtermination_ao.cl
+       kernel_direct_lighting.cl
+       kernel_shadow_blocked.cl
+       kernel_next_iteration_setup.cl
+       kernel_sum_all_radiance.cl
        kernel.cu
 )
 
@@ -36,17 +47,22 @@ set(SRC_HEADERS
        kernel_montecarlo.h
        kernel_passes.h
        kernel_path.h
+       kernel_path_common.h
        kernel_path_state.h
        kernel_path_surface.h
        kernel_path_volume.h
        kernel_projection.h
+       kernel_queues.h
        kernel_random.h
        kernel_shader.h
+       kernel_shaderdata_vars.h
        kernel_shadow.h
+       kernel_split.h
        kernel_subsurface.h
        kernel_textures.h
        kernel_types.h
        kernel_volume.h
+       kernel_work_stealing.h
 )
 
 set(SRC_CLOSURE_HEADERS
@@ -68,6 +84,7 @@ set(SRC_CLOSURE_HEADERS
        closure/emissive.h
        closure/volume.h
 )
+
 set(SRC_SVM_HEADERS
        svm/svm.h
        svm/svm_attribute.h
@@ -284,6 +301,17 @@ endif()
 #delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${KERNEL_PREPROCESSED}" ${CYCLES_INSTALL_PATH}/kernel)
 
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_data_init.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_queue_enqueue.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_scene_intersect.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_lamp_emission.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_background_buffer_update.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shader_eval.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_holdout_emission_blurring_pathtermination_ao.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_direct_lighting.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_shadow_blocked.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_next_iteration_setup.cl" ${CYCLES_INSTALL_PATH}/kernel)
+delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel_sum_all_radiance.cl" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "kernel.cu" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel)
 delayed_install(${CMAKE_CURRENT_SOURCE_DIR} "${SRC_CLOSURE_HEADERS}" ${CYCLES_INSTALL_PATH}/kernel/closure)
index 2b9e2a4e44d7a80288a67cbf886f0a355e806fe7..558aa0dc6a9bef62ed18ce074a6238ecea16f4ab 100644 (file)
@@ -47,79 +47,79 @@ ccl_device int bsdf_sample(KernelGlobals *kg, const ShaderData *sd, const Shader
        switch(sc->type) {
                case CLOSURE_BSDF_DIFFUSE_ID:
                case CLOSURE_BSDF_BSSRDF_ID:
-                       label = bsdf_diffuse_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_diffuse_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
 #ifdef __SVM__
                case CLOSURE_BSDF_OREN_NAYAR_ID:
-                       label = bsdf_oren_nayar_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_oren_nayar_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                /*case CLOSURE_BSDF_PHONG_RAMP_ID:
-                       label = bsdf_phong_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_phong_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-                       label = bsdf_diffuse_ramp_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_diffuse_ramp_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;*/
                case CLOSURE_BSDF_TRANSLUCENT_ID:
-                       label = bsdf_translucent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_translucent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_REFLECTION_ID:
-                       label = bsdf_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_REFRACTION_ID:
-                       label = bsdf_refraction_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_refraction_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_TRANSPARENT_ID:
-                       label = bsdf_transparent_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_transparent_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_MICROFACET_GGX_ID:
                case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
                case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-                       label = bsdf_microfacet_ggx_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_microfacet_ggx_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
                case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
                case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-                       label = bsdf_microfacet_beckmann_sample(kg, sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_microfacet_beckmann_sample(kg, sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
                case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-                       label = bsdf_ashikhmin_shirley_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_ashikhmin_shirley_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-                       label = bsdf_ashikhmin_velvet_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_ashikhmin_velvet_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-                       label = bsdf_diffuse_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_diffuse_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_GLOSSY_TOON_ID:
-                       label = bsdf_glossy_toon_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_glossy_toon_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-                       label = bsdf_hair_reflection_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_hair_reflection_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
                case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-                       label = bsdf_hair_transmission_sample(sc, sd->Ng, sd->I, sd->dI.dx, sd->dI.dy, randu, randv,
+                       label = bsdf_hair_transmission_sample(sc, ccl_fetch(sd, Ng), ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv,
                                eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
 #endif
 #ifdef __VOLUME__
                case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-                       label = volume_henyey_greenstein_sample(sc, sd->I, sd->dI.dx, sd->dI.dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
+                       label = volume_henyey_greenstein_sample(sc, ccl_fetch(sd, I), ccl_fetch(sd, dI).dx, ccl_fetch(sd, dI).dy, randu, randv, eval, omega_in, &domega_in->dx, &domega_in->dy, pdf);
                        break;
 #endif
                default:
@@ -139,67 +139,67 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
                return OSLShader::bsdf_eval(sd, sc, omega_in, *pdf);
 #endif
 
-       if(dot(sd->Ng, omega_in) >= 0.0f) {
+       if(dot(ccl_fetch(sd, Ng), omega_in) >= 0.0f) {
                switch(sc->type) {
                        case CLOSURE_BSDF_DIFFUSE_ID:
                        case CLOSURE_BSDF_BSSRDF_ID:
-                               eval = bsdf_diffuse_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_diffuse_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
 #ifdef __SVM__
                        case CLOSURE_BSDF_OREN_NAYAR_ID:
-                               eval = bsdf_oren_nayar_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_oren_nayar_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        /*case CLOSURE_BSDF_PHONG_RAMP_ID:
-                               eval = bsdf_phong_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_phong_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_DIFFUSE_RAMP_ID:
-                               eval = bsdf_diffuse_ramp_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_diffuse_ramp_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;*/
                        case CLOSURE_BSDF_TRANSLUCENT_ID:
-                               eval = bsdf_translucent_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_translucent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_REFLECTION_ID:
-                               eval = bsdf_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_REFRACTION_ID:
-                               eval = bsdf_refraction_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_refraction_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_TRANSPARENT_ID:
-                               eval = bsdf_transparent_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_transparent_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_MICROFACET_GGX_ID:
                        case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
                        case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-                               eval = bsdf_microfacet_ggx_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_microfacet_ggx_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
                        case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
                        case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-                               eval = bsdf_microfacet_beckmann_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_microfacet_beckmann_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
                        case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-                               eval = bsdf_ashikhmin_shirley_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_ashikhmin_shirley_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-                               eval = bsdf_ashikhmin_velvet_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_ashikhmin_velvet_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-                               eval = bsdf_diffuse_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_diffuse_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_GLOSSY_TOON_ID:
-                               eval = bsdf_glossy_toon_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_glossy_toon_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-                               eval = bsdf_hair_reflection_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_hair_reflection_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-                               eval = bsdf_hair_transmission_eval_reflect(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_hair_transmission_eval_reflect(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
 #endif
 #ifdef __VOLUME__
                        case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-                               eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+                               eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
 #endif
                        default:
@@ -211,57 +211,57 @@ ccl_device float3 bsdf_eval(KernelGlobals *kg, const ShaderData *sd, const Shade
                switch(sc->type) {
                        case CLOSURE_BSDF_DIFFUSE_ID:
                        case CLOSURE_BSDF_BSSRDF_ID:
-                               eval = bsdf_diffuse_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_diffuse_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
 #ifdef __SVM__
                        case CLOSURE_BSDF_OREN_NAYAR_ID:
-                               eval = bsdf_oren_nayar_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_oren_nayar_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_TRANSLUCENT_ID:
-                               eval = bsdf_translucent_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_translucent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_REFLECTION_ID:
-                               eval = bsdf_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_REFRACTION_ID:
-                               eval = bsdf_refraction_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_refraction_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_TRANSPARENT_ID:
-                               eval = bsdf_transparent_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_transparent_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_MICROFACET_GGX_ID:
                        case CLOSURE_BSDF_MICROFACET_GGX_ANISO_ID:
                        case CLOSURE_BSDF_MICROFACET_GGX_REFRACTION_ID:
-                               eval = bsdf_microfacet_ggx_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_microfacet_ggx_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_MICROFACET_BECKMANN_ID:
                        case CLOSURE_BSDF_MICROFACET_BECKMANN_ANISO_ID:
                        case CLOSURE_BSDF_MICROFACET_BECKMANN_REFRACTION_ID:
-                               eval = bsdf_microfacet_beckmann_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_microfacet_beckmann_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ID:
                        case CLOSURE_BSDF_ASHIKHMIN_SHIRLEY_ANISO_ID:
-                               eval = bsdf_ashikhmin_shirley_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_ashikhmin_shirley_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_ASHIKHMIN_VELVET_ID:
-                               eval = bsdf_ashikhmin_velvet_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_ashikhmin_velvet_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_DIFFUSE_TOON_ID:
-                               eval = bsdf_diffuse_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_diffuse_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_GLOSSY_TOON_ID:
-                               eval = bsdf_glossy_toon_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_glossy_toon_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_HAIR_REFLECTION_ID:
-                               eval = bsdf_hair_reflection_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_hair_reflection_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
                        case CLOSURE_BSDF_HAIR_TRANSMISSION_ID:
-                               eval = bsdf_hair_transmission_eval_transmit(sc, sd->I, omega_in, pdf);
+                               eval = bsdf_hair_transmission_eval_transmit(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
 #endif
 #ifdef __VOLUME__
                        case CLOSURE_VOLUME_HENYEY_GREENSTEIN_ID:
-                               eval = volume_henyey_greenstein_eval_phase(sc, sd->I, omega_in, pdf);
+                               eval = volume_henyey_greenstein_eval_phase(sc, ccl_fetch(sd, I), omega_in, pdf);
                                break;
 #endif
                        default:
index 9ac16e860851a1f36feee0def000645e9a2ae8e7..c7364e9edac716d6e8972503a914ab1e69013538 100644 (file)
@@ -29,13 +29,13 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, uint id, AttributeElement *elem)
 {
-       if(sd->object == PRIM_NONE)
+       if(ccl_fetch(sd, object) == PRIM_NONE)
                return (int)ATTR_STD_NOT_FOUND;
 
        /* for SVM, find attribute by unique id */
-       uint attr_offset = sd->object*kernel_data.bvh.attributes_map_stride;
+       uint attr_offset = ccl_fetch(sd, object)*kernel_data.bvh.attributes_map_stride;
 #ifdef __HAIR__
-       attr_offset = (sd->type & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
+       attr_offset = (ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)? attr_offset + ATTR_PRIM_CURVE: attr_offset;
 #endif
        uint4 attr_map = kernel_tex_fetch(__attributes_map, attr_offset);
        
@@ -49,7 +49,7 @@ ccl_device_inline int find_attribute(KernelGlobals *kg, const ShaderData *sd, ui
 
        *elem = (AttributeElement)attr_map.y;
        
-       if(sd->prim == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
+       if(ccl_fetch(sd, prim) == PRIM_NONE && (AttributeElement)attr_map.y != ATTR_ELEMENT_MESH)
                return ATTR_STD_NOT_FOUND;
 
        /* return result */
index 2e8e27c709d64dfc1f783a23716f9a3685c25dc2..3d0d406dd0bf7ab12a020fcd1c5751d346e69010 100644 (file)
@@ -447,6 +447,7 @@ ccl_device_inline float3 ray_offset(float3 P, float3 Ng)
 #endif
 }
 
+#if defined(__SHADOW_RECORD_ALL__) || defined (__VOLUME_RECORD_ALL__)
 /* ToDo: Move to another file? */
 ccl_device int intersections_compare(const void *a, const void *b)
 {
@@ -460,6 +461,7 @@ ccl_device int intersections_compare(const void *a, const void *b)
        else
                return 0;
 }
+#endif
 
 CCL_NAMESPACE_END
 
index a5a25f4a9ae951f9f59de913b44f6d577a62d8b1..4ea9e4714c47df60175917056d87628f2609a358 100644 (file)
@@ -236,25 +236,25 @@ ccl_device_inline float3 motion_triangle_refine_subsurface(KernelGlobals *kg, Sh
 ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderData *sd, const Intersection *isect, const Ray *ray, bool subsurface)
 {
        /* get shader */
-       sd->shader = kernel_tex_fetch(__tri_shader, sd->prim);
+       ccl_fetch(sd, shader) = kernel_tex_fetch(__tri_shader, ccl_fetch(sd, prim));
 
        /* get motion info */
        int numsteps, numverts;
-       object_motion_info(kg, sd->object, &numsteps, &numverts, NULL);
+       object_motion_info(kg, ccl_fetch(sd, object), &numsteps, &numverts, NULL);
 
        /* figure out which steps we need to fetch and their interpolation factor */
        int maxstep = numsteps*2;
-       int step = min((int)(sd->time*maxstep), maxstep-1);
-       float t = sd->time*maxstep - step;
+       int step = min((int)(ccl_fetch(sd, time)*maxstep), maxstep-1);
+       float t = ccl_fetch(sd, time)*maxstep - step;
 
        /* find attribute */
        AttributeElement elem;
-       int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_POSITION, &elem);
+       int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_POSITION, &elem);
        kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
        /* fetch vertex coordinates */
        float3 verts[3], next_verts[3];
-       float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, sd->prim));
+       float3 tri_vindex = float4_to_float3(kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim)));
 
        motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step, verts);
        motion_triangle_verts_for_step(kg, tri_vindex, offset, numverts, numsteps, step+1, next_verts);
@@ -268,33 +268,33 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
 #ifdef __SUBSURFACE__
        if(!subsurface)
 #endif
-               sd->P = motion_triangle_refine(kg, sd, isect, ray, verts);
+               ccl_fetch(sd, P) = motion_triangle_refine(kg, sd, isect, ray, verts);
 #ifdef __SUBSURFACE__
        else
-               sd->P = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
+               ccl_fetch(sd, P) = motion_triangle_refine_subsurface(kg, sd, isect, ray, verts);
 #endif
 
        /* compute face normal */
        float3 Ng;
-       if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+       if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
                Ng = normalize(cross(verts[2] - verts[0], verts[1] - verts[0]));
        else
                Ng = normalize(cross(verts[1] - verts[0], verts[2] - verts[0]));
 
-       sd->Ng = Ng;
-       sd->N = Ng;
+       ccl_fetch(sd, Ng) = Ng;
+       ccl_fetch(sd, N) = Ng;
 
        /* compute derivatives of P w.r.t. uv */
 #ifdef __DPDU__
-       sd->dPdu = (verts[0] - verts[2]);
-       sd->dPdv = (verts[1] - verts[2]);
+       ccl_fetch(sd, dPdu) = (verts[0] - verts[2]);
+       ccl_fetch(sd, dPdv) = (verts[1] - verts[2]);
 #endif
 
        /* compute smooth normal */
-       if(sd->shader & SHADER_SMOOTH_NORMAL) {
+       if(ccl_fetch(sd, shader) & SHADER_SMOOTH_NORMAL) {
                /* find attribute */
                AttributeElement elem;
-               int offset = find_attribute_motion(kg, sd->object, ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
+               int offset = find_attribute_motion(kg, ccl_fetch(sd, object), ATTR_STD_MOTION_VERTEX_NORMAL, &elem);
                kernel_assert(offset != ATTR_STD_NOT_FOUND);
 
                /* fetch vertex coordinates */
@@ -308,10 +308,10 @@ ccl_device_noinline void motion_triangle_shader_setup(KernelGlobals *kg, ShaderD
                normals[2] = (1.0f - t)*normals[2] + t*next_normals[2];
 
                /* interpolate between vertices */
-               float u = sd->u;
-               float v = sd->v;
+               float u = ccl_fetch(sd, u);
+               float v = ccl_fetch(sd, v);
                float w = 1.0f - u - v;
-               sd->N = (u*normals[0] + v*normals[1] + w*normals[2]);
+               ccl_fetch(sd, N) = (u*normals[0] + v*normals[1] + w*normals[2]);
        }
 }
 
index 7df710102328e4c474b11d9ccb4726fbf6dec995..40cbca243a7d3a7e97b4d73be1f0922335c6fcf3 100644 (file)
@@ -123,9 +123,9 @@ ccl_device_inline Transform object_fetch_transform_motion_test(KernelGlobals *kg
 ccl_device_inline void object_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-       *P = transform_point(&sd->ob_tfm, *P);
+       *P = transform_point(&ccl_fetch(sd, ob_tfm), *P);
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
        *P = transform_point(&tfm, *P);
 #endif
 }
@@ -135,9 +135,9 @@ ccl_device_inline void object_position_transform(KernelGlobals *kg, const Shader
 ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, const ShaderData *sd, float3 *P)
 {
 #ifdef __OBJECT_MOTION__
-       *P = transform_point(&sd->ob_itfm, *P);
+       *P = transform_point(&ccl_fetch(sd, ob_itfm), *P);
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
        *P = transform_point(&tfm, *P);
 #endif
 }
@@ -147,9 +147,9 @@ ccl_device_inline void object_inverse_position_transform(KernelGlobals *kg, cons
 ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-       *N = normalize(transform_direction_transposed(&sd->ob_tfm, *N));
+       *N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_tfm), *N));
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
        *N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -159,9 +159,9 @@ ccl_device_inline void object_inverse_normal_transform(KernelGlobals *kg, const
 ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderData *sd, float3 *N)
 {
 #ifdef __OBJECT_MOTION__
-       *N = normalize(transform_direction_transposed(&sd->ob_itfm, *N));
+       *N = normalize(transform_direction_transposed(&ccl_fetch(sd, ob_itfm), *N));
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
        *N = normalize(transform_direction_transposed(&tfm, *N));
 #endif
 }
@@ -171,9 +171,9 @@ ccl_device_inline void object_normal_transform(KernelGlobals *kg, const ShaderDa
 ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-       *D = transform_direction(&sd->ob_tfm, *D);
+       *D = transform_direction(&ccl_fetch(sd, ob_tfm), *D);
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
        *D = transform_direction(&tfm, *D);
 #endif
 }
@@ -183,9 +183,9 @@ ccl_device_inline void object_dir_transform(KernelGlobals *kg, const ShaderData
 ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const ShaderData *sd, float3 *D)
 {
 #ifdef __OBJECT_MOTION__
-       *D = transform_direction(&sd->ob_itfm, *D);
+       *D = transform_direction(&ccl_fetch(sd, ob_itfm), *D);
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_INVERSE_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_INVERSE_TRANSFORM);
        *D = transform_direction(&tfm, *D);
 #endif
 }
@@ -194,13 +194,13 @@ ccl_device_inline void object_inverse_dir_transform(KernelGlobals *kg, const Sha
 
 ccl_device_inline float3 object_location(KernelGlobals *kg, const ShaderData *sd)
 {
-       if(sd->object == OBJECT_NONE)
+       if(ccl_fetch(sd, object) == OBJECT_NONE)
                return make_float3(0.0f, 0.0f, 0.0f);
 
 #ifdef __OBJECT_MOTION__
-       return make_float3(sd->ob_tfm.x.w, sd->ob_tfm.y.w, sd->ob_tfm.z.w);
+       return make_float3(ccl_fetch(sd, ob_tfm).x.w, ccl_fetch(sd, ob_tfm).y.w, ccl_fetch(sd, ob_tfm).z.w);
 #else
-       Transform tfm = object_fetch_transform(kg, sd->object, OBJECT_TRANSFORM);
+       Transform tfm = object_fetch_transform(kg, ccl_fetch(sd, object), OBJECT_TRANSFORM);
        return make_float3(tfm.x.w, tfm.y.w, tfm.z.w);
 #endif
 }
@@ -296,7 +296,7 @@ ccl_device_inline void object_motion_info(KernelGlobals *kg, int object, int *nu
 
 ccl_device int shader_pass_id(KernelGlobals *kg, const ShaderData *sd)
 {
-       return kernel_tex_fetch(__shader_flag, (sd->shader & SHADER_MASK)*2 + 1);
+       return kernel_tex_fetch(__shader_flag, (ccl_fetch(sd, shader) & SHADER_MASK)*2 + 1);
 }
 
 /* Particle data from which object was instanced */
@@ -377,7 +377,7 @@ ccl_device_inline float3 bvh_inverse_direction(float3 dir)
 
 /* Transform ray into object space to enter static object in BVH */
 
-ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_push(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
        Transform tfm = object_fetch_transform(kg, object, OBJECT_INVERSE_TRANSFORM);
 
@@ -425,7 +425,7 @@ ccl_device_inline void qbvh_instance_push(KernelGlobals *kg,
 
 /* Transorm ray to exit static object in BVH */
 
-ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, float *t)
+ccl_device_inline void bvh_instance_pop(KernelGlobals *kg, int object, const Ray *ray, float3 *P, float3 *dir, float3 *idir, ccl_addr_space float *t)
 {
        if(*t != FLT_MAX) {
                Transform tfm = object_fetch_transform(kg, object, OBJECT_TRANSFORM);
@@ -520,5 +520,38 @@ ccl_device_inline void bvh_instance_motion_pop_factor(KernelGlobals *kg, int obj
 
 #endif
 
+/* TODO(sergey): This is only for until we've got OpenCL 2.0
+ * on all devices we consider supported. It'll be replaced with
+ * generic address space.
+ */
+
+#ifdef __KERNEL_OPENCL__
+ccl_device_inline void object_dir_transform_addrspace(KernelGlobals *kg,
+                                                      const ShaderData *sd,
+                                                      ccl_addr_space float3 *D)
+{
+       float3 private_D = *D;
+       object_dir_transform(kg, sd, &private_D);
+       *D = private_D;
+}
+
+ccl_device_inline void object_normal_transform_addrspace(KernelGlobals *kg,
+                                                         const ShaderData *sd,
+                                                         ccl_addr_space float3 *N)
+{
+       float3 private_N = *N;
+       object_dir_transform(kg, sd, &private_N);
+       *N = private_N;
+}
+#endif
+
+#ifndef __KERNEL_OPENCL__
+#  define object_dir_transform_auto object_dir_transform
+#  define object_normal_transform_auto object_normal_transform
+#else
+#  define object_dir_transform_auto object_dir_transform_addrspace
+#  define object_normal_transform_auto object_normal_transform_addrspace
+#endif
+
 CCL_NAMESPACE_END
 
index d2543c5943e214320755951319eb4fbde2c6376a..30f12d32355eda5f374d368f62afa8839d560b64 100644 (file)
@@ -25,16 +25,16 @@ CCL_NAMESPACE_BEGIN
 
 ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float *dx, float *dy)
 {
-       if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+       if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
                return triangle_attribute_float(kg, sd, elem, offset, dx, dy);
        }
 #ifdef __HAIR__
-       else if(sd->type & PRIMITIVE_ALL_CURVE) {
+       else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
                return curve_attribute_float(kg, sd, elem, offset, dx, dy);
        }
 #endif
 #ifdef __VOLUME__
-       else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+       else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
                return volume_attribute_float(kg, sd, elem, offset, dx, dy);
        }
 #endif
@@ -47,16 +47,16 @@ ccl_device float primitive_attribute_float(KernelGlobals *kg, const ShaderData *
 
 ccl_device float3 primitive_attribute_float3(KernelGlobals *kg, const ShaderData *sd, AttributeElement elem, int offset, float3 *dx, float3 *dy)
 {
-       if(sd->type & PRIMITIVE_ALL_TRIANGLE) {
+       if(ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE) {
                return triangle_attribute_float3(kg, sd, elem, offset, dx, dy);
        }
 #ifdef __HAIR__
-       else if(sd->type & PRIMITIVE_ALL_CURVE) {
+       else if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE) {
                return curve_attribute_float3(kg, sd, elem, offset, dx, dy);
        }
 #endif
 #ifdef __VOLUME__
-       else if(sd->object != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
+       else if(ccl_fetch(sd, object) != OBJECT_NONE && elem == ATTR_ELEMENT_VOXEL) {
                return volume_attribute_float3(kg, sd, elem, offset, dx, dy);
        }
 #endif
@@ -108,9 +108,9 @@ ccl_device bool primitive_ptex(KernelGlobals *kg, ShaderData *sd, float2 *uv, in
 ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
 {
 #ifdef __HAIR__
-       if(sd->type & PRIMITIVE_ALL_CURVE)
+       if(ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE)
 #ifdef __DPDU__
-               return normalize(sd->dPdu);
+               return normalize(ccl_fetch(sd, dPdu));
 #else
                return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -124,12 +124,12 @@ ccl_device float3 primitive_tangent(KernelGlobals *kg, ShaderData *sd)
                float3 data = primitive_attribute_float3(kg, sd, attr_elem, attr_offset, NULL, NULL);
                data = make_float3(-(data.y - 0.5f), (data.x - 0.5f), 0.0f);
                object_normal_transform(kg, sd, &data);
-               return cross(sd->N, normalize(cross(data, sd->N)));
+               return cross(ccl_fetch(sd, N), normalize(cross(data, ccl_fetch(sd, N))));
        }
        else {
                /* otherwise use surface derivatives */
 #ifdef __DPDU__
-               return normalize(sd->dPdu);
+               return normalize(ccl_fetch(sd, dPdu));
 #else
                return make_float3(0.0f, 0.0f, 0.0f);
 #endif
@@ -144,16 +144,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
        float3 center;
 
 #ifdef __HAIR__
-       bool is_curve_primitive = sd->type & PRIMITIVE_ALL_CURVE;
+       bool is_curve_primitive = ccl_fetch(sd, type) & PRIMITIVE_ALL_CURVE;
        if(is_curve_primitive) {
                center = curve_motion_center_location(kg, sd);
 
-               if(!(sd->flag & SD_TRANSFORM_APPLIED))
+               if(!(ccl_fetch(sd, flag) & SD_TRANSFORM_APPLIED))
                        object_position_transform(kg, sd, &center);
        }
        else
 #endif
-               center = sd->P;
+               center = ccl_fetch(sd, P);
 
        float3 motion_pre = center, motion_post = center;
 
@@ -164,16 +164,16 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
        if(offset != ATTR_STD_NOT_FOUND) {
                /* get motion info */
                int numverts, numkeys;
-               object_motion_info(kg, sd->object, NULL, &numverts, &numkeys);
+               object_motion_info(kg, ccl_fetch(sd, object), NULL, &numverts, &numkeys);
 
                /* lookup attributes */
-               int offset_next = (sd->type & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
+               int offset_next = (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE)? offset + numverts: offset + numkeys;
 
                motion_pre = primitive_attribute_float3(kg, sd, elem, offset, NULL, NULL);
                motion_post = primitive_attribute_float3(kg, sd, elem, offset_next, NULL, NULL);
 
 #ifdef __HAIR__
-               if(is_curve_primitive && (sd->flag & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
+               if(is_curve_primitive && (ccl_fetch(sd, flag) & SD_OBJECT_HAS_VERTEX_MOTION) == 0) {
                        object_position_transform(kg, sd, &motion_pre);
                        object_position_transform(kg, sd, &motion_post);
                }
@@ -184,10 +184,10 @@ ccl_device float4 primitive_motion_vector(KernelGlobals *kg, ShaderData *sd)
         * transformation was set match the world/object space of motion_pre/post */
        Transform tfm;
        
-       tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_PRE);
+       tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_PRE);
        motion_pre = transform_point(&tfm, motion_pre);
 
-       tfm = object_fetch_vector_transform(kg, sd->object, OBJECT_VECTOR_MOTION_POST);
+       tfm = object_fetch_vector_transform(kg, ccl_fetch(sd, object), OBJECT_VECTOR_MOTION_POST);
        motion_post = transform_point(&tfm, motion_post);
 
        float3 motion_center;
index dd3928682e3d155daebd1c94259a81e74def2255..995dfac5b09e94f0046805c760ece40ba789ceb0 100644 (file)
@@ -27,14 +27,14 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline float3 triangle_normal(KernelGlobals *kg, ShaderData *sd)
 {
        /* load triangle vertices */
-       float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+       float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
        float3 v0 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.x)));
        float3 v1 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.y)));
        float3 v2 = float4_to_float3(kernel_tex_fetch(__tri_verts, __float_as_int(tri_vindex.z)));
        
        /* return normal */
-       if(sd->flag & SD_NEGATIVE_SCALE_APPLIED)
+       if(ccl_fetch(sd, flag) & SD_NEGATIVE_SCALE_APPLIED)
                return normalize(cross(v2 - v0, v1 - v0));
        else
                return normalize(cross(v1 - v0, v2 - v0));
@@ -94,7 +94,7 @@ ccl_device_inline float3 triangle_smooth_normal(KernelGlobals *kg, int prim, flo
 
 /* Ray differentials on triangle */
 
-ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, float3 *dPdu, float3 *dPdv)
+ccl_device_inline void triangle_dPdudv(KernelGlobals *kg, int prim, ccl_addr_space float3 *dPdu, ccl_addr_space float3 *dPdv)
 {
        /* fetch triangle vertex coordinates */
        float4 tri_vindex = kernel_tex_fetch(__tri_vindex, prim);
@@ -116,34 +116,34 @@ ccl_device float triangle_attribute_float(KernelGlobals *kg, const ShaderData *s
                if(dx) *dx = 0.0f;
                if(dy) *dy = 0.0f;
 
-               return kernel_tex_fetch(__attributes_float, offset + sd->prim);
+               return kernel_tex_fetch(__attributes_float, offset + ccl_fetch(sd, prim));
        }
        else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-               float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+               float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
                float f0 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.x));
                float f1 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.y));
                float f2 = kernel_tex_fetch(__attributes_float, offset + __float_as_int(tri_vindex.z));
 
 #ifdef __RAY_DIFFERENTIALS__
-               if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-               if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+               if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+               if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-               return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+               return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
        }
        else if(elem == ATTR_ELEMENT_CORNER) {
-               int tri = offset + sd->prim*3;
+               int tri = offset + ccl_fetch(sd, prim)*3;
                float f0 = kernel_tex_fetch(__attributes_float, tri + 0);
                float f1 = kernel_tex_fetch(__attributes_float, tri + 1);
                float f2 = kernel_tex_fetch(__attributes_float, tri + 2);
 
 #ifdef __RAY_DIFFERENTIALS__
-               if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-               if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+               if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+               if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-               return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+               return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
        }
        else {
                if(dx) *dx = 0.0f;
@@ -159,24 +159,24 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
                if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
                if(dy) *dy = make_float3(0.0f, 0.0f, 0.0f);
 
-               return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + sd->prim));
+               return float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + ccl_fetch(sd, prim)));
        }
        else if(elem == ATTR_ELEMENT_VERTEX || elem == ATTR_ELEMENT_VERTEX_MOTION) {
-               float4 tri_vindex = kernel_tex_fetch(__tri_vindex, sd->prim);
+               float4 tri_vindex = kernel_tex_fetch(__tri_vindex, ccl_fetch(sd, prim));
 
                float3 f0 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.x)));
                float3 f1 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.y)));
                float3 f2 = float4_to_float3(kernel_tex_fetch(__attributes_float3, offset + __float_as_int(tri_vindex.z)));
 
 #ifdef __RAY_DIFFERENTIALS__
-               if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-               if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+               if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+               if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-               return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+               return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
        }
        else if(elem == ATTR_ELEMENT_CORNER || elem == ATTR_ELEMENT_CORNER_BYTE) {
-               int tri = offset + sd->prim*3;
+               int tri = offset + ccl_fetch(sd, prim)*3;
                float3 f0, f1, f2;
 
                if(elem == ATTR_ELEMENT_CORNER) {
@@ -191,11 +191,11 @@ ccl_device float3 triangle_attribute_float3(KernelGlobals *kg, const ShaderData
                }
 
 #ifdef __RAY_DIFFERENTIALS__
-               if(dx) *dx = sd->du.dx*f0 + sd->dv.dx*f1 - (sd->du.dx + sd->dv.dx)*f2;
-               if(dy) *dy = sd->du.dy*f0 + sd->dv.dy*f1 - (sd->du.dy + sd->dv.dy)*f2;
+               if(dx) *dx = ccl_fetch(sd, du).dx*f0 + ccl_fetch(sd, dv).dx*f1 - (ccl_fetch(sd, du).dx + ccl_fetch(sd, dv).dx)*f2;
+               if(dy) *dy = ccl_fetch(sd, du).dy*f0 + ccl_fetch(sd, dv).dy*f1 - (ccl_fetch(sd, du).dy + ccl_fetch(sd, dv).dy)*f2;
 #endif
 
-               return sd->u*f0 + sd->v*f1 + (1.0f - sd->u - sd->v)*f2;
+               return ccl_fetch(sd, u)*f0 + ccl_fetch(sd, v)*f1 + (1.0f - ccl_fetch(sd, u) - ccl_fetch(sd, v))*f2;
        }
        else {
                if(dx) *dx = make_float3(0.0f, 0.0f, 0.0f);
index 5a47260a4ee03fa4d2f84758bb8b0cb473a1ecf0..19e394936ee6a963ce021e07479ec8c565a74f4c 100644 (file)
@@ -25,6 +25,8 @@
 #include "kernel_path.h"
 #include "kernel_bake.h"
 
+#ifdef __COMPILE_ONLY_MEGAKERNEL__
+
 __kernel void kernel_ocl_path_trace(
        ccl_constant KernelData *data,
        ccl_global float *buffer,
@@ -52,17 +54,18 @@ __kernel void kernel_ocl_path_trace(
                kernel_path_trace(kg, buffer, rng_state, sample, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_convert_to_byte(
+#else // __COMPILE_ONLY_MEGAKERNEL__
+
+__kernel void kernel_ocl_shader(
        ccl_constant KernelData *data,
-       ccl_global uchar4 *rgba,
-       ccl_global float *buffer,
+       ccl_global uint4 *input,
+       ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
        ccl_global type *name,
 #include "kernel_textures.h"
 
-       float sample_scale,
-       int sx, int sy, int sw, int sh, int offset, int stride)
+       int type, int sx, int sw, int offset, int sample)
 {
        KernelGlobals kglobals, *kg = &kglobals;
 
@@ -73,23 +76,21 @@ __kernel void kernel_ocl_convert_to_byte(
 #include "kernel_textures.h"
 
        int x = sx + get_global_id(0);
-       int y = sy + get_global_id(1);
 
-       if(x < sx + sw && y < sy + sh)
-               kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+       if(x < sx + sw)
+               kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
 }
 
-__kernel void kernel_ocl_convert_to_half_float(
+__kernel void kernel_ocl_bake(
        ccl_constant KernelData *data,
-       ccl_global uchar4 *rgba,
-       ccl_global float *buffer,
+       ccl_global uint4 *input,
+       ccl_global float4 *output,
 
 #define KERNEL_TEX(type, ttype, name) \
        ccl_global type *name,
 #include "kernel_textures.h"
 
-       float sample_scale,
-       int sx, int sy, int sw, int sh, int offset, int stride)
+       int type, int sx, int sw, int offset, int sample)
 {
        KernelGlobals kglobals, *kg = &kglobals;
 
@@ -100,22 +101,22 @@ __kernel void kernel_ocl_convert_to_half_float(
 #include "kernel_textures.h"
 
        int x = sx + get_global_id(0);
-       int y = sy + get_global_id(1);
 
-       if(x < sx + sw && y < sy + sh)
-               kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
+       if(x < sx + sw)
+               kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
 }
 
-__kernel void kernel_ocl_shader(
+__kernel void kernel_ocl_convert_to_byte(
        ccl_constant KernelData *data,
-       ccl_global uint4 *input,
-       ccl_global float4 *output,
+       ccl_global uchar4 *rgba,
+       ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
        ccl_global type *name,
 #include "kernel_textures.h"
 
-       int type, int sx, int sw, int offset, int sample)
+       float sample_scale,
+       int sx, int sy, int sw, int sh, int offset, int stride)
 {
        KernelGlobals kglobals, *kg = &kglobals;
 
@@ -126,21 +127,23 @@ __kernel void kernel_ocl_shader(
 #include "kernel_textures.h"
 
        int x = sx + get_global_id(0);
+       int y = sy + get_global_id(1);
 
-       if(x < sx + sw)
-               kernel_shader_evaluate(kg, input, output, (ShaderEvalType)type, x, sample);
+       if(x < sx + sw && y < sy + sh)
+               kernel_film_convert_to_byte(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
-__kernel void kernel_ocl_bake(
+__kernel void kernel_ocl_convert_to_half_float(
        ccl_constant KernelData *data,
-       ccl_global uint4 *input,
-       ccl_global float4 *output,
+       ccl_global uchar4 *rgba,
+       ccl_global float *buffer,
 
 #define KERNEL_TEX(type, ttype, name) \
        ccl_global type *name,
 #include "kernel_textures.h"
 
-       int type, int sx, int sw, int offset, int sample)
+       float sample_scale,
+       int sx, int sy, int sw, int sh, int offset, int stride)
 {
        KernelGlobals kglobals, *kg = &kglobals;
 
@@ -151,8 +154,10 @@ __kernel void kernel_ocl_bake(
 #include "kernel_textures.h"
 
        int x = sx + get_global_id(0);
+       int y = sy + get_global_id(1);
 
-       if(x < sx + sw)
-               kernel_bake_evaluate(kg, input, output, (ShaderEvalType)type, x, offset, sample);
+       if(x < sx + sw && y < sy + sh)
+               kernel_film_convert_to_half_float(kg, rgba, buffer, sample_scale, x, y, offset, stride);
 }
 
+#endif // __COMPILE_ONLY_MEGAKERNEL__
\ No newline at end of file
index 369c615eadedb8cc1a1b00f49aa09890dc3431b1..257728b624476ca1e395432ce5bef539bb98107d 100644 (file)
@@ -176,7 +176,7 @@ ccl_device_inline void path_radiance_init(PathRadiance *L, int use_light_pass)
 #endif
 }
 
-ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, float3 *throughput,
+ccl_device_inline void path_radiance_bsdf_bounce(PathRadiance *L, ccl_addr_space float3 *throughput,
        BsdfEval *bsdf_eval, float bsdf_pdf, int bounce, int bsdf_label)
 {
        float inverse_pdf = 1.0f/bsdf_pdf;
diff --git a/intern/cycles/kernel/kernel_background_buffer_update.cl b/intern/cycles/kernel/kernel_background_buffer_update.cl
new file mode 100644 (file)
index 0000000..bf08477
--- /dev/null
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_background_buffer_update kernel.
+ * This is the fourth kernel in the ray tracing logic, and the third
+ * of the path iteration kernels. This kernel takes care of rays that hit
+ * the background (sceneintersect kernel), and for the rays of
+ * state RAY_UPDATE_BUFFER it updates the ray's accumulated radiance in
+ * the output buffer. This kernel also takes care of rays that have been determined
+ * to-be-regenerated.
+ *
+ * We will empty QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue in this kernel
+ *
+ * Typically all rays that are in state RAY_HIT_BACKGROUND, RAY_UPDATE_BUFFER
+ * will be eventually set to RAY_TO_REGENERATE state in this kernel. Finally all rays of ray_state
+ * RAY_TO_REGENERATE will be regenerated and put in queue QUEUE_ACTIVE_AND_REGENERATED_RAYS.
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_background_buffer_update --|--- PathRadiance_coop
+ * throughput_coop --------------------------------------|                                                     |--- L_transparent_coop
+ * per_sample_output_buffers ----------------------------|                                                     |--- per_sample_output_buffers
+ * Ray_coop ---------------------------------------------|                                                     |--- ray_state
+ * PathState_coop ---------------------------------------|                                                     |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * L_transparent_coop -----------------------------------|                                                     |--- Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * ray_state --------------------------------------------|                                                     |--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ----|                                                     |--- Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ------|                                                     |--- work_array
+ * parallel_samples -------------------------------------|                                                     |--- PathState_coop
+ * end_sample -------------------------------------------|                                                     |--- throughput_coop
+ * kg (globals + data) ----------------------------------|                                                     |--- rng_coop
+ * rng_state --------------------------------------------|                                                     |--- Ray
+ * PathRadiance_coop ------------------------------------|                                                     |
+ * sw ---------------------------------------------------|                                                     |
+ * sh ---------------------------------------------------|                                                     |
+ * sx ---------------------------------------------------|                                                     |
+ * sy ---------------------------------------------------|                                                     |
+ * stride -----------------------------------------------|                                                     |
+ * work_array -------------------------------------------|                                                     |--- work_array
+ * queuesize --------------------------------------------|                                                     |
+ * start_sample -----------------------------------------|                                                     |--- work_pool_wgs
+ * work_pool_wgs ----------------------------------------|                                                     |
+ * num_samples ------------------------------------------|                                                     |
+ *
+ * note on shader_data : shader_data argument is neither an input nor an output for this kernel. It is just filled and consumed here itself.
+ * Note on Queues :
+ * This kernel fetches rays from QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_UPDATE_BUFFER, RAY_HIT_BACKGROUND, RAY_TO_REGENERATE rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be empty
+ */
+__kernel void kernel_ocl_path_trace_background_buffer_update(
+       ccl_global char *globals,
+       ccl_constant KernelData *data,
+       ccl_global char *shader_data,
+       ccl_global float *per_sample_output_buffers,
+       ccl_global uint *rng_state,
+       ccl_global uint *rng_coop,                   /* Required for buffer Update */
+       ccl_global float3 *throughput_coop,          /* Required for background hit processing */
+       PathRadiance *PathRadiance_coop,  /* Required for background hit processing and buffer Update */
+       ccl_global Ray *Ray_coop,                    /* Required for background hit processing */
+       ccl_global PathState *PathState_coop,        /* Required for background hit processing */
+       ccl_global float *L_transparent_coop,        /* Required for background hit processing and buffer Update */
+       ccl_global char *ray_state,                  /* Stores information on the current state of a ray */
+       int sw, int sh, int sx, int sy, int stride,
+       int rng_state_offset_x,
+       int rng_state_offset_y,
+       int rng_state_stride,
+       ccl_global unsigned int *work_array,         /* Denotes work of each ray */
+       ccl_global int *Queue_data,                  /* Queues memory */
+       ccl_global int *Queue_index,                 /* Tracks the number of elements in each queue */
+       int queuesize,                               /* Size (capacity) of each queue */
+       int end_sample,
+       int start_sample,
+#ifdef __WORK_STEALING__
+       ccl_global unsigned int *work_pool_wgs,
+       unsigned int num_samples,
+#endif
+#ifdef __KERNEL_DEBUG__
+       DebugData *debugdata_coop,
+#endif
+       int parallel_samples                         /* Number of samples to be processed in parallel */
+       )
+{
+       ccl_local unsigned int local_queue_atomics;
+       if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+               local_queue_atomics = 0;
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+       if(ray_index == 0) {
+               /* We will empty this queue in this kernel */
+               Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+       }
+       char enqueue_flag = 0;
+       ray_index = get_ray_index(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, Queue_data, queuesize, 1);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+       /* If we are executing on a GPU device, we exit all threads that are not required.
+        * If we are executing on a CPU device, then we need to keep all threads active
+        * since we have barrier() calls later in the kernel. CPU devices
+        * expect all threads to execute barrier statement.
+        */
+       if(ray_index == QUEUE_EMPTY_SLOT)
+               return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+       if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+               /* Load kernel globals structure and ShaderData strucuture */
+               KernelGlobals *kg = (KernelGlobals *)globals;
+               ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __KERNEL_DEBUG__
+               DebugData *debug_data = &debugdata_coop[ray_index];
+#endif
+               ccl_global PathState *state = &PathState_coop[ray_index];
+               PathRadiance *L = L = &PathRadiance_coop[ray_index];
+               ccl_global Ray *ray = &Ray_coop[ray_index];
+               ccl_global float3 *throughput = &throughput_coop[ray_index];
+               ccl_global float *L_transparent = &L_transparent_coop[ray_index];
+               ccl_global uint *rng = &rng_coop[ray_index];
+
+#ifdef __WORK_STEALING__
+               unsigned int my_work;
+               ccl_global float *initial_per_sample_output_buffers;
+               ccl_global uint *initial_rng;
+#endif
+               unsigned int sample;
+               unsigned int tile_x;
+               unsigned int tile_y;
+               unsigned int pixel_x;
+               unsigned int pixel_y;
+               unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+               my_work = work_array[ray_index];
+               sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+               get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+               my_sample_tile = 0;
+               initial_per_sample_output_buffers = per_sample_output_buffers;
+               initial_rng = rng_state;
+#else // __WORK_STEALING__
+               sample = work_array[ray_index];
+               int tile_index = ray_index / parallel_samples;
+               /* buffer and rng_state's stride is "stride". Find x and y using ray_index */
+               tile_x = tile_index % sw;
+               tile_y = tile_index / sw;
+               my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif
+               rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+               per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+               if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+                       /* eval background shader if nothing hit */
+                       if(kernel_data.background.transparent && (state->flag & PATH_RAY_CAMERA)) {
+                               *L_transparent = (*L_transparent) + average((*throughput));
+#ifdef __PASSES__
+                       if(!(kernel_data.film.pass_flag & PASS_BACKGROUND))
+#endif
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                       }
+
+                       if(IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND))
+                       {
+#ifdef __BACKGROUND__
+                               /* sample background shader */
+                               float3 L_background = indirect_background(kg, state, ray, sd);
+                               path_radiance_accum_background(L, (*throughput), L_background, state->bounce);
+#endif
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                       }
+               }
+
+               if(IS_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER)) {
+                       float3 L_sum = path_radiance_clamp_and_sum(kg, L);
+                       kernel_write_light_passes(kg, per_sample_output_buffers, L, sample);
+#ifdef __KERNEL_DEBUG__
+                       kernel_write_debug_passes(kg, per_sample_output_buffers, state, debug_data, sample);
+#endif
+                       float4 L_rad = make_float4(L_sum.x, L_sum.y, L_sum.z, 1.0f - (*L_transparent));
+
+                       /* accumulate result in output buffer */
+                       kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+                       path_rng_end(kg, rng_state, *rng);
+
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+               }
+
+               if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+                       /* We have completed current work; So get next work */
+                       int valid_work = get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+                       if(!valid_work) {
+                               /* If work is invalid, this means no more work is available and the thread may exit */
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+                       }
+#else
+                       if((sample + parallel_samples) >= end_sample) {
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_INACTIVE);
+                       }
+#endif
+                       if(IS_STATE(ray_state, ray_index, RAY_TO_REGENERATE)) {
+#ifdef __WORK_STEALING__
+                               work_array[ray_index] = my_work;
+                               /* Get the sample associated with the current work */
+                               sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+                               /* Get pixel and tile position associated with current work */
+                               get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+                               my_sample_tile = 0;
+
+                               /* Remap rng_state according to the current work */
+                               rng_state = initial_rng + ((rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride);
+                               /* Remap per_sample_output_buffers according to the current work */
+                               per_sample_output_buffers = initial_per_sample_output_buffers
+                                                                                       + (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+#else
+                               work_array[ray_index] = sample + parallel_samples;
+                               sample = work_array[ray_index];
+
+                               /* Get ray position from ray index */
+                               pixel_x = sx + ((ray_index / parallel_samples) % sw);
+                               pixel_y = sy + ((ray_index / parallel_samples) / sw);
+#endif
+
+                               /* initialize random numbers and ray */
+                               kernel_path_trace_setup(kg, rng_state, sample, pixel_x, pixel_y, rng, ray);
+
+                               if(ray->t != 0.0f) {
+                                       /* Initialize throughput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+                                       *throughput = make_float3(1.0f, 1.0f, 1.0f);
+                                       *L_transparent = 0.0f;
+                                       path_radiance_init(L, kernel_data.film.use_light_pass);
+                                       path_state_init(kg, state, rng, sample, ray);
+#ifdef __KERNEL_DEBUG__
+                                       debug_data_init(debug_data);
+#endif
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_REGENERATED);
+                                       enqueue_flag = 1;
+                               } else {
+                                       /*These rays do not participate in path-iteration */
+                                       float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+                                       /* accumulate result in output buffer */
+                                       kernel_write_pass_float4(per_sample_output_buffers, sample, L_rad);
+                                       path_rng_end(kg, rng_state, *rng);
+
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+                               }
+                       }
+               }
+#ifndef __COMPUTE_DEVICE_GPU__
+       }
+#endif
+
+       /* Enqueue RAY_REGENERATED rays into QUEUE_ACTIVE_AND_REGENERATED_RAYS; These rays
+        * will be made active during next SceneIntersectkernel
+        */
+       enqueue_ray_index_local(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+}
index 1e81210007ceba5076d72265312a84f338579e41..3ce5134181aff19bd03fbded0ffc3858b259d211 100644 (file)
@@ -39,7 +39,7 @@ ccl_device float2 camera_sample_aperture(KernelGlobals *kg, float u, float v)
        return bokeh;
 }
 
-ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
        /* create ray form raster position */
        Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -108,8 +108,7 @@ ccl_device void camera_sample_perspective(KernelGlobals *kg, float raster_x, flo
 }
 
 /* Orthographic Camera */
-
-ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
        /* create ray form raster position */
        Transform rastertocamera = kernel_data.cam.rastertocamera;
@@ -175,7 +174,7 @@ ccl_device void camera_sample_orthographic(KernelGlobals *kg, float raster_x, fl
 
 /* Panorama Camera */
 
-ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, Ray *ray)
+ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float raster_y, float lens_u, float lens_v, ccl_addr_space Ray *ray)
 {
        Transform rastertocamera = kernel_data.cam.rastertocamera;
        float3 Pcamera = transform_perspective(&rastertocamera, make_float3(raster_x, raster_y, 0.0f));
@@ -256,7 +255,7 @@ ccl_device void camera_sample_panorama(KernelGlobals *kg, float raster_x, float
 /* Common */
 
 ccl_device void camera_sample(KernelGlobals *kg, int x, int y, float filter_u, float filter_v,
-       float lens_u, float lens_v, float time, Ray *ray)
+       float lens_u, float lens_v, float time, ccl_addr_space Ray *ray)
 {
        /* pixel filter */
        int filter_table_offset = kernel_data.film.filter_table_offset;
@@ -319,7 +318,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
 {
        if(kernel_data.cam.type != CAMERA_PANORAMA) {
                /* perspective / ortho */
-               if(sd->object == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
+               if(ccl_fetch(sd, object) == PRIM_NONE && kernel_data.cam.type == CAMERA_PERSPECTIVE)
                        P += camera_position(kg);
 
                Transform tfm = kernel_data.cam.worldtondc;
@@ -329,7 +328,7 @@ ccl_device_inline float3 camera_world_to_ndc(KernelGlobals *kg, ShaderData *sd,
                /* panorama */
                Transform tfm = kernel_data.cam.worldtocamera;
 
-               if(sd->object != OBJECT_NONE)
+               if(ccl_fetch(sd, object) != OBJECT_NONE)
                        P = normalize(transform_point(&tfm, P));
                else
                        P = normalize(transform_direction(&tfm, P));
index b209fff88e669d003d4ea8b4176d9490530fa942..7a5f70ff3da83645e9d84c4748c6519ca4705769 100644 (file)
@@ -40,6 +40,8 @@
 #include "util_half.h"
 #include "util_types.h"
 
+#define ccl_addr_space
+
 /* On x86_64, versions of glibc < 2.16 have an issue where expf is
  * much slower than the double version.  This was fixed in glibc 2.16.
  */
index 61e208fcab335d61fd5dc079836b9a22bd39af92..9fdd3abfec35bd10ae08c4cf4e10879777ab10ed 100644 (file)
@@ -41,6 +41,7 @@
 #define ccl_global
 #define ccl_constant
 #define ccl_may_alias
+#define ccl_addr_space
 
 /* No assert supported for CUDA */
 
index 12b0f11760051a7611a3fe21754249db41c92c15..e8b36d2605d835b0008e26678d61e1d7089016db 100644 (file)
 #define ccl_local __local
 #define ccl_private __private
 
+#ifdef __SPLIT_KERNEL__
+#define ccl_addr_space __global
+#else
+#define ccl_addr_space
+#endif
+
 /* Selective nodes compilation. */
 #ifndef __NODES_MAX_GROUP__
 #  define __NODES_MAX_GROUP__ NODE_GROUP_LEVEL_MAX
diff --git a/intern/cycles/kernel/kernel_data_init.cl b/intern/cycles/kernel/kernel_data_init.cl
new file mode 100644 (file)
index 0000000..dbf9e62
--- /dev/null
@@ -0,0 +1,384 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_data_initialization kernel
+ * This kernel Initializes structures needed in path-iteration kernels.
+ * This is the first kernel in ray-tracing logic.
+ *
+ * Ray state of rays outside the tile-boundary will be marked RAY_INACTIVE
+ *
+ * Its input and output are as follows,
+ *
+ * Un-initialized rng---------------|--- kernel_ocl_path_trace_data_initialization ---|--- Initialized rng
+ * Un-initialized throughput -------|                                                 |--- Initialized throughput
+ * Un-initialized L_transparent ----|                                                 |--- Initialized L_transparent
+ * Un-initialized PathRadiance -----|                                                 |--- Initialized PathRadiance
+ * Un-initialized Ray --------------|                                                 |--- Initialized Ray
+ * Un-initialized PathState --------|                                                 |--- Initialized PathState
+ * Un-initialized QueueData --------|                                                 |--- Initialized QueueData (to QUEUE_EMPTY_SLOT)
+ * Un-initilaized QueueIndex -------|                                                 |--- Initialized QueueIndex (to 0)
+ * Un-initialized use_queues_flag---|                                                 |--- Initialized use_queues_flag (to false)
+ * Un-initialized ray_state --------|                                                 |--- Initialized ray_state
+ * parallel_samples --------------- |                                                 |--- Initialized per_sample_output_buffers
+ * rng_state -----------------------|                                                 |--- Initialized work_array
+ * data ----------------------------|                                                 |--- Initialized work_pool_wgs
+ * start_sample --------------------|                                                 |
+ * sx ------------------------------|                                                 |
+ * sy ------------------------------|                                                 |
+ * sw ------------------------------|                                                 |
+ * sh ------------------------------|                                                 |
+ * stride --------------------------|                                                 |
+ * queuesize -----------------------|                                                 |
+ * num_samples ---------------------|                                                 |
+ *
+ * Note on Queues :
+ * All slots in queues are initialized to queue empty slot;
+ * The number of elements in the queues is initialized to 0;
+ */
+__kernel void kernel_ocl_path_trace_data_initialization(
+       ccl_global char *globals,
+       ccl_global char *shader_data_sd,                  /* Arguments related to ShaderData */
+       ccl_global char *shader_data_sd_DL_shadow,     /* Arguments related to ShaderData */
+
+       ccl_global float3 *P_sd,
+       ccl_global float3 *P_sd_DL_shadow,
+
+       ccl_global float3 *N_sd,
+       ccl_global float3 *N_sd_DL_shadow,
+
+       ccl_global float3 *Ng_sd,
+       ccl_global float3 *Ng_sd_DL_shadow,
+
+       ccl_global float3 *I_sd,
+       ccl_global float3 *I_sd_DL_shadow,
+
+       ccl_global int *shader_sd,
+       ccl_global int *shader_sd_DL_shadow,
+
+       ccl_global int *flag_sd,
+       ccl_global int *flag_sd_DL_shadow,
+
+       ccl_global int *prim_sd,
+       ccl_global int *prim_sd_DL_shadow,
+
+       ccl_global int *type_sd,
+       ccl_global int *type_sd_DL_shadow,
+
+       ccl_global float *u_sd,
+       ccl_global float *u_sd_DL_shadow,
+
+       ccl_global float *v_sd,
+       ccl_global float *v_sd_DL_shadow,
+
+       ccl_global int *object_sd,
+       ccl_global int *object_sd_DL_shadow,
+
+       ccl_global float *time_sd,
+       ccl_global float *time_sd_DL_shadow,
+
+       ccl_global float *ray_length_sd,
+       ccl_global float *ray_length_sd_DL_shadow,
+
+       ccl_global int *ray_depth_sd,
+       ccl_global int *ray_depth_sd_DL_shadow,
+
+       ccl_global int *transparent_depth_sd,
+       ccl_global int *transparent_depth_sd_DL_shadow,
+       #ifdef __RAY_DIFFERENTIALS__
+       ccl_global differential3 *dP_sd,
+       ccl_global differential3 *dP_sd_DL_shadow,
+
+       ccl_global differential3 *dI_sd,
+       ccl_global differential3 *dI_sd_DL_shadow,
+
+       ccl_global differential *du_sd,
+       ccl_global differential *du_sd_DL_shadow,
+
+       ccl_global differential *dv_sd,
+       ccl_global differential *dv_sd_DL_shadow,
+       #endif
+       #ifdef __DPDU__
+       ccl_global float3 *dPdu_sd,
+       ccl_global float3 *dPdu_sd_DL_shadow,
+
+       ccl_global float3 *dPdv_sd,
+       ccl_global float3 *dPdv_sd_DL_shadow,
+       #endif
+       ShaderClosure *closure_sd,
+       ShaderClosure *closure_sd_DL_shadow,
+
+       ccl_global int *num_closure_sd,
+       ccl_global int *num_closure_sd_DL_shadow,
+
+       ccl_global float *randb_closure_sd,
+       ccl_global float *randb_closure_sd_DL_shadow,
+
+       ccl_global float3 *ray_P_sd,
+       ccl_global float3 *ray_P_sd_DL_shadow,
+
+       ccl_global differential3 *ray_dP_sd,
+       ccl_global differential3 *ray_dP_sd_DL_shadow,
+
+       ccl_constant KernelData *data,
+       ccl_global float *per_sample_output_buffers,
+       ccl_global uint *rng_state,
+       ccl_global uint *rng_coop,                   /* rng array to store rng values for all rays */
+       ccl_global float3 *throughput_coop,          /* throughput array to store throughput values for all rays */
+       ccl_global float *L_transparent_coop,        /* L_transparent array to store L_transparent values for all rays */
+       PathRadiance *PathRadiance_coop,  /* PathRadiance array to store PathRadiance values for all rays */
+       ccl_global Ray *Ray_coop,                    /* Ray array to store Ray information for all rays */
+       ccl_global PathState *PathState_coop,        /* PathState array to store PathState information for all rays */
+       ccl_global char *ray_state,                  /* Stores information on current state of a ray */
+
+#define KERNEL_TEX(type, ttype, name) \
+       ccl_global type *name,
+#include "kernel_textures.h"
+
+       int start_sample, int sx, int sy, int sw, int sh, int offset, int stride,
+       int rng_state_offset_x,
+       int rng_state_offset_y,
+       int rng_state_stride,
+       ccl_global int *Queue_data,                  /* Memory for queues */
+       ccl_global int *Queue_index,                 /* Tracks the number of elements in queues */
+       int queuesize,                               /* size (capacity) of the queue */
+       ccl_global char *use_queues_flag,            /* flag to decide if scene-intersect kernel should use queues to fetch ray index */
+       ccl_global unsigned int *work_array,         /* work array to store which work each ray belongs to */
+#ifdef __WORK_STEALING__
+       ccl_global unsigned int *work_pool_wgs,      /* Work pool for each work group */
+       unsigned int num_samples,                    /* Total number of samples per pixel */
+#endif
+#ifdef __KERNEL_DEBUG__
+       DebugData *debugdata_coop,
+#endif
+       int parallel_samples                         /* Number of samples to be processed in parallel */
+       )
+{
+
+       /* Load kernel globals structure */
+       KernelGlobals *kg = (KernelGlobals *)globals;
+
+       kg->data = data;
+#define KERNEL_TEX(type, ttype, name) \
+       kg->name = name;
+#include "kernel_textures.h"
+
+       /* Load ShaderData structure */
+       ShaderData *sd = (ShaderData *)shader_data_sd;
+       ShaderData *sd_DL_shadow = (ShaderData *)shader_data_sd_DL_shadow;
+
+       sd->P = P_sd;
+       sd_DL_shadow->P = P_sd_DL_shadow;
+
+       sd->N = N_sd;
+       sd_DL_shadow->N = N_sd_DL_shadow;
+
+       sd->Ng = Ng_sd;
+       sd_DL_shadow->Ng = Ng_sd_DL_shadow;
+
+       sd->I = I_sd;
+       sd_DL_shadow->I = I_sd_DL_shadow;
+
+       sd->shader = shader_sd;
+       sd_DL_shadow->shader = shader_sd_DL_shadow;
+
+       sd->flag = flag_sd;
+       sd_DL_shadow->flag = flag_sd_DL_shadow;
+
+       sd->prim = prim_sd;
+       sd_DL_shadow->prim = prim_sd_DL_shadow;
+
+       sd->type = type_sd;
+       sd_DL_shadow->type = type_sd_DL_shadow;
+
+       sd->u = u_sd;
+       sd_DL_shadow->u = u_sd_DL_shadow;
+
+       sd->v = v_sd;
+       sd_DL_shadow->v = v_sd_DL_shadow;
+
+       sd->object = object_sd;
+       sd_DL_shadow->object = object_sd_DL_shadow;
+
+       sd->time = time_sd;
+       sd_DL_shadow->time = time_sd_DL_shadow;
+
+       sd->ray_length = ray_length_sd;
+       sd_DL_shadow->ray_length = ray_length_sd_DL_shadow;
+
+       sd->ray_depth = ray_depth_sd;
+       sd_DL_shadow->ray_depth = ray_depth_sd_DL_shadow;
+
+       sd->transparent_depth = transparent_depth_sd;
+       sd_DL_shadow->transparent_depth = transparent_depth_sd_DL_shadow;
+
+#ifdef __RAY_DIFFERENTIALS__
+       sd->dP = dP_sd;
+       sd_DL_shadow->dP = dP_sd_DL_shadow;
+
+       sd->dI = dI_sd;
+       sd_DL_shadow->dI = dI_sd_DL_shadow;
+
+       sd->du = du_sd;
+       sd_DL_shadow->du = du_sd_DL_shadow;
+
+       sd->dv = dv_sd;
+       sd_DL_shadow->dv = dv_sd_DL_shadow;
+#ifdef __DPDU__
+       sd->dPdu = dPdu_sd;
+       sd_DL_shadow->dPdu = dPdu_sd_DL_shadow;
+
+       sd->dPdv = dPdv_sd;
+       sd_DL_shadow->dPdv = dPdv_sd_DL_shadow;
+#endif
+#endif
+
+       sd->closure = closure_sd;
+       sd_DL_shadow->closure = closure_sd_DL_shadow;
+
+       sd->num_closure = num_closure_sd;
+       sd_DL_shadow->num_closure = num_closure_sd_DL_shadow;
+
+       sd->randb_closure = randb_closure_sd;
+       sd_DL_shadow->randb_closure = randb_closure_sd_DL_shadow;
+
+       sd->ray_P = ray_P_sd;
+       sd_DL_shadow->ray_P = ray_P_sd_DL_shadow;
+
+       sd->ray_dP = ray_dP_sd;
+       sd_DL_shadow->ray_dP = ray_dP_sd_DL_shadow;
+
+       int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+
+#ifdef __WORK_STEALING__
+       int lid = get_local_id(1) * get_local_size(0) + get_local_id(0);
+       /* Initialize work_pool_wgs */
+       if(lid == 0) {
+               int group_index = get_group_id(1) * get_num_groups(0) + get_group_id(0);
+               work_pool_wgs[group_index] = 0;
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+#endif // __WORK_STEALING__
+
+       /* Initialize queue data and queue index */
+       if(thread_index < queuesize) {
+               /* Initialize active ray queue */
+               Queue_data[QUEUE_ACTIVE_AND_REGENERATED_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               /* Initialize background and buffer update queue */
+               Queue_data[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               /* Initialize shadow ray cast of AO queue */
+               Queue_data[QUEUE_SHADOW_RAY_CAST_AO_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+               /* Initialize shadow ray cast of direct lighting queue */
+               Queue_data[QUEUE_SHADOW_RAY_CAST_DL_RAYS * queuesize + thread_index] = QUEUE_EMPTY_SLOT;
+       }
+
+       if(thread_index == 0) {
+               Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+               Queue_index[QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS] = 0;
+               Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+               Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+               /* The scene-intersect kernel should not use the queues very first time.
+                * since the queue would be empty.
+                */
+               use_queues_flag[0] = 0;
+       }
+
+       int x = get_global_id(0);
+       int y = get_global_id(1);
+
+       if(x < (sw * parallel_samples) && y < sh) {
+
+               int ray_index = x + y * (sw * parallel_samples);
+
+               /* This is the first assignment to ray_state; So we dont use ASSIGN_RAY_STATE macro */
+               ray_state[ray_index] = RAY_ACTIVE;
+
+               unsigned int my_sample;
+               unsigned int pixel_x;
+               unsigned int pixel_y;
+               unsigned int tile_x;
+               unsigned int tile_y;
+               unsigned int my_sample_tile;
+
+#ifdef __WORK_STEALING__
+               unsigned int my_work = 0;
+               /* get work */
+               get_next_work(work_pool_wgs, &my_work, sw, sh, num_samples, parallel_samples, ray_index);
+               /* Get the sample associated with the work */
+               my_sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+
+               my_sample_tile = 0;
+
+               /* Get pixel and tile position associated with the work */
+               get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+               work_array[ray_index] = my_work;
+#else // __WORK_STEALING__
+
+               unsigned int tile_index = ray_index / parallel_samples;
+               tile_x = tile_index % sw;
+               tile_y = tile_index / sw;
+               my_sample_tile = ray_index - (tile_index * parallel_samples);
+               my_sample = my_sample_tile + start_sample;
+
+               /* Initialize work array */
+               work_array[ray_index] = my_sample ;
+
+               /* Calculate pixel position of this ray */
+               pixel_x = sx + tile_x;
+               pixel_y = sy + tile_y;
+#endif // __WORK_STEALING__
+
+               rng_state += (rng_state_offset_x + tile_x) + (rng_state_offset_y + tile_y) * rng_state_stride;
+
+               /* Initialise per_sample_output_buffers to all zeros */
+               per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + (my_sample_tile)) * kernel_data.film.pass_stride;
+               int per_sample_output_buffers_iterator = 0;
+               for(per_sample_output_buffers_iterator = 0; per_sample_output_buffers_iterator < kernel_data.film.pass_stride; per_sample_output_buffers_iterator++) {
+                       per_sample_output_buffers[per_sample_output_buffers_iterator] = 0.0f;
+               }
+
+               /* initialize random numbers and ray */
+               kernel_path_trace_setup(kg, rng_state, my_sample, pixel_x, pixel_y, &rng_coop[ray_index], &Ray_coop[ray_index]);
+
+               if(Ray_coop[ray_index].t != 0.0f) {
+                       /* Initialize throuput, L_transparent, Ray, PathState; These rays proceed with path-iteration*/
+                       throughput_coop[ray_index] = make_float3(1.0f, 1.0f, 1.0f);
+                       L_transparent_coop[ray_index] = 0.0f;
+                       path_radiance_init(&PathRadiance_coop[ray_index], kernel_data.film.use_light_pass);
+                       path_state_init(kg, &PathState_coop[ray_index], &rng_coop[ray_index], my_sample, &Ray_coop[ray_index]);
+#ifdef __KERNEL_DEBUG__
+                       debug_data_init(&debugdata_coop[ray_index]);
+#endif
+               } else {
+                       /*These rays do not participate in path-iteration */
+
+                       float4 L_rad = make_float4(0.0f, 0.0f, 0.0f, 0.0f);
+                       /* accumulate result in output buffer */
+                       kernel_write_pass_float4(per_sample_output_buffers, my_sample, L_rad);
+                       path_rng_end(kg, rng_state, rng_coop[ray_index]);
+
+                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_TO_REGENERATE);
+               }
+       }
+
+       /* Mark rest of the ray-state indices as RAY_INACTIVE */
+       if(thread_index < (get_global_size(0) * get_global_size(1)) - (sh * (sw * parallel_samples))) {
+               /* First assignment, hence we dont use ASSIGN_RAY_STATE macro */
+               ray_state[((sw * parallel_samples) * sh) + thread_index] = RAY_INACTIVE;
+       }
+}
index f532442ba41ea6d4651763b788bc3810f365309e..94ede397848ae6ea9686aceb5cddb9af44830d7d 100644 (file)
@@ -23,7 +23,7 @@ ccl_device_inline void debug_data_init(DebugData *debug_data)
 
 ccl_device_inline void kernel_write_debug_passes(KernelGlobals *kg,
                                                  ccl_global float *buffer,
-                                                 PathState *state,
+                                                 ccl_addr_space PathState *state,
                                                  DebugData *debug_data,
                                                  int sample)
 {
index e5fbd5b450e3881c0a522024f3418d1d59854f10..ae1e70f0167013cc240c4cd77ee325c9f2aa3c39 100644 (file)
@@ -18,7 +18,7 @@ CCL_NAMESPACE_BEGIN
 
 /* See "Tracing Ray Differentials", Homan Igehy, 1999. */
 
-ccl_device void differential_transfer(differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
+ccl_device void differential_transfer(ccl_addr_space differential3 *dP_, const differential3 dP, float3 D, const differential3 dD, float3 Ng, float t)
 {
        /* ray differential transfer through homogeneous medium, to
         * compute dPdx/dy at a shading point from the incoming ray */
@@ -31,7 +31,7 @@ ccl_device void differential_transfer(differential3 *dP_, const differential3 dP
        dP_->dy = tmpy - dot(tmpy, Ng)*tmp;
 }
 
-ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
+ccl_device void differential_incoming(ccl_addr_space differential3 *dI, const differential3 dD)
 {
        /* compute dIdx/dy at a shading point, we just need to negate the
         * differential of the ray direction */
@@ -40,7 +40,7 @@ ccl_device void differential_incoming(differential3 *dI, const differential3 dD)
        dI->dy = -dD.dy;
 }
 
-ccl_device void differential_dudv(differential *du, differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
+ccl_device void differential_dudv(ccl_addr_space differential *du, ccl_addr_space differential *dv, float3 dPdu, float3 dPdv, differential3 dP, float3 Ng)
 {
        /* now we have dPdx/dy from the ray differential transfer, and dPdu/dv
         * from the primitive, we can compute dudx/dy and dvdx/dy. these are
diff --git a/intern/cycles/kernel/kernel_direct_lighting.cl b/intern/cycles/kernel/kernel_direct_lighting.cl
new file mode 100644 (file)
index 0000000..8bdc7dc
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_direct_lighting kernel.
+ * This is the eighth kernel in the ray tracing logic. This is the seventh
+ * of the path iteration kernels. This kernel takes care of direct lighting
+ * logic. However, the "shadow ray cast" part of direct lighting is handled
+ * in the next kernel.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with direct lighting should be executed.
+ * Those rays for which a shadow_blocked() function for direct-lighting must be executed, are marked with flag RAY_SHADOW_RAY_CAST_DL and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_DL_RAYS
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop -----------------------------------------|--- kernel_ocl_path_trace_direct_lighting --|--- BSDFEval_coop
+ * PathState_coop -----------------------------------|                                            |--- ISLamp_coop
+ * shader_data --------------------------------------|                                            |--- LightRay_coop
+ * ray_state ----------------------------------------|                                            |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ---|                                            |
+ * kg (globals + data) ------------------------------|                                            |
+ * queuesize ----------------------------------------|                                            |
+ *
+ * note on shader_DL : shader_DL is neither input nor output to this kernel; shader_DL is filled and consumed in this kernel itself.
+ * Note on Queues :
+ * This kernel only reads from the QUEUE_ACTIVE_AND_REGENERATED_RAYS queue and processes
+ * only the rays of state RAY_ACTIVE; If a ray needs to execute the corresponding shadow_blocked
+ * part, after direct lighting, the ray is marked with RAY_SHADOW_RAY_CAST_DL flag.
+ *
+ * State of queues when this kernel is called :
+ * state of queues QUEUE_ACTIVE_AND_REGENERATED_RAYS and QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be same
+ * before and after this kernel call.
+ * QUEUE_SHADOW_RAY_CAST_DL_RAYS queue will be filled with rays for which a shadow_blocked function must be executed, after this
+ * kernel call. Before this kernel call the QUEUE_SHADOW_RAY_CAST_DL_RAYS will be empty.
+ */
+__kernel void kernel_ocl_path_trace_direct_lighting(
+       ccl_global char *globals,
+       ccl_constant KernelData *data,
+       ccl_global char *shader_data,           /* Required for direct lighting */
+       ccl_global char *shader_DL,             /* Required for direct lighting */
+       ccl_global uint *rng_coop,              /* Required for direct lighting */
+       ccl_global PathState *PathState_coop,   /* Required for direct lighting */
+       ccl_global int *ISLamp_coop,            /* Required for direct lighting */
+       ccl_global Ray *LightRay_coop,          /* Required for direct lighting */
+       ccl_global BsdfEval *BSDFEval_coop,     /* Required for direct lighting */
+       ccl_global char *ray_state,             /* Denotes the state of each ray */
+       ccl_global int *Queue_data,             /* Queue memory */
+       ccl_global int *Queue_index,            /* Tracks the number of elements in each queue */
+       int queuesize                           /* Size (capacity) of each queue */
+       )
+{
+       ccl_local unsigned int local_queue_atomics;
+       if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+               local_queue_atomics = 0;
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       char enqueue_flag = 0;
+       int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+       ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+       /* If we are executing on a GPU device, we exit all threads that are not required
+        * If we are executing on a CPU device, then we need to keep all threads active
+        * since we have barrier() calls later in the kernel. CPU devices,
+        * expect all threads to execute barrier statement.
+        */
+       if(ray_index == QUEUE_EMPTY_SLOT)
+               return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+       if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+               if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+                       /* Load kernel globals structure and ShaderData structure */
+                       KernelGlobals *kg = (KernelGlobals *)globals;
+                       ShaderData *sd = (ShaderData *)shader_data;
+                       ShaderData *sd_DL  = (ShaderData *)shader_DL;
+
+                       ccl_global PathState *state = &PathState_coop[ray_index];
+
+                       /* direct lighting */
+#ifdef __EMISSION__
+                       if((kernel_data.integrator.use_direct_light && (ccl_fetch(sd, flag) & SD_BSDF_HAS_EVAL))) {
+                               /* sample illumination from lights to find path contribution */
+                               ccl_global RNG* rng = &rng_coop[ray_index];
+                               float light_t = path_state_rng_1D(kg, rng, state, PRNG_LIGHT);
+                               float light_u, light_v;
+                               path_state_rng_2D(kg, rng, state, PRNG_LIGHT_U, &light_u, &light_v);
+
+#ifdef __OBJECT_MOTION__
+                               light_ray.time = sd->time;
+#endif
+                               LightSample ls;
+                               light_sample(kg, light_t, light_u, light_v, ccl_fetch(sd, time), ccl_fetch(sd, P), state->bounce, &ls);
+
+                               Ray light_ray;
+                               BsdfEval L_light;
+                               bool is_lamp;
+                               if(direct_emission(kg, sd, &ls, &light_ray, &L_light, &is_lamp, state->bounce, state->transparent_bounce, sd_DL)) {
+                                       /* write intermediate data to global memory to access from the next kernel */
+                                       LightRay_coop[ray_index] = light_ray;
+                                       BSDFEval_coop[ray_index] = L_light;
+                                       ISLamp_coop[ray_index] = is_lamp;
+                                       /// mark ray state for next shadow kernel
+                                       ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_DL);
+                                       enqueue_flag = 1;
+                               }
+                       }
+#endif
+               }
+#ifndef __COMPUTE_DEVICE_GPU__
+       }
+#endif
+
+#ifdef __EMISSION__
+       /* Enqueue RAY_SHADOW_RAY_CAST_DL rays */
+       enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_DL_RAYS, enqueue_flag, queuesize, &local_queue_atomics, Queue_data, Queue_index);
+#endif
+}
index 6c5a5fac8c547060f1b750a60e1d93df5415f20c..1fee58353602f57cd603a13fadf2050af725267a 100644 (file)
 CCL_NAMESPACE_BEGIN
 
 /* Direction Emission */
-
 ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
-       LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce)
+       LightSample *ls, float3 I, differential3 dI, float t, float time, int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+       ,ShaderData *sd_input
+#endif
+)
 {
        /* setup shading at emitter */
-       ShaderData sd;
+#ifdef __SPLIT_KERNEL__
+       ShaderData *sd = sd_input;
+#else
+       ShaderData sd_object;
+       ShaderData *sd = &sd_object;
+#endif
        float3 eval;
 
 #ifdef __BACKGROUND_MIS__
@@ -37,23 +45,23 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
                ray.dP = differential3_zero();
                ray.dD = dI;
 
-               shader_setup_from_background(kg, &sd, &ray, bounce+1, transparent_bounce);
-               eval = shader_eval_background(kg, &sd, 0, SHADER_CONTEXT_EMISSION);
+               shader_setup_from_background(kg, sd, &ray, bounce+1, transparent_bounce);
+               eval = shader_eval_background(kg, sd, 0, SHADER_CONTEXT_EMISSION);
        }
        else
 #endif
        {
-               shader_setup_from_sample(kg, &sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
+               shader_setup_from_sample(kg, sd, ls->P, ls->Ng, I, ls->shader, ls->object, ls->prim, ls->u, ls->v, t, time, bounce+1, transparent_bounce);
 
-               ls->Ng = sd.Ng;
+               ls->Ng = ccl_fetch(sd, Ng);
 
                /* no path flag, we're evaluating this for all closures. that's weak but
                 * we'd have to do multiple evaluations otherwise */
-               shader_eval_surface(kg, &sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
+               shader_eval_surface(kg, sd, 0.0f, 0, SHADER_CONTEXT_EMISSION);
 
                /* evaluate emissive closure */
-               if(sd.flag & SD_EMISSION)
-                       eval = shader_emissive_eval(kg, &sd);
+               if(ccl_fetch(sd, flag) & SD_EMISSION)
+                       eval = shader_emissive_eval(kg, sd);
                else
                        eval = make_float3(0.0f, 0.0f, 0.0f);
        }
@@ -63,9 +71,14 @@ ccl_device_noinline float3 direct_emissive_eval(KernelGlobals *kg,
        return eval;
 }
 
+/* The argument sd_DL is meaningful only for split kernel. Other uses can just pass NULL */
 ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
        LightSample *ls, Ray *ray, BsdfEval *eval, bool *is_lamp,
-       int bounce, int transparent_bounce)
+       int bounce, int transparent_bounce
+#ifdef __SPLIT_KERNEL__
+       , ShaderData *sd_DL
+#endif
+       )
 {
        if(ls->pdf == 0.0f)
                return false;
@@ -74,7 +87,14 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
        differential3 dD = differential3_zero();
 
        /* evaluate closure */
-       float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, sd->time, bounce, transparent_bounce);
+
+       float3 light_eval = direct_emissive_eval(kg, ls, -ls->D, dD, ls->t, ccl_fetch(sd, time),
+                                                bounce,
+                                                transparent_bounce
+#ifdef __SPLIT_KERNEL__
+                                                ,sd_DL
+#endif
+                                                );
 
        if(is_zero(light_eval))
                return false;
@@ -83,7 +103,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
        float bsdf_pdf;
 
 #ifdef __VOLUME__
-       if(sd->prim != PRIM_NONE)
+       if(ccl_fetch(sd, prim) != PRIM_NONE)
                shader_bsdf_eval(kg, sd, ls->D, eval, &bsdf_pdf);
        else
                shader_volume_phase_eval(kg, sd, ls->D, eval, &bsdf_pdf);
@@ -118,8 +138,8 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
 
        if(ls->shader & SHADER_CAST_SHADOW) {
                /* setup ray */
-               bool transmit = (dot(sd->Ng, ls->D) < 0.0f);
-               ray->P = ray_offset(sd->P, (transmit)? -sd->Ng: sd->Ng);
+               bool transmit = (dot(ccl_fetch(sd, Ng), ls->D) < 0.0f);
+               ray->P = ray_offset(ccl_fetch(sd, P), (transmit)? -ccl_fetch(sd, Ng): ccl_fetch(sd, Ng));
 
                if(ls->t == FLT_MAX) {
                        /* distant light */
@@ -132,7 +152,7 @@ ccl_device_noinline bool direct_emission(KernelGlobals *kg, ShaderData *sd,
                        ray->D = normalize_len(ray->D, &ray->t);
                }
 
-               ray->dP = sd->dP;
+               ray->dP = ccl_fetch(sd, dP);
                ray->dD = differential3_zero();
        }
        else {
@@ -154,14 +174,14 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
        float3 L = shader_emissive_eval(kg, sd);
 
 #ifdef __HAIR__
-       if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS) && (sd->type & PRIMITIVE_ALL_TRIANGLE))
+       if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS) && (ccl_fetch(sd, type) & PRIMITIVE_ALL_TRIANGLE))
 #else
-       if(!(path_flag & PATH_RAY_MIS_SKIP) && (sd->flag & SD_USE_MIS))
+       if(!(path_flag & PATH_RAY_MIS_SKIP) && (ccl_fetch(sd, flag) & SD_USE_MIS))
 #endif
        {
                /* multiple importance sampling, get triangle light pdf,
                 * and compute weight with respect to BSDF pdf */
-               float pdf = triangle_light_pdf(kg, sd->Ng, sd->I, t);
+               float pdf = triangle_light_pdf(kg, ccl_fetch(sd, Ng), ccl_fetch(sd, I), t);
                float mis_weight = power_heuristic(bsdf_pdf, pdf);
 
                return L*mis_weight;
@@ -172,7 +192,12 @@ ccl_device_noinline float3 indirect_primitive_emission(KernelGlobals *kg, Shader
 
 /* Indirect Lamp Emission */
 
-ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission)
+/* The argument sd is meaningful only for split kernel. Other uses can just pass NULL */
+ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *state, Ray *ray, float3 *emission
+#ifdef __SPLIT_KERNEL__
+                                                ,ShaderData *sd
+#endif
+                                                )
 {
        bool hit_lamp = false;
 
@@ -196,7 +221,13 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
                }
 #endif
 
-               float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time, state->bounce, state->transparent_bounce);
+               float3 L = direct_emissive_eval(kg, &ls, -ray->D, ray->dD, ls.t, ray->time,
+                                               state->bounce,
+                                               state->transparent_bounce
+#ifdef __SPLIT_KERNEL__
+                                               ,sd
+#endif
+                                               );
 
 #ifdef __VOLUME__
                if(state->volume_stack[0].shader != SHADER_NONE) {
@@ -225,7 +256,11 @@ ccl_device_noinline bool indirect_lamp_emission(KernelGlobals *kg, PathState *st
 
 /* Indirect Background */
 
-ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *state, Ray *ray)
+ccl_device_noinline float3 indirect_background(KernelGlobals *kg, ccl_addr_space PathState *state, ccl_addr_space Ray *ray
+#ifdef __SPLIT_KERNEL__
+                                               ,ShaderData *sd_global
+#endif
+                                               )
 {
 #ifdef __BACKGROUND__
        int shader = kernel_data.background.surface_shader;
@@ -241,11 +276,17 @@ ccl_device_noinline float3 indirect_background(KernelGlobals *kg, PathState *sta
                        return make_float3(0.0f, 0.0f, 0.0f);
        }
 
+#ifdef __SPLIT_KERNEL__
        /* evaluate background closure */
+       Ray priv_ray = *ray;
+       shader_setup_from_background(kg, sd_global, &priv_ray, state->bounce+1, state->transparent_bounce);
+       float3 L = shader_eval_background(kg, sd_global, state->flag, SHADER_CONTEXT_EMISSION);
+#else
        ShaderData sd;
        shader_setup_from_background(kg, &sd, ray, state->bounce+1, state->transparent_bounce);
 
        float3 L = shader_eval_background(kg, &sd, state->flag, SHADER_CONTEXT_EMISSION);
+#endif
 
 #ifdef __BACKGROUND_MIS__
        /* check if background light exists or if we should skip pdf */
index be2c879adb9594a6afbcb24976e17c006991f88a..17fa18909c46cf5f38e4ba9c90e09797cbeead57 100644 (file)
@@ -80,7 +80,7 @@ typedef struct KernelGlobals {} KernelGlobals;
 
 #ifdef __KERNEL_OPENCL__
 
-typedef struct KernelGlobals {
+typedef ccl_addr_space struct KernelGlobals {
        ccl_constant KernelData *data;
 
 #define KERNEL_TEX(type, ttype, name) \
diff --git a/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl b/intern/cycles/kernel/kernel_holdout_emission_blurring_pathtermination_ao.cl
new file mode 100644 (file)
index 0000000..a2e5777
--- /dev/null
@@ -0,0 +1,283 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao kernel.
+ * This is the sixth kernel in the ray tracing logic. This is the fifth
+ * of the path iteration kernels. This kernel takes care of the logic to process
+ * "material of type holdout", indirect primitive emission, bsdf blurring,
+ * probabilistic path termination and AO.
+ *
+ * This kernels determines the rays for which a shadow_blocked() function associated with AO should be executed.
+ * Those rays for which a shadow_blocked() function for AO must be executed are marked with flag RAY_SHADOW_RAY_CAST_ao and
+ * enqueued into the queue QUEUE_SHADOW_RAY_CAST_AO_RAYS
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao ---|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                                          |--- PathState_coop
+ * PathRadiance_coop ------------------------------------|                                                                          |--- throughput_coop
+ * Intersection_coop ------------------------------------|                                                                          |--- L_transparent_coop
+ * PathState_coop ---------------------------------------|                                                                          |--- per_sample_output_buffers
+ * L_transparent_coop -----------------------------------|                                                                          |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                                          |--- ShaderData
+ * ray_state --------------------------------------------|                                                                          |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -------|                                                                          |--- Queue_data (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                                          |--- AOAlpha_coop
+ * kg (globals + data) ----------------------------------|                                                                          |--- AOBSDF_coop
+ * parallel_samples -------------------------------------|                                                                          |--- AOLightRay_coop
+ * per_sample_output_buffers ----------------------------|                                                                          |
+ * sw ---------------------------------------------------|                                                                          |
+ * sh ---------------------------------------------------|                                                                          |
+ * sx ---------------------------------------------------|                                                                          |
+ * sy ---------------------------------------------------|                                                                          |
+ * stride -----------------------------------------------|                                                                          |
+ * work_array -------------------------------------------|                                                                          |
+ * queuesize --------------------------------------------|                                                                          |
+ * start_sample -----------------------------------------|                                                                          |
+ *
+ * Note on Queues :
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFFER
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFFER, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE and RAY_REGENERATED rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE rays.
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be empty.
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and RAY_UPDATE_BUFFER rays
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * QUEUE_SHADOW_RAY_CAST_AO_RAYS will be filled with rays marked with flag RAY_SHADOW_RAY_CAST_AO
+ */
+
+__kernel void kernel_ocl_path_trace_holdout_emission_blurring_pathtermination_ao(
+       ccl_global char *globals,
+       ccl_constant KernelData *data,
+       ccl_global char *shader_data,               /* Required throughout the kernel except probabilistic path termination and AO */
+       ccl_global float *per_sample_output_buffers,
+       ccl_global uint *rng_coop,                  /* Required for "kernel_write_data_passes" and AO */
+       ccl_global float3 *throughput_coop,         /* Required for handling holdout material and AO */
+       ccl_global float *L_transparent_coop,       /* Required for handling holdout material */
+       PathRadiance *PathRadiance_coop, /* Required for "kernel_write_data_passes" and indirect primitive emission */
+       ccl_global PathState *PathState_coop,       /* Required throughout the kernel and AO */
+       Intersection *Intersection_coop, /* Required for indirect primitive emission */
+       ccl_global float3 *AOAlpha_coop,            /* Required for AO */
+       ccl_global float3 *AOBSDF_coop,             /* Required for AO */
+       ccl_global Ray *AOLightRay_coop,            /* Required for AO */
+       int sw, int sh, int sx, int sy, int stride,
+       ccl_global char *ray_state,                /* Denotes the state of each ray */
+       ccl_global unsigned int *work_array,       /* Denotes the work that each ray belongs to */
+       ccl_global int *Queue_data,                /* Queue memory */
+       ccl_global int *Queue_index,               /* Tracks the number of elements in each queue */
+       int queuesize,                             /* Size (capacity) of each queue */
+#ifdef __WORK_STEALING__
+       unsigned int start_sample,
+#endif
+       int parallel_samples                       /* Number of samples to be processed in parallel */
+       )
+{
+       ccl_local unsigned int local_queue_atomics_bg;
+       ccl_local unsigned int local_queue_atomics_ao;
+       if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+               local_queue_atomics_bg = 0;
+               local_queue_atomics_ao = 0;
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       char enqueue_flag = 0;
+       char enqueue_flag_AO_SHADOW_RAY_CAST = 0;
+       int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+       ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+       /* If we are executing on a GPU device, we exit all threads that are not required
+        * If we are executing on a CPU device, then we need to keep all threads active
+        * since we have barrier() calls later in the kernel. CPU devices
+        * expect all threads to execute barrier statement.
+        */
+       if(ray_index == QUEUE_EMPTY_SLOT)
+               return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__
+       if(ray_index != QUEUE_EMPTY_SLOT) {
+#endif
+               /* Load kernel globals structure and ShaderData structure */
+               KernelGlobals *kg = (KernelGlobals *)globals;
+               ShaderData *sd = (ShaderData *)shader_data;
+
+#ifdef __WORK_STEALING__
+               unsigned int my_work;
+               unsigned int pixel_x;
+               unsigned int pixel_y;
+#endif
+               unsigned int tile_x;
+               unsigned int tile_y;
+               int my_sample_tile;
+               unsigned int sample;
+
+               ccl_global RNG *rng = 0x0;
+               ccl_global PathState *state = 0x0;
+               float3 throughput;
+
+               if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+                       throughput = throughput_coop[ray_index];
+                       state = &PathState_coop[ray_index];
+                       rng = &rng_coop[ray_index];
+#ifdef __WORK_STEALING__
+                       my_work = work_array[ray_index];
+                       sample = get_my_sample(my_work, sw, sh, parallel_samples, ray_index) + start_sample;
+                       get_pixel_tile_position(&pixel_x, &pixel_y, &tile_x, &tile_y, my_work, sw, sh, sx, sy, parallel_samples, ray_index);
+                       my_sample_tile = 0;
+#else // __WORK_STEALING__
+                       sample = work_array[ray_index];
+                       /* buffer's stride is "stride"; Find x and y using ray_index */
+                       int tile_index = ray_index / parallel_samples;
+                       tile_x = tile_index % sw;
+                       tile_y = tile_index / sw;
+                       my_sample_tile = ray_index - (tile_index * parallel_samples);
+#endif // __WORK_STEALING__
+                       per_sample_output_buffers += (((tile_x + (tile_y * stride)) * parallel_samples) + my_sample_tile) * kernel_data.film.pass_stride;
+
+                       /* holdout */
+#ifdef __HOLDOUT__
+                       if((ccl_fetch(sd, flag) & (SD_HOLDOUT|SD_HOLDOUT_MASK)) && (state->flag & PATH_RAY_CAMERA)) {
+                               if(kernel_data.background.transparent) {
+                                       float3 holdout_weight;
+
+                                       if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK)
+                                               holdout_weight = make_float3(1.0f, 1.0f, 1.0f);
+                                       else
+                                               holdout_weight = shader_holdout_eval(kg, sd);
+
+                                       /* any throughput is ok, should all be identical here */
+                                       L_transparent_coop[ray_index] += average(holdout_weight*throughput);
+                               }
+
+                               if(ccl_fetch(sd, flag) & SD_HOLDOUT_MASK) {
+                                       ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                                       enqueue_flag = 1;
+                               }
+                       }
+#endif
+               }
+
+               if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+
+                       PathRadiance *L = &PathRadiance_coop[ray_index];
+                       /* holdout mask objects do not write data passes */
+                       kernel_write_data_passes(kg, per_sample_output_buffers, L, sd, sample, state, throughput);
+
+                       /* blurring of bsdf after bounces, for rays that have a small likelihood
+                               * of following this particular path (diffuse, rough glossy) */
+                       if(kernel_data.integrator.filter_glossy != FLT_MAX) {
+                               float blur_pdf = kernel_data.integrator.filter_glossy*state->min_ray_pdf;
+
+                               if(blur_pdf < 1.0f) {
+                                       float blur_roughness = sqrtf(1.0f - blur_pdf)*0.5f;
+                                       shader_bsdf_blur(kg, sd, blur_roughness);
+                               }
+                       }
+
+#ifdef __EMISSION__
+                       /* emission */
+                       if(ccl_fetch(sd, flag) & SD_EMISSION) {
+                               /* todo: is isect.t wrong here for transparent surfaces? */
+                               float3 emission = indirect_primitive_emission(kg, sd, Intersection_coop[ray_index].t, state->flag, state->ray_pdf);
+                               path_radiance_accum_emission(L, throughput, emission, state->bounce);
+                       }
+#endif
+
+                       /* path termination. this is a strange place to put the termination, it's
+                        * mainly due to the mixed in MIS that we use. gives too many unneeded
+                        * shader evaluations, only need emission if we are going to terminate */
+                       float probability = path_state_terminate_probability(kg, state, throughput);
+
+                       if(probability == 0.0f) {
+                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                               enqueue_flag = 1;
+                       }
+
+                       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+                               if(probability != 1.0f) {
+                                       float terminate = path_state_rng_1D_for_decision(kg, rng, state, PRNG_TERMINATE);
+
+                                       if(terminate >= probability) {
+                                               ASSIGN_RAY_STATE(ray_state, ray_index, RAY_UPDATE_BUFFER);
+                                               enqueue_flag = 1;
+                                       } else {
+                                               throughput_coop[ray_index] = throughput/probability;
+                                       }
+                               }
+                       }
+               }
+
+#ifdef __AO__
+               if(IS_STATE(ray_state, ray_index, RAY_ACTIVE)) {
+                       /* ambient occlusion */
+                       if(kernel_data.integrator.use_ambient_occlusion || (ccl_fetch(sd, flag) & SD_AO)) {
+                               /* todo: solve correlation */
+                               float bsdf_u, bsdf_v;
+                               path_state_rng_2D(kg, rng, state, PRNG_BSDF_U, &bsdf_u, &bsdf_v);
+
+                               float ao_factor = kernel_data.background.ao_factor;
+                               float3 ao_N;
+                               AOBSDF_coop[ray_index] = shader_bsdf_ao(kg, sd, ao_factor, &ao_N);
+                               AOAlpha_coop[ray_index] = shader_bsdf_alpha(kg, sd);
+
+                               float3 ao_D;
+                               float ao_pdf;
+                               sample_cos_hemisphere(ao_N, bsdf_u, bsdf_v, &ao_D, &ao_pdf);
+
+                               if(dot(ccl_fetch(sd, Ng), ao_D) > 0.0f && ao_pdf != 0.0f) {
+                                       Ray _ray;
+                                       _ray.P = ray_offset(ccl_fetch(sd, P), ccl_fetch(sd, Ng));
+                                       _ray.D = ao_D;
+                                       _ray.t = kernel_data.background.ao_distance;
+#ifdef __OBJECT_MOTION__
+                                       _ray.time = ccl_fetch(sd, time);
+#endif
+                                       _ray.dP = ccl_fetch(sd, dP);
+                                       _ray.dD = differential3_zero();
+                                       AOLightRay_coop[ray_index] = _ray;
+
+                                       ADD_RAY_FLAG(ray_state, ray_index, RAY_SHADOW_RAY_CAST_AO);
+                                       enqueue_flag_AO_SHADOW_RAY_CAST = 1;
+                               }
+                       }
+               }
+#endif
+#ifndef __COMPUTE_DEVICE_GPU__
+       }
+#endif
+
+       /* Enqueue RAY_UPDATE_BUFFER rays */
+       enqueue_ray_index_local(ray_index, QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS, enqueue_flag, queuesize, &local_queue_atomics_bg, Queue_data, Queue_index);
+#ifdef __AO__
+       /* Enqueue to-shadow-ray-cast rays */
+       enqueue_ray_index_local(ray_index, QUEUE_SHADOW_RAY_CAST_AO_RAYS, enqueue_flag_AO_SHADOW_RAY_CAST, queuesize, &local_queue_atomics_ao, Queue_data, Queue_index);
+#endif
+}
diff --git a/intern/cycles/kernel/kernel_lamp_emission.cl b/intern/cycles/kernel/kernel_lamp_emission.cl
new file mode 100644 (file)
index 0000000..e7f8b22
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_lamp_emission
+ * This is the 3rd kernel in the ray-tracing logic. This is the second of the
+ * path-iteration kernels. This kernel takes care of the indirect lamp emission logic.
+ * This kernel operates on QUEUE_ACTIVE_AND_REGENERATED_RAYS. It processes rays of state RAY_ACTIVE
+ * and RAY_HIT_BACKGROUND.
+ * We will empty QUEUE_ACTIVE_AND_REGENERATED_RAYS queue in this kernel.
+ * The input/output of the kernel is as follows,
+ * Throughput_coop ------------------------------------|--- kernel_ocl_path_trace_lamp_emission --|--- PathRadiance_coop
+ * Ray_coop -------------------------------------------|                                          |--- Queue_data(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * PathState_coop -------------------------------------|                                          |--- Queue_index(QUEUE_ACTIVE_AND_REGENERATED_RAYS)
+ * kg (globals + data) --------------------------------|                                          |
+ * Intersection_coop ----------------------------------|                                          |
+ * ray_state ------------------------------------------|                                          |
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATED_RAYS) -----|                                          |
+ * Queue_index (QUEUE_ACTIVE_AND_REGENERATED_RAYS) ----|                                          |
+ * queuesize ------------------------------------------|                                          |
+ * use_queues_flag ------------------------------------|                                          |
+ * sw -------------------------------------------------|                                          |
+ * sh -------------------------------------------------|                                          |
+ * parallel_samples -----------------------------------|                                          |
+ *
+ * note : shader_data is neither input nor output. Its just filled and consumed in the same, kernel_ocl_path_trace_lamp_emission, kernel.
+ */
+__kernel void kernel_ocl_path_trace_lamp_emission(
+       ccl_global char *globals,
+       ccl_constant KernelData *data,
+       ccl_global char *shader_data,               /* Required for lamp emission */
+       ccl_global float3 *throughput_coop,         /* Required for lamp emission */
+       PathRadiance *PathRadiance_coop, /* Required for lamp emission */
+       ccl_global Ray *Ray_coop,                   /* Required for lamp emission */
+       ccl_global PathState *PathState_coop,       /* Required for lamp emission */
+       Intersection *Intersection_coop, /* Required for lamp emission */
+       ccl_global char *ray_state,                 /* Denotes the state of each ray */
+       int sw, int sh,
+       ccl_global int *Queue_data,                 /* Memory for queues */
+       ccl_global int *Queue_index,                /* Tracks the number of elements in queues */
+       int queuesize,                              /* Size (capacity) of queues */
+       ccl_global char *use_queues_flag,           /* used to decide if this kernel should use queues to fetch ray index */
+       int parallel_samples                        /* Number of samples to be processed in parallel */
+       )
+{
+       int x = get_global_id(0);
+       int y = get_global_id(1);
+
+       /* We will empty this queue in this kernel */
+       if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+               Queue_index[QUEUE_ACTIVE_AND_REGENERATED_RAYS] = 0;
+       }
+
+       /* Fetch use_queues_flag */
+       ccl_local char local_use_queues_flag;
+       if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+               local_use_queues_flag = use_queues_flag[0];
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       int ray_index;
+       if(local_use_queues_flag) {
+               int thread_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+               ray_index = get_ray_index(thread_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 1);
+
+               if(ray_index == QUEUE_EMPTY_SLOT) {
+                       return;
+               }
+       } else {
+               if(x < (sw * parallel_samples) && y < sh){
+                       ray_index = x + y * (sw * parallel_samples);
+               } else {
+                       return;
+               }
+       }
+
+       if(IS_STATE(ray_state, ray_index, RAY_ACTIVE) || IS_STATE(ray_state, ray_index, RAY_HIT_BACKGROUND)) {
+               KernelGlobals *kg = (KernelGlobals *)globals;
+               ShaderData *sd = (ShaderData *)shader_data;
+               PathRadiance *L = &PathRadiance_coop[ray_index];
+
+               float3 throughput = throughput_coop[ray_index];
+               Ray ray = Ray_coop[ray_index];
+               PathState state = PathState_coop[ray_index];
+
+#ifdef __LAMP_MIS__
+               if(kernel_data.integrator.use_lamp_mis && !(state.flag & PATH_RAY_CAMERA)) {
+                       /* ray starting from previous non-transparent bounce */
+                       Ray light_ray;
+
+                       light_ray.P = ray.P - state.ray_t*ray.D;
+                       state.ray_t += Intersection_coop[ray_index].t;
+                       light_ray.D = ray.D;
+                       light_ray.t = state.ray_t;
+                       light_ray.time = ray.time;
+                       light_ray.dD = ray.dD;
+                       light_ray.dP = ray.dP;
+                       /* intersect with lamp */
+                       float3 emission;
+
+                       if(indirect_lamp_emission(kg, &state, &light_ray, &emission, sd)) {
+                               path_radiance_accum_emission(L, throughput, emission, state.bounce);
+                       }
+               }
+#endif
+               /* __VOLUME__ feature is disabled */
+#if 0
+#ifdef __VOLUME__
+               /* volume attenuation, emission, scatter */
+               if(state.volume_stack[0].shader != SHADER_NONE) {
+                       Ray volume_ray = ray;
+                       volume_ray.t = (hit)? isect.t: FLT_MAX;
+
+                       bool heterogeneous = volume_stack_is_heterogeneous(kg, state.volume_stack);
+
+#ifdef __VOLUME_DECOUPLED__
+                       int sampling_method = volume_stack_sampling_method(kg, state.volume_stack);
+                       bool decoupled = kernel_volume_use_decoupled(kg, heterogeneous, true, sampling_method);
+
+                       if(decoupled) {
+                               /* cache steps along volume for repeated sampling */
+                               VolumeSegment volume_segment;
+                               ShaderData volume_sd;
+
+                               shader_setup_from_volume(kg, &volume_sd, &volume_ray, state.bounce, state.transparent_bounce);
+                               kernel_volume_decoupled_record(kg, &state,
+                                       &volume_ray, &volume_sd, &volume_segment, heterogeneous);
+
+                               volume_segment.sampling_method = sampling_method;
+
+                               /* emission */
+                               if(volume_segment.closure_flag & SD_EMISSION)
+                                       path_radiance_accum_emission(&L, throughput, volume_segment.accum_emission, state.bounce);
+
+                               /* scattering */
+                               VolumeIntegrateResult result = VOLUME_PATH_ATTENUATED;
+
+                               if(volume_segment.closure_flag & SD_SCATTER) {
+                                       bool all = false;
+
+                                       /* direct light sampling */
+                                       kernel_branched_path_volume_connect_light(kg, rng, &volume_sd,
+                                               throughput, &state, &L, 1.0f, all, &volume_ray, &volume_segment);
+
+                                       /* indirect sample. if we use distance sampling and take just
+                                        * one sample for direct and indirect light, we could share
+                                        * this computation, but makes code a bit complex */
+                                       float rphase = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_PHASE);
+                                       float rscatter = path_state_rng_1D_for_decision(kg, rng, &state, PRNG_SCATTER_DISTANCE);
+
+                                       result = kernel_volume_decoupled_scatter(kg,
+                                               &state, &volume_ray, &volume_sd, &throughput,
+                                               rphase, rscatter, &volume_segment, NULL, true);
+                               }
+
+                               if(result != VOLUME_PATH_SCATTERED)
+                                       throughput *= volume_segment.accum_transmittance;
+
+                               /* free cached steps */
+                               kernel_volume_decoupled_free(kg, &volume_segment);
+
+                               if(result == VOLUME_PATH_SCATTERED) {
+                                       if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+                                               continue;
+                                       else
+                                               break;
+                               }
+                       }
+                       else
+#endif
+                       {
+                               /* integrate along volume segment with distance sampling */
+                               ShaderData volume_sd;
+                               VolumeIntegrateResult result = kernel_volume_integrate(
+                                       kg, &state, &volume_sd, &volume_ray, &L, &throughput, rng, heterogeneous);
+
+#ifdef __VOLUME_SCATTER__
+                               if(result == VOLUME_PATH_SCATTERED) {
+                                       /* direct lighting */
+                                       kernel_path_volume_connect_light(kg, rng, &volume_sd, throughput, &state, &L);
+
+                                       /* indirect light bounce */
+                                       if(kernel_path_volume_bounce(kg, rng, &volume_sd, &throughput, &state, &L, &ray))
+                                               continue;
+                                       else
+                                               break;
+                               }
+#endif
+                       }
+               }
+#endif
+#endif
+       }
+}
diff --git a/intern/cycles/kernel/kernel_next_iteration_setup.cl b/intern/cycles/kernel/kernel_next_iteration_setup.cl
new file mode 100644 (file)
index 0000000..49562ca
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * Copyright 2011-2015 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_split.h"
+
+/*
+ * Note on kernel_ocl_path_trace_setup_next_iteration kernel.
+ * This is the tenth kernel in the ray tracing logic. This is the ninth
+ * of the path iteration kernels. This kernel takes care of setting up
+ * Ray for the next iteration of path-iteration and accumulating radiance
+ * corresponding to AO and direct-lighting
+ *
+ * Ray state of rays that are terminated in this kernel are changed to RAY_UPDATE_BUFFER
+ *
+ * The input and output are as follows,
+ *
+ * rng_coop ---------------------------------------------|--- kernel_ocl_path_trace_setup_next_iteration -|--- Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS)
+ * throughput_coop --------------------------------------|                                                |--- Queue_data (QUEUE_HITBF_BUFF_UPDATE_TOREGEN_RAYS)
+ * PathRadiance_coop ------------------------------------|                                                |--- throughput_coop
+ * PathState_coop ---------------------------------------|                                                |--- PathRadiance_coop
+ * shader_data ------------------------------------------|                                                |--- PathState_coop
+ * ray_state --------------------------------------------|                                                |--- ray_state
+ * Queue_data (QUEUE_ACTIVE_AND_REGENERATD_RAYS) --------|                                                |--- Ray_coop
+ * Queue_index (QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS) ---|                                                |--- use_queues_flag
+ * Ray_coop ---------------------------------------------|                                                |
+ * kg (globals + data) ----------------------------------|                                                |
+ * LightRay_dl_coop -------------------------------------|
+ * ISLamp_coop ------------------------------------------|
+ * BSDFEval_coop ----------------------------------------|
+ * LightRay_ao_coop -------------------------------------|
+ * AOBSDF_coop ------------------------------------------|
+ * AOAlpha_coop -----------------------------------------|
+ *
+ * Note on queues,
+ * This kernel fetches rays from the queue QUEUE_ACTIVE_AND_REGENERATED_RAYS and processes only
+ * the rays of state RAY_ACTIVE.
+ * There are different points in this kernel where a ray may terminate and reach RAY_UPDATE_BUFF
+ * state. These rays are enqueued into QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS queue. These rays will
+ * still be present in QUEUE_ACTIVE_AND_REGENERATED_RAYS queue, but since their ray-state has been
+ * changed to RAY_UPDATE_BUFF, there is no problem.
+ *
+ * State of queues when this kernel is called :
+ * At entry,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED, RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and RAY_UPDATE_BUFFER rays
+ * At exit,
+ * QUEUE_ACTIVE_AND_REGENERATED_RAYS will be filled with RAY_ACTIVE, RAY_REGENERATED and more RAY_UPDATE_BUFFER rays.
+ * QUEUE_HITBG_BUFF_UPDATE_TOREGEN_RAYS will be filled with RAY_TO_REGENERATE and more RAY_UPDATE_BUFFER rays
+ */
+
+__kernel void kernel_ocl_path_trace_setup_next_iteration(
+       ccl_global char *globals,
+       ccl_constant KernelData *data,
+       ccl_global char *shader_data,               /* Required for setting up ray for next iteration */
+       ccl_global uint *rng_coop,                  /* Required for setting up ray for next iteration */
+       ccl_global float3 *throughput_coop,         /* Required for setting up ray for next iteration */
+       PathRadiance *PathRadiance_coop, /* Required for setting up ray for next iteration */
+       ccl_global Ray *Ray_coop,                   /* Required for setting up ray for next iteration */
+       ccl_global PathState *PathState_coop,       /* Required for setting up ray for next iteration */
+       ccl_global Ray *LightRay_dl_coop,           /* Required for radiance update - direct lighting */
+       ccl_global int *ISLamp_coop,                /* Required for radiance update - direct lighting */
+       ccl_global BsdfEval *BSDFEval_coop,         /* Required for radiance update - direct lighting */
+       ccl_global Ray *LightRay_ao_coop,           /* Required for radiance update - AO */
+       ccl_global float3 *AOBSDF_coop,             /* Required for radiance update - AO */
+       ccl_global float3 *AOAlpha_coop,            /* Required for radiance update - AO */
+       ccl_global char *ray_state,                 /* Denotes the state of each ray */
+       ccl_global int *Queue_data,                 /* Queue memory */
+       ccl_global int *Queue_index,                /* Tracks the number of elements in each queue */
+       int queuesize,                              /* Size (capacity) of each queue */
+       ccl_global char *use_queues_flag            /* flag to decide if scene_intersect kernel should use queues to fetch ray index */
+       )
+{
+
+       ccl_local unsigned int local_queue_atomics;
+       if(get_local_id(0) == 0 && get_local_id(1) == 0) {
+               local_queue_atomics = 0;
+       }
+       barrier(CLK_LOCAL_MEM_FENCE);
+
+       if(get_global_id(0) == 0 && get_global_id(1) == 0) {
+               /* If we are here, then it means that scene-intersect kernel
+               * has already been executed atleast once. From the next time,
+               * scene-intersect kernel may operate on queues to fetch ray index
+               */
+               use_queues_flag[0] = 1;
+
+               /* Mark queue indices of QUEUE_SHADOW_RAY_CAST_AO_RAYS and QUEUE_SHADOW_RAY_CAST_DL_RAYS
+                * queues that were made empty during the previous kernel
+                */
+               Queue_index[QUEUE_SHADOW_RAY_CAST_AO_RAYS] = 0;
+               Queue_index[QUEUE_SHADOW_RAY_CAST_DL_RAYS] = 0;
+       }
+
+       char enqueue_flag = 0;
+       int ray_index = get_global_id(1) * get_global_size(0) + get_global_id(0);
+       ray_index = get_ray_index(ray_index, QUEUE_ACTIVE_AND_REGENERATED_RAYS, Queue_data, queuesize, 0);
+
+#ifdef __COMPUTE_DEVICE_GPU__
+       /* If we are executing on a GPU device, we exit all threads that are not required
+        * If we are executing on a CPU device, then we need to keep all threads active
+        * since we have barrier() calls later in the kernel. CPU devices,
+        * expect all threads to execute barrier statement.
+        */
+       if(ray_index == QUEUE_EMPTY_SLOT)
+               return;
+#endif
+
+#ifndef __COMPUTE_DEVICE_GPU__