11c16b5f29e31b969ef09d878ac0c66646d4ae34
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "clew.h"
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_foreach.h"
31 #include "util_logging.h"
32 #include "util_map.h"
33 #include "util_math.h"
34 #include "util_md5.h"
35 #include "util_opengl.h"
36 #include "util_path.h"
37 #include "util_time.h"
38
39 CCL_NAMESPACE_BEGIN
40
41 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
42
43 /* Macro declarations used with split kernel */
44
45 /* Macro to enable/disable work-stealing */
46 #define __WORK_STEALING__
47
48 #define SPLIT_KERNEL_LOCAL_SIZE_X 64
49 #define SPLIT_KERNEL_LOCAL_SIZE_Y 1
50
51 /* This value may be tuned according to the scene we are rendering.
52  *
53  * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
54  * ray-bounces will improve performance.
55  */
56 #define PATH_ITER_INC_FACTOR 8
57
58 /* When allocate global memory in chunks. We may not be able to
59  * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
60  * Since some bytes may be needed for aligning chunks of memory;
61  * This is the amount of memory that we dedicate for that purpose.
62  */
63 #define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
64
65 struct OpenCLPlatformDevice {
66         OpenCLPlatformDevice(cl_platform_id platform_id,
67                              cl_device_id device_id)
68           : platform_id(platform_id), device_id(device_id) {}
69         cl_platform_id platform_id;
70         cl_device_id device_id;
71 };
72
73 namespace {
74
75 cl_device_type opencl_device_type()
76 {
77         char *device = getenv("CYCLES_OPENCL_TEST");
78
79         if(device) {
80                 if(strcmp(device, "ALL") == 0)
81                         return CL_DEVICE_TYPE_ALL;
82                 else if(strcmp(device, "DEFAULT") == 0)
83                         return CL_DEVICE_TYPE_DEFAULT;
84                 else if(strcmp(device, "CPU") == 0)
85                         return CL_DEVICE_TYPE_CPU;
86                 else if(strcmp(device, "GPU") == 0)
87                         return CL_DEVICE_TYPE_GPU;
88                 else if(strcmp(device, "ACCELERATOR") == 0)
89                         return CL_DEVICE_TYPE_ACCELERATOR;
90         }
91
92         return CL_DEVICE_TYPE_ALL;
93 }
94
95 bool opencl_kernel_use_debug()
96 {
97         return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
98 }
99
100 bool opencl_kernel_use_advanced_shading(const string& platform)
101 {
102         /* keep this in sync with kernel_types.h! */
103         if(platform == "NVIDIA CUDA")
104                 return true;
105         else if(platform == "Apple")
106                 return false;
107         else if(platform == "AMD Accelerated Parallel Processing")
108                 return true;
109         else if(platform == "Intel(R) OpenCL")
110                 return true;
111
112         return false;
113 }
114
115 bool opencl_kernel_use_split(const string& platform_name,
116                              const cl_device_type device_type)
117 {
118         if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) {
119                 return true;
120         }
121         /* TODO(sergey): Replace string lookups with more enum-like API,
122          * similar to device/vendor checks blender's gpu.
123          */
124         if(platform_name == "AMD Accelerated Parallel Processing" &&
125            device_type == CL_DEVICE_TYPE_GPU)
126         {
127                 return true;
128         }
129         return false;
130 }
131
132 bool opencl_device_supported(const string& platform_name,
133                              const cl_device_id device_id)
134 {
135         cl_device_type device_type;
136         clGetDeviceInfo(device_id,
137                         CL_DEVICE_TYPE,
138                         sizeof(cl_device_type),
139                         &device_type,
140                         NULL);
141         if(platform_name == "AMD Accelerated Parallel Processing" &&
142            device_type == CL_DEVICE_TYPE_GPU)
143         {
144                 return true;;
145         }
146         return false;
147 }
148
149 bool opencl_platform_version_check(cl_platform_id platform,
150                                    string *error = NULL)
151 {
152         const int req_major = 1, req_minor = 1;
153         int major, minor;
154         char version[256];
155         clGetPlatformInfo(platform,
156                           CL_PLATFORM_VERSION,
157                           sizeof(version),
158                           &version,
159                           NULL);
160         if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
161                 if(error != NULL) {
162                         *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
163                 }
164                 return false;
165         }
166         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
167                 if(error != NULL) {
168                         *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
169                 }
170                 return false;
171         }
172         if(error != NULL) {
173                 *error = "";
174         }
175         return true;
176 }
177
178 bool opencl_device_version_check(cl_device_id device,
179                                  string *error = NULL)
180 {
181         const int req_major = 1, req_minor = 1;
182         int major, minor;
183         char version[256];
184         clGetDeviceInfo(device,
185                         CL_DEVICE_OPENCL_C_VERSION,
186                         sizeof(version),
187                         &version,
188                         NULL);
189         if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
190                 if(error != NULL) {
191                         *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
192                 }
193                 return false;
194         }
195         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
196                 if(error != NULL) {
197                         *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
198                 }
199                 return false;
200         }
201         if(error != NULL) {
202                 *error = "";
203         }
204         return true;
205 }
206
207 void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
208 {
209         const bool force_all_platforms =
210                 (getenv("CYCLES_OPENCL_TEST") != NULL) ||
211                 (getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST")) != NULL;
212         const cl_device_type device_type = opencl_device_type();
213
214         vector<cl_device_id> device_ids;
215         cl_uint num_devices = 0;
216         vector<cl_platform_id> platform_ids;
217         cl_uint num_platforms = 0;
218
219         /* Number of the devices added to the device info list. */
220         cl_uint num_added_devices = 0;
221
222         /* Get devices. */
223         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
224            num_platforms == 0)
225         {
226                 return;
227         }
228         platform_ids.resize(num_platforms);
229         if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
230                 return;
231         }
232         /* Devices are numbered consecutively across platforms. */
233         int num_base = 0;
234         for(int platform = 0;
235             platform < num_platforms;
236             platform++, num_base += num_added_devices)
237         {
238                 cl_platform_id platform_id = platform_ids[platform];
239                 num_devices = num_added_devices = 0;
240                 if(clGetDeviceIDs(platform_id,
241                                   device_type,
242                                   0,
243                                   NULL,
244                                   &num_devices) != CL_SUCCESS || num_devices == 0)
245                 {
246                         continue;
247                 }
248                 device_ids.resize(num_devices);
249                 if(clGetDeviceIDs(platform_id,
250                                   device_type,
251                                   num_devices,
252                                   &device_ids[0],
253                                   NULL) != CL_SUCCESS)
254                 {
255                         continue;
256                 }
257                 if(!opencl_platform_version_check(platform_ids[platform])) {
258                         continue;
259                 }
260                 char pname[256];
261                 clGetPlatformInfo(platform_id,
262                                   CL_PLATFORM_NAME,
263                                   sizeof(pname),
264                                   &pname,
265                                   NULL);
266                 string platform_name = pname;
267                 for(int num = 0; num < num_devices; num++) {
268                         cl_device_id device_id = device_ids[num];
269                         if(!opencl_device_version_check(device_id)) {
270                                 continue;
271                         }
272                         if(force_all_platforms ||
273                            opencl_device_supported(platform_name, device_id))
274                         {
275                                 usable_devices->push_back(OpenCLPlatformDevice(platform_id,
276                                                                                device_id));
277                         }
278                 }
279         }
280 }
281
282 }  /* namespace */
283
284 /* thread safe cache for contexts and programs */
285 class OpenCLCache
286 {
287         struct Slot
288         {
289                 thread_mutex *mutex;
290                 cl_context context;
291                 /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
292                 cl_program ocl_dev_base_program;
293                 /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
294                 cl_program ocl_dev_megakernel_program;
295
296                 Slot() : mutex(NULL), context(NULL), ocl_dev_base_program(NULL), ocl_dev_megakernel_program(NULL) {}
297
298                 Slot(const Slot &rhs)
299                         : mutex(rhs.mutex)
300                         , context(rhs.context)
301                         , ocl_dev_base_program(rhs.ocl_dev_base_program)
302                         , ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
303                 {
304                         /* copy can only happen in map insert, assert that */
305                         assert(mutex == NULL);
306                 }
307
308                 ~Slot()
309                 {
310                         delete mutex;
311                         mutex = NULL;
312                 }
313         };
314
315         /* key is combination of platform ID and device ID */
316         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
317
318         /* map of Slot objects */
319         typedef map<PlatformDevicePair, Slot> CacheMap;
320         CacheMap cache;
321
322         thread_mutex cache_lock;
323
324         /* lazy instantiate */
325         static OpenCLCache &global_instance()
326         {
327                 static OpenCLCache instance;
328                 return instance;
329         }
330
331         OpenCLCache()
332         {
333         }
334
335         ~OpenCLCache()
336         {
337                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
338                  * so this is disabled. It's not necessary to free objects
339                  * at process exit anyway.
340                  * http://software.intel.com/en-us/forums/topic/370083#comments */
341
342                 //flush();
343         }
344
345         /* lookup something in the cache. If this returns NULL, slot_locker
346          * will be holding a lock for the cache. slot_locker should refer to a
347          * default constructed thread_scoped_lock */
348         template<typename T>
349         static T get_something(cl_platform_id platform, cl_device_id device,
350                 T Slot::*member, thread_scoped_lock &slot_locker)
351         {
352                 assert(platform != NULL);
353
354                 OpenCLCache &self = global_instance();
355
356                 thread_scoped_lock cache_lock(self.cache_lock);
357
358                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
359                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
360
361                 Slot &slot = ins.first->second;
362
363                 /* create slot lock only while holding cache lock */
364                 if(!slot.mutex)
365                         slot.mutex = new thread_mutex;
366
367                 /* need to unlock cache before locking slot, to allow store to complete */
368                 cache_lock.unlock();
369
370                 /* lock the slot */
371                 slot_locker = thread_scoped_lock(*slot.mutex);
372
373                 /* If the thing isn't cached */
374                 if(slot.*member == NULL) {
375                         /* return with the caller's lock holder holding the slot lock */
376                         return NULL;
377                 }
378
379                 /* the item was already cached, release the slot lock */
380                 slot_locker.unlock();
381
382                 return slot.*member;
383         }
384
385         /* store something in the cache. you MUST have tried to get the item before storing to it */
386         template<typename T>
387         static void store_something(cl_platform_id platform, cl_device_id device, T thing,
388                 T Slot::*member, thread_scoped_lock &slot_locker)
389         {
390                 assert(platform != NULL);
391                 assert(device != NULL);
392                 assert(thing != NULL);
393
394                 OpenCLCache &self = global_instance();
395
396                 thread_scoped_lock cache_lock(self.cache_lock);
397                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
398                 cache_lock.unlock();
399
400                 Slot &slot = i->second;
401
402                 /* sanity check */
403                 assert(i != self.cache.end());
404                 assert(slot.*member == NULL);
405
406                 slot.*member = thing;
407
408                 /* unlock the slot */
409                 slot_locker.unlock();
410         }
411
412 public:
413
414         enum ProgramName {
415                 OCL_DEV_BASE_PROGRAM,
416                 OCL_DEV_MEGAKERNEL_PROGRAM,
417         };
418
419         /* see get_something comment */
420         static cl_context get_context(cl_platform_id platform, cl_device_id device,
421                 thread_scoped_lock &slot_locker)
422         {
423                 cl_context context = get_something<cl_context>(platform, device, &Slot::context, slot_locker);
424
425                 if(!context)
426                         return NULL;
427
428                 /* caller is going to release it when done with it, so retain it */
429                 cl_int ciErr = clRetainContext(context);
430                 assert(ciErr == CL_SUCCESS);
431                 (void)ciErr;
432
433                 return context;
434         }
435
436         /* see get_something comment */
437         static cl_program get_program(cl_platform_id platform, cl_device_id device, ProgramName program_name,
438                 thread_scoped_lock &slot_locker)
439         {
440                 cl_program program = NULL;
441
442                 if(program_name == OCL_DEV_BASE_PROGRAM) {
443                         /* Get program related to OpenCLDeviceBase */
444                         program = get_something<cl_program>(platform, device, &Slot::ocl_dev_base_program, slot_locker);
445                 }
446                 else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
447                         /* Get program related to megakernel */
448                         program = get_something<cl_program>(platform, device, &Slot::ocl_dev_megakernel_program, slot_locker);
449                 } else {
450                         assert(!"Invalid program name");
451                 }
452
453                 if(!program)
454                         return NULL;
455
456                 /* caller is going to release it when done with it, so retain it */
457                 cl_int ciErr = clRetainProgram(program);
458                 assert(ciErr == CL_SUCCESS);
459                 (void)ciErr;
460
461                 return program;
462         }
463
464         /* see store_something comment */
465         static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
466                 thread_scoped_lock &slot_locker)
467         {
468                 store_something<cl_context>(platform, device, context, &Slot::context, slot_locker);
469
470                 /* increment reference count in OpenCL.
471                  * The caller is going to release the object when done with it. */
472                 cl_int ciErr = clRetainContext(context);
473                 assert(ciErr == CL_SUCCESS);
474                 (void)ciErr;
475         }
476
477         /* see store_something comment */
478         static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, ProgramName program_name,
479                 thread_scoped_lock &slot_locker)
480         {
481                 if(program_name == OCL_DEV_BASE_PROGRAM) {
482                         store_something<cl_program>(platform, device, program, &Slot::ocl_dev_base_program, slot_locker);
483                 }
484                 else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
485                         store_something<cl_program>(platform, device, program, &Slot::ocl_dev_megakernel_program, slot_locker);
486                 } else {
487                         assert(!"Invalid program name\n");
488                         return;
489                 }
490
491                 /* increment reference count in OpenCL.
492                  * The caller is going to release the object when done with it. */
493                 cl_int ciErr = clRetainProgram(program);
494                 assert(ciErr == CL_SUCCESS);
495                 (void)ciErr;
496         }
497
498         /* discard all cached contexts and programs
499          * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
500         static void flush()
501         {
502                 OpenCLCache &self = global_instance();
503                 thread_scoped_lock cache_lock(self.cache_lock);
504
505                 foreach(CacheMap::value_type &item, self.cache) {
506                         if(item.second.ocl_dev_base_program != NULL)
507                                 clReleaseProgram(item.second.ocl_dev_base_program);
508                         if(item.second.ocl_dev_megakernel_program != NULL)
509                                 clReleaseProgram(item.second.ocl_dev_megakernel_program);
510                         if(item.second.context != NULL)
511                                 clReleaseContext(item.second.context);
512                 }
513
514                 self.cache.clear();
515         }
516 };
517
518 class OpenCLDeviceBase : public Device
519 {
520 public:
521         DedicatedTaskPool task_pool;
522         cl_context cxContext;
523         cl_command_queue cqCommandQueue;
524         cl_platform_id cpPlatform;
525         cl_device_id cdDevice;
526         cl_program cpProgram;
527         cl_kernel ckFilmConvertByteKernel;
528         cl_kernel ckFilmConvertHalfFloatKernel;
529         cl_kernel ckShaderKernel;
530         cl_kernel ckBakeKernel;
531         cl_int ciErr;
532
533         typedef map<string, device_vector<uchar>*> ConstMemMap;
534         typedef map<string, device_ptr> MemMap;
535
536         ConstMemMap const_mem_map;
537         MemMap mem_map;
538         device_ptr null_mem;
539
540         bool device_initialized;
541         string platform_name;
542
543         bool opencl_error(cl_int err)
544         {
545                 if(err != CL_SUCCESS) {
546                         string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
547                         if(error_msg == "")
548                                 error_msg = message;
549                         fprintf(stderr, "%s\n", message.c_str());
550                         return true;
551                 }
552
553                 return false;
554         }
555
556         void opencl_error(const string& message)
557         {
558                 if(error_msg == "")
559                         error_msg = message;
560                 fprintf(stderr, "%s\n", message.c_str());
561         }
562
563 #define opencl_assert(stmt) \
564         { \
565                 cl_int err = stmt; \
566                 \
567                 if(err != CL_SUCCESS) { \
568                         string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
569                         if(error_msg == "") \
570                                 error_msg = message; \
571                         fprintf(stderr, "%s\n", message.c_str()); \
572                 } \
573         } (void)0
574
575         void opencl_assert_err(cl_int err, const char* where)
576         {
577                 if(err != CL_SUCCESS) {
578                         string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
579                         if(error_msg == "")
580                                 error_msg = message;
581                         fprintf(stderr, "%s\n", message.c_str());
582 #ifndef NDEBUG
583                         abort();
584 #endif
585                 }
586         }
587
588         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
589         : Device(info, stats, background_)
590         {
591                 cpPlatform = NULL;
592                 cdDevice = NULL;
593                 cxContext = NULL;
594                 cqCommandQueue = NULL;
595                 cpProgram = NULL;
596                 ckFilmConvertByteKernel = NULL;
597                 ckFilmConvertHalfFloatKernel = NULL;
598                 ckShaderKernel = NULL;
599                 ckBakeKernel = NULL;
600                 null_mem = 0;
601                 device_initialized = false;
602
603                 vector<OpenCLPlatformDevice> usable_devices;
604                 opencl_get_usable_devices(&usable_devices);
605                 if(usable_devices.size() == 0) {
606                         opencl_error("OpenCL: no devices found.");
607                         return;
608                 }
609                 assert(info.num < usable_devices.size());
610                 OpenCLPlatformDevice& platform_device = usable_devices[info.num];
611                 cpPlatform = platform_device.platform_id;
612                 cdDevice = platform_device.device_id;
613                 char name[256];
614                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
615                 platform_name = name;
616
617                 {
618                         /* try to use cached context */
619                         thread_scoped_lock cache_locker;
620                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
621
622                         if(cxContext == NULL) {
623                                 /* create context properties array to specify platform */
624                                 const cl_context_properties context_props[] = {
625                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
626                                         0, 0
627                                 };
628
629                                 /* create context */
630                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
631                                         context_notify_callback, cdDevice, &ciErr);
632
633                                 if(opencl_error(ciErr)) {
634                                         opencl_error("OpenCL: clCreateContext failed");
635                                         return;
636                                 }
637
638                                 /* cache it */
639                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
640                         }
641                 }
642
643                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
644                 if(opencl_error(ciErr))
645                         return;
646
647                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
648                 if(opencl_error(ciErr))
649                         return;
650
651                 fprintf(stderr, "Device init success\n");
652                 device_initialized = true;
653         }
654
655         static void CL_CALLBACK context_notify_callback(const char *err_info,
656                 const void * /*private_info*/, size_t /*cb*/, void *user_data)
657         {
658                 char name[256];
659                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
660
661                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
662         }
663
664         bool opencl_version_check()
665         {
666                 string error;
667                 if(!opencl_platform_version_check(cpPlatform, &error)) {
668                         opencl_error(error);
669                         return false;
670                 }
671                 if(!opencl_device_version_check(cdDevice, &error)) {
672                         opencl_error(error);
673                         return false;
674                 }
675                 return true;
676         }
677
678         bool load_binary(const string& /*kernel_path*/,
679                          const string& clbin,
680                          string custom_kernel_build_options,
681                          cl_program *program,
682                          const string *debug_src = NULL)
683         {
684                 /* read binary into memory */
685                 vector<uint8_t> binary;
686
687                 if(!path_read_binary(clbin, binary)) {
688                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
689                         return false;
690                 }
691
692                 /* create program */
693                 cl_int status;
694                 size_t size = binary.size();
695                 const uint8_t *bytes = &binary[0];
696
697                 *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
698                         &size, &bytes, &status, &ciErr);
699
700                 if(opencl_error(status) || opencl_error(ciErr)) {
701                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
702                         return false;
703                 }
704
705                 if(!build_kernel(program, custom_kernel_build_options, debug_src))
706                         return false;
707
708                 return true;
709         }
710
711         bool save_binary(cl_program *program, const string& clbin)
712         {
713                 size_t size = 0;
714                 clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
715
716                 if(!size)
717                         return false;
718
719                 vector<uint8_t> binary(size);
720                 uint8_t *bytes = &binary[0];
721
722                 clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
723
724                 if(!path_write_binary(clbin, binary)) {
725                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
726                         return false;
727                 }
728
729                 return true;
730         }
731
732         bool build_kernel(cl_program *kernel_program,
733                           string custom_kernel_build_options,
734                           const string *debug_src = NULL)
735         {
736                 string build_options;
737                 build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
738
739                 ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
740
741                 /* show warnings even if build is successful */
742                 size_t ret_val_size = 0;
743
744                 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
745
746                 if(ret_val_size > 1) {
747                         vector<char> build_log(ret_val_size + 1);
748                         clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
749
750                         build_log[ret_val_size] = '\0';
751                         /* Skip meaningless empty output from the NVidia compiler. */
752                         if(!(ret_val_size == 2 && build_log[0] == '\n')) {
753                                 fprintf(stderr, "OpenCL kernel build output:\n");
754                                 fprintf(stderr, "%s\n", &build_log[0]);
755                         }
756                 }
757
758                 if(ciErr != CL_SUCCESS) {
759                         opencl_error("OpenCL build failed: errors in console");
760                         return false;
761                 }
762
763                 return true;
764         }
765
766         bool compile_kernel(const string& kernel_path,
767                             string source,
768                             string custom_kernel_build_options,
769                             cl_program *kernel_program,
770                             const string *debug_src = NULL)
771         {
772                 /* we compile kernels consisting of many files. unfortunately opencl
773                  * kernel caches do not seem to recognize changes in included files.
774                  * so we force recompile on changes by adding the md5 hash of all files */
775                 source = path_source_replace_includes(source, kernel_path);
776
777                 if(debug_src)
778                         path_write_text(*debug_src, source);
779
780                 size_t source_len = source.size();
781                 const char *source_str = source.c_str();
782
783                 *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
784
785                 if(opencl_error(ciErr))
786                         return false;
787
788                 double starttime = time_dt();
789                 printf("Compiling OpenCL kernel ...\n");
790                 /* TODO(sergey): Report which kernel is being compiled
791                  * as well (megakernel or which of split kernels etc..).
792                  */
793                 printf("Build flags: %s\n", custom_kernel_build_options.c_str());
794
795                 if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
796                         return false;
797
798                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
799
800                 return true;
801         }
802
803         string device_md5_hash(string kernel_custom_build_options = "")
804         {
805                 MD5Hash md5;
806                 char version[256], driver[256], name[256], vendor[256];
807
808                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
809                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
810                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
811                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
812
813                 md5.append((uint8_t*)vendor, strlen(vendor));
814                 md5.append((uint8_t*)version, strlen(version));
815                 md5.append((uint8_t*)name, strlen(name));
816                 md5.append((uint8_t*)driver, strlen(driver));
817
818                 string options = kernel_build_options();
819                 options += kernel_custom_build_options;
820                 md5.append((uint8_t*)options.c_str(), options.size());
821
822                 return md5.get_hex();
823         }
824
825         bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/)
826         {
827                 /* verify if device was initialized */
828                 if(!device_initialized) {
829                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
830                         return false;
831                 }
832
833                 /* try to use cached kernel */
834                 thread_scoped_lock cache_locker;
835                 cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
836
837                 if(!cpProgram) {
838                         /* verify we have right opencl version */
839                         if(!opencl_version_check())
840                                 return false;
841
842                         /* md5 hash to detect changes */
843                         string kernel_path = path_get("kernel");
844                         string kernel_md5 = path_files_md5_hash(kernel_path);
845                         string device_md5 = device_md5_hash();
846
847                         /* path to cached binary */
848                         string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
849                         clbin = path_user_get(path_join("cache", clbin));
850
851                         /* path to preprocessed source for debugging */
852                         string clsrc, *debug_src = NULL;
853
854                         if(opencl_kernel_use_debug()) {
855                                 clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
856                                 clsrc = path_user_get(path_join("cache", clsrc));
857                                 debug_src = &clsrc;
858                         }
859
860                         /* if exists already, try use it */
861                         if(path_exists(clbin) && load_binary(kernel_path, clbin, "", &cpProgram)) {
862                                 /* kernel loaded from binary */
863                         }
864                         else {
865
866                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
867
868                                 /* if does not exist or loading binary failed, compile kernel */
869                                 if(!compile_kernel(kernel_path, init_kernel_source, "", &cpProgram, debug_src))
870                                         return false;
871
872                                 /* save binary for reuse */
873                                 if(!save_binary(&cpProgram, clbin))
874                                         return false;
875                         }
876
877                         /* cache the program */
878                         OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
879                 }
880
881                 /* find kernels */
882                 ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
883                 if(opencl_error(ciErr))
884                         return false;
885
886                 ckFilmConvertHalfFloatKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_half_float", &ciErr);
887                 if(opencl_error(ciErr))
888                         return false;
889
890                 ckShaderKernel = clCreateKernel(cpProgram, "kernel_ocl_shader", &ciErr);
891                 if(opencl_error(ciErr))
892                         return false;
893
894                 ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr);
895                 if(opencl_error(ciErr))
896                         return false;
897
898                 return true;
899         }
900
901         ~OpenCLDeviceBase()
902         {
903                 task_pool.stop();
904
905                 if(null_mem)
906                         clReleaseMemObject(CL_MEM_PTR(null_mem));
907
908                 ConstMemMap::iterator mt;
909                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
910                         mem_free(*(mt->second));
911                         delete mt->second;
912                 }
913
914                 if(ckFilmConvertByteKernel)
915                         clReleaseKernel(ckFilmConvertByteKernel);  
916                 if(ckFilmConvertHalfFloatKernel)
917                         clReleaseKernel(ckFilmConvertHalfFloatKernel);  
918                 if(ckShaderKernel)
919                         clReleaseKernel(ckShaderKernel);
920                 if(ckBakeKernel)
921                         clReleaseKernel(ckBakeKernel);
922                 if(cpProgram)
923                         clReleaseProgram(cpProgram);
924                 if(cqCommandQueue)
925                         clReleaseCommandQueue(cqCommandQueue);
926                 if(cxContext)
927                         clReleaseContext(cxContext);
928         }
929
930         void mem_alloc(device_memory& mem, MemoryType type)
931         {
932                 size_t size = mem.memory_size();
933
934                 cl_mem_flags mem_flag;
935                 void *mem_ptr = NULL;
936
937                 if(type == MEM_READ_ONLY)
938                         mem_flag = CL_MEM_READ_ONLY;
939                 else if(type == MEM_WRITE_ONLY)
940                         mem_flag = CL_MEM_WRITE_ONLY;
941                 else
942                         mem_flag = CL_MEM_READ_WRITE;
943
944                 /* Zero-size allocation might be invoked by render, but not really
945                  * supported by OpenCL. Using NULL as device pointer also doesn't really
946                  * work for some reason, so for the time being we'll use special case
947                  * will null_mem buffer.
948                  */
949                 if(size != 0) {
950                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
951                                                                         mem_flag,
952                                                                         size,
953                                                                         mem_ptr,
954                                                                         &ciErr);
955                         opencl_assert_err(ciErr, "clCreateBuffer");
956                 }
957                 else {
958                         mem.device_pointer = null_mem;
959                 }
960
961                 stats.mem_alloc(size);
962                 mem.device_size = size;
963         }
964
965         void mem_copy_to(device_memory& mem)
966         {
967                 /* this is blocking */
968                 size_t size = mem.memory_size();
969                 if(size != 0){
970                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
971                                                            CL_MEM_PTR(mem.device_pointer),
972                                                            CL_TRUE,
973                                                            0,
974                                                            size,
975                                                            (void*)mem.data_pointer,
976                                                            0,
977                                                            NULL, NULL));
978                 }
979         }
980
981         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
982         {
983                 size_t offset = elem*y*w;
984                 size_t size = elem*w*h;
985                 assert(size != 0);
986                 opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
987                                                   CL_MEM_PTR(mem.device_pointer),
988                                                   CL_TRUE,
989                                                   offset,
990                                                   size,
991                                                   (uchar*)mem.data_pointer + offset,
992                                                   0,
993                                                   NULL, NULL));
994         }
995
996         void mem_zero(device_memory& mem)
997         {
998                 if(mem.device_pointer) {
999                         memset((void*)mem.data_pointer, 0, mem.memory_size());
1000                         mem_copy_to(mem);
1001                 }
1002         }
1003
1004         void mem_free(device_memory& mem)
1005         {
1006                 if(mem.device_pointer) {
1007                         if(mem.device_pointer != null_mem) {
1008                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1009                         }
1010                         mem.device_pointer = 0;
1011
1012                         stats.mem_free(mem.device_size);
1013                         mem.device_size = 0;
1014                 }
1015         }
1016
1017         void const_copy_to(const char *name, void *host, size_t size)
1018         {
1019                 ConstMemMap::iterator i = const_mem_map.find(name);
1020
1021                 if(i == const_mem_map.end()) {
1022                         device_vector<uchar> *data = new device_vector<uchar>();
1023                         data->copy((uchar*)host, size);
1024
1025                         mem_alloc(*data, MEM_READ_ONLY);
1026                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
1027                 }
1028                 else {
1029                         device_vector<uchar> *data = i->second;
1030                         data->copy((uchar*)host, size);
1031                 }
1032
1033                 mem_copy_to(*i->second);
1034         }
1035
1036         void tex_alloc(const char *name,
1037                        device_memory& mem,
1038                        InterpolationType /*interpolation*/,
1039                        bool /*periodic*/)
1040         {
1041                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
1042                 mem_alloc(mem, MEM_READ_ONLY);
1043                 mem_copy_to(mem);
1044                 assert(mem_map.find(name) == mem_map.end());
1045                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
1046         }
1047
1048         void tex_free(device_memory& mem)
1049         {
1050                 if(mem.device_pointer) {
1051                         foreach(const MemMap::value_type& value, mem_map) {
1052                                 if(value.second == mem.device_pointer) {
1053                                         mem_map.erase(value.first);
1054                                         break;
1055                                 }
1056                         }
1057
1058                         mem_free(mem);
1059                 }
1060         }
1061
1062         size_t global_size_round_up(int group_size, int global_size)
1063         {
1064                 int r = global_size % group_size;
1065                 return global_size + ((r == 0)? 0: group_size - r);
1066         }
1067
1068         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
1069         {
1070                 size_t workgroup_size, max_work_items[3];
1071
1072                 clGetKernelWorkGroupInfo(kernel, cdDevice,
1073                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1074                 clGetDeviceInfo(cdDevice,
1075                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
1076         
1077                 /* try to divide evenly over 2 dimensions */
1078                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1079                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
1080
1081                 /* some implementations have max size 1 on 2nd dimension */
1082                 if(local_size[1] > max_work_items[1]) {
1083                         local_size[0] = workgroup_size/max_work_items[1];
1084                         local_size[1] = max_work_items[1];
1085                 }
1086
1087                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
1088
1089                 /* run kernel */
1090                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1091                 opencl_assert(clFlush(cqCommandQueue));
1092         }
1093
1094         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1095         {
1096                 cl_mem ptr;
1097
1098                 MemMap::iterator i = mem_map.find(name);
1099                 if(i != mem_map.end()) {
1100                         ptr = CL_MEM_PTR(i->second);
1101                 }
1102                 else {
1103                         /* work around NULL not working, even though the spec says otherwise */
1104                         ptr = CL_MEM_PTR(null_mem);
1105                 }
1106                 
1107                 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
1108         }
1109
1110         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
1111         {
1112                 /* cast arguments to cl types */
1113                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1114                 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
1115                 cl_mem d_buffer = CL_MEM_PTR(buffer);
1116                 cl_int d_x = task.x;
1117                 cl_int d_y = task.y;
1118                 cl_int d_w = task.w;
1119                 cl_int d_h = task.h;
1120                 cl_float d_sample_scale = 1.0f/(task.sample + 1);
1121                 cl_int d_offset = task.offset;
1122                 cl_int d_stride = task.stride;
1123
1124
1125                 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
1126
1127                 cl_uint start_arg_index =
1128                         kernel_set_args(ckFilmConvertKernel,
1129                                         0,
1130                                         d_data,
1131                                         d_rgba,
1132                                         d_buffer);
1133
1134 #define KERNEL_TEX(type, ttype, name) \
1135         set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
1136 #include "kernel_textures.h"
1137 #undef KERNEL_TEX
1138
1139                 start_arg_index += kernel_set_args(ckFilmConvertKernel,
1140                                                    start_arg_index,
1141                                                    d_sample_scale,
1142                                                    d_x,
1143                                                    d_y,
1144                                                    d_w,
1145                                                    d_h,
1146                                                    d_offset,
1147                                                    d_stride);
1148
1149                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1150         }
1151
1152         void shader(DeviceTask& task)
1153         {
1154                 /* cast arguments to cl types */
1155                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1156                 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1157                 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1158                 cl_int d_shader_eval_type = task.shader_eval_type;
1159                 cl_int d_shader_x = task.shader_x;
1160                 cl_int d_shader_w = task.shader_w;
1161                 cl_int d_offset = task.offset;
1162
1163                 cl_kernel kernel;
1164
1165                 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1166                         kernel = ckBakeKernel;
1167                 else
1168                         kernel = ckShaderKernel;
1169
1170                 for(int sample = 0; sample < task.num_samples; sample++) {
1171
1172                         if(task.get_cancel())
1173                                 break;
1174
1175                         cl_int d_sample = sample;
1176
1177                         cl_uint start_arg_index =
1178                                 kernel_set_args(kernel,
1179                                                 0,
1180                                                 d_data,
1181                                                 d_input,
1182                                                 d_output);
1183
1184 #define KERNEL_TEX(type, ttype, name) \
1185                 set_kernel_arg_mem(kernel, &start_arg_index, #name);
1186 #include "kernel_textures.h"
1187 #undef KERNEL_TEX
1188
1189                         start_arg_index += kernel_set_args(kernel,
1190                                                            start_arg_index,
1191                                                            d_shader_eval_type,
1192                                                            d_shader_x,
1193                                                            d_shader_w,
1194                                                            d_offset,
1195                                                            d_sample);
1196
1197                         enqueue_kernel(kernel, task.shader_w, 1);
1198
1199                         task.update_progress(NULL);
1200                 }
1201         }
1202
1203         class OpenCLDeviceTask : public DeviceTask {
1204         public:
1205                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
1206                 : DeviceTask(task)
1207                 {
1208                         run = function_bind(&OpenCLDeviceBase::thread_run,
1209                                             device,
1210                                             this);
1211                 }
1212         };
1213
1214         int get_split_task_count(DeviceTask& /*task*/)
1215         {
1216                 return 1;
1217         }
1218
1219         void task_add(DeviceTask& task)
1220         {
1221                 task_pool.push(new OpenCLDeviceTask(this, task));
1222         }
1223
1224         void task_wait()
1225         {
1226                 task_pool.wait();
1227         }
1228
1229         void task_cancel()
1230         {
1231                 task_pool.cancel();
1232         }
1233
1234         virtual void thread_run(DeviceTask * /*task*/) = 0;
1235
1236 protected:
1237
1238         string kernel_build_options(const string *debug_src = NULL)
1239         {
1240                 string build_options = " -cl-fast-relaxed-math ";
1241
1242                 if(platform_name == "NVIDIA CUDA") {
1243                         build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1244                                          "-cl-nv-maxrregcount=32 "
1245                                          "-cl-nv-verbose ";
1246
1247                         uint compute_capability_major, compute_capability_minor;
1248                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1249                                         sizeof(cl_uint), &compute_capability_major, NULL);
1250                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1251                                         sizeof(cl_uint), &compute_capability_minor, NULL);
1252
1253                         build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1254                                                        compute_capability_major * 100 +
1255                                                        compute_capability_minor * 10);
1256                 }
1257
1258                 else if(platform_name == "Apple")
1259                         build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1260
1261                 else if(platform_name == "AMD Accelerated Parallel Processing")
1262                         build_options += "-D__KERNEL_OPENCL_AMD__ ";
1263
1264                 else if(platform_name == "Intel(R) OpenCL") {
1265                         build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1266
1267                         /* Options for gdb source level kernel debugging.
1268                          * this segfaults on linux currently.
1269                          */
1270                         if(opencl_kernel_use_debug() && debug_src)
1271                                 build_options += "-g -s \"" + *debug_src + "\" ";
1272                 }
1273
1274                 if(opencl_kernel_use_debug())
1275                         build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1276
1277 #ifdef WITH_CYCLES_DEBUG
1278                 build_options += "-D__KERNEL_DEBUG__ ";
1279 #endif
1280
1281                 return build_options;
1282         }
1283
1284         class ArgumentWrapper {
1285         public:
1286                 ArgumentWrapper() : size(0), pointer(NULL) {}
1287                 template <typename T>
1288                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
1289                                                pointer(&argument) { }
1290                 size_t size;
1291                 void *pointer;
1292         };
1293
1294         /* TODO(sergey): In the future we can use variadic templates, once
1295          * C++0x is allowed. Should allow to clean this up a bit.
1296          */
1297         int kernel_set_args(cl_kernel kernel,
1298                             int start_argument_index,
1299                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
1300                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
1301                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
1302                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
1303                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
1304                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
1305                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
1306                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
1307                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
1308                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
1309                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
1310                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
1311                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
1312                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
1313                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
1314                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
1315                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
1316                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
1317                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
1318                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
1319                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
1320                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
1321                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
1322                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
1323                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
1324                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
1325                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
1326                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
1327                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
1328                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
1329                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
1330                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
1331                             const ArgumentWrapper& arg33 = ArgumentWrapper())
1332         {
1333                 int current_arg_index = 0;
1334 #define FAKE_VARARG_HANDLE_ARG(arg) \
1335                 do { \
1336                         if(arg.pointer != NULL) { \
1337                                 opencl_assert(clSetKernelArg( \
1338                                         kernel, \
1339                                         start_argument_index + current_arg_index, \
1340                                         arg.size, arg.pointer)); \
1341                                 ++current_arg_index; \
1342                         } \
1343                         else { \
1344                                 return current_arg_index; \
1345                         } \
1346                 } while(false)
1347                 FAKE_VARARG_HANDLE_ARG(arg1);
1348                 FAKE_VARARG_HANDLE_ARG(arg2);
1349                 FAKE_VARARG_HANDLE_ARG(arg3);
1350                 FAKE_VARARG_HANDLE_ARG(arg4);
1351                 FAKE_VARARG_HANDLE_ARG(arg5);
1352                 FAKE_VARARG_HANDLE_ARG(arg6);
1353                 FAKE_VARARG_HANDLE_ARG(arg7);
1354                 FAKE_VARARG_HANDLE_ARG(arg8);
1355                 FAKE_VARARG_HANDLE_ARG(arg9);
1356                 FAKE_VARARG_HANDLE_ARG(arg10);
1357                 FAKE_VARARG_HANDLE_ARG(arg11);
1358                 FAKE_VARARG_HANDLE_ARG(arg12);
1359                 FAKE_VARARG_HANDLE_ARG(arg13);
1360                 FAKE_VARARG_HANDLE_ARG(arg14);
1361                 FAKE_VARARG_HANDLE_ARG(arg15);
1362                 FAKE_VARARG_HANDLE_ARG(arg16);
1363                 FAKE_VARARG_HANDLE_ARG(arg17);
1364                 FAKE_VARARG_HANDLE_ARG(arg18);
1365                 FAKE_VARARG_HANDLE_ARG(arg19);
1366                 FAKE_VARARG_HANDLE_ARG(arg20);
1367                 FAKE_VARARG_HANDLE_ARG(arg21);
1368                 FAKE_VARARG_HANDLE_ARG(arg22);
1369                 FAKE_VARARG_HANDLE_ARG(arg23);
1370                 FAKE_VARARG_HANDLE_ARG(arg24);
1371                 FAKE_VARARG_HANDLE_ARG(arg25);
1372                 FAKE_VARARG_HANDLE_ARG(arg26);
1373                 FAKE_VARARG_HANDLE_ARG(arg27);
1374                 FAKE_VARARG_HANDLE_ARG(arg28);
1375                 FAKE_VARARG_HANDLE_ARG(arg29);
1376                 FAKE_VARARG_HANDLE_ARG(arg30);
1377                 FAKE_VARARG_HANDLE_ARG(arg31);
1378                 FAKE_VARARG_HANDLE_ARG(arg32);
1379                 FAKE_VARARG_HANDLE_ARG(arg33);
1380 #undef FAKE_VARARG_HANDLE_ARG
1381                 return current_arg_index;
1382         }
1383
1384         inline void release_kernel_safe(cl_kernel kernel)
1385         {
1386                 if(kernel) {
1387                         clReleaseKernel(kernel);
1388                 }
1389         }
1390
1391         inline void release_mem_object_safe(cl_mem mem)
1392         {
1393                 if(mem != NULL) {
1394                         clReleaseMemObject(mem);
1395                 }
1396         }
1397
1398         inline void release_program_safe(cl_program program)
1399         {
1400                 if(program) {
1401                         clReleaseProgram(program);
1402                 }
1403         }
1404
1405         string build_options_from_requested_features(
1406                 const DeviceRequestedFeatures& requested_features)
1407         {
1408                 string build_options = "";
1409                 if(requested_features.experimental) {
1410                         build_options += " -D__KERNEL_EXPERIMENTAL__";
1411                 }
1412                 build_options += " -D__NODES_MAX_GROUP__=" +
1413                         string_printf("%d", requested_features.max_nodes_group);
1414                 build_options += " -D__NODES_FEATURES__=" +
1415                         string_printf("%d", requested_features.nodes_features);
1416                 build_options += string_printf(" -D__MAX_CLOSURE__=%d",
1417                                                requested_features.max_closure);
1418                 if(!requested_features.use_hair) {
1419                         build_options += " -D__NO_HAIR__";
1420                 }
1421                 if(!requested_features.use_object_motion) {
1422                         build_options += " -D__NO_OBJECT_MOTION__";
1423                 }
1424                 if(!requested_features.use_camera_motion) {
1425                         build_options += " -D__NO_CAMERA_MOTION__";
1426                 }
1427                 return build_options;
1428         }
1429 };
1430
1431 class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
1432 {
1433 public:
1434         cl_kernel ckPathTraceKernel;
1435         cl_program path_trace_program;
1436
1437         OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
1438         : OpenCLDeviceBase(info, stats, background_)
1439         {
1440                 ckPathTraceKernel = NULL;
1441                 path_trace_program = NULL;
1442         }
1443
1444         bool load_kernels(const DeviceRequestedFeatures& requested_features)
1445         {
1446                 /* Get Shader, bake and film convert kernels.
1447                  * It'll also do verification of OpenCL actually initialized.
1448                  */
1449                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1450                         return false;
1451                 }
1452
1453                 /* Try to use cached kernel. */
1454                 thread_scoped_lock cache_locker;
1455                 path_trace_program = OpenCLCache::get_program(cpPlatform,
1456                                                               cdDevice,
1457                                                               OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1458                                                               cache_locker);
1459
1460                 if(!path_trace_program) {
1461                         /* Verify we have right opencl version. */
1462                         if(!opencl_version_check())
1463                                 return false;
1464
1465                         /* Calculate md5 hash to detect changes. */
1466                         string kernel_path = path_get("kernel");
1467                         string kernel_md5 = path_files_md5_hash(kernel_path);
1468                         string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
1469                         string device_md5 = device_md5_hash(custom_kernel_build_options);
1470
1471                         /* Path to cached binary. */
1472                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
1473                                                      device_md5.c_str(),
1474                                                      kernel_md5.c_str());
1475                         clbin = path_user_get(path_join("cache", clbin));
1476
1477                         /* Path to preprocessed source for debugging. */
1478                         string clsrc, *debug_src = NULL;
1479                         if(opencl_kernel_use_debug()) {
1480                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
1481                                                       device_md5.c_str(),
1482                                                       kernel_md5.c_str());
1483                                 clsrc = path_user_get(path_join("cache", clsrc));
1484                                 debug_src = &clsrc;
1485                         }
1486
1487                         /* If exists already, try use it. */
1488                         if(path_exists(clbin) && load_binary(kernel_path,
1489                                                              clbin,
1490                                                              custom_kernel_build_options,
1491                                                              &path_trace_program,
1492                                                              debug_src)) {
1493                                 /* Kernel loaded from binary, nothing to do. */
1494                         }
1495                         else {
1496                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
1497                                                             kernel_md5 + "\n";
1498                                 /* If does not exist or loading binary failed, compile kernel. */
1499                                 if(!compile_kernel(kernel_path,
1500                                                    init_kernel_source,
1501                                                    custom_kernel_build_options,
1502                                                    &path_trace_program,
1503                                                    debug_src))
1504                                 {
1505                                         return false;
1506                                 }
1507                                 /* Save binary for reuse. */
1508                                 if(!save_binary(&path_trace_program, clbin)) {
1509                                         return false;
1510                                 }
1511                         }
1512                         /* Cache the program. */
1513                         OpenCLCache::store_program(cpPlatform,
1514                                                    cdDevice,
1515                                                    path_trace_program,
1516                                                    OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1517                                                    cache_locker);
1518                 }
1519
1520                 /* Find kernels. */
1521                 ckPathTraceKernel = clCreateKernel(path_trace_program,
1522                                                    "kernel_ocl_path_trace",
1523                                                    &ciErr);
1524                 if(opencl_error(ciErr))
1525                         return false;
1526                 return true;
1527         }
1528
1529         ~OpenCLDeviceMegaKernel()
1530         {
1531                 task_pool.stop();
1532                 release_kernel_safe(ckPathTraceKernel);
1533                 release_program_safe(path_trace_program);
1534         }
1535
1536         void path_trace(RenderTile& rtile, int sample)
1537         {
1538                 /* Cast arguments to cl types. */
1539                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1540                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1541                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
1542                 cl_int d_x = rtile.x;
1543                 cl_int d_y = rtile.y;
1544                 cl_int d_w = rtile.w;
1545                 cl_int d_h = rtile.h;
1546                 cl_int d_offset = rtile.offset;
1547                 cl_int d_stride = rtile.stride;
1548
1549                 /* Sample arguments. */
1550                 cl_int d_sample = sample;
1551
1552                 cl_uint start_arg_index =
1553                         kernel_set_args(ckPathTraceKernel,
1554                                         0,
1555                                         d_data,
1556                                         d_buffer,
1557                                         d_rng_state);
1558
1559 #define KERNEL_TEX(type, ttype, name) \
1560                 set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
1561 #include "kernel_textures.h"
1562 #undef KERNEL_TEX
1563
1564                 start_arg_index += kernel_set_args(ckPathTraceKernel,
1565                                                    start_arg_index,
1566                                                    d_sample,
1567                                                    d_x,
1568                                                    d_y,
1569                                                    d_w,
1570                                                    d_h,
1571                                                    d_offset,
1572                                                    d_stride);
1573
1574                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
1575         }
1576
1577         void thread_run(DeviceTask *task)
1578         {
1579                 if(task->type == DeviceTask::FILM_CONVERT) {
1580                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1581                 }
1582                 else if(task->type == DeviceTask::SHADER) {
1583                         shader(*task);
1584                 }
1585                 else if(task->type == DeviceTask::PATH_TRACE) {
1586                         RenderTile tile;
1587                         /* Keep rendering tiles until done. */
1588                         while(task->acquire_tile(this, tile)) {
1589                                 int start_sample = tile.start_sample;
1590                                 int end_sample = tile.start_sample + tile.num_samples;
1591
1592                                 for(int sample = start_sample; sample < end_sample; sample++) {
1593                                         if(task->get_cancel()) {
1594                                                 if(task->need_finish_queue == false)
1595                                                         break;
1596                                         }
1597
1598                                         path_trace(tile, sample);
1599
1600                                         tile.sample = sample + 1;
1601
1602                                         task->update_progress(&tile);
1603                                 }
1604
1605                                 /* Complete kernel execution before release tile */
1606                                 /* This helps in multi-device render;
1607                                  * The device that reaches the critical-section function
1608                                  * release_tile waits (stalling other devices from entering
1609                                  * release_tile) for all kernels to complete. If device1 (a
1610                                  * slow-render device) reaches release_tile first then it would
1611                                  * stall device2 (a fast-render device) from proceeding to render
1612                                  * next tile.
1613                                  */
1614                                 clFinish(cqCommandQueue);
1615
1616                                 task->release_tile(tile);
1617                         }
1618                 }
1619         }
1620 };
1621
1622 /* TODO(sergey): This is to keep tile split on OpenCL level working
1623  * for now, since without this view-port render does not work as it
1624  * should.
1625  *
1626  * Ideally it'll be done on the higher level, but we need to get ready
1627  * for merge rather soon, so let's keep split logic private here in
1628  * the file.
1629  */
1630 class SplitRenderTile : public RenderTile {
1631 public:
1632         SplitRenderTile()
1633                 : RenderTile(),
1634                   buffer_offset_x(0),
1635                   buffer_offset_y(0),
1636                   rng_state_offset_x(0),
1637                   rng_state_offset_y(0),
1638                   buffer_rng_state_stride(0) {}
1639
1640         explicit SplitRenderTile(RenderTile& tile)
1641                 : RenderTile(),
1642                   buffer_offset_x(0),
1643                   buffer_offset_y(0),
1644                   rng_state_offset_x(0),
1645                   rng_state_offset_y(0),
1646                   buffer_rng_state_stride(0)
1647         {
1648                 x = tile.x;
1649                 y = tile.y;
1650                 w = tile.w;
1651                 h = tile.h;
1652                 start_sample = tile.start_sample;
1653                 num_samples = tile.num_samples;
1654                 sample = tile.sample;
1655                 resolution = tile.resolution;
1656                 offset = tile.offset;
1657                 stride = tile.stride;
1658                 buffer = tile.buffer;
1659                 rng_state = tile.rng_state;
1660                 buffers = tile.buffers;
1661         }
1662
1663         /* Split kernel is device global memory constrained;
1664          * hence split kernel cant render big tile size's in
1665          * one go. If the user sets a big tile size (big tile size
1666          * is a term relative to the available device global memory),
1667          * we split the tile further and then call path_trace on
1668          * each of those split tiles. The following variables declared,
1669          * assist in achieving that purpose
1670          */
1671         int buffer_offset_x;
1672         int buffer_offset_y;
1673         int rng_state_offset_x;
1674         int rng_state_offset_y;
1675         int buffer_rng_state_stride;
1676 };
1677
1678 /* OpenCLDeviceSplitKernel's declaration/definition. */
1679 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
1680 {
1681 public:
1682         /* Kernel declaration. */
1683         cl_kernel ckPathTraceKernel_data_init;
1684         cl_kernel ckPathTraceKernel_scene_intersect;
1685         cl_kernel ckPathTraceKernel_lamp_emission;
1686         cl_kernel ckPathTraceKernel_queue_enqueue;
1687         cl_kernel ckPathTraceKernel_background_buffer_update;
1688         cl_kernel ckPathTraceKernel_shader_eval;
1689         cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
1690         cl_kernel ckPathTraceKernel_direct_lighting;
1691         cl_kernel ckPathTraceKernel_shadow_blocked;
1692         cl_kernel ckPathTraceKernel_next_iteration_setup;
1693         cl_kernel ckPathTraceKernel_sum_all_radiance;
1694
1695         /* cl_program declaration. */
1696         cl_program data_init_program;
1697         cl_program scene_intersect_program;
1698         cl_program lamp_emission_program;
1699         cl_program queue_enqueue_program;
1700         cl_program background_buffer_update_program;
1701         cl_program shader_eval_program;
1702         cl_program holdout_emission_blurring_pathtermination_ao_program;
1703         cl_program direct_lighting_program;
1704         cl_program shadow_blocked_program;
1705         cl_program next_iteration_setup_program;
1706         cl_program sum_all_radiance_program;
1707
1708         /* Global memory variables [porting]; These memory is used for
1709          * co-operation between different kernels; Data written by one
1710          * kernel will be available to another kernel via this global
1711          * memory.
1712          */
1713         cl_mem rng_coop;
1714         cl_mem throughput_coop;
1715         cl_mem L_transparent_coop;
1716         cl_mem PathRadiance_coop;
1717         cl_mem Ray_coop;
1718         cl_mem PathState_coop;
1719         cl_mem Intersection_coop;
1720         cl_mem kgbuffer;  /* KernelGlobals buffer. */
1721
1722         /* Global buffers for ShaderData. */
1723         cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
1724         cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
1725                                 * shadow_blocked kernel.
1726                                 */
1727
1728         /* Global buffers of each member of ShaderData. */
1729         cl_mem P_sd;
1730         cl_mem P_sd_DL_shadow;
1731         cl_mem N_sd;
1732         cl_mem N_sd_DL_shadow;
1733         cl_mem Ng_sd;
1734         cl_mem Ng_sd_DL_shadow;
1735         cl_mem I_sd;
1736         cl_mem I_sd_DL_shadow;
1737         cl_mem shader_sd;
1738         cl_mem shader_sd_DL_shadow;
1739         cl_mem flag_sd;
1740         cl_mem flag_sd_DL_shadow;
1741         cl_mem prim_sd;
1742         cl_mem prim_sd_DL_shadow;
1743         cl_mem type_sd;
1744         cl_mem type_sd_DL_shadow;
1745         cl_mem u_sd;
1746         cl_mem u_sd_DL_shadow;
1747         cl_mem v_sd;
1748         cl_mem v_sd_DL_shadow;
1749         cl_mem object_sd;
1750         cl_mem object_sd_DL_shadow;
1751         cl_mem time_sd;
1752         cl_mem time_sd_DL_shadow;
1753         cl_mem ray_length_sd;
1754         cl_mem ray_length_sd_DL_shadow;
1755         cl_mem ray_depth_sd;
1756         cl_mem ray_depth_sd_DL_shadow;
1757         cl_mem transparent_depth_sd;
1758         cl_mem transparent_depth_sd_DL_shadow;
1759
1760         /* Ray differentials. */
1761         cl_mem dP_sd, dI_sd;
1762         cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
1763         cl_mem du_sd, dv_sd;
1764         cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
1765
1766         /* Dp/Du */
1767         cl_mem dPdu_sd, dPdv_sd;
1768         cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
1769
1770         /* Object motion. */
1771         cl_mem ob_tfm_sd, ob_itfm_sd;
1772         cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
1773
1774         cl_mem closure_sd;
1775         cl_mem closure_sd_DL_shadow;
1776         cl_mem num_closure_sd;
1777         cl_mem num_closure_sd_DL_shadow;
1778         cl_mem randb_closure_sd;
1779         cl_mem randb_closure_sd_DL_shadow;
1780         cl_mem ray_P_sd;
1781         cl_mem ray_P_sd_DL_shadow;
1782         cl_mem ray_dP_sd;
1783         cl_mem ray_dP_sd_DL_shadow;
1784
1785         /* Global memory required for shadow blocked and accum_radiance. */
1786         cl_mem BSDFEval_coop;
1787         cl_mem ISLamp_coop;
1788         cl_mem LightRay_coop;
1789         cl_mem AOAlpha_coop;
1790         cl_mem AOBSDF_coop;
1791         cl_mem AOLightRay_coop;
1792         cl_mem Intersection_coop_AO;
1793         cl_mem Intersection_coop_DL;
1794
1795 #ifdef WITH_CYCLES_DEBUG
1796         /* DebugData memory */
1797         cl_mem debugdata_coop;
1798 #endif
1799
1800         /* Global state array that tracks ray state. */
1801         cl_mem ray_state;
1802
1803         /* Per sample buffers. */
1804         cl_mem per_sample_output_buffers;
1805
1806         /* Denotes which sample each ray is being processed for. */
1807         cl_mem work_array;
1808
1809         /* Queue */
1810         cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
1811         cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
1812                              * Tracks the size of each queue.
1813                              */
1814
1815         /* Flag to make sceneintersect and lampemission kernel use queues. */
1816         cl_mem use_queues_flag;
1817
1818         /* Amount of memory in output buffer associated with one pixel/thread. */
1819         size_t per_thread_output_buffer_size;
1820
1821         /* Total allocatable available device memory. */
1822         size_t total_allocatable_memory;
1823
1824         /* host version of ray_state; Used in checking host path-iteration
1825          * termination.
1826          */
1827         char *hostRayStateArray;
1828
1829         /* Number of path-iterations to be done in one shot. */
1830         unsigned int PathIteration_times;
1831
1832 #ifdef __WORK_STEALING__
1833         /* Work pool with respect to each work group. */
1834         cl_mem work_pool_wgs;
1835
1836         /* Denotes the maximum work groups possible w.r.t. current tile size. */
1837         unsigned int max_work_groups;
1838 #endif
1839
1840         /* clos_max value for which the kernels have been loaded currently. */
1841         int current_max_closure;
1842
1843         /* Marked True in constructor and marked false at the end of path_trace(). */
1844         bool first_tile;
1845
1846         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
1847         : OpenCLDeviceBase(info, stats, background_)
1848         {
1849                 background = background_;
1850
1851                 /* Initialize kernels. */
1852                 ckPathTraceKernel_data_init = NULL;
1853                 ckPathTraceKernel_scene_intersect = NULL;
1854                 ckPathTraceKernel_lamp_emission = NULL;
1855                 ckPathTraceKernel_background_buffer_update = NULL;
1856                 ckPathTraceKernel_shader_eval = NULL;
1857                 ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
1858                 ckPathTraceKernel_direct_lighting = NULL;
1859                 ckPathTraceKernel_shadow_blocked = NULL;
1860                 ckPathTraceKernel_next_iteration_setup = NULL;
1861                 ckPathTraceKernel_sum_all_radiance = NULL;
1862                 ckPathTraceKernel_queue_enqueue = NULL;
1863
1864                 /* Initialize program. */
1865                 data_init_program = NULL;
1866                 scene_intersect_program = NULL;
1867                 lamp_emission_program = NULL;
1868                 queue_enqueue_program = NULL;
1869                 background_buffer_update_program = NULL;
1870                 shader_eval_program = NULL;
1871                 holdout_emission_blurring_pathtermination_ao_program = NULL;
1872                 direct_lighting_program = NULL;
1873                 shadow_blocked_program = NULL;
1874                 next_iteration_setup_program = NULL;
1875                 sum_all_radiance_program = NULL;
1876
1877                 /* Initialize cl_mem variables. */
1878                 kgbuffer = NULL;
1879                 sd = NULL;
1880                 sd_DL_shadow = NULL;
1881
1882                 P_sd = NULL;
1883                 P_sd_DL_shadow = NULL;
1884                 N_sd = NULL;
1885                 N_sd_DL_shadow = NULL;
1886                 Ng_sd = NULL;
1887                 Ng_sd_DL_shadow = NULL;
1888                 I_sd = NULL;
1889                 I_sd_DL_shadow = NULL;
1890                 shader_sd = NULL;
1891                 shader_sd_DL_shadow = NULL;
1892                 flag_sd = NULL;
1893                 flag_sd_DL_shadow = NULL;
1894                 prim_sd = NULL;
1895                 prim_sd_DL_shadow = NULL;
1896                 type_sd = NULL;
1897                 type_sd_DL_shadow = NULL;
1898                 u_sd = NULL;
1899                 u_sd_DL_shadow = NULL;
1900                 v_sd = NULL;
1901                 v_sd_DL_shadow = NULL;
1902                 object_sd = NULL;
1903                 object_sd_DL_shadow = NULL;
1904                 time_sd = NULL;
1905                 time_sd_DL_shadow = NULL;
1906                 ray_length_sd = NULL;
1907                 ray_length_sd_DL_shadow = NULL;
1908                 ray_depth_sd = NULL;
1909                 ray_depth_sd_DL_shadow = NULL;
1910                 transparent_depth_sd = NULL;
1911                 transparent_depth_sd_DL_shadow = NULL;
1912
1913                 /* Ray differentials. */
1914                 dP_sd = NULL;
1915                 dI_sd = NULL;
1916                 dP_sd_DL_shadow = NULL;
1917                 dI_sd_DL_shadow = NULL;
1918                 du_sd = NULL;
1919                 dv_sd = NULL;
1920                 du_sd_DL_shadow = NULL;
1921                 dv_sd_DL_shadow = NULL;
1922
1923                 /* Dp/Du */
1924                 dPdu_sd = NULL;
1925                 dPdv_sd = NULL;
1926                 dPdu_sd_DL_shadow = NULL;
1927                 dPdv_sd_DL_shadow = NULL;
1928
1929                 /* Object motion. */
1930                 ob_tfm_sd = NULL;
1931                 ob_itfm_sd = NULL;
1932                 ob_tfm_sd_DL_shadow = NULL;
1933                 ob_itfm_sd_DL_shadow = NULL;
1934
1935                 closure_sd = NULL;
1936                 closure_sd_DL_shadow = NULL;
1937                 num_closure_sd = NULL;
1938                 num_closure_sd_DL_shadow = NULL;
1939                 randb_closure_sd = NULL;
1940                 randb_closure_sd_DL_shadow = NULL;
1941                 ray_P_sd = NULL;
1942                 ray_P_sd_DL_shadow = NULL;
1943                 ray_dP_sd = NULL;
1944                 ray_dP_sd_DL_shadow = NULL;
1945
1946                 rng_coop = NULL;
1947                 throughput_coop = NULL;
1948                 L_transparent_coop = NULL;
1949                 PathRadiance_coop = NULL;
1950                 Ray_coop = NULL;
1951                 PathState_coop = NULL;
1952                 Intersection_coop = NULL;
1953                 ray_state = NULL;
1954
1955                 AOAlpha_coop = NULL;
1956                 AOBSDF_coop = NULL;
1957                 AOLightRay_coop = NULL;
1958                 BSDFEval_coop = NULL;
1959                 ISLamp_coop = NULL;
1960                 LightRay_coop = NULL;
1961                 Intersection_coop_AO = NULL;
1962                 Intersection_coop_DL = NULL;
1963
1964 #ifdef WITH_CYCLES_DEBUG
1965                 debugdata_coop = NULL;
1966 #endif
1967
1968                 work_array = NULL;
1969
1970                 /* Queue. */
1971                 Queue_data = NULL;
1972                 Queue_index = NULL;
1973                 use_queues_flag = NULL;
1974
1975                 per_sample_output_buffers = NULL;
1976
1977                 per_thread_output_buffer_size = 0;
1978                 hostRayStateArray = NULL;
1979                 PathIteration_times = PATH_ITER_INC_FACTOR;
1980 #ifdef __WORK_STEALING__
1981                 work_pool_wgs = NULL;
1982                 max_work_groups = 0;
1983 #endif
1984                 current_max_closure = -1;
1985                 first_tile = true;
1986
1987                 /* Get device's maximum memory that can be allocated. */
1988                 ciErr = clGetDeviceInfo(cdDevice,
1989                                         CL_DEVICE_MAX_MEM_ALLOC_SIZE,
1990                                         sizeof(size_t),
1991                                         &total_allocatable_memory,
1992                                         NULL);
1993                 assert(ciErr == CL_SUCCESS);
1994                 if(platform_name == "AMD Accelerated Parallel Processing") {
1995                         /* This value is tweak-able; AMD platform does not seem to
1996                          * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
1997                          * is considered for further computation.
1998                          */
1999                         total_allocatable_memory /= 2;
2000                 }
2001         }
2002
2003         /* TODO(sergey): Seems really close to load_kernel(),
2004          * could it be de-duplicated?
2005          */
2006         bool load_split_kernel(string kernel_path,
2007                                string kernel_init_source,
2008                                string clbin,
2009                                string custom_kernel_build_options,
2010                                cl_program *program,
2011                                const string *debug_src = NULL)
2012         {
2013                 if(!opencl_version_check())
2014                         return false;
2015
2016                 clbin = path_user_get(path_join("cache", clbin));
2017
2018                 /* If exists already, try use it. */
2019                 if(path_exists(clbin) && load_binary(kernel_path,
2020                                                      clbin,
2021                                                      custom_kernel_build_options,
2022                                                      program,
2023                                                      debug_src)) {
2024                         /* Kernel loaded from binary. */
2025                 }
2026                 else {
2027                         /* If does not exist or loading binary failed, compile kernel. */
2028                         if(!compile_kernel(kernel_path,
2029                                            kernel_init_source,
2030                                            custom_kernel_build_options,
2031                                            program,
2032                                            debug_src))
2033                         {
2034                                 return false;
2035                         }
2036                         /* Save binary for reuse. */
2037                         if(!save_binary(program, clbin)) {
2038                                 return false;
2039                         }
2040                 }
2041                 return true;
2042         }
2043
2044         /* Split kernel utility functions. */
2045         size_t get_tex_size(const char *tex_name)
2046         {
2047                 cl_mem ptr;
2048                 size_t ret_size = 0;
2049                 MemMap::iterator i = mem_map.find(tex_name);
2050                 if(i != mem_map.end()) {
2051                         ptr = CL_MEM_PTR(i->second);
2052                         ciErr = clGetMemObjectInfo(ptr,
2053                                                    CL_MEM_SIZE,
2054                                                    sizeof(ret_size),
2055                                                    &ret_size,
2056                                                    NULL);
2057                         assert(ciErr == CL_SUCCESS);
2058                 }
2059                 return ret_size;
2060         }
2061
2062         size_t get_shader_closure_size(int max_closure)
2063         {
2064                 return (sizeof(ShaderClosure) * max_closure);
2065         }
2066
2067         size_t get_shader_data_size(size_t shader_closure_size)
2068         {
2069                 /* ShaderData size without accounting for ShaderClosure array. */
2070                 size_t shader_data_size =
2071                         sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
2072                 return (shader_data_size + shader_closure_size);
2073         }
2074
2075         /* Returns size of KernelGlobals structure associated with OpenCL. */
2076         size_t get_KernelGlobals_size()
2077         {
2078                 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
2079                  * fetch its size.
2080                  */
2081                 typedef struct KernelGlobals {
2082                         ccl_constant KernelData *data;
2083 #define KERNEL_TEX(type, ttype, name) \
2084         ccl_global type *name;
2085 #include "kernel_textures.h"
2086 #undef KERNEL_TEX
2087                 } KernelGlobals;
2088
2089                 return sizeof(KernelGlobals);
2090         }
2091
2092         /* Returns size of Structure of arrays implementation of. */
2093         size_t get_shaderdata_soa_size()
2094         {
2095                 size_t shader_soa_size = 0;
2096
2097 #define SD_VAR(type, what) shader_soa_size += sizeof(void *);
2098 #define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
2099                 #include "kernel_shaderdata_vars.h"
2100 #undef SD_VAR
2101 #undef SD_CLOSURE_VAR
2102
2103                 return shader_soa_size;
2104         }
2105
2106         bool load_kernels(const DeviceRequestedFeatures& requested_features)
2107         {
2108                 /* Get Shader, bake and film_convert kernels.
2109                  * It'll also do verification of OpenCL actually initialized.
2110                  */
2111                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
2112                         return false;
2113                 }
2114
2115                 string kernel_path = path_get("kernel");
2116                 string kernel_md5 = path_files_md5_hash(kernel_path);
2117                 string device_md5;
2118                 string kernel_init_source;
2119                 string clbin;
2120                 string clsrc, *debug_src = NULL;
2121
2122                 string build_options = "-D__SPLIT_KERNEL__";
2123 #ifdef __WORK_STEALING__
2124                 build_options += " -D__WORK_STEALING__";
2125 #endif
2126                 build_options += build_options_from_requested_features(requested_features);
2127
2128                 /* Set compute device build option. */
2129                 cl_device_type device_type;
2130                 ciErr = clGetDeviceInfo(cdDevice,
2131                                         CL_DEVICE_TYPE,
2132                                         sizeof(cl_device_type),
2133                                         &device_type,
2134                                         NULL);
2135                 assert(ciErr == CL_SUCCESS);
2136                 if(device_type == CL_DEVICE_TYPE_GPU) {
2137                         build_options += " -D__COMPUTE_DEVICE_GPU__";
2138                 }
2139
2140 #define GLUE(a, b) a ## b
2141 #define LOAD_KERNEL(name) \
2142         do { \
2143                 kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
2144                                      kernel_md5 + "\n"; \
2145                 device_md5 = device_md5_hash(build_options); \
2146                 clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
2147                                       device_md5.c_str(), kernel_md5.c_str()); \
2148                 if(opencl_kernel_use_debug()) { \
2149                         clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
2150                                               device_md5.c_str(), kernel_md5.c_str()); \
2151                         clsrc = path_user_get(path_join("cache", clsrc)); \
2152                         debug_src = &clsrc; \
2153                 } \
2154                 if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
2155                                       build_options, \
2156                                       &GLUE(name, _program), \
2157                                       debug_src)) \
2158                 { \
2159                         fprintf(stderr, "Faled to compile %s\n", #name); \
2160                         return false; \
2161                 } \
2162         } while(false)
2163
2164                 LOAD_KERNEL(data_init);
2165                 LOAD_KERNEL(scene_intersect);
2166                 LOAD_KERNEL(lamp_emission);
2167                 LOAD_KERNEL(queue_enqueue);
2168                 LOAD_KERNEL(background_buffer_update);
2169                 LOAD_KERNEL(shader_eval);
2170                 LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
2171                 LOAD_KERNEL(direct_lighting);
2172                 LOAD_KERNEL(shadow_blocked);
2173                 LOAD_KERNEL(next_iteration_setup);
2174                 LOAD_KERNEL(sum_all_radiance);
2175
2176 #undef LOAD_KERNEL
2177
2178 #define FIND_KERNEL(name) \
2179         do { \
2180                 GLUE(ckPathTraceKernel_, name) = \
2181                         clCreateKernel(GLUE(name, _program), \
2182                                        "kernel_ocl_path_trace_"  #name, &ciErr); \
2183                 if(opencl_error(ciErr)) { \
2184                         fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
2185                         return false; \
2186                 } \
2187         } while(false)
2188
2189                 FIND_KERNEL(data_init);
2190                 FIND_KERNEL(scene_intersect);
2191                 FIND_KERNEL(lamp_emission);
2192                 FIND_KERNEL(queue_enqueue);
2193                 FIND_KERNEL(background_buffer_update);
2194                 FIND_KERNEL(shader_eval);
2195                 FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
2196                 FIND_KERNEL(direct_lighting);
2197                 FIND_KERNEL(shadow_blocked);
2198                 FIND_KERNEL(next_iteration_setup);
2199                 FIND_KERNEL(sum_all_radiance);
2200 #undef FIND_KERNEL
2201 #undef GLUE
2202
2203                 current_max_closure = requested_features.max_closure;
2204
2205                 return true;
2206         }
2207
2208         ~OpenCLDeviceSplitKernel()
2209         {
2210                 task_pool.stop();
2211
2212                 /* Release kernels */
2213                 release_kernel_safe(ckPathTraceKernel_data_init);
2214                 release_kernel_safe(ckPathTraceKernel_scene_intersect);
2215                 release_kernel_safe(ckPathTraceKernel_lamp_emission);
2216                 release_kernel_safe(ckPathTraceKernel_queue_enqueue);
2217                 release_kernel_safe(ckPathTraceKernel_background_buffer_update);
2218                 release_kernel_safe(ckPathTraceKernel_shader_eval);
2219                 release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
2220                 release_kernel_safe(ckPathTraceKernel_direct_lighting);
2221                 release_kernel_safe(ckPathTraceKernel_shadow_blocked);
2222                 release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
2223                 release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
2224
2225                 /* Release global memory */
2226                 release_mem_object_safe(P_sd);
2227                 release_mem_object_safe(P_sd_DL_shadow);
2228                 release_mem_object_safe(N_sd);
2229                 release_mem_object_safe(N_sd_DL_shadow);
2230                 release_mem_object_safe(Ng_sd);
2231                 release_mem_object_safe(Ng_sd_DL_shadow);
2232                 release_mem_object_safe(I_sd);
2233                 release_mem_object_safe(I_sd_DL_shadow);
2234                 release_mem_object_safe(shader_sd);
2235                 release_mem_object_safe(shader_sd_DL_shadow);
2236                 release_mem_object_safe(flag_sd);
2237                 release_mem_object_safe(flag_sd_DL_shadow);
2238                 release_mem_object_safe(prim_sd);
2239                 release_mem_object_safe(prim_sd_DL_shadow);
2240                 release_mem_object_safe(type_sd);
2241                 release_mem_object_safe(type_sd_DL_shadow);
2242                 release_mem_object_safe(u_sd);
2243                 release_mem_object_safe(u_sd_DL_shadow);
2244                 release_mem_object_safe(v_sd);
2245                 release_mem_object_safe(v_sd_DL_shadow);
2246                 release_mem_object_safe(object_sd);
2247                 release_mem_object_safe(object_sd_DL_shadow);
2248                 release_mem_object_safe(time_sd);
2249                 release_mem_object_safe(time_sd_DL_shadow);
2250                 release_mem_object_safe(ray_length_sd);
2251                 release_mem_object_safe(ray_length_sd_DL_shadow);
2252                 release_mem_object_safe(ray_depth_sd);
2253                 release_mem_object_safe(ray_depth_sd_DL_shadow);
2254                 release_mem_object_safe(transparent_depth_sd);
2255                 release_mem_object_safe(transparent_depth_sd_DL_shadow);
2256
2257                 /* Ray differentials. */
2258                 release_mem_object_safe(dP_sd);
2259                 release_mem_object_safe(dP_sd_DL_shadow);
2260                 release_mem_object_safe(dI_sd);
2261                 release_mem_object_safe(dI_sd_DL_shadow);
2262                 release_mem_object_safe(du_sd);
2263                 release_mem_object_safe(du_sd_DL_shadow);
2264                 release_mem_object_safe(dv_sd);
2265                 release_mem_object_safe(dv_sd_DL_shadow);
2266
2267                 /* Dp/Du */
2268                 release_mem_object_safe(dPdu_sd);
2269                 release_mem_object_safe(dPdu_sd_DL_shadow);
2270                 release_mem_object_safe(dPdv_sd);
2271                 release_mem_object_safe(dPdv_sd_DL_shadow);
2272
2273                 /* Object motion. */
2274                 release_mem_object_safe(ob_tfm_sd);
2275                 release_mem_object_safe(ob_itfm_sd);
2276
2277                 release_mem_object_safe(ob_tfm_sd_DL_shadow);
2278                 release_mem_object_safe(ob_itfm_sd_DL_shadow);
2279
2280                 release_mem_object_safe(closure_sd);
2281                 release_mem_object_safe(closure_sd_DL_shadow);
2282                 release_mem_object_safe(num_closure_sd);
2283                 release_mem_object_safe(num_closure_sd_DL_shadow);
2284                 release_mem_object_safe(randb_closure_sd);
2285                 release_mem_object_safe(randb_closure_sd_DL_shadow);
2286                 release_mem_object_safe(ray_P_sd);
2287                 release_mem_object_safe(ray_P_sd_DL_shadow);
2288                 release_mem_object_safe(ray_dP_sd);
2289                 release_mem_object_safe(ray_dP_sd_DL_shadow);
2290                 release_mem_object_safe(rng_coop);
2291                 release_mem_object_safe(throughput_coop);
2292                 release_mem_object_safe(L_transparent_coop);
2293                 release_mem_object_safe(PathRadiance_coop);
2294                 release_mem_object_safe(Ray_coop);
2295                 release_mem_object_safe(PathState_coop);
2296                 release_mem_object_safe(Intersection_coop);
2297                 release_mem_object_safe(kgbuffer);
2298                 release_mem_object_safe(sd);
2299                 release_mem_object_safe(sd_DL_shadow);
2300                 release_mem_object_safe(ray_state);
2301                 release_mem_object_safe(AOAlpha_coop);
2302                 release_mem_object_safe(AOBSDF_coop);
2303                 release_mem_object_safe(AOLightRay_coop);
2304                 release_mem_object_safe(BSDFEval_coop);
2305                 release_mem_object_safe(ISLamp_coop);
2306                 release_mem_object_safe(LightRay_coop);
2307                 release_mem_object_safe(Intersection_coop_AO);
2308                 release_mem_object_safe(Intersection_coop_DL);
2309 #ifdef WITH_CYCLES_DEBUG
2310                 release_mem_object_safe(debugdata_coop);
2311 #endif
2312                 release_mem_object_safe(use_queues_flag);
2313                 release_mem_object_safe(Queue_data);
2314                 release_mem_object_safe(Queue_index);
2315                 release_mem_object_safe(work_array);
2316 #ifdef __WORK_STEALING__
2317                 release_mem_object_safe(work_pool_wgs);
2318 #endif
2319                 release_mem_object_safe(per_sample_output_buffers);
2320
2321                 /* Release programs */
2322                 release_program_safe(data_init_program);
2323                 release_program_safe(scene_intersect_program);
2324                 release_program_safe(lamp_emission_program);
2325                 release_program_safe(queue_enqueue_program);
2326                 release_program_safe(background_buffer_update_program);
2327                 release_program_safe(shader_eval_program);
2328                 release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
2329                 release_program_safe(direct_lighting_program);
2330                 release_program_safe(shadow_blocked_program);
2331                 release_program_safe(next_iteration_setup_program);
2332                 release_program_safe(sum_all_radiance_program);
2333
2334                 if(hostRayStateArray != NULL) {
2335                         free(hostRayStateArray);
2336                 }
2337         }
2338
2339         void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
2340         {
2341                 /* cast arguments to cl types */
2342                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
2343                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
2344                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
2345                 cl_int d_x = rtile.x;
2346                 cl_int d_y = rtile.y;
2347                 cl_int d_w = rtile.w;
2348                 cl_int d_h = rtile.h;
2349                 cl_int d_offset = rtile.offset;
2350                 cl_int d_stride = rtile.stride;
2351
2352                 /* Make sure that set render feasible tile size is a multiple of local
2353                  * work size dimensions.
2354                  */
2355                 assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
2356                 assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
2357
2358                 size_t global_size[2];
2359                 size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
2360                                         SPLIT_KERNEL_LOCAL_SIZE_Y};
2361
2362                 /* Set the range of samples to be processed for every ray in
2363                  * path-regeneration logic.
2364                  */
2365                 cl_int start_sample = rtile.start_sample;
2366                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
2367                 cl_int num_samples = rtile.num_samples;
2368
2369 #ifdef __WORK_STEALING__
2370                 global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
2371                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2372                 unsigned int num_parallel_samples = 1;
2373 #else
2374                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2375                 unsigned int num_threads = max_render_feasible_tile_size.x *
2376                                            max_render_feasible_tile_size.y;
2377                 unsigned int num_tile_columns_possible = num_threads / global_size[1];
2378                 /* Estimate number of parallel samples that can be
2379                  * processed in parallel.
2380                  */
2381                 unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
2382                                                         rtile.num_samples);
2383                 /* Wavefront size in AMD is 64.
2384                  * TODO(sergey): What about other platforms?
2385                  */
2386                 if(num_parallel_samples >= 64) {
2387                         /* TODO(sergey): Could use generic round-up here. */
2388                         num_parallel_samples = (num_parallel_samples / 64) * 64;
2389                 }
2390                 assert(num_parallel_samples != 0);
2391
2392                 global_size[0] = d_w * num_parallel_samples;
2393 #endif  /* __WORK_STEALING__ */
2394
2395                 assert(global_size[0] * global_size[1] <=
2396                        max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
2397
2398                 /* Allocate all required global memory once. */
2399                 if(first_tile) {
2400                         size_t num_global_elements = max_render_feasible_tile_size.x *
2401                                                      max_render_feasible_tile_size.y;
2402                         /* TODO(sergey): This will actually over-allocate if
2403                          * particular kernel does not support multiclosure.
2404                          */
2405                         size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
2406
2407 #ifdef __WORK_STEALING__
2408                         /* Calculate max groups */
2409                         size_t max_global_size[2];
2410                         size_t tile_x = max_render_feasible_tile_size.x;
2411                         size_t tile_y = max_render_feasible_tile_size.y;
2412                         max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
2413                         max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
2414                         max_work_groups = (max_global_size[0] * max_global_size[1]) /
2415                                           (local_size[0] * local_size[1]);
2416                         /* Allocate work_pool_wgs memory. */
2417                         work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
2418 #endif  /* __WORK_STEALING__ */
2419
2420                         /* Allocate queue_index memory only once. */
2421                         Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
2422                         use_queues_flag = mem_alloc(sizeof(char));
2423                         kgbuffer = mem_alloc(get_KernelGlobals_size());
2424
2425                         /* Create global buffers for ShaderData. */
2426                         sd = mem_alloc(get_shaderdata_soa_size());
2427                         sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
2428                         P_sd = mem_alloc(num_global_elements * sizeof(float3));
2429                         P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2430                         N_sd = mem_alloc(num_global_elements * sizeof(float3));
2431                         N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2432                         Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
2433                         Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2434                         I_sd = mem_alloc(num_global_elements * sizeof(float3));
2435                         I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2436                         shader_sd = mem_alloc(num_global_elements * sizeof(int));
2437                         shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2438                         flag_sd = mem_alloc(num_global_elements * sizeof(int));
2439                         flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2440                         prim_sd = mem_alloc(num_global_elements * sizeof(int));
2441                         prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2442                         type_sd = mem_alloc(num_global_elements * sizeof(int));
2443                         type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2444                         u_sd = mem_alloc(num_global_elements * sizeof(float));
2445                         u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2446                         v_sd = mem_alloc(num_global_elements * sizeof(float));
2447                         v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2448                         object_sd = mem_alloc(num_global_elements * sizeof(int));
2449                         object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2450                         time_sd = mem_alloc(num_global_elements * sizeof(float));
2451                         time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2452                         ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
2453                         ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2454                         ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2455                         ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2456                         transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2457                         transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2458
2459                         /* Ray differentials. */
2460                         dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2461                         dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2462                         dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
2463                         dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2464                         du_sd = mem_alloc(num_global_elements * sizeof(differential));
2465                         du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2466                         dv_sd = mem_alloc(num_global_elements * sizeof(differential));
2467                         dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2468
2469                         /* Dp/Du */
2470                         dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
2471                         dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2472                         dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
2473                         dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2474
2475                         /* Object motion. */
2476                         ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2477                         ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2478                         ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2479                         ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2480
2481                         closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
2482                         closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
2483                         num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
2484                         num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2485                         randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
2486                         randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2487                         ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
2488                         ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2489                         ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2490                         ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2491
2492                         /* Creation of global memory buffers which are shared among
2493                          * the kernels.
2494                          */
2495                         rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
2496                         throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
2497                         L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
2498                         PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
2499                         Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
2500                         PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
2501                         Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
2502                         AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
2503                         AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
2504                         AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2505                         BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
2506                         ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
2507                         LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2508                         Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
2509                         Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
2510
2511 #ifdef WITH_CYCLES_DEBUG
2512                         debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
2513 #endif
2514
2515                         ray_state = mem_alloc(num_global_elements * sizeof(char));
2516
2517                         hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
2518                         assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
2519
2520                         Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
2521                         work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
2522                         per_sample_output_buffers = mem_alloc(num_global_elements *
2523                                                               per_thread_output_buffer_size);
2524                 }
2525
2526                 cl_int dQueue_size = global_size[0] * global_size[1];
2527                 cl_int total_num_rays = global_size[0] * global_size[1];
2528
2529                 cl_uint start_arg_index =
2530                         kernel_set_args(ckPathTraceKernel_data_init,
2531                                         0,
2532                                         kgbuffer,
2533                                         sd,
2534                                         sd_DL_shadow,
2535                                         P_sd,
2536                                         P_sd_DL_shadow,
2537                                         N_sd,
2538                                         N_sd_DL_shadow,
2539                                         Ng_sd,
2540                                         Ng_sd_DL_shadow,
2541                                         I_sd,
2542                                         I_sd_DL_shadow,
2543                                         shader_sd,
2544                                         shader_sd_DL_shadow,
2545                                         flag_sd,
2546                                         flag_sd_DL_shadow,
2547                                         prim_sd,
2548                                         prim_sd_DL_shadow,
2549                                         type_sd,
2550                                         type_sd_DL_shadow,
2551                                         u_sd,
2552                                         u_sd_DL_shadow,
2553                                         v_sd,
2554                                         v_sd_DL_shadow,
2555                                         object_sd,
2556                                         object_sd_DL_shadow,
2557                                         time_sd,
2558                                         time_sd_DL_shadow,
2559                                         ray_length_sd,
2560                                         ray_length_sd_DL_shadow,
2561                                         ray_depth_sd,
2562                                         ray_depth_sd_DL_shadow,
2563                                         transparent_depth_sd,
2564                                         transparent_depth_sd_DL_shadow);
2565
2566                 /* Ray differentials. */
2567                 start_arg_index +=
2568                         kernel_set_args(ckPathTraceKernel_data_init,
2569                                         start_arg_index,
2570                                         dP_sd,
2571                                         dP_sd_DL_shadow,
2572                                         dI_sd,
2573                                         dI_sd_DL_shadow,
2574                                         du_sd,
2575                                         du_sd_DL_shadow,
2576                                         dv_sd,
2577                                         dv_sd_DL_shadow);
2578
2579                 /* Dp/Du */
2580                 start_arg_index +=
2581                         kernel_set_args(ckPathTraceKernel_data_init,
2582                                         start_arg_index,
2583                                         dPdu_sd,
2584                                         dPdu_sd_DL_shadow,
2585                                         dPdv_sd,
2586                                         dPdv_sd_DL_shadow);
2587
2588                 /* Object motion. */
2589                 start_arg_index +=
2590                         kernel_set_args(ckPathTraceKernel_data_init,
2591                                         start_arg_index,
2592                                         ob_tfm_sd,
2593                                         ob_tfm_sd_DL_shadow,
2594                                         ob_itfm_sd,
2595                                         ob_itfm_sd_DL_shadow);
2596
2597                 start_arg_index +=
2598                         kernel_set_args(ckPathTraceKernel_data_init,
2599                                         start_arg_index,
2600                                         closure_sd,
2601                                         closure_sd_DL_shadow,
2602                                         num_closure_sd,
2603                                         num_closure_sd_DL_shadow,
2604                                         randb_closure_sd,
2605                                         randb_closure_sd_DL_shadow,
2606                                         ray_P_sd,
2607                                         ray_P_sd_DL_shadow,
2608                                         ray_dP_sd,
2609                                         ray_dP_sd_DL_shadow,
2610                                         d_data,
2611                                         per_sample_output_buffers,
2612                                         d_rng_state,
2613                                         rng_coop,
2614                                         throughput_coop,
2615                                         L_transparent_coop,
2616                                         PathRadiance_coop,
2617                                         Ray_coop,
2618                                         PathState_coop,
2619                                         ray_state);
2620
2621 /* TODO(segrey): Avoid map lookup here. */
2622 #define KERNEL_TEX(type, ttype, name) \
2623         set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
2624 #include "kernel_textures.h"
2625 #undef KERNEL_TEX
2626
2627                 start_arg_index +=
2628                         kernel_set_args(ckPathTraceKernel_data_init,
2629                                         start_arg_index,
2630                                         start_sample,
2631                                         d_x,
2632                                         d_y,
2633                                         d_w,
2634                                         d_h,
2635                                         d_offset,
2636                                         d_stride,
2637                                         rtile.rng_state_offset_x,
2638                                         rtile.rng_state_offset_y,
2639                                         rtile.buffer_rng_state_stride,
2640                                         Queue_data,
2641                                         Queue_index,
2642                                         dQueue_size,
2643                                         use_queues_flag,
2644                                         work_array,
2645 #ifdef __WORK_STEALING__
2646                                         work_pool_wgs,
2647                                         num_samples,
2648 #endif
2649 #ifdef WITH_CYCLES_DEBUG
2650                                         debugdata_coop,
2651 #endif
2652                                         num_parallel_samples);
2653
2654                 kernel_set_args(ckPathTraceKernel_scene_intersect,
2655                                 0,
2656                                 kgbuffer,
2657                                 d_data,
2658                                 rng_coop,
2659                                 Ray_coop,
2660                                 PathState_coop,
2661                                 Intersection_coop,
2662                                 ray_state,
2663                                 d_w,
2664                                 d_h,
2665                                 Queue_data,
2666                                 Queue_index,
2667                                 dQueue_size,
2668                                 use_queues_flag,
2669 #ifdef WITH_CYCLES_DEBUG
2670                                 debugdata_coop,
2671 #endif
2672                                 num_parallel_samples);
2673
2674                 kernel_set_args(ckPathTraceKernel_lamp_emission,
2675                                 0,
2676                                 kgbuffer,
2677                                 d_data,
2678                                 sd,
2679                                 throughput_coop,
2680                                 PathRadiance_coop,
2681                                 Ray_coop,
2682                                 PathState_coop,
2683                                 Intersection_coop,
2684                                 ray_state,
2685                                 d_w,
2686                                 d_h,
2687                                 Queue_data,
2688                                 Queue_index,
2689                                 dQueue_size,
2690                                 use_queues_flag,
2691                                 num_parallel_samples);
2692
2693                 kernel_set_args(ckPathTraceKernel_queue_enqueue,
2694                                 0,
2695                                 Queue_data,
2696                                 Queue_index,
2697                                 ray_state,
2698                                 dQueue_size);
2699
2700                 kernel_set_args(ckPathTraceKernel_background_buffer_update,
2701                                  0,
2702                                  kgbuffer,
2703                                  d_data,
2704                                  sd,
2705                                  per_sample_output_buffers,
2706                                  d_rng_state,
2707                                  rng_coop,
2708                                  throughput_coop,
2709                                  PathRadiance_coop,
2710                                  Ray_coop,
2711                                  PathState_coop,
2712                                  L_transparent_coop,
2713                                  ray_state,
2714                                  d_w,
2715                                  d_h,
2716                                  d_x,
2717                                  d_y,
2718                                  d_stride,
2719                                  rtile.rng_state_offset_x,
2720                                  rtile.rng_state_offset_y,
2721                                  rtile.buffer_rng_state_stride,
2722                                  work_array,
2723                                  Queue_data,
2724                                  Queue_index,
2725                                  dQueue_size,
2726                                  end_sample,
2727                                  start_sample,
2728 #ifdef __WORK_STEALING__
2729                                  work_pool_wgs,
2730                                  num_samples,
2731 #endif
2732 #ifdef WITH_CYCLES_DEBUG
2733                                  debugdata_coop,
2734 #endif
2735                                  num_parallel_samples);
2736
2737                 kernel_set_args(ckPathTraceKernel_shader_eval,
2738                                 0,
2739                                 kgbuffer,
2740                                 d_data,
2741                                 sd,
2742                                 rng_coop,
2743                                 Ray_coop,
2744                                 PathState_coop,
2745                                 Intersection_coop,
2746                                 ray_state,
2747                                 Queue_data,
2748                                 Queue_index,
2749                                 dQueue_size);
2750
2751                 kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
2752                                 0,
2753                                 kgbuffer,
2754                                 d_data,
2755                                 sd,
2756                                 per_sample_output_buffers,
2757                                 rng_coop,
2758                                 throughput_coop,
2759                                 L_transparent_coop,
2760                                 PathRadiance_coop,
2761                                 PathState_coop,
2762                                 Intersection_coop,
2763                                 AOAlpha_coop,
2764                                 AOBSDF_coop,
2765                                 AOLightRay_coop,
2766                                 d_w,
2767                                 d_h,
2768                                 d_x,
2769                                 d_y,
2770                                 d_stride,
2771                                 ray_state,
2772                                 work_array,
2773                                 Queue_data,
2774                                 Queue_index,
2775                                 dQueue_size,
2776 #ifdef __WORK_STEALING__
2777                                 start_sample,
2778 #endif
2779                                 num_parallel_samples);
2780
2781                 kernel_set_args(ckPathTraceKernel_direct_lighting,
2782                                 0,
2783                                 kgbuffer,
2784                                 d_data,
2785                                 sd,
2786                                 sd_DL_shadow,
2787                                 rng_coop,
2788                                 PathState_coop,
2789                                 ISLamp_coop,
2790                                 LightRay_coop,
2791                                 BSDFEval_coop,
2792                                 ray_state,
2793                                 Queue_data,
2794                                 Queue_index,
2795                                 dQueue_size);
2796
2797                 kernel_set_args(ckPathTraceKernel_shadow_blocked,
2798                                 0,
2799                                 kgbuffer,
2800                                 d_data,
2801                                 sd_DL_shadow,
2802                                 PathState_coop,
2803                                 LightRay_coop,
2804                                 AOLightRay_coop,
2805                                 Intersection_coop_AO,
2806                                 Intersection_coop_DL,
2807                                 ray_state,
2808                                 Queue_data,
2809                                 Queue_index,
2810                                 dQueue_size,
2811                                 total_num_rays);
2812
2813                 kernel_set_args(ckPathTraceKernel_next_iteration_setup,
2814                                 0,
2815                                 kgbuffer,
2816                                 d_data,
2817                                 sd,
2818                                 rng_coop,
2819                                 throughput_coop,
2820                                 PathRadiance_coop,
2821                                 Ray_coop,
2822                                 PathState_coop,
2823                                 LightRay_coop,
2824                                 ISLamp_coop,
2825                                 BSDFEval_coop,
2826                                 AOLightRay_coop,
2827                                 AOBSDF_coop,
2828                                 AOAlpha_coop,
2829                                 ray_state,
2830                                 Queue_data,
2831                                 Queue_index,
2832                                 dQueue_size,
2833                                 use_queues_flag);
2834
2835                 kernel_set_args(ckPathTraceKernel_sum_all_radiance,
2836                                 0,
2837                                 d_data,
2838                                 d_buffer,
2839                                 per_sample_output_buffers,
2840                                 num_parallel_samples,
2841                                 d_w,
2842                                 d_h,
2843                                 d_stride,
2844                                 rtile.buffer_offset_x,
2845                                 rtile.buffer_offset_y,
2846                                 rtile.buffer_rng_state_stride,
2847                                 start_sample);
2848
2849                 /* Macro for Enqueuing split kernels. */
2850 #define GLUE(a, b) a ## b
2851 #define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
2852                 { \
2853                         ciErr = clEnqueueNDRangeKernel(cqCommandQueue, \
2854                                                        GLUE(ckPathTraceKernel_, \
2855                                                             kernelName), \
2856                                                        2, \
2857                                                        NULL, \
2858                                                        globalSize, \
2859                                                        localSize, \
2860                                                        0, \
2861                                                        NULL, \
2862                                                        NULL); \
2863                         opencl_assert_err(ciErr, "clEnqueueNDRangeKernel"); \
2864                         if(ciErr != CL_SUCCESS) { \
2865                                 string message = string_printf("OpenCL error: %s in clEnqueueNDRangeKernel()", \
2866                                                                clewErrorString(ciErr)); \
2867                                 opencl_error(message); \
2868                                 return; \
2869                         } \
2870                 } (void) 0
2871
2872                 /* Enqueue ckPathTraceKernel_data_init kernel. */
2873                 ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
2874                 bool activeRaysAvailable = true;
2875
2876                 /* Record number of time host intervention has been made */
2877                 unsigned int numHostIntervention = 0;
2878                 unsigned int numNextPathIterTimes = PathIteration_times;
2879                 while(activeRaysAvailable) {
2880                         /* Twice the global work size of other kernels for
2881                          * ckPathTraceKernel_shadow_blocked_direct_lighting. */
2882                         size_t global_size_shadow_blocked[2];
2883                         global_size_shadow_blocked[0] = global_size[0] * 2;
2884                         global_size_shadow_blocked[1] = global_size[1];
2885
2886                         /* Do path-iteration in host [Enqueue Path-iteration kernels. */
2887                         for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
2888                                 ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
2889                                 ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
2890                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
2891                                 ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
2892                                 ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
2893                                 ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
2894                                 ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
2895                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
2896                                 ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
2897                         }
2898
2899                         /* Read ray-state into Host memory to decide if we should exit
2900                          * path-iteration in host.
2901                          */
2902                         ciErr = clEnqueueReadBuffer(cqCommandQueue,
2903                                                     ray_state,
2904                                                     CL_TRUE,
2905                                                     0,
2906                                                     global_size[0] * global_size[1] * sizeof(char),
2907                                                     hostRayStateArray,
2908                                                     0,
2909                                                     NULL,
2910                                                     NULL);
2911                         assert(ciErr == CL_SUCCESS);
2912
2913                         activeRaysAvailable = false;
2914
2915                         for(int rayStateIter = 0;
2916                             rayStateIter < global_size[0] * global_size[1];
2917                             ++rayStateIter)
2918                         {
2919                                 if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
2920                                         /* Not all rays are RAY_INACTIVE. */
2921                                         activeRaysAvailable = true;
2922                                         break;
2923                                 }
2924                         }
2925
2926                         if(activeRaysAvailable) {
2927                                 numHostIntervention++;
2928                                 PathIteration_times = PATH_ITER_INC_FACTOR;
2929                                 /* Host intervention done before all rays become RAY_INACTIVE;
2930                                  * Set do more initial iterations for the next tile.
2931                                  */
2932                                 numNextPathIterTimes += PATH_ITER_INC_FACTOR;
2933                         }
2934                 }
2935
2936                 /* Execute SumALLRadiance kernel to accumulate radiance calculated in
2937                  * per_sample_output_buffers into RenderTile's output buffer.
2938                  */
2939                 size_t sum_all_radiance_local_size[2] = {16, 16};
2940                 size_t sum_all_radiance_global_size[2];
2941                 sum_all_radiance_global_size[0] =
2942                         (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
2943                         sum_all_radiance_local_size[0];
2944                 sum_all_radiance_global_size[1] =
2945                         (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
2946                         sum_all_radiance_local_size[1];
2947                 ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
2948                                      sum_all_radiance_global_size,
2949                                      sum_all_radiance_local_size);
2950
2951 #undef ENQUEUE_SPLIT_KERNEL
2952 #undef GLUE
2953
2954                 if(numHostIntervention == 0) {
2955                         /* This means that we are executing kernel more than required
2956                          * Must avoid this for the next sample/tile.
2957                          */
2958                         PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?