Cycles: Move requested feature conversion to an own function
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "clew.h"
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_foreach.h"
31 #include "util_logging.h"
32 #include "util_map.h"
33 #include "util_math.h"
34 #include "util_md5.h"
35 #include "util_opengl.h"
36 #include "util_path.h"
37 #include "util_time.h"
38
39 CCL_NAMESPACE_BEGIN
40
41 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
42
43 /* Macro declarations used with split kernel */
44
45 /* Macro to enable/disable work-stealing */
46 #define __WORK_STEALING__
47
48 #define SPLIT_KERNEL_LOCAL_SIZE_X 64
49 #define SPLIT_KERNEL_LOCAL_SIZE_Y 1
50
51 /* This value may be tuned according to the scene we are rendering.
52  *
53  * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
54  * ray-bounces will improve performance.
55  */
56 #define PATH_ITER_INC_FACTOR 8
57
58 /* When allocate global memory in chunks. We may not be able to
59  * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
60  * Since some bytes may be needed for aligning chunks of memory;
61  * This is the amount of memory that we dedicate for that purpose.
62  */
63 #define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
64
65 static cl_device_type opencl_device_type()
66 {
67         char *device = getenv("CYCLES_OPENCL_TEST");
68
69         if(device) {
70                 if(strcmp(device, "ALL") == 0)
71                         return CL_DEVICE_TYPE_ALL;
72                 else if(strcmp(device, "DEFAULT") == 0)
73                         return CL_DEVICE_TYPE_DEFAULT;
74                 else if(strcmp(device, "CPU") == 0)
75                         return CL_DEVICE_TYPE_CPU;
76                 else if(strcmp(device, "GPU") == 0)
77                         return CL_DEVICE_TYPE_GPU;
78                 else if(strcmp(device, "ACCELERATOR") == 0)
79                         return CL_DEVICE_TYPE_ACCELERATOR;
80         }
81
82         return CL_DEVICE_TYPE_ALL;
83 }
84
85 static bool opencl_kernel_use_debug()
86 {
87         return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
88 }
89
90 static bool opencl_kernel_use_advanced_shading(const string& platform)
91 {
92         /* keep this in sync with kernel_types.h! */
93         if(platform == "NVIDIA CUDA")
94                 return true;
95         else if(platform == "Apple")
96                 return false;
97         else if(platform == "AMD Accelerated Parallel Processing")
98                 return true;
99         else if(platform == "Intel(R) OpenCL")
100                 return true;
101
102         return false;
103 }
104
105 /* thread safe cache for contexts and programs */
106 class OpenCLCache
107 {
108         struct Slot
109         {
110                 thread_mutex *mutex;
111                 cl_context context;
112                 /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
113                 cl_program ocl_dev_base_program;
114                 /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
115                 cl_program ocl_dev_megakernel_program;
116
117                 Slot() : mutex(NULL), context(NULL), ocl_dev_base_program(NULL), ocl_dev_megakernel_program(NULL) {}
118
119                 Slot(const Slot &rhs)
120                         : mutex(rhs.mutex)
121                         , context(rhs.context)
122                         , ocl_dev_base_program(rhs.ocl_dev_base_program)
123                         , ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
124                 {
125                         /* copy can only happen in map insert, assert that */
126                         assert(mutex == NULL);
127                 }
128
129                 ~Slot()
130                 {
131                         delete mutex;
132                         mutex = NULL;
133                 }
134         };
135
136         /* key is combination of platform ID and device ID */
137         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
138
139         /* map of Slot objects */
140         typedef map<PlatformDevicePair, Slot> CacheMap;
141         CacheMap cache;
142
143         thread_mutex cache_lock;
144
145         /* lazy instantiate */
146         static OpenCLCache &global_instance()
147         {
148                 static OpenCLCache instance;
149                 return instance;
150         }
151
152         OpenCLCache()
153         {
154         }
155
156         ~OpenCLCache()
157         {
158                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
159                  * so this is disabled. It's not necessary to free objects
160                  * at process exit anyway.
161                  * http://software.intel.com/en-us/forums/topic/370083#comments */
162
163                 //flush();
164         }
165
166         /* lookup something in the cache. If this returns NULL, slot_locker
167          * will be holding a lock for the cache. slot_locker should refer to a
168          * default constructed thread_scoped_lock */
169         template<typename T>
170         static T get_something(cl_platform_id platform, cl_device_id device,
171                 T Slot::*member, thread_scoped_lock &slot_locker)
172         {
173                 assert(platform != NULL);
174
175                 OpenCLCache &self = global_instance();
176
177                 thread_scoped_lock cache_lock(self.cache_lock);
178
179                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
180                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
181
182                 Slot &slot = ins.first->second;
183
184                 /* create slot lock only while holding cache lock */
185                 if(!slot.mutex)
186                         slot.mutex = new thread_mutex;
187
188                 /* need to unlock cache before locking slot, to allow store to complete */
189                 cache_lock.unlock();
190
191                 /* lock the slot */
192                 slot_locker = thread_scoped_lock(*slot.mutex);
193
194                 /* If the thing isn't cached */
195                 if(slot.*member == NULL) {
196                         /* return with the caller's lock holder holding the slot lock */
197                         return NULL;
198                 }
199
200                 /* the item was already cached, release the slot lock */
201                 slot_locker.unlock();
202
203                 return slot.*member;
204         }
205
206         /* store something in the cache. you MUST have tried to get the item before storing to it */
207         template<typename T>
208         static void store_something(cl_platform_id platform, cl_device_id device, T thing,
209                 T Slot::*member, thread_scoped_lock &slot_locker)
210         {
211                 assert(platform != NULL);
212                 assert(device != NULL);
213                 assert(thing != NULL);
214
215                 OpenCLCache &self = global_instance();
216
217                 thread_scoped_lock cache_lock(self.cache_lock);
218                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
219                 cache_lock.unlock();
220
221                 Slot &slot = i->second;
222
223                 /* sanity check */
224                 assert(i != self.cache.end());
225                 assert(slot.*member == NULL);
226
227                 slot.*member = thing;
228
229                 /* unlock the slot */
230                 slot_locker.unlock();
231         }
232
233 public:
234
235         enum ProgramName {
236                 OCL_DEV_BASE_PROGRAM,
237                 OCL_DEV_MEGAKERNEL_PROGRAM,
238         };
239
240         /* see get_something comment */
241         static cl_context get_context(cl_platform_id platform, cl_device_id device,
242                 thread_scoped_lock &slot_locker)
243         {
244                 cl_context context = get_something<cl_context>(platform, device, &Slot::context, slot_locker);
245
246                 if(!context)
247                         return NULL;
248
249                 /* caller is going to release it when done with it, so retain it */
250                 cl_int ciErr = clRetainContext(context);
251                 assert(ciErr == CL_SUCCESS);
252                 (void)ciErr;
253
254                 return context;
255         }
256
257         /* see get_something comment */
258         static cl_program get_program(cl_platform_id platform, cl_device_id device, ProgramName program_name,
259                 thread_scoped_lock &slot_locker)
260         {
261                 cl_program program = NULL;
262
263                 if(program_name == OCL_DEV_BASE_PROGRAM) {
264                         /* Get program related to OpenCLDeviceBase */
265                         program = get_something<cl_program>(platform, device, &Slot::ocl_dev_base_program, slot_locker);
266                 }
267                 else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
268                         /* Get program related to megakernel */
269                         program = get_something<cl_program>(platform, device, &Slot::ocl_dev_megakernel_program, slot_locker);
270                 } else {
271                         assert(!"Invalid program name");
272                 }
273
274                 if(!program)
275                         return NULL;
276
277                 /* caller is going to release it when done with it, so retain it */
278                 cl_int ciErr = clRetainProgram(program);
279                 assert(ciErr == CL_SUCCESS);
280                 (void)ciErr;
281
282                 return program;
283         }
284
285         /* see store_something comment */
286         static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
287                 thread_scoped_lock &slot_locker)
288         {
289                 store_something<cl_context>(platform, device, context, &Slot::context, slot_locker);
290
291                 /* increment reference count in OpenCL.
292                  * The caller is going to release the object when done with it. */
293                 cl_int ciErr = clRetainContext(context);
294                 assert(ciErr == CL_SUCCESS);
295                 (void)ciErr;
296         }
297
298         /* see store_something comment */
299         static void store_program(cl_platform_id platform, cl_device_id device, cl_program program, ProgramName program_name,
300                 thread_scoped_lock &slot_locker)
301         {
302                 if(program_name == OCL_DEV_BASE_PROGRAM) {
303                         store_something<cl_program>(platform, device, program, &Slot::ocl_dev_base_program, slot_locker);
304                 }
305                 else if(program_name == OCL_DEV_MEGAKERNEL_PROGRAM) {
306                         store_something<cl_program>(platform, device, program, &Slot::ocl_dev_megakernel_program, slot_locker);
307                 } else {
308                         assert(!"Invalid program name\n");
309                         return;
310                 }
311
312                 /* increment reference count in OpenCL.
313                  * The caller is going to release the object when done with it. */
314                 cl_int ciErr = clRetainProgram(program);
315                 assert(ciErr == CL_SUCCESS);
316                 (void)ciErr;
317         }
318
319         /* discard all cached contexts and programs
320          * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
321         static void flush()
322         {
323                 OpenCLCache &self = global_instance();
324                 thread_scoped_lock cache_lock(self.cache_lock);
325
326                 foreach(CacheMap::value_type &item, self.cache) {
327                         if(item.second.ocl_dev_base_program != NULL)
328                                 clReleaseProgram(item.second.ocl_dev_base_program);
329                         if(item.second.ocl_dev_megakernel_program != NULL)
330                                 clReleaseProgram(item.second.ocl_dev_megakernel_program);
331                         if(item.second.context != NULL)
332                                 clReleaseContext(item.second.context);
333                 }
334
335                 self.cache.clear();
336         }
337 };
338
339 class OpenCLDeviceBase : public Device
340 {
341 public:
342         DedicatedTaskPool task_pool;
343         cl_context cxContext;
344         cl_command_queue cqCommandQueue;
345         cl_platform_id cpPlatform;
346         cl_device_id cdDevice;
347         cl_program cpProgram;
348         cl_kernel ckFilmConvertByteKernel;
349         cl_kernel ckFilmConvertHalfFloatKernel;
350         cl_kernel ckShaderKernel;
351         cl_kernel ckBakeKernel;
352         cl_int ciErr;
353
354         typedef map<string, device_vector<uchar>*> ConstMemMap;
355         typedef map<string, device_ptr> MemMap;
356
357         ConstMemMap const_mem_map;
358         MemMap mem_map;
359         device_ptr null_mem;
360
361         bool device_initialized;
362         string platform_name;
363
364         bool opencl_error(cl_int err)
365         {
366                 if(err != CL_SUCCESS) {
367                         string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
368                         if(error_msg == "")
369                                 error_msg = message;
370                         fprintf(stderr, "%s\n", message.c_str());
371                         return true;
372                 }
373
374                 return false;
375         }
376
377         void opencl_error(const string& message)
378         {
379                 if(error_msg == "")
380                         error_msg = message;
381                 fprintf(stderr, "%s\n", message.c_str());
382         }
383
384 #define opencl_assert(stmt) \
385         { \
386                 cl_int err = stmt; \
387                 \
388                 if(err != CL_SUCCESS) { \
389                         string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
390                         if(error_msg == "") \
391                                 error_msg = message; \
392                         fprintf(stderr, "%s\n", message.c_str()); \
393                 } \
394         } (void)0
395
396         void opencl_assert_err(cl_int err, const char* where)
397         {
398                 if(err != CL_SUCCESS) {
399                         string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
400                         if(error_msg == "")
401                                 error_msg = message;
402                         fprintf(stderr, "%s\n", message.c_str());
403 #ifndef NDEBUG
404                         abort();
405 #endif
406                 }
407         }
408
409         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
410         : Device(info, stats, background_)
411         {
412                 cpPlatform = NULL;
413                 cdDevice = NULL;
414                 cxContext = NULL;
415                 cqCommandQueue = NULL;
416                 cpProgram = NULL;
417                 ckFilmConvertByteKernel = NULL;
418                 ckFilmConvertHalfFloatKernel = NULL;
419                 ckShaderKernel = NULL;
420                 ckBakeKernel = NULL;
421                 null_mem = 0;
422                 device_initialized = false;
423
424                 /* setup platform */
425                 cl_uint num_platforms;
426
427                 ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
428                 if(opencl_error(ciErr))
429                         return;
430
431                 if(num_platforms == 0) {
432                         opencl_error("OpenCL: no platforms found.");
433                         return;
434                 }
435
436                 vector<cl_platform_id> platforms(num_platforms, NULL);
437
438                 ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
439                 if(opencl_error(ciErr)) {
440                         fprintf(stderr, "clGetPlatformIDs failed \n");
441                         return;
442                 }
443
444                 int num_base = 0;
445                 int total_devices = 0;
446
447                 for(int platform = 0; platform < num_platforms; platform++) {
448                         cl_uint num_devices;
449
450                         if(opencl_error(clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices)))
451                                 return;
452
453                         total_devices += num_devices;
454
455                         if(info.num - num_base >= num_devices) {
456                                 /* num doesn't refer to a device in this platform */
457                                 num_base += num_devices;
458                                 continue;
459                         }
460
461                         /* device is in this platform */
462                         cpPlatform = platforms[platform];
463
464                         /* get devices */
465                         vector<cl_device_id> device_ids(num_devices, NULL);
466
467                         if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) {
468                                 fprintf(stderr, "clGetDeviceIDs failed \n");
469                                 return;
470                         }
471
472                         cdDevice = device_ids[info.num - num_base];
473
474                         char name[256];
475                         clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
476                         platform_name = name;
477
478                         break;
479                 }
480
481                 if(total_devices == 0) {
482                         opencl_error("OpenCL: no devices found.");
483                         return;
484                 }
485                 else if(!cdDevice) {
486                         opencl_error("OpenCL: specified device not found.");
487                         return;
488                 }
489
490                 {
491                         /* try to use cached context */
492                         thread_scoped_lock cache_locker;
493                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
494
495                         if(cxContext == NULL) {
496                                 /* create context properties array to specify platform */
497                                 const cl_context_properties context_props[] = {
498                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
499                                         0, 0
500                                 };
501
502                                 /* create context */
503                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
504                                         context_notify_callback, cdDevice, &ciErr);
505
506                                 if(opencl_error(ciErr)) {
507                                         opencl_error("OpenCL: clCreateContext failed");
508                                         return;
509                                 }
510
511                                 /* cache it */
512                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
513                         }
514                 }
515
516                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
517                 if(opencl_error(ciErr))
518                         return;
519
520                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
521                 if(opencl_error(ciErr))
522                         return;
523
524                 fprintf(stderr, "Device init success\n");
525                 device_initialized = true;
526         }
527
528         static void CL_CALLBACK context_notify_callback(const char *err_info,
529                 const void * /*private_info*/, size_t /*cb*/, void *user_data)
530         {
531                 char name[256];
532                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
533
534                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
535         }
536
537         bool opencl_version_check()
538         {
539                 char version[256];
540
541                 int major, minor, req_major = 1, req_minor = 1;
542
543                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
544
545                 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
546                         opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
547                         return false;
548                 }
549
550                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
551                         opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
552                         return false;
553                 }
554
555                 clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
556
557                 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
558                         opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
559                         return false;
560                 }
561
562                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
563                         opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
564                         return false;
565                 }
566
567                 return true;
568         }
569
570         bool load_binary(const string& /*kernel_path*/,
571                          const string& clbin,
572                          string custom_kernel_build_options,
573                          cl_program *program,
574                          const string *debug_src = NULL)
575         {
576                 /* read binary into memory */
577                 vector<uint8_t> binary;
578
579                 if(!path_read_binary(clbin, binary)) {
580                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
581                         return false;
582                 }
583
584                 /* create program */
585                 cl_int status;
586                 size_t size = binary.size();
587                 const uint8_t *bytes = &binary[0];
588
589                 *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
590                         &size, &bytes, &status, &ciErr);
591
592                 if(opencl_error(status) || opencl_error(ciErr)) {
593                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
594                         return false;
595                 }
596
597                 if(!build_kernel(program, custom_kernel_build_options, debug_src))
598                         return false;
599
600                 return true;
601         }
602
603         bool save_binary(cl_program *program, const string& clbin)
604         {
605                 size_t size = 0;
606                 clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
607
608                 if(!size)
609                         return false;
610
611                 vector<uint8_t> binary(size);
612                 uint8_t *bytes = &binary[0];
613
614                 clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
615
616                 if(!path_write_binary(clbin, binary)) {
617                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
618                         return false;
619                 }
620
621                 return true;
622         }
623
624         bool build_kernel(cl_program *kernel_program,
625                           string custom_kernel_build_options,
626                           const string *debug_src = NULL)
627         {
628                 string build_options;
629                 build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
630
631                 ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
632
633                 /* show warnings even if build is successful */
634                 size_t ret_val_size = 0;
635
636                 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
637
638                 if(ret_val_size > 1) {
639                         vector<char> build_log(ret_val_size + 1);
640                         clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
641
642                         build_log[ret_val_size] = '\0';
643                         /* Skip meaningless empty output from the NVidia compiler. */
644                         if(!(ret_val_size == 2 && build_log[0] == '\n')) {
645                                 fprintf(stderr, "OpenCL kernel build output:\n");
646                                 fprintf(stderr, "%s\n", &build_log[0]);
647                         }
648                 }
649
650                 if(ciErr != CL_SUCCESS) {
651                         opencl_error("OpenCL build failed: errors in console");
652                         return false;
653                 }
654
655                 return true;
656         }
657
658         bool compile_kernel(const string& kernel_path,
659                             string source,
660                             string custom_kernel_build_options,
661                             cl_program *kernel_program,
662                             const string *debug_src = NULL)
663         {
664                 /* we compile kernels consisting of many files. unfortunately opencl
665                  * kernel caches do not seem to recognize changes in included files.
666                  * so we force recompile on changes by adding the md5 hash of all files */
667                 source = path_source_replace_includes(source, kernel_path);
668
669                 if(debug_src)
670                         path_write_text(*debug_src, source);
671
672                 size_t source_len = source.size();
673                 const char *source_str = source.c_str();
674
675                 *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
676
677                 if(opencl_error(ciErr))
678                         return false;
679
680                 double starttime = time_dt();
681                 printf("Compiling OpenCL kernel ...\n");
682                 /* TODO(sergey): Report which kernel is being compiled
683                  * as well (megakernel or which of split kernels etc..).
684                  */
685                 printf("Build flags: %s\n", custom_kernel_build_options.c_str());
686
687                 if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
688                         return false;
689
690                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
691
692                 return true;
693         }
694
695         string device_md5_hash(string kernel_custom_build_options = "")
696         {
697                 MD5Hash md5;
698                 char version[256], driver[256], name[256], vendor[256];
699
700                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
701                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
702                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
703                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
704
705                 md5.append((uint8_t*)vendor, strlen(vendor));
706                 md5.append((uint8_t*)version, strlen(version));
707                 md5.append((uint8_t*)name, strlen(name));
708                 md5.append((uint8_t*)driver, strlen(driver));
709
710                 string options = kernel_build_options();
711                 options += kernel_custom_build_options;
712                 md5.append((uint8_t*)options.c_str(), options.size());
713
714                 return md5.get_hex();
715         }
716
717         bool load_kernels(const DeviceRequestedFeatures& /*requested_features*/)
718         {
719                 /* verify if device was initialized */
720                 if(!device_initialized) {
721                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
722                         return false;
723                 }
724
725                 /* try to use cached kernel */
726                 thread_scoped_lock cache_locker;
727                 cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
728
729                 if(!cpProgram) {
730                         /* verify we have right opencl version */
731                         if(!opencl_version_check())
732                                 return false;
733
734                         /* md5 hash to detect changes */
735                         string kernel_path = path_get("kernel");
736                         string kernel_md5 = path_files_md5_hash(kernel_path);
737                         string device_md5 = device_md5_hash();
738
739                         /* path to cached binary */
740                         string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
741                         clbin = path_user_get(path_join("cache", clbin));
742
743                         /* path to preprocessed source for debugging */
744                         string clsrc, *debug_src = NULL;
745
746                         if(opencl_kernel_use_debug()) {
747                                 clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
748                                 clsrc = path_user_get(path_join("cache", clsrc));
749                                 debug_src = &clsrc;
750                         }
751
752                         /* if exists already, try use it */
753                         if(path_exists(clbin) && load_binary(kernel_path, clbin, "", &cpProgram)) {
754                                 /* kernel loaded from binary */
755                         }
756                         else {
757
758                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
759
760                                 /* if does not exist or loading binary failed, compile kernel */
761                                 if(!compile_kernel(kernel_path, init_kernel_source, "", &cpProgram, debug_src))
762                                         return false;
763
764                                 /* save binary for reuse */
765                                 if(!save_binary(&cpProgram, clbin))
766                                         return false;
767                         }
768
769                         /* cache the program */
770                         OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, OpenCLCache::OCL_DEV_BASE_PROGRAM, cache_locker);
771                 }
772
773                 /* find kernels */
774                 ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
775                 if(opencl_error(ciErr))
776                         return false;
777
778                 ckFilmConvertHalfFloatKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_half_float", &ciErr);
779                 if(opencl_error(ciErr))
780                         return false;
781
782                 ckShaderKernel = clCreateKernel(cpProgram, "kernel_ocl_shader", &ciErr);
783                 if(opencl_error(ciErr))
784                         return false;
785
786                 ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr);
787                 if(opencl_error(ciErr))
788                         return false;
789
790                 return true;
791         }
792
793         ~OpenCLDeviceBase()
794         {
795                 task_pool.stop();
796
797                 if(null_mem)
798                         clReleaseMemObject(CL_MEM_PTR(null_mem));
799
800                 ConstMemMap::iterator mt;
801                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
802                         mem_free(*(mt->second));
803                         delete mt->second;
804                 }
805
806                 if(ckFilmConvertByteKernel)
807                         clReleaseKernel(ckFilmConvertByteKernel);  
808                 if(ckFilmConvertHalfFloatKernel)
809                         clReleaseKernel(ckFilmConvertHalfFloatKernel);  
810                 if(ckShaderKernel)
811                         clReleaseKernel(ckShaderKernel);
812                 if(ckBakeKernel)
813                         clReleaseKernel(ckBakeKernel);
814                 if(cpProgram)
815                         clReleaseProgram(cpProgram);
816                 if(cqCommandQueue)
817                         clReleaseCommandQueue(cqCommandQueue);
818                 if(cxContext)
819                         clReleaseContext(cxContext);
820         }
821
822         void mem_alloc(device_memory& mem, MemoryType type)
823         {
824                 size_t size = mem.memory_size();
825
826                 cl_mem_flags mem_flag;
827                 void *mem_ptr = NULL;
828
829                 if(type == MEM_READ_ONLY)
830                         mem_flag = CL_MEM_READ_ONLY;
831                 else if(type == MEM_WRITE_ONLY)
832                         mem_flag = CL_MEM_WRITE_ONLY;
833                 else
834                         mem_flag = CL_MEM_READ_WRITE;
835
836                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
837
838                 opencl_assert_err(ciErr, "clCreateBuffer");
839
840                 stats.mem_alloc(size);
841                 mem.device_size = size;
842         }
843
844         void mem_copy_to(device_memory& mem)
845         {
846                 /* this is blocking */
847                 size_t size = mem.memory_size();
848                 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL));
849         }
850
851         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
852         {
853                 size_t offset = elem*y*w;
854                 size_t size = elem*w*h;
855
856                 opencl_assert(clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL));
857         }
858
859         void mem_zero(device_memory& mem)
860         {
861                 if(mem.device_pointer) {
862                         memset((void*)mem.data_pointer, 0, mem.memory_size());
863                         mem_copy_to(mem);
864                 }
865         }
866
867         void mem_free(device_memory& mem)
868         {
869                 if(mem.device_pointer) {
870                         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
871                         mem.device_pointer = 0;
872
873                         stats.mem_free(mem.device_size);
874                         mem.device_size = 0;
875                 }
876         }
877
878         void const_copy_to(const char *name, void *host, size_t size)
879         {
880                 ConstMemMap::iterator i = const_mem_map.find(name);
881
882                 if(i == const_mem_map.end()) {
883                         device_vector<uchar> *data = new device_vector<uchar>();
884                         data->copy((uchar*)host, size);
885
886                         mem_alloc(*data, MEM_READ_ONLY);
887                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
888                 }
889                 else {
890                         device_vector<uchar> *data = i->second;
891                         data->copy((uchar*)host, size);
892                 }
893
894                 mem_copy_to(*i->second);
895         }
896
897         void tex_alloc(const char *name,
898                        device_memory& mem,
899                        InterpolationType /*interpolation*/,
900                        bool /*periodic*/)
901         {
902                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
903                 mem_alloc(mem, MEM_READ_ONLY);
904                 mem_copy_to(mem);
905                 assert(mem_map.find(name) == mem_map.end());
906                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
907         }
908
909         void tex_free(device_memory& mem)
910         {
911                 if(mem.device_pointer) {
912                         foreach(const MemMap::value_type& value, mem_map) {
913                                 if(value.second == mem.device_pointer) {
914                                         mem_map.erase(value.first);
915                                         break;
916                                 }
917                         }
918
919                         mem_free(mem);
920                 }
921         }
922
923         size_t global_size_round_up(int group_size, int global_size)
924         {
925                 int r = global_size % group_size;
926                 return global_size + ((r == 0)? 0: group_size - r);
927         }
928
929         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
930         {
931                 size_t workgroup_size, max_work_items[3];
932
933                 clGetKernelWorkGroupInfo(kernel, cdDevice,
934                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
935                 clGetDeviceInfo(cdDevice,
936                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
937         
938                 /* try to divide evenly over 2 dimensions */
939                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
940                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
941
942                 /* some implementations have max size 1 on 2nd dimension */
943                 if(local_size[1] > max_work_items[1]) {
944                         local_size[0] = workgroup_size/max_work_items[1];
945                         local_size[1] = max_work_items[1];
946                 }
947
948                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
949
950                 /* run kernel */
951                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
952                 opencl_assert(clFlush(cqCommandQueue));
953         }
954
955         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
956         {
957                 cl_mem ptr;
958
959                 MemMap::iterator i = mem_map.find(name);
960                 if(i != mem_map.end()) {
961                         ptr = CL_MEM_PTR(i->second);
962                 }
963                 else {
964                         /* work around NULL not working, even though the spec says otherwise */
965                         ptr = CL_MEM_PTR(null_mem);
966                 }
967                 
968                 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
969         }
970
971         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
972         {
973                 /* cast arguments to cl types */
974                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
975                 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
976                 cl_mem d_buffer = CL_MEM_PTR(buffer);
977                 cl_int d_x = task.x;
978                 cl_int d_y = task.y;
979                 cl_int d_w = task.w;
980                 cl_int d_h = task.h;
981                 cl_float d_sample_scale = 1.0f/(task.sample + 1);
982                 cl_int d_offset = task.offset;
983                 cl_int d_stride = task.stride;
984
985
986                 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
987
988                 cl_uint start_arg_index =
989                         kernel_set_args(ckFilmConvertKernel,
990                                         0,
991                                         d_data,
992                                         d_rgba,
993                                         d_buffer);
994
995 #define KERNEL_TEX(type, ttype, name) \
996         set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
997 #include "kernel_textures.h"
998 #undef KERNEL_TEX
999
1000                 start_arg_index += kernel_set_args(ckFilmConvertKernel,
1001                                                    start_arg_index,
1002                                                    d_sample_scale,
1003                                                    d_x,
1004                                                    d_y,
1005                                                    d_w,
1006                                                    d_h,
1007                                                    d_offset,
1008                                                    d_stride);
1009
1010                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1011         }
1012
1013         void shader(DeviceTask& task)
1014         {
1015                 /* cast arguments to cl types */
1016                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1017                 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1018                 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1019                 cl_int d_shader_eval_type = task.shader_eval_type;
1020                 cl_int d_shader_x = task.shader_x;
1021                 cl_int d_shader_w = task.shader_w;
1022                 cl_int d_offset = task.offset;
1023
1024                 cl_kernel kernel;
1025
1026                 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1027                         kernel = ckBakeKernel;
1028                 else
1029                         kernel = ckShaderKernel;
1030
1031                 for(int sample = 0; sample < task.num_samples; sample++) {
1032
1033                         if(task.get_cancel())
1034                                 break;
1035
1036                         cl_int d_sample = sample;
1037
1038                         cl_uint start_arg_index =
1039                                 kernel_set_args(kernel,
1040                                                 0,
1041                                                 d_data,
1042                                                 d_input,
1043                                                 d_output);
1044
1045 #define KERNEL_TEX(type, ttype, name) \
1046                 set_kernel_arg_mem(kernel, &start_arg_index, #name);
1047 #include "kernel_textures.h"
1048 #undef KERNEL_TEX
1049
1050                         start_arg_index += kernel_set_args(kernel,
1051                                                            start_arg_index,
1052                                                            d_shader_eval_type,
1053                                                            d_shader_x,
1054                                                            d_shader_w,
1055                                                            d_offset,
1056                                                            d_sample);
1057
1058                         enqueue_kernel(kernel, task.shader_w, 1);
1059
1060                         task.update_progress(NULL);
1061                 }
1062         }
1063
1064         class OpenCLDeviceTask : public DeviceTask {
1065         public:
1066                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
1067                 : DeviceTask(task)
1068                 {
1069                         run = function_bind(&OpenCLDeviceBase::thread_run,
1070                                             device,
1071                                             this);
1072                 }
1073         };
1074
1075         int get_split_task_count(DeviceTask& /*task*/)
1076         {
1077                 return 1;
1078         }
1079
1080         void task_add(DeviceTask& task)
1081         {
1082                 task_pool.push(new OpenCLDeviceTask(this, task));
1083         }
1084
1085         void task_wait()
1086         {
1087                 task_pool.wait();
1088         }
1089
1090         void task_cancel()
1091         {
1092                 task_pool.cancel();
1093         }
1094
1095         virtual void thread_run(DeviceTask * /*task*/) = 0;
1096
1097 protected:
1098
1099         string kernel_build_options(const string *debug_src = NULL)
1100         {
1101                 string build_options = " -cl-fast-relaxed-math ";
1102
1103                 if(platform_name == "NVIDIA CUDA") {
1104                         build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1105                                          "-cl-nv-maxrregcount=32 "
1106                                          "-cl-nv-verbose ";
1107
1108                         uint compute_capability_major, compute_capability_minor;
1109                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1110                                         sizeof(cl_uint), &compute_capability_major, NULL);
1111                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1112                                         sizeof(cl_uint), &compute_capability_minor, NULL);
1113
1114                         build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1115                                                        compute_capability_major * 100 +
1116                                                        compute_capability_minor * 10);
1117                 }
1118
1119                 else if(platform_name == "Apple")
1120                         build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1121
1122                 else if(platform_name == "AMD Accelerated Parallel Processing")
1123                         build_options += "-D__KERNEL_OPENCL_AMD__ ";
1124
1125                 else if(platform_name == "Intel(R) OpenCL") {
1126                         build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1127
1128                         /* Options for gdb source level kernel debugging.
1129                          * this segfaults on linux currently.
1130                          */
1131                         if(opencl_kernel_use_debug() && debug_src)
1132                                 build_options += "-g -s \"" + *debug_src + "\" ";
1133                 }
1134
1135                 if(opencl_kernel_use_debug())
1136                         build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1137
1138 #ifdef WITH_CYCLES_DEBUG
1139                 build_options += "-D__KERNEL_DEBUG__ ";
1140 #endif
1141
1142                 return build_options;
1143         }
1144
1145         class ArgumentWrapper {
1146         public:
1147                 ArgumentWrapper() : size(0), pointer(NULL) {}
1148                 template <typename T>
1149                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
1150                                                pointer(&argument) { }
1151                 size_t size;
1152                 void *pointer;
1153         };
1154
1155         /* TODO(sergey): In the future we can use variadic templates, once
1156          * C++0x is allowed. Should allow to clean this up a bit.
1157          */
1158         int kernel_set_args(cl_kernel kernel,
1159                             int start_argument_index,
1160                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
1161                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
1162                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
1163                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
1164                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
1165                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
1166                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
1167                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
1168                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
1169                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
1170                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
1171                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
1172                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
1173                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
1174                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
1175                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
1176                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
1177                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
1178                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
1179                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
1180                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
1181                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
1182                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
1183                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
1184                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
1185                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
1186                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
1187                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
1188                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
1189                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
1190                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
1191                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
1192                             const ArgumentWrapper& arg33 = ArgumentWrapper())
1193         {
1194                 int current_arg_index = 0;
1195 #define FAKE_VARARG_HANDLE_ARG(arg) \
1196                 do { \
1197                         if(arg.pointer != NULL) { \
1198                                 opencl_assert(clSetKernelArg( \
1199                                         kernel, \
1200                                         start_argument_index + current_arg_index, \
1201                                         arg.size, arg.pointer)); \
1202                                 ++current_arg_index; \
1203                         } \
1204                         else { \
1205                                 return current_arg_index; \
1206                         } \
1207                 } while(false)
1208                 FAKE_VARARG_HANDLE_ARG(arg1);
1209                 FAKE_VARARG_HANDLE_ARG(arg2);
1210                 FAKE_VARARG_HANDLE_ARG(arg3);
1211                 FAKE_VARARG_HANDLE_ARG(arg4);
1212                 FAKE_VARARG_HANDLE_ARG(arg5);
1213                 FAKE_VARARG_HANDLE_ARG(arg6);
1214                 FAKE_VARARG_HANDLE_ARG(arg7);
1215                 FAKE_VARARG_HANDLE_ARG(arg8);
1216                 FAKE_VARARG_HANDLE_ARG(arg9);
1217                 FAKE_VARARG_HANDLE_ARG(arg10);
1218                 FAKE_VARARG_HANDLE_ARG(arg11);
1219                 FAKE_VARARG_HANDLE_ARG(arg12);
1220                 FAKE_VARARG_HANDLE_ARG(arg13);
1221                 FAKE_VARARG_HANDLE_ARG(arg14);
1222                 FAKE_VARARG_HANDLE_ARG(arg15);
1223                 FAKE_VARARG_HANDLE_ARG(arg16);
1224                 FAKE_VARARG_HANDLE_ARG(arg17);
1225                 FAKE_VARARG_HANDLE_ARG(arg18);
1226                 FAKE_VARARG_HANDLE_ARG(arg19);
1227                 FAKE_VARARG_HANDLE_ARG(arg20);
1228                 FAKE_VARARG_HANDLE_ARG(arg21);
1229                 FAKE_VARARG_HANDLE_ARG(arg22);
1230                 FAKE_VARARG_HANDLE_ARG(arg23);
1231                 FAKE_VARARG_HANDLE_ARG(arg24);
1232                 FAKE_VARARG_HANDLE_ARG(arg25);
1233                 FAKE_VARARG_HANDLE_ARG(arg26);
1234                 FAKE_VARARG_HANDLE_ARG(arg27);
1235                 FAKE_VARARG_HANDLE_ARG(arg28);
1236                 FAKE_VARARG_HANDLE_ARG(arg29);
1237                 FAKE_VARARG_HANDLE_ARG(arg30);
1238                 FAKE_VARARG_HANDLE_ARG(arg31);
1239                 FAKE_VARARG_HANDLE_ARG(arg32);
1240                 FAKE_VARARG_HANDLE_ARG(arg33);
1241 #undef FAKE_VARARG_HANDLE_ARG
1242                 return current_arg_index;
1243         }
1244
1245         inline void release_kernel_safe(cl_kernel kernel)
1246         {
1247                 if(kernel) {
1248                         clReleaseKernel(kernel);
1249                 }
1250         }
1251
1252         inline void release_mem_object_safe(cl_mem mem)
1253         {
1254                 if(mem != NULL) {
1255                         clReleaseMemObject(mem);
1256                 }
1257         }
1258
1259         inline void release_program_safe(cl_program program)
1260         {
1261                 if(program) {
1262                         clReleaseProgram(program);
1263                 }
1264         }
1265
1266         string build_options_from_requested_features(
1267                 const DeviceRequestedFeatures& requested_features)
1268         {
1269                 string build_options = "";
1270                 if(requested_features.experimental) {
1271                         build_options += " -D__KERNEL_EXPERIMENTAL__";
1272                 }
1273                 build_options += " -D__NODES_MAX_GROUP__=" +
1274                         string_printf("%d", requested_features.max_nodes_group);
1275                 build_options += " -D__NODES_FEATURES__=" +
1276                         string_printf("%d", requested_features.nodes_features);
1277                 build_options += string_printf(" -D__MAX_CLOSURE__=%d",
1278                                                requested_features.max_closure);
1279                 if(!requested_features.use_hair) {
1280                         build_options += " -D__NO_HAIR__";
1281                 }
1282                 if(!requested_features.use_object_motion) {
1283                         build_options += " -D__NO_OBJECT_MOTION__";
1284                 }
1285                 if(!requested_features.use_camera_motion) {
1286                         build_options += " -D__NO_CAMERA_MOTION__";
1287                 }
1288                 return build_options;
1289         }
1290 };
1291
1292 class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
1293 {
1294 public:
1295         cl_kernel ckPathTraceKernel;
1296         cl_program path_trace_program;
1297
1298         OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
1299         : OpenCLDeviceBase(info, stats, background_)
1300         {
1301                 ckPathTraceKernel = NULL;
1302                 path_trace_program = NULL;
1303         }
1304
1305         bool load_kernels(const DeviceRequestedFeatures& requested_features)
1306         {
1307                 /* Get Shader, bake and film convert kernels.
1308                  * It'll also do verification of OpenCL actually initialized.
1309                  */
1310                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1311                         return false;
1312                 }
1313
1314                 /* Try to use cached kernel. */
1315                 thread_scoped_lock cache_locker;
1316                 path_trace_program = OpenCLCache::get_program(cpPlatform,
1317                                                               cdDevice,
1318                                                               OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1319                                                               cache_locker);
1320
1321                 if(!path_trace_program) {
1322                         /* Verify we have right opencl version. */
1323                         if(!opencl_version_check())
1324                                 return false;
1325
1326                         /* Calculate md5 hash to detect changes. */
1327                         string kernel_path = path_get("kernel");
1328                         string kernel_md5 = path_files_md5_hash(kernel_path);
1329                         string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
1330                         string device_md5 = device_md5_hash(custom_kernel_build_options);
1331
1332                         /* Path to cached binary. */
1333                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
1334                                                      device_md5.c_str(),
1335                                                      kernel_md5.c_str());
1336                         clbin = path_user_get(path_join("cache", clbin));
1337
1338                         /* Path to preprocessed source for debugging. */
1339                         string clsrc, *debug_src = NULL;
1340                         if(opencl_kernel_use_debug()) {
1341                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
1342                                                       device_md5.c_str(),
1343                                                       kernel_md5.c_str());
1344                                 clsrc = path_user_get(path_join("cache", clsrc));
1345                                 debug_src = &clsrc;
1346                         }
1347
1348                         /* If exists already, try use it. */
1349                         if(path_exists(clbin) && load_binary(kernel_path,
1350                                                              clbin,
1351                                                              custom_kernel_build_options,
1352                                                              &path_trace_program,
1353                                                              debug_src)) {
1354                                 /* Kernel loaded from binary, nothing to do. */
1355                         }
1356                         else {
1357                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
1358                                                             kernel_md5 + "\n";
1359                                 /* If does not exist or loading binary failed, compile kernel. */
1360                                 if(!compile_kernel(kernel_path,
1361                                                    init_kernel_source,
1362                                                    custom_kernel_build_options,
1363                                                    &path_trace_program,
1364                                                    debug_src))
1365                                 {
1366                                         return false;
1367                                 }
1368                                 /* Save binary for reuse. */
1369                                 if(!save_binary(&path_trace_program, clbin)) {
1370                                         return false;
1371                                 }
1372                         }
1373                         /* Cache the program. */
1374                         OpenCLCache::store_program(cpPlatform,
1375                                                    cdDevice,
1376                                                    path_trace_program,
1377                                                    OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1378                                                    cache_locker);
1379                 }
1380
1381                 /* Find kernels. */
1382                 ckPathTraceKernel = clCreateKernel(path_trace_program,
1383                                                    "kernel_ocl_path_trace",
1384                                                    &ciErr);
1385                 if(opencl_error(ciErr))
1386                         return false;
1387                 return true;
1388         }
1389
1390         ~OpenCLDeviceMegaKernel()
1391         {
1392                 task_pool.stop();
1393                 release_kernel_safe(ckPathTraceKernel);
1394                 release_program_safe(path_trace_program);
1395         }
1396
1397         void path_trace(RenderTile& rtile, int sample)
1398         {
1399                 /* Cast arguments to cl types. */
1400                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1401                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1402                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
1403                 cl_int d_x = rtile.x;
1404                 cl_int d_y = rtile.y;
1405                 cl_int d_w = rtile.w;
1406                 cl_int d_h = rtile.h;
1407                 cl_int d_offset = rtile.offset;
1408                 cl_int d_stride = rtile.stride;
1409
1410                 /* Sample arguments. */
1411                 cl_int d_sample = sample;
1412
1413                 cl_uint start_arg_index =
1414                         kernel_set_args(ckPathTraceKernel,
1415                                         0,
1416                                         d_data,
1417                                         d_buffer,
1418                                         d_rng_state);
1419
1420 #define KERNEL_TEX(type, ttype, name) \
1421                 set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
1422 #include "kernel_textures.h"
1423 #undef KERNEL_TEX
1424
1425                 start_arg_index += kernel_set_args(ckPathTraceKernel,
1426                                                    start_arg_index,
1427                                                    d_sample,
1428                                                    d_x,
1429                                                    d_y,
1430                                                    d_w,
1431                                                    d_h,
1432                                                    d_offset,
1433                                                    d_stride);
1434
1435                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
1436         }
1437
1438         void thread_run(DeviceTask *task)
1439         {
1440                 if(task->type == DeviceTask::FILM_CONVERT) {
1441                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1442                 }
1443                 else if(task->type == DeviceTask::SHADER) {
1444                         shader(*task);
1445                 }
1446                 else if(task->type == DeviceTask::PATH_TRACE) {
1447                         RenderTile tile;
1448                         /* Keep rendering tiles until done. */
1449                         while(task->acquire_tile(this, tile)) {
1450                                 int start_sample = tile.start_sample;
1451                                 int end_sample = tile.start_sample + tile.num_samples;
1452
1453                                 for(int sample = start_sample; sample < end_sample; sample++) {
1454                                         if(task->get_cancel()) {
1455                                                 if(task->need_finish_queue == false)
1456                                                         break;
1457                                         }
1458
1459                                         path_trace(tile, sample);
1460
1461                                         tile.sample = sample + 1;
1462
1463                                         task->update_progress(&tile);
1464                                 }
1465
1466                                 /* Complete kernel execution before release tile */
1467                                 /* This helps in multi-device render;
1468                                  * The device that reaches the critical-section function
1469                                  * release_tile waits (stalling other devices from entering
1470                                  * release_tile) for all kernels to complete. If device1 (a
1471                                  * slow-render device) reaches release_tile first then it would
1472                                  * stall device2 (a fast-render device) from proceeding to render
1473                                  * next tile.
1474                                  */
1475                                 clFinish(cqCommandQueue);
1476
1477                                 task->release_tile(tile);
1478                         }
1479                 }
1480         }
1481 };
1482
1483 /* TODO(sergey): This is to keep tile split on OpenCL level working
1484  * for now, since without this view-port render does not work as it
1485  * should.
1486  *
1487  * Ideally it'll be done on the higher level, but we need to get ready
1488  * for merge rather soon, so let's keep split logic private here in
1489  * the file.
1490  */
1491 class SplitRenderTile : public RenderTile {
1492 public:
1493         SplitRenderTile()
1494                 : RenderTile(),
1495                   buffer_offset_x(0),
1496                   buffer_offset_y(0),
1497                   rng_state_offset_x(0),
1498                   rng_state_offset_y(0),
1499                   buffer_rng_state_stride(0) {}
1500
1501         explicit SplitRenderTile(RenderTile& tile)
1502                 : RenderTile(),
1503                   buffer_offset_x(0),
1504                   buffer_offset_y(0),
1505                   rng_state_offset_x(0),
1506                   rng_state_offset_y(0),
1507                   buffer_rng_state_stride(0)
1508         {
1509                 x = tile.x;
1510                 y = tile.y;
1511                 w = tile.w;
1512                 h = tile.h;
1513                 start_sample = tile.start_sample;
1514                 num_samples = tile.num_samples;
1515                 sample = tile.sample;
1516                 resolution = tile.resolution;
1517                 offset = tile.offset;
1518                 stride = tile.stride;
1519                 buffer = tile.buffer;
1520                 rng_state = tile.rng_state;
1521                 buffers = tile.buffers;
1522         }
1523
1524         /* Split kernel is device global memory constrained;
1525          * hence split kernel cant render big tile size's in
1526          * one go. If the user sets a big tile size (big tile size
1527          * is a term relative to the available device global memory),
1528          * we split the tile further and then call path_trace on
1529          * each of those split tiles. The following variables declared,
1530          * assist in achieving that purpose
1531          */
1532         int buffer_offset_x;
1533         int buffer_offset_y;
1534         int rng_state_offset_x;
1535         int rng_state_offset_y;
1536         int buffer_rng_state_stride;
1537 };
1538
1539 /* OpenCLDeviceSplitKernel's declaration/definition. */
1540 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
1541 {
1542 public:
1543         /* Kernel declaration. */
1544         cl_kernel ckPathTraceKernel_data_init;
1545         cl_kernel ckPathTraceKernel_scene_intersect;
1546         cl_kernel ckPathTraceKernel_lamp_emission;
1547         cl_kernel ckPathTraceKernel_queue_enqueue;
1548         cl_kernel ckPathTraceKernel_background_buffer_update;
1549         cl_kernel ckPathTraceKernel_shader_eval;
1550         cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
1551         cl_kernel ckPathTraceKernel_direct_lighting;
1552         cl_kernel ckPathTraceKernel_shadow_blocked;
1553         cl_kernel ckPathTraceKernel_next_iteration_setup;
1554         cl_kernel ckPathTraceKernel_sum_all_radiance;
1555
1556         /* cl_program declaration. */
1557         cl_program data_init_program;
1558         cl_program scene_intersect_program;
1559         cl_program lamp_emission_program;
1560         cl_program queue_enqueue_program;
1561         cl_program background_buffer_update_program;
1562         cl_program shader_eval_program;
1563         cl_program holdout_emission_blurring_pathtermination_ao_program;
1564         cl_program direct_lighting_program;
1565         cl_program shadow_blocked_program;
1566         cl_program next_iteration_setup_program;
1567         cl_program sum_all_radiance_program;
1568
1569         /* Global memory variables [porting]; These memory is used for
1570          * co-operation between different kernels; Data written by one
1571          * kernel will be available to another kernel via this global
1572          * memory.
1573          */
1574         cl_mem rng_coop;
1575         cl_mem throughput_coop;
1576         cl_mem L_transparent_coop;
1577         cl_mem PathRadiance_coop;
1578         cl_mem Ray_coop;
1579         cl_mem PathState_coop;
1580         cl_mem Intersection_coop;
1581         cl_mem kgbuffer;  /* KernelGlobals buffer. */
1582
1583         /* Global buffers for ShaderData. */
1584         cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
1585         cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
1586                                 * shadow_blocked kernel.
1587                                 */
1588
1589         /* Global buffers of each member of ShaderData. */
1590         cl_mem P_sd;
1591         cl_mem P_sd_DL_shadow;
1592         cl_mem N_sd;
1593         cl_mem N_sd_DL_shadow;
1594         cl_mem Ng_sd;
1595         cl_mem Ng_sd_DL_shadow;
1596         cl_mem I_sd;
1597         cl_mem I_sd_DL_shadow;
1598         cl_mem shader_sd;
1599         cl_mem shader_sd_DL_shadow;
1600         cl_mem flag_sd;
1601         cl_mem flag_sd_DL_shadow;
1602         cl_mem prim_sd;
1603         cl_mem prim_sd_DL_shadow;
1604         cl_mem type_sd;
1605         cl_mem type_sd_DL_shadow;
1606         cl_mem u_sd;
1607         cl_mem u_sd_DL_shadow;
1608         cl_mem v_sd;
1609         cl_mem v_sd_DL_shadow;
1610         cl_mem object_sd;
1611         cl_mem object_sd_DL_shadow;
1612         cl_mem time_sd;
1613         cl_mem time_sd_DL_shadow;
1614         cl_mem ray_length_sd;
1615         cl_mem ray_length_sd_DL_shadow;
1616         cl_mem ray_depth_sd;
1617         cl_mem ray_depth_sd_DL_shadow;
1618         cl_mem transparent_depth_sd;
1619         cl_mem transparent_depth_sd_DL_shadow;
1620
1621         /* Ray differentials. */
1622         cl_mem dP_sd, dI_sd;
1623         cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
1624         cl_mem du_sd, dv_sd;
1625         cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
1626
1627         /* Dp/Du */
1628         cl_mem dPdu_sd, dPdv_sd;
1629         cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
1630
1631         /* Object motion. */
1632         cl_mem ob_tfm_sd, ob_itfm_sd;
1633         cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
1634
1635         cl_mem closure_sd;
1636         cl_mem closure_sd_DL_shadow;
1637         cl_mem num_closure_sd;
1638         cl_mem num_closure_sd_DL_shadow;
1639         cl_mem randb_closure_sd;
1640         cl_mem randb_closure_sd_DL_shadow;
1641         cl_mem ray_P_sd;
1642         cl_mem ray_P_sd_DL_shadow;
1643         cl_mem ray_dP_sd;
1644         cl_mem ray_dP_sd_DL_shadow;
1645
1646         /* Global memory required for shadow blocked and accum_radiance. */
1647         cl_mem BSDFEval_coop;
1648         cl_mem ISLamp_coop;
1649         cl_mem LightRay_coop;
1650         cl_mem AOAlpha_coop;
1651         cl_mem AOBSDF_coop;
1652         cl_mem AOLightRay_coop;
1653         cl_mem Intersection_coop_AO;
1654         cl_mem Intersection_coop_DL;
1655
1656 #ifdef WITH_CYCLES_DEBUG
1657         /* DebugData memory */
1658         cl_mem debugdata_coop;
1659 #endif
1660
1661         /* Global state array that tracks ray state. */
1662         cl_mem ray_state;
1663
1664         /* Per sample buffers. */
1665         cl_mem per_sample_output_buffers;
1666
1667         /* Denotes which sample each ray is being processed for. */
1668         cl_mem work_array;
1669
1670         /* Queue */
1671         cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
1672         cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
1673                              * Tracks the size of each queue.
1674                              */
1675
1676         /* Flag to make sceneintersect and lampemission kernel use queues. */
1677         cl_mem use_queues_flag;
1678
1679         /* Amount of memory in output buffer associated with one pixel/thread. */
1680         size_t per_thread_output_buffer_size;
1681
1682         /* Total allocatable available device memory. */
1683         size_t total_allocatable_memory;
1684
1685         /* host version of ray_state; Used in checking host path-iteration
1686          * termination.
1687          */
1688         char *hostRayStateArray;
1689
1690         /* Number of path-iterations to be done in one shot. */
1691         unsigned int PathIteration_times;
1692
1693 #ifdef __WORK_STEALING__
1694         /* Work pool with respect to each work group. */
1695         cl_mem work_pool_wgs;
1696
1697         /* Denotes the maximum work groups possible w.r.t. current tile size. */
1698         unsigned int max_work_groups;
1699 #endif
1700
1701         /* clos_max value for which the kernels have been loaded currently. */
1702         int current_max_closure;
1703
1704         /* Marked True in constructor and marked false at the end of path_trace(). */
1705         bool first_tile;
1706
1707         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
1708         : OpenCLDeviceBase(info, stats, background_)
1709         {
1710                 background = background_;
1711
1712                 /* Initialize kernels. */
1713                 ckPathTraceKernel_data_init = NULL;
1714                 ckPathTraceKernel_scene_intersect = NULL;
1715                 ckPathTraceKernel_lamp_emission = NULL;
1716                 ckPathTraceKernel_background_buffer_update = NULL;
1717                 ckPathTraceKernel_shader_eval = NULL;
1718                 ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
1719                 ckPathTraceKernel_direct_lighting = NULL;
1720                 ckPathTraceKernel_shadow_blocked = NULL;
1721                 ckPathTraceKernel_next_iteration_setup = NULL;
1722                 ckPathTraceKernel_sum_all_radiance = NULL;
1723                 ckPathTraceKernel_queue_enqueue = NULL;
1724
1725                 /* Initialize program. */
1726                 data_init_program = NULL;
1727                 scene_intersect_program = NULL;
1728                 lamp_emission_program = NULL;
1729                 queue_enqueue_program = NULL;
1730                 background_buffer_update_program = NULL;
1731                 shader_eval_program = NULL;
1732                 holdout_emission_blurring_pathtermination_ao_program = NULL;
1733                 direct_lighting_program = NULL;
1734                 shadow_blocked_program = NULL;
1735                 next_iteration_setup_program = NULL;
1736                 sum_all_radiance_program = NULL;
1737
1738                 /* Initialize cl_mem variables. */
1739                 kgbuffer = NULL;
1740                 sd = NULL;
1741                 sd_DL_shadow = NULL;
1742
1743                 P_sd = NULL;
1744                 P_sd_DL_shadow = NULL;
1745                 N_sd = NULL;
1746                 N_sd_DL_shadow = NULL;
1747                 Ng_sd = NULL;
1748                 Ng_sd_DL_shadow = NULL;
1749                 I_sd = NULL;
1750                 I_sd_DL_shadow = NULL;
1751                 shader_sd = NULL;
1752                 shader_sd_DL_shadow = NULL;
1753                 flag_sd = NULL;
1754                 flag_sd_DL_shadow = NULL;
1755                 prim_sd = NULL;
1756                 prim_sd_DL_shadow = NULL;
1757                 type_sd = NULL;
1758                 type_sd_DL_shadow = NULL;
1759                 u_sd = NULL;
1760                 u_sd_DL_shadow = NULL;
1761                 v_sd = NULL;
1762                 v_sd_DL_shadow = NULL;
1763                 object_sd = NULL;
1764                 object_sd_DL_shadow = NULL;
1765                 time_sd = NULL;
1766                 time_sd_DL_shadow = NULL;
1767                 ray_length_sd = NULL;
1768                 ray_length_sd_DL_shadow = NULL;
1769                 ray_depth_sd = NULL;
1770                 ray_depth_sd_DL_shadow = NULL;
1771                 transparent_depth_sd = NULL;
1772                 transparent_depth_sd_DL_shadow = NULL;
1773
1774                 /* Ray differentials. */
1775                 dP_sd = NULL;
1776                 dI_sd = NULL;
1777                 dP_sd_DL_shadow = NULL;
1778                 dI_sd_DL_shadow = NULL;
1779                 du_sd = NULL;
1780                 dv_sd = NULL;
1781                 du_sd_DL_shadow = NULL;
1782                 dv_sd_DL_shadow = NULL;
1783
1784                 /* Dp/Du */
1785                 dPdu_sd = NULL;
1786                 dPdv_sd = NULL;
1787                 dPdu_sd_DL_shadow = NULL;
1788                 dPdv_sd_DL_shadow = NULL;
1789
1790                 /* Object motion. */
1791                 ob_tfm_sd = NULL;
1792                 ob_itfm_sd = NULL;
1793                 ob_tfm_sd_DL_shadow = NULL;
1794                 ob_itfm_sd_DL_shadow = NULL;
1795
1796                 closure_sd = NULL;
1797                 closure_sd_DL_shadow = NULL;
1798                 num_closure_sd = NULL;
1799                 num_closure_sd_DL_shadow = NULL;
1800                 randb_closure_sd = NULL;
1801                 randb_closure_sd_DL_shadow = NULL;
1802                 ray_P_sd = NULL;
1803                 ray_P_sd_DL_shadow = NULL;
1804                 ray_dP_sd = NULL;
1805                 ray_dP_sd_DL_shadow = NULL;
1806
1807                 rng_coop = NULL;
1808                 throughput_coop = NULL;
1809                 L_transparent_coop = NULL;
1810                 PathRadiance_coop = NULL;
1811                 Ray_coop = NULL;
1812                 PathState_coop = NULL;
1813                 Intersection_coop = NULL;
1814                 ray_state = NULL;
1815
1816                 AOAlpha_coop = NULL;
1817                 AOBSDF_coop = NULL;
1818                 AOLightRay_coop = NULL;
1819                 BSDFEval_coop = NULL;
1820                 ISLamp_coop = NULL;
1821                 LightRay_coop = NULL;
1822                 Intersection_coop_AO = NULL;
1823                 Intersection_coop_DL = NULL;
1824
1825 #ifdef WITH_CYCLES_DEBUG
1826                 debugdata_coop = NULL;
1827 #endif
1828
1829                 work_array = NULL;
1830
1831                 /* Queue. */
1832                 Queue_data = NULL;
1833                 Queue_index = NULL;
1834                 use_queues_flag = NULL;
1835
1836                 per_sample_output_buffers = NULL;
1837
1838                 per_thread_output_buffer_size = 0;
1839                 hostRayStateArray = NULL;
1840                 PathIteration_times = PATH_ITER_INC_FACTOR;
1841 #ifdef __WORK_STEALING__
1842                 work_pool_wgs = NULL;
1843                 max_work_groups = 0;
1844 #endif
1845                 current_max_closure = -1;
1846                 first_tile = true;
1847
1848                 /* Get device's maximum memory that can be allocated. */
1849                 ciErr = clGetDeviceInfo(cdDevice,
1850                                         CL_DEVICE_MAX_MEM_ALLOC_SIZE,
1851                                         sizeof(size_t),
1852                                         &total_allocatable_memory,
1853                                         NULL);
1854                 assert(ciErr == CL_SUCCESS);
1855                 if(platform_name == "AMD Accelerated Parallel Processing") {
1856                         /* This value is tweak-able; AMD platform does not seem to
1857                          * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
1858                          * is considered for further computation.
1859                          */
1860                         total_allocatable_memory /= 2;
1861                 }
1862         }
1863
1864         /* TODO(sergey): Seems really close to load_kernel(),
1865          * could it be de-duplicated?
1866          */
1867         bool load_split_kernel(string kernel_path,
1868                                string kernel_init_source,
1869                                string clbin,
1870                                string custom_kernel_build_options,
1871                                cl_program *program,
1872                                const string *debug_src = NULL)
1873         {
1874                 if(!opencl_version_check())
1875                         return false;
1876
1877                 clbin = path_user_get(path_join("cache", clbin));
1878
1879                 /* If exists already, try use it. */
1880                 if(path_exists(clbin) && load_binary(kernel_path,
1881                                                      clbin,
1882                                                      custom_kernel_build_options,
1883                                                      program,
1884                                                      debug_src)) {
1885                         /* Kernel loaded from binary. */
1886                 }
1887                 else {
1888                         /* If does not exist or loading binary failed, compile kernel. */
1889                         if(!compile_kernel(kernel_path,
1890                                            kernel_init_source,
1891                                            custom_kernel_build_options,
1892                                            program,
1893                                            debug_src))
1894                         {
1895                                 return false;
1896                         }
1897                         /* Save binary for reuse. */
1898                         if(!save_binary(program, clbin)) {
1899                                 return false;
1900                         }
1901                 }
1902                 return true;
1903         }
1904
1905         /* Split kernel utility functions. */
1906         size_t get_tex_size(const char *tex_name)
1907         {
1908                 cl_mem ptr;
1909                 size_t ret_size = 0;
1910                 MemMap::iterator i = mem_map.find(tex_name);
1911                 if(i != mem_map.end()) {
1912                         ptr = CL_MEM_PTR(i->second);
1913                         ciErr = clGetMemObjectInfo(ptr,
1914                                                    CL_MEM_SIZE,
1915                                                    sizeof(ret_size),
1916                                                    &ret_size,
1917                                                    NULL);
1918                         assert(ciErr == CL_SUCCESS);
1919                 }
1920                 return ret_size;
1921         }
1922
1923         size_t get_shader_closure_size(int max_closure)
1924         {
1925                 return (sizeof(ShaderClosure) * max_closure);
1926         }
1927
1928         size_t get_shader_data_size(size_t shader_closure_size)
1929         {
1930                 /* ShaderData size without accounting for ShaderClosure array. */
1931                 size_t shader_data_size =
1932                         sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
1933                 return (shader_data_size + shader_closure_size);
1934         }
1935
1936         /* Returns size of KernelGlobals structure associated with OpenCL. */
1937         size_t get_KernelGlobals_size()
1938         {
1939                 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
1940                  * fetch its size.
1941                  */
1942                 typedef struct KernelGlobals {
1943                         ccl_constant KernelData *data;
1944 #define KERNEL_TEX(type, ttype, name) \
1945         ccl_global type *name;
1946 #include "kernel_textures.h"
1947 #undef KERNEL_TEX
1948                 } KernelGlobals;
1949
1950                 return sizeof(KernelGlobals);
1951         }
1952
1953         /* Returns size of Structure of arrays implementation of. */
1954         size_t get_shaderdata_soa_size()
1955         {
1956                 size_t shader_soa_size = 0;
1957
1958 #define SD_VAR(type, what) shader_soa_size += sizeof(void *);
1959 #define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
1960                 #include "kernel_shaderdata_vars.h"
1961 #undef SD_VAR
1962 #undef SD_CLOSURE_VAR
1963
1964                 return shader_soa_size;
1965         }
1966
1967         bool load_kernels(const DeviceRequestedFeatures& requested_features)
1968         {
1969                 /* Get Shader, bake and film_convert kernels.
1970                  * It'll also do verification of OpenCL actually initialized.
1971                  */
1972                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1973                         return false;
1974                 }
1975
1976                 string kernel_path = path_get("kernel");
1977                 string kernel_md5 = path_files_md5_hash(kernel_path);
1978                 string device_md5;
1979                 string kernel_init_source;
1980                 string clbin;
1981                 string clsrc, *debug_src = NULL;
1982
1983                 string build_options = "-D__SPLIT_KERNEL__";
1984 #ifdef __WORK_STEALING__
1985                 build_options += " -D__WORK_STEALING__";
1986 #endif
1987                 build_options += build_options_from_requested_features(requested_features);
1988
1989                 /* Set compute device build option. */
1990                 cl_device_type device_type;
1991                 ciErr = clGetDeviceInfo(cdDevice,
1992                                         CL_DEVICE_TYPE,
1993                                         sizeof(cl_device_type),
1994                                         &device_type,
1995                                         NULL);
1996                 assert(ciErr == CL_SUCCESS);
1997                 if(device_type == CL_DEVICE_TYPE_GPU) {
1998                         build_options += " -D__COMPUTE_DEVICE_GPU__";
1999                 }
2000
2001 #define GLUE(a, b) a ## b
2002 #define LOAD_KERNEL(name) \
2003         do { \
2004                 kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
2005                                      kernel_md5 + "\n"; \
2006                 device_md5 = device_md5_hash(build_options); \
2007                 clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
2008                                       device_md5.c_str(), kernel_md5.c_str()); \
2009                 if(opencl_kernel_use_debug()) { \
2010                         clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
2011                                               device_md5.c_str(), kernel_md5.c_str()); \
2012                         clsrc = path_user_get(path_join("cache", clsrc)); \
2013                         debug_src = &clsrc; \
2014                 } \
2015                 if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
2016                                       build_options, \
2017                                       &GLUE(name, _program), \
2018                                       debug_src)) \
2019                 { \
2020                         fprintf(stderr, "Faled to compile %s\n", #name); \
2021                         return false; \
2022                 } \
2023         } while(false)
2024
2025                 LOAD_KERNEL(data_init);
2026                 LOAD_KERNEL(scene_intersect);
2027                 LOAD_KERNEL(lamp_emission);
2028                 LOAD_KERNEL(queue_enqueue);
2029                 LOAD_KERNEL(background_buffer_update);
2030                 LOAD_KERNEL(shader_eval);
2031                 LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
2032                 LOAD_KERNEL(direct_lighting);
2033                 LOAD_KERNEL(shadow_blocked);
2034                 LOAD_KERNEL(next_iteration_setup);
2035                 LOAD_KERNEL(sum_all_radiance);
2036
2037 #undef LOAD_KERNEL
2038
2039 #define FIND_KERNEL(name) \
2040         do { \
2041                 GLUE(ckPathTraceKernel_, name) = \
2042                         clCreateKernel(GLUE(name, _program), \
2043                                        "kernel_ocl_path_trace_"  #name, &ciErr); \
2044                 if(opencl_error(ciErr)) { \
2045                         fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
2046                         return false; \
2047                 } \
2048         } while(false)
2049
2050                 FIND_KERNEL(data_init);
2051                 FIND_KERNEL(scene_intersect);
2052                 FIND_KERNEL(lamp_emission);
2053                 FIND_KERNEL(queue_enqueue);
2054                 FIND_KERNEL(background_buffer_update);
2055                 FIND_KERNEL(shader_eval);
2056                 FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
2057                 FIND_KERNEL(direct_lighting);
2058                 FIND_KERNEL(shadow_blocked);
2059                 FIND_KERNEL(next_iteration_setup);
2060                 FIND_KERNEL(sum_all_radiance);
2061 #undef FIND_KERNEL
2062 #undef GLUE
2063
2064                 current_max_closure = requested_features.max_closure;
2065
2066                 return true;
2067         }
2068
2069         ~OpenCLDeviceSplitKernel()
2070         {
2071                 task_pool.stop();
2072
2073                 /* Release kernels */
2074                 release_kernel_safe(ckPathTraceKernel_data_init);
2075                 release_kernel_safe(ckPathTraceKernel_scene_intersect);
2076                 release_kernel_safe(ckPathTraceKernel_lamp_emission);
2077                 release_kernel_safe(ckPathTraceKernel_queue_enqueue);
2078                 release_kernel_safe(ckPathTraceKernel_background_buffer_update);
2079                 release_kernel_safe(ckPathTraceKernel_shader_eval);
2080                 release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
2081                 release_kernel_safe(ckPathTraceKernel_direct_lighting);
2082                 release_kernel_safe(ckPathTraceKernel_shadow_blocked);
2083                 release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
2084                 release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
2085
2086                 /* Release global memory */
2087                 release_mem_object_safe(P_sd);
2088                 release_mem_object_safe(P_sd_DL_shadow);
2089                 release_mem_object_safe(N_sd);
2090                 release_mem_object_safe(N_sd_DL_shadow);
2091                 release_mem_object_safe(Ng_sd);
2092                 release_mem_object_safe(Ng_sd_DL_shadow);
2093                 release_mem_object_safe(I_sd);
2094                 release_mem_object_safe(I_sd_DL_shadow);
2095                 release_mem_object_safe(shader_sd);
2096                 release_mem_object_safe(shader_sd_DL_shadow);
2097                 release_mem_object_safe(flag_sd);
2098                 release_mem_object_safe(flag_sd_DL_shadow);
2099                 release_mem_object_safe(prim_sd);
2100                 release_mem_object_safe(prim_sd_DL_shadow);
2101                 release_mem_object_safe(type_sd);
2102                 release_mem_object_safe(type_sd_DL_shadow);
2103                 release_mem_object_safe(u_sd);
2104                 release_mem_object_safe(u_sd_DL_shadow);
2105                 release_mem_object_safe(v_sd);
2106                 release_mem_object_safe(v_sd_DL_shadow);
2107                 release_mem_object_safe(object_sd);
2108                 release_mem_object_safe(object_sd_DL_shadow);
2109                 release_mem_object_safe(time_sd);
2110                 release_mem_object_safe(time_sd_DL_shadow);
2111                 release_mem_object_safe(ray_length_sd);
2112                 release_mem_object_safe(ray_length_sd_DL_shadow);
2113                 release_mem_object_safe(ray_depth_sd);
2114                 release_mem_object_safe(ray_depth_sd_DL_shadow);
2115                 release_mem_object_safe(transparent_depth_sd);
2116                 release_mem_object_safe(transparent_depth_sd_DL_shadow);
2117
2118                 /* Ray differentials. */
2119                 release_mem_object_safe(dP_sd);
2120                 release_mem_object_safe(dP_sd_DL_shadow);
2121                 release_mem_object_safe(dI_sd);
2122                 release_mem_object_safe(dI_sd_DL_shadow);
2123                 release_mem_object_safe(du_sd);
2124                 release_mem_object_safe(du_sd_DL_shadow);
2125                 release_mem_object_safe(dv_sd);
2126                 release_mem_object_safe(dv_sd_DL_shadow);
2127
2128                 /* Dp/Du */
2129                 release_mem_object_safe(dPdu_sd);
2130                 release_mem_object_safe(dPdu_sd_DL_shadow);
2131                 release_mem_object_safe(dPdv_sd);
2132                 release_mem_object_safe(dPdv_sd_DL_shadow);
2133
2134                 /* Object motion. */
2135                 release_mem_object_safe(ob_tfm_sd);
2136                 release_mem_object_safe(ob_itfm_sd);
2137
2138                 release_mem_object_safe(ob_tfm_sd_DL_shadow);
2139                 release_mem_object_safe(ob_itfm_sd_DL_shadow);
2140
2141                 release_mem_object_safe(closure_sd);
2142                 release_mem_object_safe(closure_sd_DL_shadow);
2143                 release_mem_object_safe(num_closure_sd);
2144                 release_mem_object_safe(num_closure_sd_DL_shadow);
2145                 release_mem_object_safe(randb_closure_sd);
2146                 release_mem_object_safe(randb_closure_sd_DL_shadow);
2147                 release_mem_object_safe(ray_P_sd);
2148                 release_mem_object_safe(ray_P_sd_DL_shadow);
2149                 release_mem_object_safe(ray_dP_sd);
2150                 release_mem_object_safe(ray_dP_sd_DL_shadow);
2151                 release_mem_object_safe(rng_coop);
2152                 release_mem_object_safe(throughput_coop);
2153                 release_mem_object_safe(L_transparent_coop);
2154                 release_mem_object_safe(PathRadiance_coop);
2155                 release_mem_object_safe(Ray_coop);
2156                 release_mem_object_safe(PathState_coop);
2157                 release_mem_object_safe(Intersection_coop);
2158                 release_mem_object_safe(kgbuffer);
2159                 release_mem_object_safe(sd);
2160                 release_mem_object_safe(sd_DL_shadow);
2161                 release_mem_object_safe(ray_state);
2162                 release_mem_object_safe(AOAlpha_coop);
2163                 release_mem_object_safe(AOBSDF_coop);
2164                 release_mem_object_safe(AOLightRay_coop);
2165                 release_mem_object_safe(BSDFEval_coop);
2166                 release_mem_object_safe(ISLamp_coop);
2167                 release_mem_object_safe(LightRay_coop);
2168                 release_mem_object_safe(Intersection_coop_AO);
2169                 release_mem_object_safe(Intersection_coop_DL);
2170 #ifdef WITH_CYCLES_DEBUG
2171                 release_mem_object_safe(debugdata_coop);
2172 #endif
2173                 release_mem_object_safe(use_queues_flag);
2174                 release_mem_object_safe(Queue_data);
2175                 release_mem_object_safe(Queue_index);
2176                 release_mem_object_safe(work_array);
2177 #ifdef __WORK_STEALING__
2178                 release_mem_object_safe(work_pool_wgs);
2179 #endif
2180                 release_mem_object_safe(per_sample_output_buffers);
2181
2182                 /* Release programs */
2183                 release_program_safe(data_init_program);
2184                 release_program_safe(scene_intersect_program);
2185                 release_program_safe(lamp_emission_program);
2186                 release_program_safe(queue_enqueue_program);
2187                 release_program_safe(background_buffer_update_program);
2188                 release_program_safe(shader_eval_program);
2189                 release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
2190                 release_program_safe(direct_lighting_program);
2191                 release_program_safe(shadow_blocked_program);
2192                 release_program_safe(next_iteration_setup_program);
2193                 release_program_safe(sum_all_radiance_program);
2194
2195                 if(hostRayStateArray != NULL) {
2196                         free(hostRayStateArray);
2197                 }
2198         }
2199
2200         void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
2201         {
2202                 /* cast arguments to cl types */
2203                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
2204                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
2205                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
2206                 cl_int d_x = rtile.x;
2207                 cl_int d_y = rtile.y;
2208                 cl_int d_w = rtile.w;
2209                 cl_int d_h = rtile.h;
2210                 cl_int d_offset = rtile.offset;
2211                 cl_int d_stride = rtile.stride;
2212
2213                 /* Make sure that set render feasible tile size is a multiple of local
2214                  * work size dimensions.
2215                  */
2216                 assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
2217                 assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
2218
2219                 size_t global_size[2];
2220                 size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
2221                                         SPLIT_KERNEL_LOCAL_SIZE_Y};
2222
2223                 /* Set the range of samples to be processed for every ray in
2224                  * path-regeneration logic.
2225                  */
2226                 cl_int start_sample = rtile.start_sample;
2227                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
2228                 cl_int num_samples = rtile.num_samples;
2229
2230 #ifdef __WORK_STEALING__
2231                 global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
2232                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2233                 unsigned int num_parallel_samples = 1;
2234 #else
2235                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2236                 unsigned int num_threads = max_render_feasible_tile_size.x *
2237                                            max_render_feasible_tile_size.y;
2238                 unsigned int num_tile_columns_possible = num_threads / global_size[1];
2239                 /* Estimate number of parallel samples that can be
2240                  * processed in parallel.
2241                  */
2242                 unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
2243                                                         rtile.num_samples);
2244                 /* Wavefront size in AMD is 64.
2245                  * TODO(sergey): What about other platforms?
2246                  */
2247                 if(num_parallel_samples >= 64) {
2248                         /* TODO(sergey): Could use generic round-up here. */
2249                         num_parallel_samples = (num_parallel_samples / 64) * 64;
2250                 }
2251                 assert(num_parallel_samples != 0);
2252
2253                 global_size[0] = d_w * num_parallel_samples;
2254 #endif  /* __WORK_STEALING__ */
2255
2256                 assert(global_size[0] * global_size[1] <=
2257                        max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
2258
2259                 /* Allocate all required global memory once. */
2260                 if(first_tile) {
2261                         size_t num_global_elements = max_render_feasible_tile_size.x *
2262                                                      max_render_feasible_tile_size.y;
2263                         /* TODO(sergey): This will actually over-allocate if
2264                          * particular kernel does not support multiclosure.
2265                          */
2266                         size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
2267
2268 #ifdef __WORK_STEALING__
2269                         /* Calculate max groups */
2270                         size_t max_global_size[2];
2271                         size_t tile_x = max_render_feasible_tile_size.x;
2272                         size_t tile_y = max_render_feasible_tile_size.y;
2273                         max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
2274                         max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
2275                         max_work_groups = (max_global_size[0] * max_global_size[1]) /
2276                                           (local_size[0] * local_size[1]);
2277                         /* Allocate work_pool_wgs memory. */
2278                         work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
2279 #endif  /* __WORK_STEALING__ */
2280
2281                         /* Allocate queue_index memory only once. */
2282                         Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
2283                         use_queues_flag = mem_alloc(sizeof(char));
2284                         kgbuffer = mem_alloc(get_KernelGlobals_size());
2285
2286                         /* Create global buffers for ShaderData. */
2287                         sd = mem_alloc(get_shaderdata_soa_size());
2288                         sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
2289                         P_sd = mem_alloc(num_global_elements * sizeof(float3));
2290                         P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2291                         N_sd = mem_alloc(num_global_elements * sizeof(float3));
2292                         N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2293                         Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
2294                         Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2295                         I_sd = mem_alloc(num_global_elements * sizeof(float3));
2296                         I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2297                         shader_sd = mem_alloc(num_global_elements * sizeof(int));
2298                         shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2299                         flag_sd = mem_alloc(num_global_elements * sizeof(int));
2300                         flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2301                         prim_sd = mem_alloc(num_global_elements * sizeof(int));
2302                         prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2303                         type_sd = mem_alloc(num_global_elements * sizeof(int));
2304                         type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2305                         u_sd = mem_alloc(num_global_elements * sizeof(float));
2306                         u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2307                         v_sd = mem_alloc(num_global_elements * sizeof(float));
2308                         v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2309                         object_sd = mem_alloc(num_global_elements * sizeof(int));
2310                         object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2311                         time_sd = mem_alloc(num_global_elements * sizeof(float));
2312                         time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2313                         ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
2314                         ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2315                         ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2316                         ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2317                         transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2318                         transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2319
2320                         /* Ray differentials. */
2321                         dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2322                         dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2323                         dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
2324                         dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2325                         du_sd = mem_alloc(num_global_elements * sizeof(differential));
2326                         du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2327                         dv_sd = mem_alloc(num_global_elements * sizeof(differential));
2328                         dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2329
2330                         /* Dp/Du */
2331                         dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
2332                         dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2333                         dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
2334                         dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2335
2336                         /* Object motion. */
2337                         ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2338                         ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2339                         ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2340                         ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2341
2342                         closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
2343                         closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
2344                         num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
2345                         num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2346                         randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
2347                         randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2348                         ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
2349                         ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2350                         ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2351                         ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2352
2353                         /* Creation of global memory buffers which are shared among
2354                          * the kernels.
2355                          */
2356                         rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
2357                         throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
2358                         L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
2359                         PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
2360                         Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
2361                         PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
2362                         Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
2363                         AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
2364                         AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
2365                         AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2366                         BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
2367                         ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
2368                         LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2369                         Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
2370                         Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
2371
2372 #ifdef WITH_CYCLES_DEBUG
2373                         debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
2374 #endif
2375
2376                         ray_state = mem_alloc(num_global_elements * sizeof(char));
2377
2378                         hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
2379                         assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
2380
2381                         Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
2382                         work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
2383                         per_sample_output_buffers = mem_alloc(num_global_elements *
2384                                                               per_thread_output_buffer_size);
2385                 }
2386
2387                 cl_int dQueue_size = global_size[0] * global_size[1];
2388                 cl_int total_num_rays = global_size[0] * global_size[1];
2389
2390                 cl_uint start_arg_index =
2391                         kernel_set_args(ckPathTraceKernel_data_init,
2392                                         0,
2393                                         kgbuffer,
2394                                         sd,
2395                                         sd_DL_shadow,
2396                                         P_sd,
2397                                         P_sd_DL_shadow,
2398                                         N_sd,
2399                                         N_sd_DL_shadow,
2400                                         Ng_sd,
2401                                         Ng_sd_DL_shadow,
2402                                         I_sd,
2403                                         I_sd_DL_shadow,
2404                                         shader_sd,
2405                                         shader_sd_DL_shadow,
2406                                         flag_sd,
2407                                         flag_sd_DL_shadow,
2408                                         prim_sd,
2409                                         prim_sd_DL_shadow,
2410                                         type_sd,
2411                                         type_sd_DL_shadow,
2412                                         u_sd,
2413                                         u_sd_DL_shadow,
2414                                         v_sd,
2415                                         v_sd_DL_shadow,
2416                                         object_sd,
2417                                         object_sd_DL_shadow,
2418                                         time_sd,
2419                                         time_sd_DL_shadow,
2420                                         ray_length_sd,
2421                                         ray_length_sd_DL_shadow,
2422                                         ray_depth_sd,
2423                                         ray_depth_sd_DL_shadow,
2424                                         transparent_depth_sd,
2425                                         transparent_depth_sd_DL_shadow);
2426
2427                 /* Ray differentials. */
2428                 start_arg_index +=
2429                         kernel_set_args(ckPathTraceKernel_data_init,
2430                                         start_arg_index,
2431                                         dP_sd,
2432                                         dP_sd_DL_shadow,
2433                                         dI_sd,
2434                                         dI_sd_DL_shadow,
2435                                         du_sd,
2436                                         du_sd_DL_shadow,
2437                                         dv_sd,
2438                                         dv_sd_DL_shadow);
2439
2440                 /* Dp/Du */
2441                 start_arg_index +=
2442                         kernel_set_args(ckPathTraceKernel_data_init,
2443                                         start_arg_index,
2444                                         dPdu_sd,
2445                                         dPdu_sd_DL_shadow,
2446                                         dPdv_sd,
2447                                         dPdv_sd_DL_shadow);
2448
2449                 /* Object motion. */
2450                 start_arg_index +=
2451                         kernel_set_args(ckPathTraceKernel_data_init,
2452                                         start_arg_index,
2453                                         ob_tfm_sd,
2454                                         ob_tfm_sd_DL_shadow,
2455                                         ob_itfm_sd,
2456                                         ob_itfm_sd_DL_shadow);
2457
2458                 start_arg_index +=
2459                         kernel_set_args(ckPathTraceKernel_data_init,
2460                                         start_arg_index,
2461                                         closure_sd,
2462                                         closure_sd_DL_shadow,
2463                                         num_closure_sd,
2464                                         num_closure_sd_DL_shadow,
2465                                         randb_closure_sd,
2466                                         randb_closure_sd_DL_shadow,
2467                                         ray_P_sd,
2468                                         ray_P_sd_DL_shadow,
2469                                         ray_dP_sd,
2470                                         ray_dP_sd_DL_shadow,
2471                                         d_data,
2472                                         per_sample_output_buffers,
2473                                         d_rng_state,
2474                                         rng_coop,
2475                                         throughput_coop,
2476                                         L_transparent_coop,
2477                                         PathRadiance_coop,
2478                                         Ray_coop,
2479                                         PathState_coop,
2480                                         ray_state);
2481
2482 /* TODO(segrey): Avoid map lookup here. */
2483 #define KERNEL_TEX(type, ttype, name) \
2484         set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
2485 #include "kernel_textures.h"
2486 #undef KERNEL_TEX
2487
2488                 start_arg_index +=
2489                         kernel_set_args(ckPathTraceKernel_data_init,
2490                                         start_arg_index,
2491                                         start_sample,
2492                                         d_x,
2493                                         d_y,
2494                                         d_w,
2495                                         d_h,
2496                                         d_offset,
2497                                         d_stride,
2498                                         rtile.rng_state_offset_x,
2499                                         rtile.rng_state_offset_y,
2500                                         rtile.buffer_rng_state_stride,
2501                                         Queue_data,
2502                                         Queue_index,
2503                                         dQueue_size,
2504                                         use_queues_flag,
2505                                         work_array,
2506 #ifdef __WORK_STEALING__
2507                                         work_pool_wgs,
2508                                         num_samples,
2509 #endif
2510 #ifdef WITH_CYCLES_DEBUG
2511                                         debugdata_coop,
2512 #endif
2513                                         num_parallel_samples);
2514
2515                 kernel_set_args(ckPathTraceKernel_scene_intersect,
2516                                 0,
2517                                 kgbuffer,
2518                                 d_data,
2519                                 rng_coop,
2520                                 Ray_coop,
2521                                 PathState_coop,
2522                                 Intersection_coop,
2523                                 ray_state,
2524                                 d_w,
2525                                 d_h,
2526                                 Queue_data,
2527                                 Queue_index,
2528                                 dQueue_size,
2529                                 use_queues_flag,
2530 #ifdef WITH_CYCLES_DEBUG
2531                                 debugdata_coop,
2532 #endif
2533                                 num_parallel_samples);
2534
2535                 kernel_set_args(ckPathTraceKernel_lamp_emission,
2536                                 0,
2537                                 kgbuffer,
2538                                 d_data,
2539                                 sd,
2540                                 throughput_coop,
2541                                 PathRadiance_coop,
2542                                 Ray_coop,
2543                                 PathState_coop,
2544                                 Intersection_coop,
2545                                 ray_state,
2546                                 d_w,
2547                                 d_h,
2548                                 Queue_data,
2549                                 Queue_index,
2550                                 dQueue_size,
2551                                 use_queues_flag,
2552                                 num_parallel_samples);
2553
2554                 kernel_set_args(ckPathTraceKernel_queue_enqueue,
2555                                 0,
2556                                 Queue_data,
2557                                 Queue_index,
2558                                 ray_state,
2559                                 dQueue_size);
2560
2561                 kernel_set_args(ckPathTraceKernel_background_buffer_update,
2562                                  0,
2563                                  kgbuffer,
2564                                  d_data,
2565                                  sd,
2566                                  per_sample_output_buffers,
2567                                  d_rng_state,
2568                                  rng_coop,
2569                                  throughput_coop,
2570                                  PathRadiance_coop,
2571                                  Ray_coop,
2572                                  PathState_coop,
2573                                  L_transparent_coop,
2574                                  ray_state,
2575                                  d_w,
2576                                  d_h,
2577                                  d_x,
2578                                  d_y,
2579                                  d_stride,
2580                                  rtile.rng_state_offset_x,
2581                                  rtile.rng_state_offset_y,
2582                                  rtile.buffer_rng_state_stride,
2583                                  work_array,
2584                                  Queue_data,
2585                                  Queue_index,
2586                                  dQueue_size,
2587                                  end_sample,
2588                                  start_sample,
2589 #ifdef __WORK_STEALING__
2590                                  work_pool_wgs,
2591                                  num_samples,
2592 #endif
2593 #ifdef WITH_CYCLES_DEBUG
2594                                  debugdata_coop,
2595 #endif
2596                                  num_parallel_samples);
2597
2598                 kernel_set_args(ckPathTraceKernel_shader_eval,
2599                                 0,
2600                                 kgbuffer,
2601                                 d_data,
2602                                 sd,
2603                                 rng_coop,
2604                                 Ray_coop,
2605                                 PathState_coop,
2606                                 Intersection_coop,
2607                                 ray_state,
2608                                 Queue_data,
2609                                 Queue_index,
2610                                 dQueue_size);
2611
2612                 kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
2613                                 0,
2614                                 kgbuffer,
2615                                 d_data,
2616                                 sd,
2617                                 per_sample_output_buffers,
2618                                 rng_coop,
2619                                 throughput_coop,
2620                                 L_transparent_coop,
2621                                 PathRadiance_coop,
2622                                 PathState_coop,
2623                                 Intersection_coop,
2624                                 AOAlpha_coop,
2625                                 AOBSDF_coop,
2626                                 AOLightRay_coop,
2627                                 d_w,
2628                                 d_h,
2629                                 d_x,
2630                                 d_y,
2631                                 d_stride,
2632                                 ray_state,
2633                                 work_array,
2634                                 Queue_data,
2635                                 Queue_index,
2636                                 dQueue_size,
2637 #ifdef __WORK_STEALING__
2638                                 start_sample,
2639 #endif
2640                                 num_parallel_samples);
2641
2642                 kernel_set_args(ckPathTraceKernel_direct_lighting,
2643                                 0,
2644                                 kgbuffer,
2645                                 d_data,
2646                                 sd,
2647                                 sd_DL_shadow,
2648                                 rng_coop,
2649                                 PathState_coop,
2650                                 ISLamp_coop,
2651                                 LightRay_coop,
2652                                 BSDFEval_coop,
2653                                 ray_state,
2654                                 Queue_data,
2655                                 Queue_index,
2656                                 dQueue_size);
2657
2658                 kernel_set_args(ckPathTraceKernel_shadow_blocked,
2659                                 0,
2660                                 kgbuffer,
2661                                 d_data,
2662                                 sd_DL_shadow,
2663                                 PathState_coop,
2664                                 LightRay_coop,
2665                                 AOLightRay_coop,
2666                                 Intersection_coop_AO,
2667                                 Intersection_coop_DL,
2668                                 ray_state,
2669                                 Queue_data,
2670                                 Queue_index,
2671                                 dQueue_size,
2672                                 total_num_rays);
2673
2674                 kernel_set_args(ckPathTraceKernel_next_iteration_setup,
2675                                 0,
2676                                 kgbuffer,
2677                                 d_data,
2678                                 sd,
2679                                 rng_coop,
2680                                 throughput_coop,
2681                                 PathRadiance_coop,
2682                                 Ray_coop,
2683                                 PathState_coop,
2684                                 LightRay_coop,
2685                                 ISLamp_coop,
2686                                 BSDFEval_coop,
2687                                 AOLightRay_coop,
2688                                 AOBSDF_coop,
2689                                 AOAlpha_coop,
2690                                 ray_state,
2691                                 Queue_data,
2692                                 Queue_index,
2693                                 dQueue_size,
2694                                 use_queues_flag);
2695
2696                 kernel_set_args(ckPathTraceKernel_sum_all_radiance,
2697                                 0,
2698                                 d_data,
2699                                 d_buffer,
2700                                 per_sample_output_buffers,
2701                                 num_parallel_samples,
2702                                 d_w,
2703                                 d_h,
2704                                 d_stride,
2705                                 rtile.buffer_offset_x,
2706                                 rtile.buffer_offset_y,
2707                                 rtile.buffer_rng_state_stride,
2708                                 start_sample);
2709
2710                 /* Macro for Enqueuing split kernels. */
2711 #define GLUE(a, b) a ## b
2712 #define ENQUEUE_SPLIT_KERNEL(kernelName, globalSize, localSize) \
2713                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, \
2714                                                      GLUE(ckPathTraceKernel_, \
2715                                                           kernelName), \
2716                                                      2, \
2717                                                      NULL, \
2718                                                      globalSize, \
2719                                                      localSize, \
2720                                                      0, \
2721                                                      NULL, \
2722                                                      NULL))
2723
2724                 /* Enqueue ckPathTraceKernel_data_init kernel. */
2725                 ENQUEUE_SPLIT_KERNEL(data_init, global_size, local_size);
2726                 bool activeRaysAvailable = true;
2727
2728                 /* Record number of time host intervention has been made */
2729                 unsigned int numHostIntervention = 0;
2730                 unsigned int numNextPathIterTimes = PathIteration_times;
2731                 while(activeRaysAvailable) {
2732                         /* Twice the global work size of other kernels for
2733                          * ckPathTraceKernel_shadow_blocked_direct_lighting. */
2734                         size_t global_size_shadow_blocked[2];
2735                         global_size_shadow_blocked[0] = global_size[0] * 2;
2736                         global_size_shadow_blocked[1] = global_size[1];
2737
2738                         /* Do path-iteration in host [Enqueue Path-iteration kernels. */
2739                         for(int PathIter = 0; PathIter < PathIteration_times; PathIter++) {
2740                                 ENQUEUE_SPLIT_KERNEL(scene_intersect, global_size, local_size);
2741                                 ENQUEUE_SPLIT_KERNEL(lamp_emission, global_size, local_size);
2742                                 ENQUEUE_SPLIT_KERNEL(queue_enqueue, global_size, local_size);
2743                                 ENQUEUE_SPLIT_KERNEL(background_buffer_update, global_size, local_size);
2744                                 ENQUEUE_SPLIT_KERNEL(shader_eval, global_size, local_size);
2745                                 ENQUEUE_SPLIT_KERNEL(holdout_emission_blurring_pathtermination_ao, global_size, local_size);
2746                                 ENQUEUE_SPLIT_KERNEL(direct_lighting, global_size, local_size);
2747                                 ENQUEUE_SPLIT_KERNEL(shadow_blocked, global_size_shadow_blocked, local_size);
2748                                 ENQUEUE_SPLIT_KERNEL(next_iteration_setup, global_size, local_size);
2749                         }
2750
2751                         /* Read ray-state into Host memory to decide if we should exit
2752                          * path-iteration in host.
2753                          */
2754                         ciErr = clEnqueueReadBuffer(cqCommandQueue,
2755                                                     ray_state,
2756                                                     CL_TRUE,
2757                                                     0,
2758                                                     global_size[0] * global_size[1] * sizeof(char),
2759                                                     hostRayStateArray,
2760                                                     0,
2761                                                     NULL,
2762                                                     NULL);
2763                         assert(ciErr == CL_SUCCESS);
2764
2765                         activeRaysAvailable = false;
2766
2767                         for(int rayStateIter = 0;
2768                             rayStateIter < global_size[0] * global_size[1];
2769                             ++rayStateIter)
2770                         {
2771                                 if(int8_t(hostRayStateArray[rayStateIter]) != RAY_INACTIVE) {
2772                                         /* Not all rays are RAY_INACTIVE. */
2773                                         activeRaysAvailable = true;
2774                                         break;
2775                                 }
2776                         }
2777
2778                         if(activeRaysAvailable) {
2779                                 numHostIntervention++;
2780                                 PathIteration_times = PATH_ITER_INC_FACTOR;
2781                                 /* Host intervention done before all rays become RAY_INACTIVE;
2782                                  * Set do more initial iterations for the next tile.
2783                                  */
2784                                 numNextPathIterTimes += PATH_ITER_INC_FACTOR;
2785                         }
2786                 }
2787
2788                 /* Execute SumALLRadiance kernel to accumulate radiance calculated in
2789                  * per_sample_output_buffers into RenderTile's output buffer.
2790                  */
2791                 size_t sum_all_radiance_local_size[2] = {16, 16};
2792                 size_t sum_all_radiance_global_size[2];
2793                 sum_all_radiance_global_size[0] =
2794                         (((d_w - 1) / sum_all_radiance_local_size[0]) + 1) *
2795                         sum_all_radiance_local_size[0];
2796                 sum_all_radiance_global_size[1] =
2797                         (((d_h - 1) / sum_all_radiance_local_size[1]) + 1) *
2798                         sum_all_radiance_local_size[1];
2799                 ENQUEUE_SPLIT_KERNEL(sum_all_radiance,
2800                                      sum_all_radiance_global_size,
2801                                      sum_all_radiance_local_size);
2802
2803 #undef ENQUEUE_SPLIT_KERNEL
2804 #undef GLUE
2805
2806                 if(numHostIntervention == 0) {
2807                         /* This means that we are executing kernel more than required
2808                          * Must avoid this for the next sample/tile.
2809                          */
2810                         PathIteration_times = ((numNextPathIterTimes - PATH_ITER_INC_FACTOR) <= 0) ?
2811                         PATH_ITER_INC_FACTOR : numNextPathIterTimes - PATH_ITER_INC_FACTOR;
2812                 }
2813                 else {
2814                         /* Number of path-iterations done for this tile is set as
2815                          * Initial path-iteration times for the next tile
2816                          */
2817                         PathIteration_times = numNextPathIterTimes;
2818                 }
2819
2820                 first_tile = false;
2821         }
2822
2823         /* Calculates the amount of memory that has to be always
2824          * allocated in order for the split kernel to function.
2825          * This memory is tile/scene-property invariant (meaning,
2826          * the value returned by this function does not depend
2827          * on the user set tile size or scene properties.
2828          */
2829         size_t get_invariable_mem_allocated()
2830         {
2831                 size_t total_invariable_mem_allocated = 0;
2832                 size_t KernelGlobals_size = 0;
2833                 size_t ShaderData_SOA_size = 0;
2834
2835                 KernelGlobals_size = get_KernelGlobals_size();
2836                 ShaderData_SOA_size = get_shaderdata_soa_size();
2837
2838                 total_invariable_mem_allocated += KernelGlobals_size; /* KernelGlobals size */
2839                 total_invariable_mem_allocated += NUM_QUEUES * sizeof(unsigned int); /* Queue index size */
2840                 total_invariable_mem_allocated += sizeof(char); /* use_queues_flag size */
2841                 total_invariable_mem_allocated += ShaderData_SOA_size; /* sd size */
2842                 total_invariable_mem_allocated += ShaderData_SOA_size; /* sd_DL_shadow size */
2843
2844                 return total_invariable_mem_allocated;
2845         }
2846
2847         /* Calculate the memory that has-to-be/has-been allocated for
2848          * the split kernel to function.
2849          */
2850         size_t get_tile_specific_mem_allocated(const int2 tile_size)
2851         {
2852                 size_t tile_specific_mem_allocated = 0;
2853
2854                 /* Get required tile info */
2855                 unsigned int user_set_tile_w = tile_size.x;
2856                 unsigned int user_set_tile_h = tile_size.y;
2857
2858 #ifdef __WORK_STEALING__
2859                 /* Calculate memory to be allocated for work_pools in
2860                  * case of work_stealing.
2861                  */
2862                 size_t max_global_size[2];
2863                 size_t max_num_work_pools = 0;
2864                 max_global_size[0] =
2865                         (((user_set_tile_w - 1) / SPLIT_KERNEL_LOCAL_SIZE_X) + 1) *
2866                         SPLIT_KERNEL_LOCAL_SIZE_X;
2867                 max_global_size[1] =
2868                         (((user_set_tile_h - 1) / SPLIT_KERNEL_LOCAL_SIZE_Y) + 1) *
2869                         SPLIT_KERNEL_LOCAL_SIZE_Y;
2870                 max_num_work_pools =
2871                         (max_global_size[0] * max_global_size[1]) /
2872                         (SPLIT_KERNEL_LOCAL_SIZE_X * SPLIT_KERNEL_LOCAL_SIZE_Y);
2873                 tile_specific_mem_allocated += max_num_work_pools * sizeof(unsigned int);
2874 #endif
2875
2876                 tile_specific_mem_allocated +=
2877                         user_set_tile_w * user_set_tile_h * per_thread_output_buffer_size;
2878                 tile_specific_mem_allocated +=
2879                         user_set_tile_w * user_set_tile_h * sizeof(RNG);
2880
2881                 return tile_specific_mem_allocated;
2882         }
2883
2884         /* Calculates the texture memories and KernelData (d_data) memory
2885          * that has been allocated.
2886          */
2887         size_t get_scene_specific_mem_allocated(cl_mem d_data)
2888         {
2889                 size_t scene_specific_mem_allocated = 0;
2890                 /* Calculate texture memories. */
2891 #define KERNEL_TEX(type, ttype, name) \
2892         scene_specific_mem_allocated += get_tex_size(#name);
2893 #include "kernel_textures.h"
2894 #undef KERNEL_TEX
2895                 size_t d_data_size;
2896                 ciErr = clGetMemObjectInfo(d_data,
2897                                            CL_MEM_SIZE,
2898                                            sizeof(d_data_size),
2899                                            &d_data_size,
2900                                            NULL);
2901                 assert(ciErr == CL_SUCCESS && "Can't get d_data mem object info");
2902                 scene_specific_mem_allocated += d_data_size;
2903                 return scene_specific_mem_allocated;
2904         }
2905
2906         /* Calculate the memory required for one thread in split kernel. */
2907         size_t get_per_thread_memory()
2908         {
2909                 size_t shader_closure_size = 0;
2910                 size_t shaderdata_volume = 0;
2911                 shader_closure_size = get_shader_closure_size(current_max_closure);
2912                 /* TODO(sergey): This will actually over-allocate if
2913                  * particular kernel does not support multiclosure.
2914                  */
2915                 shaderdata_volume = get_shader_data_size(shader_closure_size);
2916                 size_t retval = sizeof(RNG)
2917                         + sizeof(float3)          /* Throughput size */
2918                         + sizeof(float)           /* L transparent size */
2919                         + sizeof(char)            /* Ray state size */
2920                         + sizeof(unsigned int)    /* Work element size */
2921                         + sizeof(int)             /* ISLamp_size */
2922                         + sizeof(PathRadiance) + sizeof(Ray) + sizeof(PathState)
2923                         + sizeof(Intersection)    /* Overall isect */
2924                         + sizeof(Intersection)    /* Instersection_coop_AO */
2925                         + sizeof(Intersection)    /* Intersection coop DL */
2926                         + shaderdata_volume       /* Overall ShaderData */
2927                         + (shaderdata_volume * 2) /* ShaderData : DL and shadow */
2928                         + sizeof(Ray) + sizeof(BsdfEval)
2929                         + sizeof(float3)          /* AOAlpha size */
2930                         + sizeof(float3)          /* AOBSDF size */
2931                         + sizeof(Ray)
2932                         + (sizeof(int) * NUM_QUEUES)
2933                         + per_thread_output_buffer_size;
2934                 return retval;
2935         }
2936
2937         /* Considers the total memory available in the device and
2938          * and returns the maximum global work size possible.
2939          */
2940         size_t get_feasible_global_work_size(int2 tile_size, cl_mem d_data)
2941         {
2942                 /* Calculate invariably allocated memory. */
2943                 size_t invariable_mem_allocated = get_invariable_mem_allocated();
2944                 /* Calculate tile specific allocated memory. */
2945                 size_t tile_specific_mem_allocated =
2946                         get_tile_specific_mem_allocated(tile_size);
2947                 /* Calculate scene specific allocated memory. */
2948                 size_t scene_specific_mem_allocated =
2949                         get_scene_specific_mem_allocated(d_data);
2950                 /* Calculate total memory available for the threads in global work size. */
2951                 size_t available_memory = total_allocatable_memory
2952                         - invariable_mem_allocated
2953                         - tile_specific_mem_allocated
2954                         - scene_specific_mem_allocated
2955                         - DATA_ALLOCATION_MEM_FACTOR;
2956                 size_t per_thread_memory_required = get_per_thread_memory();
2957                 return (available_memory / per_thread_memory_required);
2958         }
2959
2960         /* Checks if the device has enough memory to render the whole tile;
2961          * If not, we should split single tile into multiple tiles of small size
2962          * and process them all.
2963          */
2964         bool need_to_split_tile(unsigned int d_w,
2965                                 unsigned int d_h,
2966                                 int2 max_render_feasible_tile_size)