a7157e2b041e31da5ef1f4eb6bfbe9afaaf0e8b1
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "clew.h"
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_foreach.h"
31 #include "util_logging.h"
32 #include "util_map.h"
33 #include "util_math.h"
34 #include "util_md5.h"
35 #include "util_opengl.h"
36 #include "util_path.h"
37 #include "util_time.h"
38
39 CCL_NAMESPACE_BEGIN
40
41 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
42
43 /* Macro declarations used with split kernel */
44
45 /* Macro to enable/disable work-stealing */
46 #define __WORK_STEALING__
47
48 #define SPLIT_KERNEL_LOCAL_SIZE_X 64
49 #define SPLIT_KERNEL_LOCAL_SIZE_Y 1
50
51 /* This value may be tuned according to the scene we are rendering.
52  *
53  * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
54  * ray-bounces will improve performance.
55  */
56 #define PATH_ITER_INC_FACTOR 8
57
58 /* When allocate global memory in chunks. We may not be able to
59  * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
60  * Since some bytes may be needed for aligning chunks of memory;
61  * This is the amount of memory that we dedicate for that purpose.
62  */
63 #define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
64
65 struct OpenCLPlatformDevice {
66         OpenCLPlatformDevice(cl_platform_id platform_id,
67                              const string& platform_name,
68                              cl_device_id device_id,
69                              cl_device_type device_type,
70                              const string& device_name)
71           : platform_id(platform_id),
72             platform_name(platform_name),
73             device_id(device_id),
74             device_type(device_type),
75             device_name(device_name) {}
76         cl_platform_id platform_id;
77         string platform_name;
78         cl_device_id device_id;
79         cl_device_type device_type;
80         string device_name;
81 };
82
83 namespace {
84
85 cl_device_type opencl_device_type()
86 {
87         char *device = getenv("CYCLES_OPENCL_TEST");
88
89         if(device) {
90                 if(strcmp(device, "NONE") == 0)
91                         return 0;
92                 if(strcmp(device, "ALL") == 0)
93                         return CL_DEVICE_TYPE_ALL;
94                 else if(strcmp(device, "DEFAULT") == 0)
95                         return CL_DEVICE_TYPE_DEFAULT;
96                 else if(strcmp(device, "CPU") == 0)
97                         return CL_DEVICE_TYPE_CPU;
98                 else if(strcmp(device, "GPU") == 0)
99                         return CL_DEVICE_TYPE_GPU;
100                 else if(strcmp(device, "ACCELERATOR") == 0)
101                         return CL_DEVICE_TYPE_ACCELERATOR;
102         }
103
104         return CL_DEVICE_TYPE_ALL;
105 }
106
107 bool opencl_kernel_use_debug()
108 {
109         return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
110 }
111
112 bool opencl_kernel_use_advanced_shading(const string& platform)
113 {
114         /* keep this in sync with kernel_types.h! */
115         if(platform == "NVIDIA CUDA")
116                 return true;
117         else if(platform == "Apple")
118                 return true;
119         else if(platform == "AMD Accelerated Parallel Processing")
120                 return true;
121         else if(platform == "Intel(R) OpenCL")
122                 return true;
123         /* Make sure officially unsupported OpenCL platforms
124          * does not set up to use advanced shading.
125          */
126         return false;
127 }
128
129 bool opencl_kernel_use_split(const string& platform_name,
130                              const cl_device_type device_type)
131 {
132         if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) {
133                 return true;
134         }
135         /* TODO(sergey): Replace string lookups with more enum-like API,
136          * similar to device/vendor checks blender's gpu.
137          */
138         if(platform_name == "AMD Accelerated Parallel Processing" &&
139            device_type == CL_DEVICE_TYPE_GPU)
140         {
141                 return true;
142         }
143         return false;
144 }
145
146 bool opencl_device_supported(const string& platform_name,
147                              const cl_device_id device_id)
148 {
149         cl_device_type device_type;
150         clGetDeviceInfo(device_id,
151                         CL_DEVICE_TYPE,
152                         sizeof(cl_device_type),
153                         &device_type,
154                         NULL);
155         if(platform_name == "AMD Accelerated Parallel Processing" &&
156            device_type == CL_DEVICE_TYPE_GPU)
157         {
158                 return true;
159         }
160         if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
161                 return true;
162         }
163         return false;
164 }
165
166 bool opencl_platform_version_check(cl_platform_id platform,
167                                    string *error = NULL)
168 {
169         const int req_major = 1, req_minor = 1;
170         int major, minor;
171         char version[256];
172         clGetPlatformInfo(platform,
173                           CL_PLATFORM_VERSION,
174                           sizeof(version),
175                           &version,
176                           NULL);
177         if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
178                 if(error != NULL) {
179                         *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
180                 }
181                 return false;
182         }
183         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
184                 if(error != NULL) {
185                         *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
186                 }
187                 return false;
188         }
189         if(error != NULL) {
190                 *error = "";
191         }
192         return true;
193 }
194
195 bool opencl_device_version_check(cl_device_id device,
196                                  string *error = NULL)
197 {
198         const int req_major = 1, req_minor = 1;
199         int major, minor;
200         char version[256];
201         clGetDeviceInfo(device,
202                         CL_DEVICE_OPENCL_C_VERSION,
203                         sizeof(version),
204                         &version,
205                         NULL);
206         if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
207                 if(error != NULL) {
208                         *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
209                 }
210                 return false;
211         }
212         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
213                 if(error != NULL) {
214                         *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
215                 }
216                 return false;
217         }
218         if(error != NULL) {
219                 *error = "";
220         }
221         return true;
222 }
223
224 void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
225 {
226         const bool force_all_platforms =
227                 (getenv("CYCLES_OPENCL_TEST") != NULL) ||
228                 (getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST")) != NULL;
229         const cl_device_type device_type = opencl_device_type();
230         static bool first_time = true;
231 #define FIRST_VLOG(severity) if(first_time) VLOG(severity)
232
233         usable_devices->clear();
234
235         if(device_type == 0) {
236                 FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
237                 first_time = false;
238                 return;
239         }
240
241         vector<cl_device_id> device_ids;
242         cl_uint num_devices = 0;
243         vector<cl_platform_id> platform_ids;
244         cl_uint num_platforms = 0;
245
246         /* Get devices. */
247         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
248            num_platforms == 0)
249         {
250                 FIRST_VLOG(2) << "No OpenCL platforms were found.";
251                 first_time = false;
252                 return;
253         }
254         platform_ids.resize(num_platforms);
255         if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
256                 FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
257                 first_time = false;
258                 return;
259         }
260         /* Devices are numbered consecutively across platforms. */
261         for(int platform = 0; platform < num_platforms; platform++) {
262                 cl_platform_id platform_id = platform_ids[platform];
263                 char pname[256];
264                 if(clGetPlatformInfo(platform_id,
265                                      CL_PLATFORM_NAME,
266                                      sizeof(pname),
267                                      &pname,
268                                      NULL) != CL_SUCCESS)
269                 {
270                         FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
271                         continue;
272                 }
273                 string platform_name = pname;
274                 FIRST_VLOG(2) << "Enumerating devices for platform "
275                               << platform_name << ".";
276                 if(!opencl_platform_version_check(platform_id)) {
277                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
278                                       << " due to too old compiler version.";
279                         continue;
280                 }
281                 num_devices = 0;
282                 if(clGetDeviceIDs(platform_id,
283                                   device_type,
284                                   0,
285                                   NULL,
286                                   &num_devices) != CL_SUCCESS || num_devices == 0)
287                 {
288                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
289                                       << ", failed to fetch number of devices.";
290                         continue;
291                 }
292                 device_ids.resize(num_devices);
293                 if(clGetDeviceIDs(platform_id,
294                                   device_type,
295                                   num_devices,
296                                   &device_ids[0],
297                                   NULL) != CL_SUCCESS)
298                 {
299                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
300                                       << ", failed to fetch devices list.";
301                         continue;
302                 }
303                 for(int num = 0; num < num_devices; num++) {
304                         cl_device_id device_id = device_ids[num];
305                         char device_name[1024] = "\0";
306                         if(clGetDeviceInfo(device_id,
307                                            CL_DEVICE_NAME,
308                                            sizeof(device_name),
309                                            &device_name,
310                                            NULL) != CL_SUCCESS)
311                         {
312                                 FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
313                                 continue;
314                         }
315                         if(!opencl_device_version_check(device_id)) {
316                                 FIRST_VLOG(2) << "Ignoring device " << device_name
317                                               << " due to old compiler version.";
318                                 continue;
319                         }
320                         if(force_all_platforms ||
321                            opencl_device_supported(platform_name, device_id))
322                         {
323                                 cl_device_type device_type;
324                                 if(clGetDeviceInfo(device_id,
325                                                    CL_DEVICE_TYPE,
326                                                    sizeof(cl_device_type),
327                                                    &device_type,
328                                                    NULL) != CL_SUCCESS)
329                                 {
330                                         FIRST_VLOG(2) << "Ignoring device " << device_name
331                                                       << ", failed to fetch device type.";
332                                         continue;
333                                 }
334                                 FIRST_VLOG(2) << "Adding new device " << device_name << ".";
335                                 usable_devices->push_back(OpenCLPlatformDevice(platform_id,
336                                                                                platform_name,
337                                                                                device_id,
338                                                                                device_type,
339                                                                                device_name));
340                         }
341                         else {
342                                 FIRST_VLOG(2) << "Ignoring device " << device_name
343                                               << ", not officially supported yet.";
344                         }
345                 }
346         }
347         first_time = false;
348 }
349
350 }  /* namespace */
351
352 /* Thread safe cache for contexts and programs.
353  *
354  * TODO(sergey): Make it more generous, so it can contain any type of program
355  * without hardcoding possible program types in the slot.
356  */
357 class OpenCLCache
358 {
359         struct Slot
360         {
361                 thread_mutex *mutex;
362                 cl_context context;
363                 /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
364                 cl_program ocl_dev_base_program;
365                 /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
366                 cl_program ocl_dev_megakernel_program;
367
368                 Slot() : mutex(NULL),
369                          context(NULL),
370                          ocl_dev_base_program(NULL),
371                          ocl_dev_megakernel_program(NULL) {}
372
373                 Slot(const Slot& rhs)
374                     : mutex(rhs.mutex),
375                       context(rhs.context),
376                       ocl_dev_base_program(rhs.ocl_dev_base_program),
377                       ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
378                 {
379                         /* copy can only happen in map insert, assert that */
380                         assert(mutex == NULL);
381                 }
382
383                 ~Slot()
384                 {
385                         delete mutex;
386                         mutex = NULL;
387                 }
388         };
389
390         /* key is combination of platform ID and device ID */
391         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
392
393         /* map of Slot objects */
394         typedef map<PlatformDevicePair, Slot> CacheMap;
395         CacheMap cache;
396
397         thread_mutex cache_lock;
398
399         /* lazy instantiate */
400         static OpenCLCache &global_instance()
401         {
402                 static OpenCLCache instance;
403                 return instance;
404         }
405
406         OpenCLCache()
407         {
408         }
409
410         ~OpenCLCache()
411         {
412                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
413                  * so this is disabled. It's not necessary to free objects
414                  * at process exit anyway.
415                  * http://software.intel.com/en-us/forums/topic/370083#comments */
416
417                 //flush();
418         }
419
420         /* lookup something in the cache. If this returns NULL, slot_locker
421          * will be holding a lock for the cache. slot_locker should refer to a
422          * default constructed thread_scoped_lock */
423         template<typename T>
424         static T get_something(cl_platform_id platform,
425                                cl_device_id device,
426                                T Slot::*member,
427                                thread_scoped_lock& slot_locker)
428         {
429                 assert(platform != NULL);
430
431                 OpenCLCache& self = global_instance();
432
433                 thread_scoped_lock cache_lock(self.cache_lock);
434
435                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
436                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
437
438                 Slot &slot = ins.first->second;
439
440                 /* create slot lock only while holding cache lock */
441                 if(!slot.mutex)
442                         slot.mutex = new thread_mutex;
443
444                 /* need to unlock cache before locking slot, to allow store to complete */
445                 cache_lock.unlock();
446
447                 /* lock the slot */
448                 slot_locker = thread_scoped_lock(*slot.mutex);
449
450                 /* If the thing isn't cached */
451                 if(slot.*member == NULL) {
452                         /* return with the caller's lock holder holding the slot lock */
453                         return NULL;
454                 }
455
456                 /* the item was already cached, release the slot lock */
457                 slot_locker.unlock();
458
459                 return slot.*member;
460         }
461
462         /* store something in the cache. you MUST have tried to get the item before storing to it */
463         template<typename T>
464         static void store_something(cl_platform_id platform,
465                                     cl_device_id device,
466                                     T thing,
467                                     T Slot::*member,
468                                     thread_scoped_lock& slot_locker)
469         {
470                 assert(platform != NULL);
471                 assert(device != NULL);
472                 assert(thing != NULL);
473
474                 OpenCLCache &self = global_instance();
475
476                 thread_scoped_lock cache_lock(self.cache_lock);
477                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
478                 cache_lock.unlock();
479
480                 Slot &slot = i->second;
481
482                 /* sanity check */
483                 assert(i != self.cache.end());
484                 assert(slot.*member == NULL);
485
486                 slot.*member = thing;
487
488                 /* unlock the slot */
489                 slot_locker.unlock();
490         }
491
492 public:
493
494         enum ProgramName {
495                 OCL_DEV_BASE_PROGRAM,
496                 OCL_DEV_MEGAKERNEL_PROGRAM,
497         };
498
499         /* see get_something comment */
500         static cl_context get_context(cl_platform_id platform,
501                                       cl_device_id device,
502                                       thread_scoped_lock& slot_locker)
503         {
504                 cl_context context = get_something<cl_context>(platform,
505                                                                device,
506                                                                &Slot::context,
507                                                                slot_locker);
508
509                 if(!context)
510                         return NULL;
511
512                 /* caller is going to release it when done with it, so retain it */
513                 cl_int ciErr = clRetainContext(context);
514                 assert(ciErr == CL_SUCCESS);
515                 (void)ciErr;
516
517                 return context;
518         }
519
520         /* see get_something comment */
521         static cl_program get_program(cl_platform_id platform,
522                                       cl_device_id device,
523                                       ProgramName program_name,
524                                       thread_scoped_lock& slot_locker)
525         {
526                 cl_program program = NULL;
527
528                 switch(program_name) {
529                         case OCL_DEV_BASE_PROGRAM:
530                                 /* Get program related to OpenCLDeviceBase */
531                                 program = get_something<cl_program>(platform,
532                                                                     device,
533                                                                     &Slot::ocl_dev_base_program,
534                                                                     slot_locker);
535                                 break;
536                         case OCL_DEV_MEGAKERNEL_PROGRAM:
537                                 /* Get program related to megakernel */
538                                 program = get_something<cl_program>(platform,
539                                                                     device,
540                                                                     &Slot::ocl_dev_megakernel_program,
541                                                                     slot_locker);
542                                 break;
543                 default:
544                         assert(!"Invalid program name");
545                 }
546
547                 if(!program)
548                         return NULL;
549
550                 /* caller is going to release it when done with it, so retain it */
551                 cl_int ciErr = clRetainProgram(program);
552                 assert(ciErr == CL_SUCCESS);
553                 (void)ciErr;
554
555                 return program;
556         }
557
558         /* see store_something comment */
559         static void store_context(cl_platform_id platform,
560                                   cl_device_id device,
561                                   cl_context context,
562                                   thread_scoped_lock& slot_locker)
563         {
564                 store_something<cl_context>(platform,
565                                             device,
566                                             context,
567                                             &Slot::context,
568                                             slot_locker);
569
570                 /* increment reference count in OpenCL.
571                  * The caller is going to release the object when done with it. */
572                 cl_int ciErr = clRetainContext(context);
573                 assert(ciErr == CL_SUCCESS);
574                 (void)ciErr;
575         }
576
577         /* see store_something comment */
578         static void store_program(cl_platform_id platform,
579                                   cl_device_id device,
580                                   cl_program program,
581                                   ProgramName program_name,
582                                   thread_scoped_lock& slot_locker)
583         {
584                 switch (program_name) {
585                         case OCL_DEV_BASE_PROGRAM:
586                                 store_something<cl_program>(platform,
587                                                             device,
588                                                             program,
589                                                             &Slot::ocl_dev_base_program,
590                                                             slot_locker);
591                                 break;
592                         case OCL_DEV_MEGAKERNEL_PROGRAM:
593                                 store_something<cl_program>(platform,
594                                                             device,
595                                                             program,
596                                                             &Slot::ocl_dev_megakernel_program,
597                                                             slot_locker);
598                                 break;
599                         default:
600                                 assert(!"Invalid program name\n");
601                                 return;
602                 }
603
604                 /* Increment reference count in OpenCL.
605                  * The caller is going to release the object when done with it.
606                  */
607                 cl_int ciErr = clRetainProgram(program);
608                 assert(ciErr == CL_SUCCESS);
609                 (void)ciErr;
610         }
611
612         /* Discard all cached contexts and programs.  */
613         static void flush()
614         {
615                 OpenCLCache &self = global_instance();
616                 thread_scoped_lock cache_lock(self.cache_lock);
617
618                 foreach(CacheMap::value_type &item, self.cache) {
619                         if(item.second.ocl_dev_base_program != NULL)
620                                 clReleaseProgram(item.second.ocl_dev_base_program);
621                         if(item.second.ocl_dev_megakernel_program != NULL)
622                                 clReleaseProgram(item.second.ocl_dev_megakernel_program);
623                         if(item.second.context != NULL)
624                                 clReleaseContext(item.second.context);
625                 }
626
627                 self.cache.clear();
628         }
629 };
630
631 class OpenCLDeviceBase : public Device
632 {
633 public:
634         DedicatedTaskPool task_pool;
635         cl_context cxContext;
636         cl_command_queue cqCommandQueue;
637         cl_platform_id cpPlatform;
638         cl_device_id cdDevice;
639         cl_program cpProgram;
640         cl_kernel ckFilmConvertByteKernel;
641         cl_kernel ckFilmConvertHalfFloatKernel;
642         cl_kernel ckShaderKernel;
643         cl_kernel ckBakeKernel;
644         cl_int ciErr;
645
646         typedef map<string, device_vector<uchar>*> ConstMemMap;
647         typedef map<string, device_ptr> MemMap;
648
649         ConstMemMap const_mem_map;
650         MemMap mem_map;
651         device_ptr null_mem;
652
653         bool device_initialized;
654         string platform_name;
655
656         bool opencl_error(cl_int err)
657         {
658                 if(err != CL_SUCCESS) {
659                         string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
660                         if(error_msg == "")
661                                 error_msg = message;
662                         fprintf(stderr, "%s\n", message.c_str());
663                         return true;
664                 }
665
666                 return false;
667         }
668
669         void opencl_error(const string& message)
670         {
671                 if(error_msg == "")
672                         error_msg = message;
673                 fprintf(stderr, "%s\n", message.c_str());
674         }
675
676 #define opencl_assert(stmt) \
677         { \
678                 cl_int err = stmt; \
679                 \
680                 if(err != CL_SUCCESS) { \
681                         string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
682                         if(error_msg == "") \
683                                 error_msg = message; \
684                         fprintf(stderr, "%s\n", message.c_str()); \
685                 } \
686         } (void)0
687
688         void opencl_assert_err(cl_int err, const char* where)
689         {
690                 if(err != CL_SUCCESS) {
691                         string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
692                         if(error_msg == "")
693                                 error_msg = message;
694                         fprintf(stderr, "%s\n", message.c_str());
695 #ifndef NDEBUG
696                         abort();
697 #endif
698                 }
699         }
700
701         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
702         : Device(info, stats, background_)
703         {
704                 cpPlatform = NULL;
705                 cdDevice = NULL;
706                 cxContext = NULL;
707                 cqCommandQueue = NULL;
708                 cpProgram = NULL;
709                 ckFilmConvertByteKernel = NULL;
710                 ckFilmConvertHalfFloatKernel = NULL;
711                 ckShaderKernel = NULL;
712                 ckBakeKernel = NULL;
713                 null_mem = 0;
714                 device_initialized = false;
715
716                 vector<OpenCLPlatformDevice> usable_devices;
717                 opencl_get_usable_devices(&usable_devices);
718                 if(usable_devices.size() == 0) {
719                         opencl_error("OpenCL: no devices found.");
720                         return;
721                 }
722                 assert(info.num < usable_devices.size());
723                 OpenCLPlatformDevice& platform_device = usable_devices[info.num];
724                 cpPlatform = platform_device.platform_id;
725                 cdDevice = platform_device.device_id;
726                 platform_name = platform_device.platform_name;
727                 VLOG(2) << "Creating new Cycles device for OpenCL platform "
728                         << platform_name << ", device "
729                         << platform_device.device_name << ".";
730
731                 {
732                         /* try to use cached context */
733                         thread_scoped_lock cache_locker;
734                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
735
736                         if(cxContext == NULL) {
737                                 /* create context properties array to specify platform */
738                                 const cl_context_properties context_props[] = {
739                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
740                                         0, 0
741                                 };
742
743                                 /* create context */
744                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
745                                         context_notify_callback, cdDevice, &ciErr);
746
747                                 if(opencl_error(ciErr)) {
748                                         opencl_error("OpenCL: clCreateContext failed");
749                                         return;
750                                 }
751
752                                 /* cache it */
753                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
754                         }
755                 }
756
757                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
758                 if(opencl_error(ciErr))
759                         return;
760
761                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
762                 if(opencl_error(ciErr))
763                         return;
764
765                 fprintf(stderr, "Device init success\n");
766                 device_initialized = true;
767         }
768
769         static void CL_CALLBACK context_notify_callback(const char *err_info,
770                 const void * /*private_info*/, size_t /*cb*/, void *user_data)
771         {
772                 char name[256];
773                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
774
775                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
776         }
777
778         bool opencl_version_check()
779         {
780                 string error;
781                 if(!opencl_platform_version_check(cpPlatform, &error)) {
782                         opencl_error(error);
783                         return false;
784                 }
785                 if(!opencl_device_version_check(cdDevice, &error)) {
786                         opencl_error(error);
787                         return false;
788                 }
789                 return true;
790         }
791
792         bool load_binary(const string& /*kernel_path*/,
793                          const string& clbin,
794                          string custom_kernel_build_options,
795                          cl_program *program,
796                          const string *debug_src = NULL)
797         {
798                 /* read binary into memory */
799                 vector<uint8_t> binary;
800
801                 if(!path_read_binary(clbin, binary)) {
802                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
803                         return false;
804                 }
805
806                 /* create program */
807                 cl_int status;
808                 size_t size = binary.size();
809                 const uint8_t *bytes = &binary[0];
810
811                 *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
812                         &size, &bytes, &status, &ciErr);
813
814                 if(opencl_error(status) || opencl_error(ciErr)) {
815                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
816                         return false;
817                 }
818
819                 if(!build_kernel(program, custom_kernel_build_options, debug_src))
820                         return false;
821
822                 return true;
823         }
824
825         bool save_binary(cl_program *program, const string& clbin)
826         {
827                 size_t size = 0;
828                 clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
829
830                 if(!size)
831                         return false;
832
833                 vector<uint8_t> binary(size);
834                 uint8_t *bytes = &binary[0];
835
836                 clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
837
838                 if(!path_write_binary(clbin, binary)) {
839                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
840                         return false;
841                 }
842
843                 return true;
844         }
845
846         bool build_kernel(cl_program *kernel_program,
847                           string custom_kernel_build_options,
848                           const string *debug_src = NULL)
849         {
850                 string build_options;
851                 build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
852
853                 ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
854
855                 /* show warnings even if build is successful */
856                 size_t ret_val_size = 0;
857
858                 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
859
860                 if(ret_val_size > 1) {
861                         vector<char> build_log(ret_val_size + 1);
862                         clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
863
864                         build_log[ret_val_size] = '\0';
865                         /* Skip meaningless empty output from the NVidia compiler. */
866                         if(!(ret_val_size == 2 && build_log[0] == '\n')) {
867                                 fprintf(stderr, "OpenCL kernel build output:\n");
868                                 fprintf(stderr, "%s\n", &build_log[0]);
869                         }
870                 }
871
872                 if(ciErr != CL_SUCCESS) {
873                         opencl_error("OpenCL build failed: errors in console");
874                         return false;
875                 }
876
877                 return true;
878         }
879
880         bool compile_kernel(const string& kernel_path,
881                             string source,
882                             string custom_kernel_build_options,
883                             cl_program *kernel_program,
884                             const string *debug_src = NULL)
885         {
886                 /* we compile kernels consisting of many files. unfortunately opencl
887                  * kernel caches do not seem to recognize changes in included files.
888                  * so we force recompile on changes by adding the md5 hash of all files */
889                 source = path_source_replace_includes(source, kernel_path);
890
891                 if(debug_src)
892                         path_write_text(*debug_src, source);
893
894                 size_t source_len = source.size();
895                 const char *source_str = source.c_str();
896
897                 *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
898
899                 if(opencl_error(ciErr))
900                         return false;
901
902                 double starttime = time_dt();
903                 printf("Compiling OpenCL kernel ...\n");
904                 /* TODO(sergey): Report which kernel is being compiled
905                  * as well (megakernel or which of split kernels etc..).
906                  */
907                 printf("Build flags: %s\n", custom_kernel_build_options.c_str());
908
909                 if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
910                         return false;
911
912                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
913
914                 return true;
915         }
916
917         string device_md5_hash(string kernel_custom_build_options = "")
918         {
919                 MD5Hash md5;
920                 char version[256], driver[256], name[256], vendor[256];
921
922                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
923                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
924                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
925                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
926
927                 md5.append((uint8_t*)vendor, strlen(vendor));
928                 md5.append((uint8_t*)version, strlen(version));
929                 md5.append((uint8_t*)name, strlen(name));
930                 md5.append((uint8_t*)driver, strlen(driver));
931
932                 string options = kernel_build_options();
933                 options += kernel_custom_build_options;
934                 md5.append((uint8_t*)options.c_str(), options.size());
935
936                 return md5.get_hex();
937         }
938
939         bool load_kernels(const DeviceRequestedFeatures& requested_features)
940         {
941                 /* Verify if device was initialized. */
942                 if(!device_initialized) {
943                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
944                         return false;
945                 }
946
947                 /* Try to use cached kernel. */
948                 thread_scoped_lock cache_locker;
949                 cpProgram = load_cached_kernel(requested_features,
950                                                OpenCLCache::OCL_DEV_BASE_PROGRAM,
951                                                cache_locker);
952
953                 if(!cpProgram) {
954                         VLOG(2) << "No cached OpenCL kernel.";
955
956                         /* Verify we have right opencl version. */
957                         if(!opencl_version_check())
958                                 return false;
959
960                         string build_flags = build_options_for_base_program(requested_features);
961
962                         /* Calculate md5 hashes to detect changes. */
963                         string kernel_path = path_get("kernel");
964                         string kernel_md5 = path_files_md5_hash(kernel_path);
965                         string device_md5 = device_md5_hash(build_flags);
966
967                         /* Path to cached binary.
968                          *
969                          * TODO(sergey): Seems we could de-duplicate all this string_printf()
970                          * calls with some utility function which will give file name for a
971                          * given hashes..
972                          */
973                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
974                                                      device_md5.c_str(),
975                                                      kernel_md5.c_str());
976                         clbin = path_user_get(path_join("cache", clbin));
977
978                         /* path to preprocessed source for debugging */
979                         string clsrc, *debug_src = NULL;
980
981                         if(opencl_kernel_use_debug()) {
982                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
983                                                       device_md5.c_str(),
984                                                       kernel_md5.c_str());
985                                 clsrc = path_user_get(path_join("cache", clsrc));
986                                 debug_src = &clsrc;
987                         }
988
989                         /* If binary kernel exists already, try use it. */
990                         if(path_exists(clbin) && load_binary(kernel_path,
991                                                              clbin,
992                                                              build_flags,
993                                                              &cpProgram)) {
994                                 /* Kernel loaded from binary, nothing to do. */
995                                 VLOG(2) << "Loaded kernel from " << clbin << ".";
996                         }
997                         else {
998                                 VLOG(2) << "Kernel file " << clbin << " either doesn't exist or failed to be loaded by driver.";
999                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
1000
1001                                 /* If does not exist or loading binary failed, compile kernel. */
1002                                 if(!compile_kernel(kernel_path,
1003                                                    init_kernel_source,
1004                                                    build_flags,
1005                                                    &cpProgram,
1006                                                    debug_src))
1007                                 {
1008                                         return false;
1009                                 }
1010
1011                                 /* Save binary for reuse. */
1012                                 if(!save_binary(&cpProgram, clbin)) {
1013                                         return false;
1014                                 }
1015                         }
1016
1017                         /* Cache the program. */
1018                         store_cached_kernel(cpPlatform,
1019                                             cdDevice,
1020                                             cpProgram,
1021                                             OpenCLCache::OCL_DEV_BASE_PROGRAM,
1022                                             cache_locker);
1023                 }
1024                 else {
1025                         VLOG(2) << "Found cached OpenCL kernel.";
1026                 }
1027
1028                 /* Find kernels. */
1029 #define FIND_KERNEL(kernel_var, kernel_name) \
1030                 do { \
1031                         kernel_var = clCreateKernel(cpProgram, "kernel_ocl_" kernel_name, &ciErr); \
1032                         if(opencl_error(ciErr)) \
1033                                 return false; \
1034                 } while(0)
1035
1036                 FIND_KERNEL(ckFilmConvertByteKernel, "convert_to_byte");
1037                 FIND_KERNEL(ckFilmConvertHalfFloatKernel, "convert_to_half_float");
1038                 FIND_KERNEL(ckShaderKernel, "shader");
1039                 FIND_KERNEL(ckBakeKernel, "bake");
1040
1041 #undef FIND_KERNEL
1042                 return true;
1043         }
1044
1045         ~OpenCLDeviceBase()
1046         {
1047                 task_pool.stop();
1048
1049                 if(null_mem)
1050                         clReleaseMemObject(CL_MEM_PTR(null_mem));
1051
1052                 ConstMemMap::iterator mt;
1053                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
1054                         mem_free(*(mt->second));
1055                         delete mt->second;
1056                 }
1057
1058                 if(ckFilmConvertByteKernel)
1059                         clReleaseKernel(ckFilmConvertByteKernel);
1060                 if(ckFilmConvertHalfFloatKernel)
1061                         clReleaseKernel(ckFilmConvertHalfFloatKernel);
1062                 if(ckShaderKernel)
1063                         clReleaseKernel(ckShaderKernel);
1064                 if(ckBakeKernel)
1065                         clReleaseKernel(ckBakeKernel);
1066                 if(cpProgram)
1067                         clReleaseProgram(cpProgram);
1068                 if(cqCommandQueue)
1069                         clReleaseCommandQueue(cqCommandQueue);
1070                 if(cxContext)
1071                         clReleaseContext(cxContext);
1072         }
1073
1074         void mem_alloc(device_memory& mem, MemoryType type)
1075         {
1076                 size_t size = mem.memory_size();
1077
1078                 cl_mem_flags mem_flag;
1079                 void *mem_ptr = NULL;
1080
1081                 if(type == MEM_READ_ONLY)
1082                         mem_flag = CL_MEM_READ_ONLY;
1083                 else if(type == MEM_WRITE_ONLY)
1084                         mem_flag = CL_MEM_WRITE_ONLY;
1085                 else
1086                         mem_flag = CL_MEM_READ_WRITE;
1087
1088                 /* Zero-size allocation might be invoked by render, but not really
1089                  * supported by OpenCL. Using NULL as device pointer also doesn't really
1090                  * work for some reason, so for the time being we'll use special case
1091                  * will null_mem buffer.
1092                  */
1093                 if(size != 0) {
1094                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
1095                                                                         mem_flag,
1096                                                                         size,
1097                                                                         mem_ptr,
1098                                                                         &ciErr);
1099                         opencl_assert_err(ciErr, "clCreateBuffer");
1100                 }
1101                 else {
1102                         mem.device_pointer = null_mem;
1103                 }
1104
1105                 stats.mem_alloc(size);
1106                 mem.device_size = size;
1107         }
1108
1109         void mem_copy_to(device_memory& mem)
1110         {
1111                 /* this is blocking */
1112                 size_t size = mem.memory_size();
1113                 if(size != 0){
1114                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1115                                                            CL_MEM_PTR(mem.device_pointer),
1116                                                            CL_TRUE,
1117                                                            0,
1118                                                            size,
1119                                                            (void*)mem.data_pointer,
1120                                                            0,
1121                                                            NULL, NULL));
1122                 }
1123         }
1124
1125         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
1126         {
1127                 size_t offset = elem*y*w;
1128                 size_t size = elem*w*h;
1129                 assert(size != 0);
1130                 opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
1131                                                   CL_MEM_PTR(mem.device_pointer),
1132                                                   CL_TRUE,
1133                                                   offset,
1134                                                   size,
1135                                                   (uchar*)mem.data_pointer + offset,
1136                                                   0,
1137                                                   NULL, NULL));
1138         }
1139
1140         void mem_zero(device_memory& mem)
1141         {
1142                 if(mem.device_pointer) {
1143                         memset((void*)mem.data_pointer, 0, mem.memory_size());
1144                         mem_copy_to(mem);
1145                 }
1146         }
1147
1148         void mem_free(device_memory& mem)
1149         {
1150                 if(mem.device_pointer) {
1151                         if(mem.device_pointer != null_mem) {
1152                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1153                         }
1154                         mem.device_pointer = 0;
1155
1156                         stats.mem_free(mem.device_size);
1157                         mem.device_size = 0;
1158                 }
1159         }
1160
1161         void const_copy_to(const char *name, void *host, size_t size)
1162         {
1163                 ConstMemMap::iterator i = const_mem_map.find(name);
1164
1165                 if(i == const_mem_map.end()) {
1166                         device_vector<uchar> *data = new device_vector<uchar>();
1167                         data->copy((uchar*)host, size);
1168
1169                         mem_alloc(*data, MEM_READ_ONLY);
1170                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
1171                 }
1172                 else {
1173                         device_vector<uchar> *data = i->second;
1174                         data->copy((uchar*)host, size);
1175                 }
1176
1177                 mem_copy_to(*i->second);
1178         }
1179
1180         void tex_alloc(const char *name,
1181                        device_memory& mem,
1182                        InterpolationType /*interpolation*/,
1183                        ExtensionType /*extension*/)
1184         {
1185                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
1186                 mem_alloc(mem, MEM_READ_ONLY);
1187                 mem_copy_to(mem);
1188                 assert(mem_map.find(name) == mem_map.end());
1189                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
1190         }
1191
1192         void tex_free(device_memory& mem)
1193         {
1194                 if(mem.device_pointer) {
1195                         foreach(const MemMap::value_type& value, mem_map) {
1196                                 if(value.second == mem.device_pointer) {
1197                                         mem_map.erase(value.first);
1198                                         break;
1199                                 }
1200                         }
1201
1202                         mem_free(mem);
1203                 }
1204         }
1205
1206         size_t global_size_round_up(int group_size, int global_size)
1207         {
1208                 int r = global_size % group_size;
1209                 return global_size + ((r == 0)? 0: group_size - r);
1210         }
1211
1212         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
1213         {
1214                 size_t workgroup_size, max_work_items[3];
1215
1216                 clGetKernelWorkGroupInfo(kernel, cdDevice,
1217                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1218                 clGetDeviceInfo(cdDevice,
1219                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
1220         
1221                 /* try to divide evenly over 2 dimensions */
1222                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1223                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
1224
1225                 /* some implementations have max size 1 on 2nd dimension */
1226                 if(local_size[1] > max_work_items[1]) {
1227                         local_size[0] = workgroup_size/max_work_items[1];
1228                         local_size[1] = max_work_items[1];
1229                 }
1230
1231                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
1232
1233                 /* run kernel */
1234                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1235                 opencl_assert(clFlush(cqCommandQueue));
1236         }
1237
1238         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1239         {
1240                 cl_mem ptr;
1241
1242                 MemMap::iterator i = mem_map.find(name);
1243                 if(i != mem_map.end()) {
1244                         ptr = CL_MEM_PTR(i->second);
1245                 }
1246                 else {
1247                         /* work around NULL not working, even though the spec says otherwise */
1248                         ptr = CL_MEM_PTR(null_mem);
1249                 }
1250                 
1251                 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
1252         }
1253
1254         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
1255         {
1256                 /* cast arguments to cl types */
1257                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1258                 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
1259                 cl_mem d_buffer = CL_MEM_PTR(buffer);
1260                 cl_int d_x = task.x;
1261                 cl_int d_y = task.y;
1262                 cl_int d_w = task.w;
1263                 cl_int d_h = task.h;
1264                 cl_float d_sample_scale = 1.0f/(task.sample + 1);
1265                 cl_int d_offset = task.offset;
1266                 cl_int d_stride = task.stride;
1267
1268
1269                 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
1270
1271                 cl_uint start_arg_index =
1272                         kernel_set_args(ckFilmConvertKernel,
1273                                         0,
1274                                         d_data,
1275                                         d_rgba,
1276                                         d_buffer);
1277
1278 #define KERNEL_TEX(type, ttype, name) \
1279         set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
1280 #include "kernel_textures.h"
1281 #undef KERNEL_TEX
1282
1283                 start_arg_index += kernel_set_args(ckFilmConvertKernel,
1284                                                    start_arg_index,
1285                                                    d_sample_scale,
1286                                                    d_x,
1287                                                    d_y,
1288                                                    d_w,
1289                                                    d_h,
1290                                                    d_offset,
1291                                                    d_stride);
1292
1293                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1294         }
1295
1296         void shader(DeviceTask& task)
1297         {
1298                 /* cast arguments to cl types */
1299                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1300                 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1301                 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1302                 cl_int d_shader_eval_type = task.shader_eval_type;
1303                 cl_int d_shader_x = task.shader_x;
1304                 cl_int d_shader_w = task.shader_w;
1305                 cl_int d_offset = task.offset;
1306
1307                 cl_kernel kernel;
1308
1309                 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1310                         kernel = ckBakeKernel;
1311                 else
1312                         kernel = ckShaderKernel;
1313
1314                 for(int sample = 0; sample < task.num_samples; sample++) {
1315
1316                         if(task.get_cancel())
1317                                 break;
1318
1319                         cl_int d_sample = sample;
1320
1321                         cl_uint start_arg_index =
1322                                 kernel_set_args(kernel,
1323                                                 0,
1324                                                 d_data,
1325                                                 d_input,
1326                                                 d_output);
1327
1328 #define KERNEL_TEX(type, ttype, name) \
1329                 set_kernel_arg_mem(kernel, &start_arg_index, #name);
1330 #include "kernel_textures.h"
1331 #undef KERNEL_TEX
1332
1333                         start_arg_index += kernel_set_args(kernel,
1334                                                            start_arg_index,
1335                                                            d_shader_eval_type,
1336                                                            d_shader_x,
1337                                                            d_shader_w,
1338                                                            d_offset,
1339                                                            d_sample);
1340
1341                         enqueue_kernel(kernel, task.shader_w, 1);
1342
1343                         task.update_progress(NULL);
1344                 }
1345         }
1346
1347         class OpenCLDeviceTask : public DeviceTask {
1348         public:
1349                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
1350                 : DeviceTask(task)
1351                 {
1352                         run = function_bind(&OpenCLDeviceBase::thread_run,
1353                                             device,
1354                                             this);
1355                 }
1356         };
1357
1358         int get_split_task_count(DeviceTask& /*task*/)
1359         {
1360                 return 1;
1361         }
1362
1363         void task_add(DeviceTask& task)
1364         {
1365                 task_pool.push(new OpenCLDeviceTask(this, task));
1366         }
1367
1368         void task_wait()
1369         {
1370                 task_pool.wait();
1371         }
1372
1373         void task_cancel()
1374         {
1375                 task_pool.cancel();
1376         }
1377
1378         virtual void thread_run(DeviceTask * /*task*/) = 0;
1379
1380 protected:
1381         string kernel_build_options(const string *debug_src = NULL)
1382         {
1383                 string build_options = " -cl-fast-relaxed-math ";
1384
1385                 if(platform_name == "NVIDIA CUDA") {
1386                         build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1387                                          "-cl-nv-maxrregcount=32 "
1388                                          "-cl-nv-verbose ";
1389
1390                         uint compute_capability_major, compute_capability_minor;
1391                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1392                                         sizeof(cl_uint), &compute_capability_major, NULL);
1393                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1394                                         sizeof(cl_uint), &compute_capability_minor, NULL);
1395
1396                         build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1397                                                        compute_capability_major * 100 +
1398                                                        compute_capability_minor * 10);
1399                 }
1400
1401                 else if(platform_name == "Apple")
1402                         build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1403
1404                 else if(platform_name == "AMD Accelerated Parallel Processing")
1405                         build_options += "-D__KERNEL_OPENCL_AMD__ ";
1406
1407                 else if(platform_name == "Intel(R) OpenCL") {
1408                         build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1409
1410                         /* Options for gdb source level kernel debugging.
1411                          * this segfaults on linux currently.
1412                          */
1413                         if(opencl_kernel_use_debug() && debug_src)
1414                                 build_options += "-g -s \"" + *debug_src + "\" ";
1415                 }
1416
1417                 if(opencl_kernel_use_debug())
1418                         build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1419
1420 #ifdef WITH_CYCLES_DEBUG
1421                 build_options += "-D__KERNEL_DEBUG__ ";
1422 #endif
1423
1424                 return build_options;
1425         }
1426
1427         class ArgumentWrapper {
1428         public:
1429                 ArgumentWrapper() : size(0), pointer(NULL) {}
1430                 template <typename T>
1431                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
1432                                                pointer(&argument) { }
1433                 size_t size;
1434                 void *pointer;
1435         };
1436
1437         /* TODO(sergey): In the future we can use variadic templates, once
1438          * C++0x is allowed. Should allow to clean this up a bit.
1439          */
1440         int kernel_set_args(cl_kernel kernel,
1441                             int start_argument_index,
1442                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
1443                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
1444                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
1445                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
1446                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
1447                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
1448                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
1449                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
1450                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
1451                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
1452                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
1453                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
1454                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
1455                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
1456                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
1457                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
1458                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
1459                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
1460                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
1461                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
1462                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
1463                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
1464                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
1465                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
1466                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
1467                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
1468                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
1469                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
1470                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
1471                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
1472                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
1473                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
1474                             const ArgumentWrapper& arg33 = ArgumentWrapper())
1475         {
1476                 int current_arg_index = 0;
1477 #define FAKE_VARARG_HANDLE_ARG(arg) \
1478                 do { \
1479                         if(arg.pointer != NULL) { \
1480                                 opencl_assert(clSetKernelArg( \
1481                                         kernel, \
1482                                         start_argument_index + current_arg_index, \
1483                                         arg.size, arg.pointer)); \
1484                                 ++current_arg_index; \
1485                         } \
1486                         else { \
1487                                 return current_arg_index; \
1488                         } \
1489                 } while(false)
1490                 FAKE_VARARG_HANDLE_ARG(arg1);
1491                 FAKE_VARARG_HANDLE_ARG(arg2);
1492                 FAKE_VARARG_HANDLE_ARG(arg3);
1493                 FAKE_VARARG_HANDLE_ARG(arg4);
1494                 FAKE_VARARG_HANDLE_ARG(arg5);
1495                 FAKE_VARARG_HANDLE_ARG(arg6);
1496                 FAKE_VARARG_HANDLE_ARG(arg7);
1497                 FAKE_VARARG_HANDLE_ARG(arg8);
1498                 FAKE_VARARG_HANDLE_ARG(arg9);
1499                 FAKE_VARARG_HANDLE_ARG(arg10);
1500                 FAKE_VARARG_HANDLE_ARG(arg11);
1501                 FAKE_VARARG_HANDLE_ARG(arg12);
1502                 FAKE_VARARG_HANDLE_ARG(arg13);
1503                 FAKE_VARARG_HANDLE_ARG(arg14);
1504                 FAKE_VARARG_HANDLE_ARG(arg15);
1505                 FAKE_VARARG_HANDLE_ARG(arg16);
1506                 FAKE_VARARG_HANDLE_ARG(arg17);
1507                 FAKE_VARARG_HANDLE_ARG(arg18);
1508                 FAKE_VARARG_HANDLE_ARG(arg19);
1509                 FAKE_VARARG_HANDLE_ARG(arg20);
1510                 FAKE_VARARG_HANDLE_ARG(arg21);
1511                 FAKE_VARARG_HANDLE_ARG(arg22);
1512                 FAKE_VARARG_HANDLE_ARG(arg23);
1513                 FAKE_VARARG_HANDLE_ARG(arg24);
1514                 FAKE_VARARG_HANDLE_ARG(arg25);
1515                 FAKE_VARARG_HANDLE_ARG(arg26);
1516                 FAKE_VARARG_HANDLE_ARG(arg27);
1517                 FAKE_VARARG_HANDLE_ARG(arg28);
1518                 FAKE_VARARG_HANDLE_ARG(arg29);
1519                 FAKE_VARARG_HANDLE_ARG(arg30);
1520                 FAKE_VARARG_HANDLE_ARG(arg31);
1521                 FAKE_VARARG_HANDLE_ARG(arg32);
1522                 FAKE_VARARG_HANDLE_ARG(arg33);
1523 #undef FAKE_VARARG_HANDLE_ARG
1524                 return current_arg_index;
1525         }
1526
1527         inline void release_kernel_safe(cl_kernel kernel)
1528         {
1529                 if(kernel) {
1530                         clReleaseKernel(kernel);
1531                 }
1532         }
1533
1534         inline void release_mem_object_safe(cl_mem mem)
1535         {
1536                 if(mem != NULL) {
1537                         clReleaseMemObject(mem);
1538                 }
1539         }
1540
1541         inline void release_program_safe(cl_program program)
1542         {
1543                 if(program) {
1544                         clReleaseProgram(program);
1545                 }
1546         }
1547
1548         string build_options_from_requested_features(
1549                 const DeviceRequestedFeatures& requested_features)
1550         {
1551                 string build_options = "";
1552                 if(requested_features.experimental) {
1553                         build_options += " -D__KERNEL_EXPERIMENTAL__";
1554                 }
1555                 build_options += " -D__NODES_MAX_GROUP__=" +
1556                         string_printf("%d", requested_features.max_nodes_group);
1557                 build_options += " -D__NODES_FEATURES__=" +
1558                         string_printf("%d", requested_features.nodes_features);
1559                 build_options += string_printf(" -D__MAX_CLOSURE__=%d",
1560                                                requested_features.max_closure);
1561                 if(!requested_features.use_hair) {
1562                         build_options += " -D__NO_HAIR__";
1563                 }
1564                 if(!requested_features.use_object_motion) {
1565                         build_options += " -D__NO_OBJECT_MOTION__";
1566                 }
1567                 if(!requested_features.use_camera_motion) {
1568                         build_options += " -D__NO_CAMERA_MOTION__";
1569                 }
1570                 if(!requested_features.use_baking) {
1571                         build_options += " -D__NO_BAKING__";
1572                 }
1573                 return build_options;
1574         }
1575
1576         /* ** Those guys are for workign around some compiler-specific bugs ** */
1577
1578         virtual cl_program load_cached_kernel(
1579                 const DeviceRequestedFeatures& /*requested_features*/,
1580                 OpenCLCache::ProgramName program_name,
1581                 thread_scoped_lock& cache_locker)
1582         {
1583                 return OpenCLCache::get_program(cpPlatform,
1584                                                 cdDevice,
1585                                                 program_name,
1586                                                 cache_locker);
1587         }
1588
1589         virtual void store_cached_kernel(cl_platform_id platform,
1590                                          cl_device_id device,
1591                                          cl_program program,
1592                                          OpenCLCache::ProgramName program_name,
1593                                          thread_scoped_lock& cache_locker)
1594         {
1595                 OpenCLCache::store_program(platform,
1596                                            device,
1597                                            program,
1598                                            program_name,
1599                                            cache_locker);
1600         }
1601
1602         virtual string build_options_for_base_program(
1603                 const DeviceRequestedFeatures& /*requested_features*/)
1604         {
1605                 /* TODO(sergey): By default we compile all features, meaning
1606                  * mega kernel is not getting feature-based optimizations.
1607                  *
1608                  * Ideally we need always compile kernel with as less features
1609                  * enabled as possible to keep performance at it's max.
1610                  */
1611                 return "";
1612         }
1613 };
1614
1615 class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
1616 {
1617 public:
1618         cl_kernel ckPathTraceKernel;
1619         cl_program path_trace_program;
1620
1621         OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
1622         : OpenCLDeviceBase(info, stats, background_)
1623         {
1624                 ckPathTraceKernel = NULL;
1625                 path_trace_program = NULL;
1626         }
1627
1628         bool load_kernels(const DeviceRequestedFeatures& requested_features)
1629         {
1630                 /* Get Shader, bake and film convert kernels.
1631                  * It'll also do verification of OpenCL actually initialized.
1632                  */
1633                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1634                         return false;
1635                 }
1636
1637                 /* Try to use cached kernel. */
1638                 thread_scoped_lock cache_locker;
1639                 path_trace_program = OpenCLCache::get_program(cpPlatform,
1640                                                               cdDevice,
1641                                                               OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1642                                                               cache_locker);
1643
1644                 if(!path_trace_program) {
1645                         /* Verify we have right opencl version. */
1646                         if(!opencl_version_check())
1647                                 return false;
1648
1649                         /* Calculate md5 hash to detect changes. */
1650                         string kernel_path = path_get("kernel");
1651                         string kernel_md5 = path_files_md5_hash(kernel_path);
1652                         string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
1653                         string device_md5 = device_md5_hash(custom_kernel_build_options);
1654
1655                         /* Path to cached binary. */
1656                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
1657                                                      device_md5.c_str(),
1658                                                      kernel_md5.c_str());
1659                         clbin = path_user_get(path_join("cache", clbin));
1660
1661                         /* Path to preprocessed source for debugging. */
1662                         string clsrc, *debug_src = NULL;
1663                         if(opencl_kernel_use_debug()) {
1664                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
1665                                                       device_md5.c_str(),
1666                                                       kernel_md5.c_str());
1667                                 clsrc = path_user_get(path_join("cache", clsrc));
1668                                 debug_src = &clsrc;
1669                         }
1670
1671                         /* If exists already, try use it. */
1672                         if(path_exists(clbin) && load_binary(kernel_path,
1673                                                              clbin,
1674                                                              custom_kernel_build_options,
1675                                                              &path_trace_program,
1676                                                              debug_src)) {
1677                                 /* Kernel loaded from binary, nothing to do. */
1678                         }
1679                         else {
1680                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
1681                                                             kernel_md5 + "\n";
1682                                 /* If does not exist or loading binary failed, compile kernel. */
1683                                 if(!compile_kernel(kernel_path,
1684                                                    init_kernel_source,
1685                                                    custom_kernel_build_options,
1686                                                    &path_trace_program,
1687                                                    debug_src))
1688                                 {
1689                                         return false;
1690                                 }
1691                                 /* Save binary for reuse. */
1692                                 if(!save_binary(&path_trace_program, clbin)) {
1693                                         return false;
1694                                 }
1695                         }
1696                         /* Cache the program. */
1697                         OpenCLCache::store_program(cpPlatform,
1698                                                    cdDevice,
1699                                                    path_trace_program,
1700                                                    OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1701                                                    cache_locker);
1702                 }
1703
1704                 /* Find kernels. */
1705                 ckPathTraceKernel = clCreateKernel(path_trace_program,
1706                                                    "kernel_ocl_path_trace",
1707                                                    &ciErr);
1708                 if(opencl_error(ciErr))
1709                         return false;
1710                 return true;
1711         }
1712
1713         ~OpenCLDeviceMegaKernel()
1714         {
1715                 task_pool.stop();
1716                 release_kernel_safe(ckPathTraceKernel);
1717                 release_program_safe(path_trace_program);
1718         }
1719
1720         void path_trace(RenderTile& rtile, int sample)
1721         {
1722                 /* Cast arguments to cl types. */
1723                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1724                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1725                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
1726                 cl_int d_x = rtile.x;
1727                 cl_int d_y = rtile.y;
1728                 cl_int d_w = rtile.w;
1729                 cl_int d_h = rtile.h;
1730                 cl_int d_offset = rtile.offset;
1731                 cl_int d_stride = rtile.stride;
1732
1733                 /* Sample arguments. */
1734                 cl_int d_sample = sample;
1735
1736                 cl_uint start_arg_index =
1737                         kernel_set_args(ckPathTraceKernel,
1738                                         0,
1739                                         d_data,
1740                                         d_buffer,
1741                                         d_rng_state);
1742
1743 #define KERNEL_TEX(type, ttype, name) \
1744                 set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
1745 #include "kernel_textures.h"
1746 #undef KERNEL_TEX
1747
1748                 start_arg_index += kernel_set_args(ckPathTraceKernel,
1749                                                    start_arg_index,
1750                                                    d_sample,
1751                                                    d_x,
1752                                                    d_y,
1753                                                    d_w,
1754                                                    d_h,
1755                                                    d_offset,
1756                                                    d_stride);
1757
1758                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
1759         }
1760
1761         void thread_run(DeviceTask *task)
1762         {
1763                 if(task->type == DeviceTask::FILM_CONVERT) {
1764                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1765                 }
1766                 else if(task->type == DeviceTask::SHADER) {
1767                         shader(*task);
1768                 }
1769                 else if(task->type == DeviceTask::PATH_TRACE) {
1770                         RenderTile tile;
1771                         /* Keep rendering tiles until done. */
1772                         while(task->acquire_tile(this, tile)) {
1773                                 int start_sample = tile.start_sample;
1774                                 int end_sample = tile.start_sample + tile.num_samples;
1775
1776                                 for(int sample = start_sample; sample < end_sample; sample++) {
1777                                         if(task->get_cancel()) {
1778                                                 if(task->need_finish_queue == false)
1779                                                         break;
1780                                         }
1781
1782                                         path_trace(tile, sample);
1783
1784                                         tile.sample = sample + 1;
1785
1786                                         task->update_progress(&tile);
1787                                 }
1788
1789                                 /* Complete kernel execution before release tile */
1790                                 /* This helps in multi-device render;
1791                                  * The device that reaches the critical-section function
1792                                  * release_tile waits (stalling other devices from entering
1793                                  * release_tile) for all kernels to complete. If device1 (a
1794                                  * slow-render device) reaches release_tile first then it would
1795                                  * stall device2 (a fast-render device) from proceeding to render
1796                                  * next tile.
1797                                  */
1798                                 clFinish(cqCommandQueue);
1799
1800                                 task->release_tile(tile);
1801                         }
1802                 }
1803         }
1804 };
1805
1806 /* TODO(sergey): This is to keep tile split on OpenCL level working
1807  * for now, since without this view-port render does not work as it
1808  * should.
1809  *
1810  * Ideally it'll be done on the higher level, but we need to get ready
1811  * for merge rather soon, so let's keep split logic private here in
1812  * the file.
1813  */
1814 class SplitRenderTile : public RenderTile {
1815 public:
1816         SplitRenderTile()
1817                 : RenderTile(),
1818                   buffer_offset_x(0),
1819                   buffer_offset_y(0),
1820                   rng_state_offset_x(0),
1821                   rng_state_offset_y(0),
1822                   buffer_rng_state_stride(0) {}
1823
1824         explicit SplitRenderTile(RenderTile& tile)
1825                 : RenderTile(),
1826                   buffer_offset_x(0),
1827                   buffer_offset_y(0),
1828                   rng_state_offset_x(0),
1829                   rng_state_offset_y(0),
1830                   buffer_rng_state_stride(0)
1831         {
1832                 x = tile.x;
1833                 y = tile.y;
1834                 w = tile.w;
1835                 h = tile.h;
1836                 start_sample = tile.start_sample;
1837                 num_samples = tile.num_samples;
1838                 sample = tile.sample;
1839                 resolution = tile.resolution;
1840                 offset = tile.offset;
1841                 stride = tile.stride;
1842                 buffer = tile.buffer;
1843                 rng_state = tile.rng_state;
1844                 buffers = tile.buffers;
1845         }
1846
1847         /* Split kernel is device global memory constrained;
1848          * hence split kernel cant render big tile size's in
1849          * one go. If the user sets a big tile size (big tile size
1850          * is a term relative to the available device global memory),
1851          * we split the tile further and then call path_trace on
1852          * each of those split tiles. The following variables declared,
1853          * assist in achieving that purpose
1854          */
1855         int buffer_offset_x;
1856         int buffer_offset_y;
1857         int rng_state_offset_x;
1858         int rng_state_offset_y;
1859         int buffer_rng_state_stride;
1860 };
1861
1862 /* OpenCLDeviceSplitKernel's declaration/definition. */
1863 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
1864 {
1865 public:
1866         /* Kernel declaration. */
1867         cl_kernel ckPathTraceKernel_data_init;
1868         cl_kernel ckPathTraceKernel_scene_intersect;
1869         cl_kernel ckPathTraceKernel_lamp_emission;
1870         cl_kernel ckPathTraceKernel_queue_enqueue;
1871         cl_kernel ckPathTraceKernel_background_buffer_update;
1872         cl_kernel ckPathTraceKernel_shader_eval;
1873         cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
1874         cl_kernel ckPathTraceKernel_direct_lighting;
1875         cl_kernel ckPathTraceKernel_shadow_blocked;
1876         cl_kernel ckPathTraceKernel_next_iteration_setup;
1877         cl_kernel ckPathTraceKernel_sum_all_radiance;
1878
1879         /* cl_program declaration. */
1880         cl_program data_init_program;
1881         cl_program scene_intersect_program;
1882         cl_program lamp_emission_program;
1883         cl_program queue_enqueue_program;
1884         cl_program background_buffer_update_program;
1885         cl_program shader_eval_program;
1886         cl_program holdout_emission_blurring_pathtermination_ao_program;
1887         cl_program direct_lighting_program;
1888         cl_program shadow_blocked_program;
1889         cl_program next_iteration_setup_program;
1890         cl_program sum_all_radiance_program;
1891
1892         /* Global memory variables [porting]; These memory is used for
1893          * co-operation between different kernels; Data written by one
1894          * kernel will be available to another kernel via this global
1895          * memory.
1896          */
1897         cl_mem rng_coop;
1898         cl_mem throughput_coop;
1899         cl_mem L_transparent_coop;
1900         cl_mem PathRadiance_coop;
1901         cl_mem Ray_coop;
1902         cl_mem PathState_coop;
1903         cl_mem Intersection_coop;
1904         cl_mem kgbuffer;  /* KernelGlobals buffer. */
1905
1906         /* Global buffers for ShaderData. */
1907         cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
1908         cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
1909                                 * shadow_blocked kernel.
1910                                 */
1911
1912         /* Global buffers of each member of ShaderData. */
1913         cl_mem P_sd;
1914         cl_mem P_sd_DL_shadow;
1915         cl_mem N_sd;
1916         cl_mem N_sd_DL_shadow;
1917         cl_mem Ng_sd;
1918         cl_mem Ng_sd_DL_shadow;
1919         cl_mem I_sd;
1920         cl_mem I_sd_DL_shadow;
1921         cl_mem shader_sd;
1922         cl_mem shader_sd_DL_shadow;
1923         cl_mem flag_sd;
1924         cl_mem flag_sd_DL_shadow;
1925         cl_mem prim_sd;
1926         cl_mem prim_sd_DL_shadow;
1927         cl_mem type_sd;
1928         cl_mem type_sd_DL_shadow;
1929         cl_mem u_sd;
1930         cl_mem u_sd_DL_shadow;
1931         cl_mem v_sd;
1932         cl_mem v_sd_DL_shadow;
1933         cl_mem object_sd;
1934         cl_mem object_sd_DL_shadow;
1935         cl_mem time_sd;
1936         cl_mem time_sd_DL_shadow;
1937         cl_mem ray_length_sd;
1938         cl_mem ray_length_sd_DL_shadow;
1939         cl_mem ray_depth_sd;
1940         cl_mem ray_depth_sd_DL_shadow;
1941         cl_mem transparent_depth_sd;
1942         cl_mem transparent_depth_sd_DL_shadow;
1943
1944         /* Ray differentials. */
1945         cl_mem dP_sd, dI_sd;
1946         cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
1947         cl_mem du_sd, dv_sd;
1948         cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
1949
1950         /* Dp/Du */
1951         cl_mem dPdu_sd, dPdv_sd;
1952         cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
1953
1954         /* Object motion. */
1955         cl_mem ob_tfm_sd, ob_itfm_sd;
1956         cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
1957
1958         cl_mem closure_sd;
1959         cl_mem closure_sd_DL_shadow;
1960         cl_mem num_closure_sd;
1961         cl_mem num_closure_sd_DL_shadow;
1962         cl_mem randb_closure_sd;
1963         cl_mem randb_closure_sd_DL_shadow;
1964         cl_mem ray_P_sd;
1965         cl_mem ray_P_sd_DL_shadow;
1966         cl_mem ray_dP_sd;
1967         cl_mem ray_dP_sd_DL_shadow;
1968
1969         /* Global memory required for shadow blocked and accum_radiance. */
1970         cl_mem BSDFEval_coop;
1971         cl_mem ISLamp_coop;
1972         cl_mem LightRay_coop;
1973         cl_mem AOAlpha_coop;
1974         cl_mem AOBSDF_coop;
1975         cl_mem AOLightRay_coop;
1976         cl_mem Intersection_coop_AO;
1977         cl_mem Intersection_coop_DL;
1978
1979 #ifdef WITH_CYCLES_DEBUG
1980         /* DebugData memory */
1981         cl_mem debugdata_coop;
1982 #endif
1983
1984         /* Global state array that tracks ray state. */
1985         cl_mem ray_state;
1986
1987         /* Per sample buffers. */
1988         cl_mem per_sample_output_buffers;
1989
1990         /* Denotes which sample each ray is being processed for. */
1991         cl_mem work_array;
1992
1993         /* Queue */
1994         cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
1995         cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
1996                              * Tracks the size of each queue.
1997                              */
1998
1999         /* Flag to make sceneintersect and lampemission kernel use queues. */
2000         cl_mem use_queues_flag;
2001
2002         /* Amount of memory in output buffer associated with one pixel/thread. */
2003         size_t per_thread_output_buffer_size;
2004
2005         /* Total allocatable available device memory. */
2006         size_t total_allocatable_memory;
2007
2008         /* host version of ray_state; Used in checking host path-iteration
2009          * termination.
2010          */
2011         char *hostRayStateArray;
2012
2013         /* Number of path-iterations to be done in one shot. */
2014         unsigned int PathIteration_times;
2015
2016 #ifdef __WORK_STEALING__
2017         /* Work pool with respect to each work group. */
2018         cl_mem work_pool_wgs;
2019
2020         /* Denotes the maximum work groups possible w.r.t. current tile size. */
2021         unsigned int max_work_groups;
2022 #endif
2023
2024         /* clos_max value for which the kernels have been loaded currently. */
2025         int current_max_closure;
2026
2027         /* Marked True in constructor and marked false at the end of path_trace(). */
2028         bool first_tile;
2029
2030         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
2031         : OpenCLDeviceBase(info, stats, background_)
2032         {
2033                 background = background_;
2034
2035                 /* Initialize kernels. */
2036                 ckPathTraceKernel_data_init = NULL;
2037                 ckPathTraceKernel_scene_intersect = NULL;
2038                 ckPathTraceKernel_lamp_emission = NULL;
2039                 ckPathTraceKernel_background_buffer_update = NULL;
2040                 ckPathTraceKernel_shader_eval = NULL;
2041                 ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
2042                 ckPathTraceKernel_direct_lighting = NULL;
2043                 ckPathTraceKernel_shadow_blocked = NULL;
2044                 ckPathTraceKernel_next_iteration_setup = NULL;
2045                 ckPathTraceKernel_sum_all_radiance = NULL;
2046                 ckPathTraceKernel_queue_enqueue = NULL;
2047
2048                 /* Initialize program. */
2049                 data_init_program = NULL;
2050                 scene_intersect_program = NULL;
2051                 lamp_emission_program = NULL;
2052                 queue_enqueue_program = NULL;
2053                 background_buffer_update_program = NULL;
2054                 shader_eval_program = NULL;
2055                 holdout_emission_blurring_pathtermination_ao_program = NULL;
2056                 direct_lighting_program = NULL;
2057                 shadow_blocked_program = NULL;
2058                 next_iteration_setup_program = NULL;
2059                 sum_all_radiance_program = NULL;
2060
2061                 /* Initialize cl_mem variables. */
2062                 kgbuffer = NULL;
2063                 sd = NULL;
2064                 sd_DL_shadow = NULL;
2065
2066                 P_sd = NULL;
2067                 P_sd_DL_shadow = NULL;
2068                 N_sd = NULL;
2069                 N_sd_DL_shadow = NULL;
2070                 Ng_sd = NULL;
2071                 Ng_sd_DL_shadow = NULL;
2072                 I_sd = NULL;
2073                 I_sd_DL_shadow = NULL;
2074                 shader_sd = NULL;
2075                 shader_sd_DL_shadow = NULL;
2076                 flag_sd = NULL;
2077                 flag_sd_DL_shadow = NULL;
2078                 prim_sd = NULL;
2079                 prim_sd_DL_shadow = NULL;
2080                 type_sd = NULL;
2081                 type_sd_DL_shadow = NULL;
2082                 u_sd = NULL;
2083                 u_sd_DL_shadow = NULL;
2084                 v_sd = NULL;
2085                 v_sd_DL_shadow = NULL;
2086                 object_sd = NULL;
2087                 object_sd_DL_shadow = NULL;
2088                 time_sd = NULL;
2089                 time_sd_DL_shadow = NULL;
2090                 ray_length_sd = NULL;
2091                 ray_length_sd_DL_shadow = NULL;
2092                 ray_depth_sd = NULL;
2093                 ray_depth_sd_DL_shadow = NULL;
2094                 transparent_depth_sd = NULL;
2095                 transparent_depth_sd_DL_shadow = NULL;
2096
2097                 /* Ray differentials. */
2098                 dP_sd = NULL;
2099                 dI_sd = NULL;
2100                 dP_sd_DL_shadow = NULL;
2101                 dI_sd_DL_shadow = NULL;
2102                 du_sd = NULL;
2103                 dv_sd = NULL;
2104                 du_sd_DL_shadow = NULL;
2105                 dv_sd_DL_shadow = NULL;
2106
2107                 /* Dp/Du */
2108                 dPdu_sd = NULL;
2109                 dPdv_sd = NULL;
2110                 dPdu_sd_DL_shadow = NULL;
2111                 dPdv_sd_DL_shadow = NULL;
2112
2113                 /* Object motion. */
2114                 ob_tfm_sd = NULL;
2115                 ob_itfm_sd = NULL;
2116                 ob_tfm_sd_DL_shadow = NULL;
2117                 ob_itfm_sd_DL_shadow = NULL;
2118
2119                 closure_sd = NULL;
2120                 closure_sd_DL_shadow = NULL;
2121                 num_closure_sd = NULL;
2122                 num_closure_sd_DL_shadow = NULL;
2123                 randb_closure_sd = NULL;
2124                 randb_closure_sd_DL_shadow = NULL;
2125                 ray_P_sd = NULL;
2126                 ray_P_sd_DL_shadow = NULL;
2127                 ray_dP_sd = NULL;
2128                 ray_dP_sd_DL_shadow = NULL;
2129
2130                 rng_coop = NULL;
2131                 throughput_coop = NULL;
2132                 L_transparent_coop = NULL;
2133                 PathRadiance_coop = NULL;
2134                 Ray_coop = NULL;
2135                 PathState_coop = NULL;
2136                 Intersection_coop = NULL;
2137                 ray_state = NULL;
2138
2139                 AOAlpha_coop = NULL;
2140                 AOBSDF_coop = NULL;
2141                 AOLightRay_coop = NULL;
2142                 BSDFEval_coop = NULL;
2143                 ISLamp_coop = NULL;
2144                 LightRay_coop = NULL;
2145                 Intersection_coop_AO = NULL;
2146                 Intersection_coop_DL = NULL;
2147
2148 #ifdef WITH_CYCLES_DEBUG
2149                 debugdata_coop = NULL;
2150 #endif
2151
2152                 work_array = NULL;
2153
2154                 /* Queue. */
2155                 Queue_data = NULL;
2156                 Queue_index = NULL;
2157                 use_queues_flag = NULL;
2158
2159                 per_sample_output_buffers = NULL;
2160
2161                 per_thread_output_buffer_size = 0;
2162                 hostRayStateArray = NULL;
2163                 PathIteration_times = PATH_ITER_INC_FACTOR;
2164 #ifdef __WORK_STEALING__
2165                 work_pool_wgs = NULL;
2166                 max_work_groups = 0;
2167 #endif
2168                 current_max_closure = -1;
2169                 first_tile = true;
2170
2171                 /* Get device's maximum memory that can be allocated. */
2172                 ciErr = clGetDeviceInfo(cdDevice,
2173                                         CL_DEVICE_MAX_MEM_ALLOC_SIZE,
2174                                         sizeof(size_t),
2175                                         &total_allocatable_memory,
2176                                         NULL);
2177                 assert(ciErr == CL_SUCCESS);
2178                 if(platform_name == "AMD Accelerated Parallel Processing") {
2179                         /* This value is tweak-able; AMD platform does not seem to
2180                          * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
2181                          * is considered for further computation.
2182                          */
2183                         total_allocatable_memory /= 2;
2184                 }
2185         }
2186
2187         /* TODO(sergey): Seems really close to load_kernel(),
2188          * could it be de-duplicated?
2189          */
2190         bool load_split_kernel(string kernel_path,
2191                                string kernel_init_source,
2192                                string clbin,
2193                                string custom_kernel_build_options,
2194                                cl_program *program,
2195                                const string *debug_src = NULL)
2196         {
2197                 if(!opencl_version_check())
2198                         return false;
2199
2200                 clbin = path_user_get(path_join("cache", clbin));
2201
2202                 /* If exists already, try use it. */
2203                 if(path_exists(clbin) && load_binary(kernel_path,
2204                                                      clbin,
2205                                                      custom_kernel_build_options,
2206                                                      program,
2207                                                      debug_src)) {
2208                         /* Kernel loaded from binary. */
2209                 }
2210                 else {
2211                         /* If does not exist or loading binary failed, compile kernel. */
2212                         if(!compile_kernel(kernel_path,
2213                                            kernel_init_source,
2214                                            custom_kernel_build_options,
2215                                            program,
2216                                            debug_src))
2217                         {
2218                                 return false;
2219                         }
2220                         /* Save binary for reuse. */
2221                         if(!save_binary(program, clbin)) {
2222                                 return false;
2223                         }
2224                 }
2225                 return true;
2226         }
2227
2228         /* Split kernel utility functions. */
2229         size_t get_tex_size(const char *tex_name)
2230         {
2231                 cl_mem ptr;
2232                 size_t ret_size = 0;
2233                 MemMap::iterator i = mem_map.find(tex_name);
2234                 if(i != mem_map.end()) {
2235                         ptr = CL_MEM_PTR(i->second);
2236                         ciErr = clGetMemObjectInfo(ptr,
2237                                                    CL_MEM_SIZE,
2238                                                    sizeof(ret_size),
2239                                                    &ret_size,
2240                                                    NULL);
2241                         assert(ciErr == CL_SUCCESS);
2242                 }
2243                 return ret_size;
2244         }
2245
2246         size_t get_shader_closure_size(int max_closure)
2247         {
2248                 return (sizeof(ShaderClosure) * max_closure);
2249         }
2250
2251         size_t get_shader_data_size(size_t shader_closure_size)
2252         {
2253                 /* ShaderData size without accounting for ShaderClosure array. */
2254                 size_t shader_data_size =
2255                         sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
2256                 return (shader_data_size + shader_closure_size);
2257         }
2258
2259         /* Returns size of KernelGlobals structure associated with OpenCL. */
2260         size_t get_KernelGlobals_size()
2261         {
2262                 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
2263                  * fetch its size.
2264                  */
2265                 typedef struct KernelGlobals {
2266                         ccl_constant KernelData *data;
2267 #define KERNEL_TEX(type, ttype, name) \
2268         ccl_global type *name;
2269 #include "kernel_textures.h"
2270 #undef KERNEL_TEX
2271                 } KernelGlobals;
2272
2273                 return sizeof(KernelGlobals);
2274         }
2275
2276         /* Returns size of Structure of arrays implementation of. */
2277         size_t get_shaderdata_soa_size()
2278         {
2279                 size_t shader_soa_size = 0;
2280
2281 #define SD_VAR(type, what) shader_soa_size += sizeof(void *);
2282 #define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
2283                 #include "kernel_shaderdata_vars.h"
2284 #undef SD_VAR
2285 #undef SD_CLOSURE_VAR
2286
2287                 return shader_soa_size;
2288         }
2289
2290         bool load_kernels(const DeviceRequestedFeatures& requested_features)
2291         {
2292                 /* Get Shader, bake and film_convert kernels.
2293                  * It'll also do verification of OpenCL actually initialized.
2294                  */
2295                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
2296                         return false;
2297                 }
2298
2299                 string kernel_path = path_get("kernel");
2300                 string kernel_md5 = path_files_md5_hash(kernel_path);
2301                 string device_md5;
2302                 string kernel_init_source;
2303                 string clbin;
2304                 string clsrc, *debug_src = NULL;
2305
2306                 string build_options = "-D__SPLIT_KERNEL__";
2307 #ifdef __WORK_STEALING__
2308                 build_options += " -D__WORK_STEALING__";
2309 #endif
2310                 build_options += build_options_from_requested_features(requested_features);
2311
2312                 /* Set compute device build option. */
2313                 cl_device_type device_type;
2314                 ciErr = clGetDeviceInfo(cdDevice,
2315                                         CL_DEVICE_TYPE,
2316                                         sizeof(cl_device_type),
2317                                         &device_type,
2318                                         NULL);
2319                 assert(ciErr == CL_SUCCESS);
2320                 if(device_type == CL_DEVICE_TYPE_GPU) {
2321                         build_options += " -D__COMPUTE_DEVICE_GPU__";
2322                 }
2323
2324 #define GLUE(a, b) a ## b
2325 #define LOAD_KERNEL(name) \
2326         do { \
2327                 kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
2328                                      kernel_md5 + "\n"; \
2329                 device_md5 = device_md5_hash(build_options); \
2330                 clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
2331                                       device_md5.c_str(), kernel_md5.c_str()); \
2332                 if(opencl_kernel_use_debug()) { \
2333                         clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
2334                                               device_md5.c_str(), kernel_md5.c_str()); \
2335                         clsrc = path_user_get(path_join("cache", clsrc)); \
2336                         debug_src = &clsrc; \
2337                 } \
2338                 if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
2339                                       build_options, \
2340                                       &GLUE(name, _program), \
2341                                       debug_src)) \
2342                 { \
2343                         fprintf(stderr, "Faled to compile %s\n", #name); \
2344                         return false; \
2345                 } \
2346         } while(false)
2347
2348                 LOAD_KERNEL(data_init);
2349                 LOAD_KERNEL(scene_intersect);
2350                 LOAD_KERNEL(lamp_emission);
2351                 LOAD_KERNEL(queue_enqueue);
2352                 LOAD_KERNEL(background_buffer_update);
2353                 LOAD_KERNEL(shader_eval);
2354                 LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
2355                 LOAD_KERNEL(direct_lighting);
2356                 LOAD_KERNEL(shadow_blocked);
2357                 LOAD_KERNEL(next_iteration_setup);
2358                 LOAD_KERNEL(sum_all_radiance);
2359
2360 #undef LOAD_KERNEL
2361
2362 #define FIND_KERNEL(name) \
2363         do { \
2364                 GLUE(ckPathTraceKernel_, name) = \
2365                         clCreateKernel(GLUE(name, _program), \
2366                                        "kernel_ocl_path_trace_"  #name, &ciErr); \
2367                 if(opencl_error(ciErr)) { \
2368                         fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
2369                         return false; \
2370                 } \
2371         } while(false)
2372
2373                 FIND_KERNEL(data_init);
2374                 FIND_KERNEL(scene_intersect);
2375                 FIND_KERNEL(lamp_emission);
2376                 FIND_KERNEL(queue_enqueue);
2377                 FIND_KERNEL(background_buffer_update);
2378                 FIND_KERNEL(shader_eval);
2379                 FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
2380                 FIND_KERNEL(direct_lighting);
2381                 FIND_KERNEL(shadow_blocked);
2382                 FIND_KERNEL(next_iteration_setup);
2383                 FIND_KERNEL(sum_all_radiance);
2384 #undef FIND_KERNEL
2385 #undef GLUE
2386
2387                 current_max_closure = requested_features.max_closure;
2388
2389                 return true;
2390         }
2391
2392         ~OpenCLDeviceSplitKernel()
2393         {
2394                 task_pool.stop();
2395
2396                 /* Release kernels */
2397                 release_kernel_safe(ckPathTraceKernel_data_init);
2398                 release_kernel_safe(ckPathTraceKernel_scene_intersect);
2399                 release_kernel_safe(ckPathTraceKernel_lamp_emission);
2400                 release_kernel_safe(ckPathTraceKernel_queue_enqueue);
2401                 release_kernel_safe(ckPathTraceKernel_background_buffer_update);
2402                 release_kernel_safe(ckPathTraceKernel_shader_eval);
2403                 release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
2404                 release_kernel_safe(ckPathTraceKernel_direct_lighting);
2405                 release_kernel_safe(ckPathTraceKernel_shadow_blocked);
2406                 release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
2407                 release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
2408
2409                 /* Release global memory */
2410                 release_mem_object_safe(P_sd);
2411                 release_mem_object_safe(P_sd_DL_shadow);
2412                 release_mem_object_safe(N_sd);
2413                 release_mem_object_safe(N_sd_DL_shadow);
2414                 release_mem_object_safe(Ng_sd);
2415                 release_mem_object_safe(Ng_sd_DL_shadow);
2416                 release_mem_object_safe(I_sd);
2417                 release_mem_object_safe(I_sd_DL_shadow);
2418                 release_mem_object_safe(shader_sd);
2419                 release_mem_object_safe(shader_sd_DL_shadow);
2420                 release_mem_object_safe(flag_sd);
2421                 release_mem_object_safe(flag_sd_DL_shadow);
2422                 release_mem_object_safe(prim_sd);
2423                 release_mem_object_safe(prim_sd_DL_shadow);
2424                 release_mem_object_safe(type_sd);
2425                 release_mem_object_safe(type_sd_DL_shadow);
2426                 release_mem_object_safe(u_sd);
2427                 release_mem_object_safe(u_sd_DL_shadow);
2428                 release_mem_object_safe(v_sd);
2429                 release_mem_object_safe(v_sd_DL_shadow);
2430                 release_mem_object_safe(object_sd);
2431                 release_mem_object_safe(object_sd_DL_shadow);
2432                 release_mem_object_safe(time_sd);
2433                 release_mem_object_safe(time_sd_DL_shadow);
2434                 release_mem_object_safe(ray_length_sd);
2435                 release_mem_object_safe(ray_length_sd_DL_shadow);
2436                 release_mem_object_safe(ray_depth_sd);
2437                 release_mem_object_safe(ray_depth_sd_DL_shadow);
2438                 release_mem_object_safe(transparent_depth_sd);
2439                 release_mem_object_safe(transparent_depth_sd_DL_shadow);
2440
2441                 /* Ray differentials. */
2442                 release_mem_object_safe(dP_sd);
2443                 release_mem_object_safe(dP_sd_DL_shadow);
2444                 release_mem_object_safe(dI_sd);
2445                 release_mem_object_safe(dI_sd_DL_shadow);
2446                 release_mem_object_safe(du_sd);
2447                 release_mem_object_safe(du_sd_DL_shadow);
2448                 release_mem_object_safe(dv_sd);
2449                 release_mem_object_safe(dv_sd_DL_shadow);
2450
2451                 /* Dp/Du */
2452                 release_mem_object_safe(dPdu_sd);
2453                 release_mem_object_safe(dPdu_sd_DL_shadow);
2454                 release_mem_object_safe(dPdv_sd);
2455                 release_mem_object_safe(dPdv_sd_DL_shadow);
2456
2457                 /* Object motion. */
2458                 release_mem_object_safe(ob_tfm_sd);
2459                 release_mem_object_safe(ob_itfm_sd);
2460
2461                 release_mem_object_safe(ob_tfm_sd_DL_shadow);
2462                 release_mem_object_safe(ob_itfm_sd_DL_shadow);
2463
2464                 release_mem_object_safe(closure_sd);
2465                 release_mem_object_safe(closure_sd_DL_shadow);
2466                 release_mem_object_safe(num_closure_sd);
2467                 release_mem_object_safe(num_closure_sd_DL_shadow);
2468                 release_mem_object_safe(randb_closure_sd);
2469                 release_mem_object_safe(randb_closure_sd_DL_shadow);
2470                 release_mem_object_safe(ray_P_sd);
2471                 release_mem_object_safe(ray_P_sd_DL_shadow);
2472                 release_mem_object_safe(ray_dP_sd);
2473                 release_mem_object_safe(ray_dP_sd_DL_shadow);
2474                 release_mem_object_safe(rng_coop);
2475                 release_mem_object_safe(throughput_coop);
2476                 release_mem_object_safe(L_transparent_coop);
2477                 release_mem_object_safe(PathRadiance_coop);
2478                 release_mem_object_safe(Ray_coop);
2479                 release_mem_object_safe(PathState_coop);
2480                 release_mem_object_safe(Intersection_coop);
2481                 release_mem_object_safe(kgbuffer);
2482                 release_mem_object_safe(sd);
2483                 release_mem_object_safe(sd_DL_shadow);
2484                 release_mem_object_safe(ray_state);
2485                 release_mem_object_safe(AOAlpha_coop);
2486                 release_mem_object_safe(AOBSDF_coop);
2487                 release_mem_object_safe(AOLightRay_coop);
2488                 release_mem_object_safe(BSDFEval_coop);
2489                 release_mem_object_safe(ISLamp_coop);
2490                 release_mem_object_safe(LightRay_coop);
2491                 release_mem_object_safe(Intersection_coop_AO);
2492                 release_mem_object_safe(Intersection_coop_DL);
2493 #ifdef WITH_CYCLES_DEBUG
2494                 release_mem_object_safe(debugdata_coop);
2495 #endif
2496                 release_mem_object_safe(use_queues_flag);
2497                 release_mem_object_safe(Queue_data);
2498                 release_mem_object_safe(Queue_index);
2499                 release_mem_object_safe(work_array);
2500 #ifdef __WORK_STEALING__
2501                 release_mem_object_safe(work_pool_wgs);
2502 #endif
2503                 release_mem_object_safe(per_sample_output_buffers);
2504
2505                 /* Release programs */
2506                 release_program_safe(data_init_program);
2507                 release_program_safe(scene_intersect_program);
2508                 release_program_safe(lamp_emission_program);
2509                 release_program_safe(queue_enqueue_program);
2510                 release_program_safe(background_buffer_update_program);
2511                 release_program_safe(shader_eval_program);
2512                 release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
2513                 release_program_safe(direct_lighting_program);
2514                 release_program_safe(shadow_blocked_program);
2515                 release_program_safe(next_iteration_setup_program);
2516                 release_program_safe(sum_all_radiance_program);
2517
2518                 if(hostRayStateArray != NULL) {
2519                         free(hostRayStateArray);
2520                 }
2521         }
2522
2523         void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
2524         {
2525                 /* cast arguments to cl types */
2526                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
2527                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
2528                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
2529                 cl_int d_x = rtile.x;
2530                 cl_int d_y = rtile.y;
2531                 cl_int d_w = rtile.w;
2532                 cl_int d_h = rtile.h;
2533                 cl_int d_offset = rtile.offset;
2534                 cl_int d_stride = rtile.stride;
2535
2536                 /* Make sure that set render feasible tile size is a multiple of local
2537                  * work size dimensions.
2538                  */
2539                 assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
2540                 assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
2541
2542                 size_t global_size[2];
2543                 size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
2544                                         SPLIT_KERNEL_LOCAL_SIZE_Y};
2545
2546                 /* Set the range of samples to be processed for every ray in
2547                  * path-regeneration logic.
2548                  */
2549                 cl_int start_sample = rtile.start_sample;
2550                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
2551                 cl_int num_samples = rtile.num_samples;
2552
2553 #ifdef __WORK_STEALING__
2554                 global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
2555                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2556                 unsigned int num_parallel_samples = 1;
2557 #else
2558                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2559                 unsigned int num_threads = max_render_feasible_tile_size.x *
2560                                            max_render_feasible_tile_size.y;
2561                 unsigned int num_tile_columns_possible = num_threads / global_size[1];
2562                 /* Estimate number of parallel samples that can be
2563                  * processed in parallel.
2564                  */
2565                 unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
2566                                                         rtile.num_samples);
2567                 /* Wavefront size in AMD is 64.
2568                  * TODO(sergey): What about other platforms?
2569                  */
2570                 if(num_parallel_samples >= 64) {
2571                         /* TODO(sergey): Could use generic round-up here. */
2572                         num_parallel_samples = (num_parallel_samples / 64) * 64;
2573                 }
2574                 assert(num_parallel_samples != 0);
2575
2576                 global_size[0] = d_w * num_parallel_samples;
2577 #endif  /* __WORK_STEALING__ */
2578
2579                 assert(global_size[0] * global_size[1] <=
2580                        max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
2581
2582                 /* Allocate all required global memory once. */
2583                 if(first_tile) {
2584                         size_t num_global_elements = max_render_feasible_tile_size.x *
2585                                                      max_render_feasible_tile_size.y;
2586                         /* TODO(sergey): This will actually over-allocate if
2587                          * particular kernel does not support multiclosure.
2588                          */
2589                         size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
2590
2591 #ifdef __WORK_STEALING__
2592                         /* Calculate max groups */
2593                         size_t max_global_size[2];
2594                         size_t tile_x = max_render_feasible_tile_size.x;
2595                         size_t tile_y = max_render_feasible_tile_size.y;
2596                         max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
2597                         max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
2598                         max_work_groups = (max_global_size[0] * max_global_size[1]) /
2599                                           (local_size[0] * local_size[1]);
2600                         /* Allocate work_pool_wgs memory. */
2601                         work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
2602 #endif  /* __WORK_STEALING__ */
2603
2604                         /* Allocate queue_index memory only once. */
2605                         Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
2606                         use_queues_flag = mem_alloc(sizeof(char));
2607                         kgbuffer = mem_alloc(get_KernelGlobals_size());
2608
2609                         /* Create global buffers for ShaderData. */
2610                         sd = mem_alloc(get_shaderdata_soa_size());
2611                         sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
2612                         P_sd = mem_alloc(num_global_elements * sizeof(float3));
2613                         P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2614                         N_sd = mem_alloc(num_global_elements * sizeof(float3));
2615                         N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2616                         Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
2617                         Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2618                         I_sd = mem_alloc(num_global_elements * sizeof(float3));
2619                         I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2620                         shader_sd = mem_alloc(num_global_elements * sizeof(int));
2621                         shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2622                         flag_sd = mem_alloc(num_global_elements * sizeof(int));
2623                         flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2624                         prim_sd = mem_alloc(num_global_elements * sizeof(int));
2625                         prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2626                         type_sd = mem_alloc(num_global_elements * sizeof(int));
2627                         type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2628                         u_sd = mem_alloc(num_global_elements * sizeof(float));
2629                         u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2630                         v_sd = mem_alloc(num_global_elements * sizeof(float));
2631                         v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2632                         object_sd = mem_alloc(num_global_elements * sizeof(int));
2633                         object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2634                         time_sd = mem_alloc(num_global_elements * sizeof(float));
2635                         time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2636                         ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
2637                         ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2638                         ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2639                         ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2640                         transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2641                         transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2642
2643                         /* Ray differentials. */
2644                         dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2645                         dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2646                         dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
2647                         dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2648                         du_sd = mem_alloc(num_global_elements * sizeof(differential));
2649                         du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2650                         dv_sd = mem_alloc(num_global_elements * sizeof(differential));
2651                         dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2652
2653                         /* Dp/Du */
2654                         dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
2655                         dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2656                         dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
2657                         dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2658
2659                         /* Object motion. */
2660                         ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2661                         ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2662                         ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2663                         ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2664
2665                         closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
2666                         closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
2667                         num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
2668                         num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2669                         randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
2670                         randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2671                         ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
2672                         ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2673                         ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2674                         ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2675
2676                         /* Creation of global memory buffers which are shared among
2677                          * the kernels.
2678                          */
2679                         rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
2680                         throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
2681                         L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
2682                         PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
2683                         Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
2684                         PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
2685                         Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
2686                         AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
2687                         AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
2688                         AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2689                         BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
2690                         ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
2691                         LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2692                         Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
2693                         Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
2694
2695 #ifdef WITH_CYCLES_DEBUG
2696                         debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
2697 #endif
2698
2699                         ray_state = mem_alloc(num_global_elements * sizeof(char));
2700
2701                         hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
2702                         assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
2703
2704                         Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
2705                         work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
2706                         per_sample_output_buffers = mem_alloc(num_global_elements *
2707                                                               per_thread_output_buffer_size);
2708                 }
2709
2710                 cl_int dQueue_size = global_size[0] * global_size[1];
2711                 cl_int total_num_rays = global_size[0] * global_size[1];
2712
2713                 cl_uint start_arg_index =
2714                         kernel_set_args(ckPathTraceKernel_data_init,
2715                                         0,
2716                                         kgbuffer,
2717                                         sd,
2718                                         sd_DL_shadow,
2719                                         P_sd,
2720                                         P_sd_DL_shadow,
2721                                         N_sd,
2722                                         N_sd_DL_shadow,
2723                                         Ng_sd,
2724                                         Ng_sd_DL_shadow,
2725                                         I_sd,
2726                                         I_sd_DL_shadow,
2727                                         shader_sd,
2728                                         shader_sd_DL_shadow,
2729                                         flag_sd,
2730                                         flag_sd_DL_shadow,
2731                                         prim_sd,
2732                                         prim_sd_DL_shadow,
2733                                         type_sd,
2734                                         type_sd_DL_shadow,
2735                                         u_sd,
2736                                         u_sd_DL_shadow,
2737                                         v_sd,
2738                                         v_sd_DL_shadow,
2739                                         object_sd,
2740                                         object_sd_DL_shadow,
2741                                         time_sd,
2742                                         time_sd_DL_shadow,
2743                                         ray_length_sd,
2744                                         ray_length_sd_DL_shadow,
2745                                         ray_depth_sd,
2746                                         ray_depth_sd_DL_shadow,
2747                                         transparent_depth_sd,
2748                                         transparent_depth_sd_DL_shadow);
2749
2750                 /* Ray differentials. */
2751                 start_arg_index +=
2752                         kernel_set_args(ckPathTraceKernel_data_init,
2753                                         start_arg_index,
2754                                         dP_sd,
2755                                         dP_sd_DL_shadow,
2756                                         dI_sd,
2757                                         dI_sd_DL_shadow,
2758                                         du_sd,
2759                                         du_sd_DL_shadow,
2760                                         dv_sd,
2761                                         dv_sd_DL_shadow);
2762
2763                 /* Dp/Du */
2764                 start_arg_index +=
2765                         kernel_set_args(ckPathTraceKernel_data_init,
2766                                         start_arg_index,
2767                                         dPdu_sd,
2768                                         dPdu_sd_DL_shadow,
2769                                         dPdv_sd,
2770                                         dPdv_sd_DL_shadow);
2771
2772                 /* Object motion. */
2773                 start_arg_index +=
2774                         kernel_set_args(ckPathTraceKernel_data_init,
2775                                         start_arg_index,
2776                                         ob_tfm_sd,
2777                                         ob_tfm_sd_DL_shadow,
2778                                         ob_itfm_sd,
2779                                         ob_itfm_sd_DL_shadow);
2780
2781                 start_arg_index +=
2782                         kernel_set_args(ckPathTraceKernel_data_init,
2783                                         start_arg_index,
2784                                         closure_sd,
2785                                         closure_sd_DL_shadow,
2786                                         num_closure_sd,
2787                                         num_closure_sd_DL_shadow,
2788                                         randb_closure_sd,
2789                                         randb_closure_sd_DL_shadow,
2790                                         ray_P_sd,
2791                                         ray_P_sd_DL_shadow,
2792                                         ray_dP_sd,
2793                                         ray_dP_sd_DL_shadow,
2794                                         d_data,
2795                                         per_sample_output_buffers,
2796                                         d_rng_state,
2797                                         rng_coop,
2798                                         throughput_coop,
2799                                         L_transparent_coop,
2800                                         PathRadiance_coop,
2801                                         Ray_coop,
2802                                         PathState_coop,
2803                                         ray_state);
2804
2805 /* TODO(segrey): Avoid map lookup here. */
2806 #define KERNEL_TEX(type, ttype, name) \
2807         set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
2808 #include "kernel_textures.h"
2809 #undef KERNEL_TEX
2810
2811                 start_arg_index +=
2812                         kernel_set_args(ckPathTraceKernel_data_init,
2813                                         start_arg_index,
2814                                         start_sample,
2815                                         d_x,
2816                                         d_y,
2817                                         d_w,
2818                                         d_h,
2819                                         d_offset,
2820                                         d_stride,
2821                                         rtile.rng_state_offset_x,
2822                                         rtile.rng_state_offset_y,
2823                                         rtile.buffer_rng_state_stride,
2824                                         Queue_data,
2825                                         Queue_index,
2826                                         dQueue_size,
2827                                         use_queues_flag,
2828                                         work_array,
2829 #ifdef __WORK_STEALING__
2830                                         work_pool_wgs,
2831                                         num_samples,
2832 #endif
2833 #ifdef WITH_CYCLES_DEBUG
2834                                         debugdata_coop,
2835 #endif
2836                                         num_parallel_samples);
2837
2838                 kernel_set_args(ckPathTraceKernel_scene_intersect,
2839                                 0,
2840                                 kgbuffer,
2841                                 d_data,
2842                                 rng_coop,
2843                                 Ray_coop,
2844                                 PathState_coop,
2845                                 Intersection_coop,
2846                                 ray_state,
2847                                 d_w,
2848                                 d_h,
2849                                 Queue_data,
2850                                 Queue_index,
2851                                 dQueue_size,
2852                                 use_queues_flag,
2853 #ifdef WITH_CYCLES_DEBUG
2854                                 debugdata_coop,
2855 #endif
2856                                 num_parallel_samples);
2857
2858                 kernel_set_args(ckPathTraceKernel_lamp_emission,
2859                                 0,
2860                                 kgbuffer,
2861                                 d_data,
2862                                 sd,
2863                                 throughput_coop,
2864                                 PathRadiance_coop,
2865                                 Ray_coop,
2866                                 PathState_coop,
2867                                 Intersection_coop,
2868                                 ray_state,
2869                                 d_w,
2870                                 d_h,
2871                                 Queue_data,
2872                                 Queue_index,
2873                                 dQueue_size,
2874                                 use_queues_flag,
2875                                 num_parallel_samples);
2876
2877                 kernel_set_args(ckPathTraceKernel_queue_enqueue,
2878                                 0,
2879                                 Queue_data,
2880                                 Queue_index,
2881                                 ray_state,
2882                                 dQueue_size);
2883
2884                 kernel_set_args(ckPathTraceKernel_background_buffer_update,
2885                                  0,
2886                                  kgbuffer,
2887                                  d_data,
2888                                  sd,
2889                                  per_sample_output_buffers,
2890                                  d_rng_state,
2891                                  rng_coop,
2892                                  throughput_coop,
2893                                  PathRadiance_coop,
2894                                  Ray_coop,
2895                                  PathState_coop,
2896                                  L_transparent_coop,
2897                                  ray_state,
2898                                  d_w,
2899                                  d_h,
2900                                  d_x,
2901                                  d_y,
2902                                  d_stride,
2903                                  rtile.rng_state_offset_x,
2904                                  rtile.rng_state_offset_y,
2905                                  rtile.buffer_rng_state_stride,
2906                                  work_array,
2907                                  Queue_data,
2908                                  Queue_index,
2909                                  dQueue_size,
2910                                  end_sample,
2911                                  start_sample,
2912 #ifdef __WORK_STEALING__
2913                                  work_pool_wgs,
2914                                  num_samples,
2915 #endif
2916 #ifdef WITH_CYCLES_DEBUG
2917                                  debugdata_coop,
2918 #endif
2919                                  num_parallel_samples);
2920
2921                 kernel_set_args(ckPathTraceKernel_shader_eval,
2922                                 0,
2923                                 kgbuffer,
2924                                 d_data,
2925                                 sd,
2926                                 rng_coop,
2927                                 Ray_coop,
2928                                 PathState_coop,
2929                                 Intersection_coop,
2930                                 ray_state,
2931                                 Queue_data,
2932                                 Queue_index,
2933                                 dQueue_size);
2934
2935                 kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
2936                                 0,
2937                                 kgbuffer,
2938                                 d_data,
2939                                 sd,
2940                                 per_sample_output_buffers,
2941                                 rng_coop,
2942                                 throughput_coop,
2943                                 L_transparent_co