Cycles; Make baking a feature-specific option
[blender-staging.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "clew.h"
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_foreach.h"
31 #include "util_logging.h"
32 #include "util_map.h"
33 #include "util_math.h"
34 #include "util_md5.h"
35 #include "util_opengl.h"
36 #include "util_path.h"
37 #include "util_time.h"
38
39 CCL_NAMESPACE_BEGIN
40
41 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
42
43 /* Macro declarations used with split kernel */
44
45 /* Macro to enable/disable work-stealing */
46 #define __WORK_STEALING__
47
48 #define SPLIT_KERNEL_LOCAL_SIZE_X 64
49 #define SPLIT_KERNEL_LOCAL_SIZE_Y 1
50
51 /* This value may be tuned according to the scene we are rendering.
52  *
53  * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
54  * ray-bounces will improve performance.
55  */
56 #define PATH_ITER_INC_FACTOR 8
57
58 /* When allocate global memory in chunks. We may not be able to
59  * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
60  * Since some bytes may be needed for aligning chunks of memory;
61  * This is the amount of memory that we dedicate for that purpose.
62  */
63 #define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
64
65 struct OpenCLPlatformDevice {
66         OpenCLPlatformDevice(cl_platform_id platform_id,
67                              const string& platform_name,
68                              cl_device_id device_id,
69                              cl_device_type device_type,
70                              const string& device_name)
71           : platform_id(platform_id),
72             platform_name(platform_name),
73             device_id(device_id),
74             device_type(device_type),
75             device_name(device_name) {}
76         cl_platform_id platform_id;
77         string platform_name;
78         cl_device_id device_id;
79         cl_device_type device_type;
80         string device_name;
81 };
82
83 namespace {
84
85 cl_device_type opencl_device_type()
86 {
87         char *device = getenv("CYCLES_OPENCL_TEST");
88
89         if(device) {
90                 if(strcmp(device, "NONE") == 0)
91                         return 0;
92                 if(strcmp(device, "ALL") == 0)
93                         return CL_DEVICE_TYPE_ALL;
94                 else if(strcmp(device, "DEFAULT") == 0)
95                         return CL_DEVICE_TYPE_DEFAULT;
96                 else if(strcmp(device, "CPU") == 0)
97                         return CL_DEVICE_TYPE_CPU;
98                 else if(strcmp(device, "GPU") == 0)
99                         return CL_DEVICE_TYPE_GPU;
100                 else if(strcmp(device, "ACCELERATOR") == 0)
101                         return CL_DEVICE_TYPE_ACCELERATOR;
102         }
103
104         return CL_DEVICE_TYPE_ALL;
105 }
106
107 bool opencl_kernel_use_debug()
108 {
109         return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
110 }
111
112 bool opencl_kernel_use_advanced_shading(const string& platform)
113 {
114         /* keep this in sync with kernel_types.h! */
115         if(platform == "NVIDIA CUDA")
116                 return true;
117         else if(platform == "Apple")
118                 return true;
119         else if(platform == "AMD Accelerated Parallel Processing")
120                 return true;
121         else if(platform == "Intel(R) OpenCL")
122                 return true;
123         /* Make sure officially unsupported OpenCL platforms
124          * does not set up to use advanced shading.
125          */
126         return false;
127 }
128
129 bool opencl_kernel_use_split(const string& platform_name,
130                              const cl_device_type device_type)
131 {
132         if(getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST") != NULL) {
133                 return true;
134         }
135         /* TODO(sergey): Replace string lookups with more enum-like API,
136          * similar to device/vendor checks blender's gpu.
137          */
138         if(platform_name == "AMD Accelerated Parallel Processing" &&
139            device_type == CL_DEVICE_TYPE_GPU)
140         {
141                 return true;
142         }
143         return false;
144 }
145
146 bool opencl_device_supported(const string& platform_name,
147                              const cl_device_id device_id)
148 {
149         cl_device_type device_type;
150         clGetDeviceInfo(device_id,
151                         CL_DEVICE_TYPE,
152                         sizeof(cl_device_type),
153                         &device_type,
154                         NULL);
155         if(platform_name == "AMD Accelerated Parallel Processing" &&
156            device_type == CL_DEVICE_TYPE_GPU)
157         {
158                 return true;
159         }
160         if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
161                 return true;
162         }
163         return false;
164 }
165
166 bool opencl_platform_version_check(cl_platform_id platform,
167                                    string *error = NULL)
168 {
169         const int req_major = 1, req_minor = 1;
170         int major, minor;
171         char version[256];
172         clGetPlatformInfo(platform,
173                           CL_PLATFORM_VERSION,
174                           sizeof(version),
175                           &version,
176                           NULL);
177         if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
178                 if(error != NULL) {
179                         *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
180                 }
181                 return false;
182         }
183         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
184                 if(error != NULL) {
185                         *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
186                 }
187                 return false;
188         }
189         if(error != NULL) {
190                 *error = "";
191         }
192         return true;
193 }
194
195 bool opencl_device_version_check(cl_device_id device,
196                                  string *error = NULL)
197 {
198         const int req_major = 1, req_minor = 1;
199         int major, minor;
200         char version[256];
201         clGetDeviceInfo(device,
202                         CL_DEVICE_OPENCL_C_VERSION,
203                         sizeof(version),
204                         &version,
205                         NULL);
206         if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
207                 if(error != NULL) {
208                         *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
209                 }
210                 return false;
211         }
212         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
213                 if(error != NULL) {
214                         *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
215                 }
216                 return false;
217         }
218         if(error != NULL) {
219                 *error = "";
220         }
221         return true;
222 }
223
224 void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
225 {
226         const bool force_all_platforms =
227                 (getenv("CYCLES_OPENCL_TEST") != NULL) ||
228                 (getenv("CYCLES_OPENCL_SPLIT_KERNEL_TEST")) != NULL;
229         const cl_device_type device_type = opencl_device_type();
230         static bool first_time = true;
231 #define FIRST_VLOG(severity) if(first_time) VLOG(severity)
232
233         usable_devices->clear();
234
235         if(device_type == 0) {
236                 FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
237                 first_time = false;
238                 return;
239         }
240
241         vector<cl_device_id> device_ids;
242         cl_uint num_devices = 0;
243         vector<cl_platform_id> platform_ids;
244         cl_uint num_platforms = 0;
245
246         /* Get devices. */
247         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
248            num_platforms == 0)
249         {
250                 FIRST_VLOG(2) << "No OpenCL platforms were found.";
251                 first_time = false;
252                 return;
253         }
254         platform_ids.resize(num_platforms);
255         if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
256                 FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
257                 first_time = false;
258                 return;
259         }
260         /* Devices are numbered consecutively across platforms. */
261         for(int platform = 0; platform < num_platforms; platform++) {
262                 cl_platform_id platform_id = platform_ids[platform];
263                 char pname[256];
264                 if(clGetPlatformInfo(platform_id,
265                                      CL_PLATFORM_NAME,
266                                      sizeof(pname),
267                                      &pname,
268                                      NULL) != CL_SUCCESS)
269                 {
270                         FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
271                         continue;
272                 }
273                 string platform_name = pname;
274                 FIRST_VLOG(2) << "Enumerating devices for platform "
275                               << platform_name << ".";
276                 if(!opencl_platform_version_check(platform_id)) {
277                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
278                                       << " due to too old compiler version.";
279                         continue;
280                 }
281                 num_devices = 0;
282                 if(clGetDeviceIDs(platform_id,
283                                   device_type,
284                                   0,
285                                   NULL,
286                                   &num_devices) != CL_SUCCESS || num_devices == 0)
287                 {
288                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
289                                       << ", failed to fetch number of devices.";
290                         continue;
291                 }
292                 device_ids.resize(num_devices);
293                 if(clGetDeviceIDs(platform_id,
294                                   device_type,
295                                   num_devices,
296                                   &device_ids[0],
297                                   NULL) != CL_SUCCESS)
298                 {
299                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
300                                       << ", failed to fetch devices list.";
301                         continue;
302                 }
303                 for(int num = 0; num < num_devices; num++) {
304                         cl_device_id device_id = device_ids[num];
305                         char device_name[1024] = "\0";
306                         if(clGetDeviceInfo(device_id,
307                                            CL_DEVICE_NAME,
308                                            sizeof(device_name),
309                                            &device_name,
310                                            NULL) != CL_SUCCESS)
311                         {
312                                 FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
313                                 continue;
314                         }
315                         if(!opencl_device_version_check(device_id)) {
316                                 FIRST_VLOG(2) << "Ignoting device " << device_name
317                                               << " due to old compiler version.";
318                                 continue;
319                         }
320                         if(force_all_platforms ||
321                            opencl_device_supported(platform_name, device_id))
322                         {
323                                 cl_device_type device_type;
324                                 if(clGetDeviceInfo(device_id,
325                                                    CL_DEVICE_TYPE,
326                                                    sizeof(cl_device_type),
327                                                    &device_type,
328                                                    NULL) != CL_SUCCESS)
329                                 {
330                                         FIRST_VLOG(2) << "Ignoting device " << device_name
331                                                       << ", faield to fetch device type.";
332                                         continue;
333                                 }
334                                 FIRST_VLOG(2) << "Adding new device " << device_name << ".";
335                                 usable_devices->push_back(OpenCLPlatformDevice(platform_id,
336                                                                                platform_name,
337                                                                                device_id,
338                                                                                device_type,
339                                                                                device_name));
340                         }
341                         else {
342                                 FIRST_VLOG(2) << "Ignoting device " << device_name
343                                               << ", not officially supported yet.";
344                         }
345                 }
346         }
347         first_time = false;
348 }
349
350 }  /* namespace */
351
352 /* Thread safe cache for contexts and programs.
353  *
354  * TODO(sergey): Make it more generous, so it can contain any type of program
355  * without hardcoding possible program types in the slot.
356  */
357 class OpenCLCache
358 {
359         struct Slot
360         {
361                 thread_mutex *mutex;
362                 cl_context context;
363                 /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
364                 cl_program ocl_dev_base_program;
365                 /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
366                 cl_program ocl_dev_megakernel_program;
367
368                 Slot() : mutex(NULL),
369                          context(NULL),
370                          ocl_dev_base_program(NULL),
371                          ocl_dev_megakernel_program(NULL) {}
372
373                 Slot(const Slot& rhs)
374                     : mutex(rhs.mutex),
375                       context(rhs.context),
376                       ocl_dev_base_program(rhs.ocl_dev_base_program),
377                       ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
378                 {
379                         /* copy can only happen in map insert, assert that */
380                         assert(mutex == NULL);
381                 }
382
383                 ~Slot()
384                 {
385                         delete mutex;
386                         mutex = NULL;
387                 }
388         };
389
390         /* key is combination of platform ID and device ID */
391         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
392
393         /* map of Slot objects */
394         typedef map<PlatformDevicePair, Slot> CacheMap;
395         CacheMap cache;
396
397         thread_mutex cache_lock;
398
399         /* lazy instantiate */
400         static OpenCLCache &global_instance()
401         {
402                 static OpenCLCache instance;
403                 return instance;
404         }
405
406         OpenCLCache()
407         {
408         }
409
410         ~OpenCLCache()
411         {
412                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
413                  * so this is disabled. It's not necessary to free objects
414                  * at process exit anyway.
415                  * http://software.intel.com/en-us/forums/topic/370083#comments */
416
417                 //flush();
418         }
419
420         /* lookup something in the cache. If this returns NULL, slot_locker
421          * will be holding a lock for the cache. slot_locker should refer to a
422          * default constructed thread_scoped_lock */
423         template<typename T>
424         static T get_something(cl_platform_id platform,
425                                cl_device_id device,
426                                T Slot::*member,
427                                thread_scoped_lock& slot_locker)
428         {
429                 assert(platform != NULL);
430
431                 OpenCLCache& self = global_instance();
432
433                 thread_scoped_lock cache_lock(self.cache_lock);
434
435                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
436                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
437
438                 Slot &slot = ins.first->second;
439
440                 /* create slot lock only while holding cache lock */
441                 if(!slot.mutex)
442                         slot.mutex = new thread_mutex;
443
444                 /* need to unlock cache before locking slot, to allow store to complete */
445                 cache_lock.unlock();
446
447                 /* lock the slot */
448                 slot_locker = thread_scoped_lock(*slot.mutex);
449
450                 /* If the thing isn't cached */
451                 if(slot.*member == NULL) {
452                         /* return with the caller's lock holder holding the slot lock */
453                         return NULL;
454                 }
455
456                 /* the item was already cached, release the slot lock */
457                 slot_locker.unlock();
458
459                 return slot.*member;
460         }
461
462         /* store something in the cache. you MUST have tried to get the item before storing to it */
463         template<typename T>
464         static void store_something(cl_platform_id platform,
465                                     cl_device_id device,
466                                     T thing,
467                                     T Slot::*member,
468                                     thread_scoped_lock& slot_locker)
469         {
470                 assert(platform != NULL);
471                 assert(device != NULL);
472                 assert(thing != NULL);
473
474                 OpenCLCache &self = global_instance();
475
476                 thread_scoped_lock cache_lock(self.cache_lock);
477                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
478                 cache_lock.unlock();
479
480                 Slot &slot = i->second;
481
482                 /* sanity check */
483                 assert(i != self.cache.end());
484                 assert(slot.*member == NULL);
485
486                 slot.*member = thing;
487
488                 /* unlock the slot */
489                 slot_locker.unlock();
490         }
491
492 public:
493
494         enum ProgramName {
495                 OCL_DEV_BASE_PROGRAM,
496                 OCL_DEV_MEGAKERNEL_PROGRAM,
497         };
498
499         /* see get_something comment */
500         static cl_context get_context(cl_platform_id platform,
501                                       cl_device_id device,
502                                       thread_scoped_lock& slot_locker)
503         {
504                 cl_context context = get_something<cl_context>(platform,
505                                                                device,
506                                                                &Slot::context,
507                                                                slot_locker);
508
509                 if(!context)
510                         return NULL;
511
512                 /* caller is going to release it when done with it, so retain it */
513                 cl_int ciErr = clRetainContext(context);
514                 assert(ciErr == CL_SUCCESS);
515                 (void)ciErr;
516
517                 return context;
518         }
519
520         /* see get_something comment */
521         static cl_program get_program(cl_platform_id platform,
522                                       cl_device_id device,
523                                       ProgramName program_name,
524                                       thread_scoped_lock& slot_locker)
525         {
526                 cl_program program = NULL;
527
528                 switch(program_name) {
529                         case OCL_DEV_BASE_PROGRAM:
530                                 /* Get program related to OpenCLDeviceBase */
531                                 program = get_something<cl_program>(platform,
532                                                                     device,
533                                                                     &Slot::ocl_dev_base_program,
534                                                                     slot_locker);
535                                 break;
536                         case OCL_DEV_MEGAKERNEL_PROGRAM:
537                                 /* Get program related to megakernel */
538                                 program = get_something<cl_program>(platform,
539                                                                     device,
540                                                                     &Slot::ocl_dev_megakernel_program,
541                                                                     slot_locker);
542                                 break;
543                 default:
544                         assert(!"Invalid program name");
545                 }
546
547                 if(!program)
548                         return NULL;
549
550                 /* caller is going to release it when done with it, so retain it */
551                 assert(clRetainProgram(program) == CL_SUCCESS);
552
553                 return program;
554         }
555
556         /* see store_something comment */
557         static void store_context(cl_platform_id platform,
558                                   cl_device_id device,
559                                   cl_context context,
560                                   thread_scoped_lock& slot_locker)
561         {
562                 store_something<cl_context>(platform,
563                                             device,
564                                             context,
565                                             &Slot::context,
566                                             slot_locker);
567
568                 /* increment reference count in OpenCL.
569                  * The caller is going to release the object when done with it. */
570                 assert(clRetainContext(context) == CL_SUCCESS);
571         }
572
573         /* see store_something comment */
574         static void store_program(cl_platform_id platform,
575                                   cl_device_id device,
576                                   cl_program program,
577                                   ProgramName program_name,
578                                   thread_scoped_lock& slot_locker)
579         {
580                 switch (program_name) {
581                         case OCL_DEV_BASE_PROGRAM:
582                                 store_something<cl_program>(platform,
583                                                             device,
584                                                             program,
585                                                             &Slot::ocl_dev_base_program,
586                                                             slot_locker);
587                                 break;
588                         case OCL_DEV_MEGAKERNEL_PROGRAM:
589                                 store_something<cl_program>(platform,
590                                                             device,
591                                                             program,
592                                                             &Slot::ocl_dev_megakernel_program,
593                                                             slot_locker);
594                                 break;
595                         default:
596                                 assert(!"Invalid program name\n");
597                                 return;
598                 }
599
600                 /* Increment reference count in OpenCL.
601                  * The caller is going to release the object when done with it.
602                  */
603                 assert(clRetainProgram(program) == CL_SUCCESS);
604         }
605
606         /* Discard all cached contexts and programs.  */
607         static void flush()
608         {
609                 OpenCLCache &self = global_instance();
610                 thread_scoped_lock cache_lock(self.cache_lock);
611
612                 foreach(CacheMap::value_type &item, self.cache) {
613                         if(item.second.ocl_dev_base_program != NULL)
614                                 clReleaseProgram(item.second.ocl_dev_base_program);
615                         if(item.second.ocl_dev_megakernel_program != NULL)
616                                 clReleaseProgram(item.second.ocl_dev_megakernel_program);
617                         if(item.second.context != NULL)
618                                 clReleaseContext(item.second.context);
619                 }
620
621                 self.cache.clear();
622         }
623 };
624
625 class OpenCLDeviceBase : public Device
626 {
627 public:
628         DedicatedTaskPool task_pool;
629         cl_context cxContext;
630         cl_command_queue cqCommandQueue;
631         cl_platform_id cpPlatform;
632         cl_device_id cdDevice;
633         cl_program cpProgram;
634         cl_kernel ckFilmConvertByteKernel;
635         cl_kernel ckFilmConvertHalfFloatKernel;
636         cl_kernel ckShaderKernel;
637         cl_kernel ckBakeKernel;
638         cl_int ciErr;
639
640         typedef map<string, device_vector<uchar>*> ConstMemMap;
641         typedef map<string, device_ptr> MemMap;
642
643         ConstMemMap const_mem_map;
644         MemMap mem_map;
645         device_ptr null_mem;
646
647         bool device_initialized;
648         string platform_name;
649
650         bool opencl_error(cl_int err)
651         {
652                 if(err != CL_SUCCESS) {
653                         string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
654                         if(error_msg == "")
655                                 error_msg = message;
656                         fprintf(stderr, "%s\n", message.c_str());
657                         return true;
658                 }
659
660                 return false;
661         }
662
663         void opencl_error(const string& message)
664         {
665                 if(error_msg == "")
666                         error_msg = message;
667                 fprintf(stderr, "%s\n", message.c_str());
668         }
669
670 #define opencl_assert(stmt) \
671         { \
672                 cl_int err = stmt; \
673                 \
674                 if(err != CL_SUCCESS) { \
675                         string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
676                         if(error_msg == "") \
677                                 error_msg = message; \
678                         fprintf(stderr, "%s\n", message.c_str()); \
679                 } \
680         } (void)0
681
682         void opencl_assert_err(cl_int err, const char* where)
683         {
684                 if(err != CL_SUCCESS) {
685                         string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
686                         if(error_msg == "")
687                                 error_msg = message;
688                         fprintf(stderr, "%s\n", message.c_str());
689 #ifndef NDEBUG
690                         abort();
691 #endif
692                 }
693         }
694
695         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
696         : Device(info, stats, background_)
697         {
698                 cpPlatform = NULL;
699                 cdDevice = NULL;
700                 cxContext = NULL;
701                 cqCommandQueue = NULL;
702                 cpProgram = NULL;
703                 ckFilmConvertByteKernel = NULL;
704                 ckFilmConvertHalfFloatKernel = NULL;
705                 ckShaderKernel = NULL;
706                 ckBakeKernel = NULL;
707                 null_mem = 0;
708                 device_initialized = false;
709
710                 vector<OpenCLPlatformDevice> usable_devices;
711                 opencl_get_usable_devices(&usable_devices);
712                 if(usable_devices.size() == 0) {
713                         opencl_error("OpenCL: no devices found.");
714                         return;
715                 }
716                 assert(info.num < usable_devices.size());
717                 OpenCLPlatformDevice& platform_device = usable_devices[info.num];
718                 cpPlatform = platform_device.platform_id;
719                 cdDevice = platform_device.device_id;
720                 platform_name = platform_device.platform_name;
721                 VLOG(2) << "Creating new Cycles device for OpenCL platform "
722                         << platform_name << ", device "
723                         << platform_device.device_name << ".";
724
725                 {
726                         /* try to use cached context */
727                         thread_scoped_lock cache_locker;
728                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
729
730                         if(cxContext == NULL) {
731                                 /* create context properties array to specify platform */
732                                 const cl_context_properties context_props[] = {
733                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
734                                         0, 0
735                                 };
736
737                                 /* create context */
738                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
739                                         context_notify_callback, cdDevice, &ciErr);
740
741                                 if(opencl_error(ciErr)) {
742                                         opencl_error("OpenCL: clCreateContext failed");
743                                         return;
744                                 }
745
746                                 /* cache it */
747                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
748                         }
749                 }
750
751                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
752                 if(opencl_error(ciErr))
753                         return;
754
755                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
756                 if(opencl_error(ciErr))
757                         return;
758
759                 fprintf(stderr, "Device init success\n");
760                 device_initialized = true;
761         }
762
763         static void CL_CALLBACK context_notify_callback(const char *err_info,
764                 const void * /*private_info*/, size_t /*cb*/, void *user_data)
765         {
766                 char name[256];
767                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
768
769                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
770         }
771
772         bool opencl_version_check()
773         {
774                 string error;
775                 if(!opencl_platform_version_check(cpPlatform, &error)) {
776                         opencl_error(error);
777                         return false;
778                 }
779                 if(!opencl_device_version_check(cdDevice, &error)) {
780                         opencl_error(error);
781                         return false;
782                 }
783                 return true;
784         }
785
786         bool load_binary(const string& /*kernel_path*/,
787                          const string& clbin,
788                          string custom_kernel_build_options,
789                          cl_program *program,
790                          const string *debug_src = NULL)
791         {
792                 /* read binary into memory */
793                 vector<uint8_t> binary;
794
795                 if(!path_read_binary(clbin, binary)) {
796                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
797                         return false;
798                 }
799
800                 /* create program */
801                 cl_int status;
802                 size_t size = binary.size();
803                 const uint8_t *bytes = &binary[0];
804
805                 *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
806                         &size, &bytes, &status, &ciErr);
807
808                 if(opencl_error(status) || opencl_error(ciErr)) {
809                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
810                         return false;
811                 }
812
813                 if(!build_kernel(program, custom_kernel_build_options, debug_src))
814                         return false;
815
816                 return true;
817         }
818
819         bool save_binary(cl_program *program, const string& clbin)
820         {
821                 size_t size = 0;
822                 clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
823
824                 if(!size)
825                         return false;
826
827                 vector<uint8_t> binary(size);
828                 uint8_t *bytes = &binary[0];
829
830                 clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
831
832                 if(!path_write_binary(clbin, binary)) {
833                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
834                         return false;
835                 }
836
837                 return true;
838         }
839
840         bool build_kernel(cl_program *kernel_program,
841                           string custom_kernel_build_options,
842                           const string *debug_src = NULL)
843         {
844                 string build_options;
845                 build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
846
847                 ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
848
849                 /* show warnings even if build is successful */
850                 size_t ret_val_size = 0;
851
852                 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
853
854                 if(ret_val_size > 1) {
855                         vector<char> build_log(ret_val_size + 1);
856                         clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
857
858                         build_log[ret_val_size] = '\0';
859                         /* Skip meaningless empty output from the NVidia compiler. */
860                         if(!(ret_val_size == 2 && build_log[0] == '\n')) {
861                                 fprintf(stderr, "OpenCL kernel build output:\n");
862                                 fprintf(stderr, "%s\n", &build_log[0]);
863                         }
864                 }
865
866                 if(ciErr != CL_SUCCESS) {
867                         opencl_error("OpenCL build failed: errors in console");
868                         return false;
869                 }
870
871                 return true;
872         }
873
874         bool compile_kernel(const string& kernel_path,
875                             string source,
876                             string custom_kernel_build_options,
877                             cl_program *kernel_program,
878                             const string *debug_src = NULL)
879         {
880                 /* we compile kernels consisting of many files. unfortunately opencl
881                  * kernel caches do not seem to recognize changes in included files.
882                  * so we force recompile on changes by adding the md5 hash of all files */
883                 source = path_source_replace_includes(source, kernel_path);
884
885                 if(debug_src)
886                         path_write_text(*debug_src, source);
887
888                 size_t source_len = source.size();
889                 const char *source_str = source.c_str();
890
891                 *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
892
893                 if(opencl_error(ciErr))
894                         return false;
895
896                 double starttime = time_dt();
897                 printf("Compiling OpenCL kernel ...\n");
898                 /* TODO(sergey): Report which kernel is being compiled
899                  * as well (megakernel or which of split kernels etc..).
900                  */
901                 printf("Build flags: %s\n", custom_kernel_build_options.c_str());
902
903                 if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
904                         return false;
905
906                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
907
908                 return true;
909         }
910
911         string device_md5_hash(string kernel_custom_build_options = "")
912         {
913                 MD5Hash md5;
914                 char version[256], driver[256], name[256], vendor[256];
915
916                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
917                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
918                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
919                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
920
921                 md5.append((uint8_t*)vendor, strlen(vendor));
922                 md5.append((uint8_t*)version, strlen(version));
923                 md5.append((uint8_t*)name, strlen(name));
924                 md5.append((uint8_t*)driver, strlen(driver));
925
926                 string options = kernel_build_options();
927                 options += kernel_custom_build_options;
928                 md5.append((uint8_t*)options.c_str(), options.size());
929
930                 return md5.get_hex();
931         }
932
933         bool load_kernels(const DeviceRequestedFeatures& requested_features)
934         {
935                 /* Verify if device was initialized. */
936                 if(!device_initialized) {
937                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
938                         return false;
939                 }
940
941                 /* Try to use cached kernel. */
942                 thread_scoped_lock cache_locker;
943                 cpProgram = load_cached_kernel(requested_features,
944                                                OpenCLCache::OCL_DEV_BASE_PROGRAM,
945                                                cache_locker);
946
947                 if(!cpProgram) {
948                         VLOG(2) << "No cached OpenCL kernel.";
949
950                         /* Verify we have right opencl version. */
951                         if(!opencl_version_check())
952                                 return false;
953
954                         string build_flags = build_options_for_base_program(requested_features);
955
956                         /* Calculate md5 hashes to detect changes. */
957                         string kernel_path = path_get("kernel");
958                         string kernel_md5 = path_files_md5_hash(kernel_path);
959                         string device_md5 = device_md5_hash(build_flags);
960
961                         /* Path to cached binary.
962                          *
963                          * TODO(sergey): Seems we could de-duplicate all this string_printf()
964                          * calls with some utility function which will give file name for a
965                          * given hashes..
966                          */
967                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
968                                                      device_md5.c_str(),
969                                                      kernel_md5.c_str());
970                         clbin = path_user_get(path_join("cache", clbin));
971
972                         /* path to preprocessed source for debugging */
973                         string clsrc, *debug_src = NULL;
974
975                         if(opencl_kernel_use_debug()) {
976                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
977                                                       device_md5.c_str(),
978                                                       kernel_md5.c_str());
979                                 clsrc = path_user_get(path_join("cache", clsrc));
980                                 debug_src = &clsrc;
981                         }
982
983                         /* If binary kernel exists already, try use it. */
984                         if(path_exists(clbin) && load_binary(kernel_path,
985                                                              clbin,
986                                                              build_flags,
987                                                              &cpProgram)) {
988                                 /* Kernel loaded from binary, nothing to do. */
989                                 VLOG(2) << "Loaded kernel from " << clbin << ".";
990                         }
991                         else {
992                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
993
994                                 /* If does not exist or loading binary failed, compile kernel. */
995                                 if(!compile_kernel(kernel_path,
996                                                    init_kernel_source,
997                                                    build_flags,
998                                                    &cpProgram,
999                                                    debug_src))
1000                                 {
1001                                         return false;
1002                                 }
1003
1004                                 /* Save binary for reuse. */
1005                                 if(!save_binary(&cpProgram, clbin)) {
1006                                         return false;
1007                                 }
1008                         }
1009
1010                         /* Cache the program. */
1011                         store_cached_kernel(cpPlatform,
1012                                             cdDevice,
1013                                             cpProgram,
1014                                             OpenCLCache::OCL_DEV_BASE_PROGRAM,
1015                                             cache_locker);
1016                 }
1017                 else {
1018                         VLOG(2) << "Found cached OpenCL kernel.";
1019                 }
1020
1021                 /* Find kernels. */
1022 #define FIND_KERNEL(kernel_var, kernel_name) \
1023                 do { \
1024                         kernel_var = clCreateKernel(cpProgram, "kernel_ocl_" kernel_name, &ciErr); \
1025                         if(opencl_error(ciErr)) \
1026                                 return false; \
1027                 } while(0)
1028
1029                 FIND_KERNEL(ckFilmConvertByteKernel, "convert_to_byte");
1030                 FIND_KERNEL(ckFilmConvertHalfFloatKernel, "convert_to_half_float");
1031                 FIND_KERNEL(ckShaderKernel, "shader");
1032                 FIND_KERNEL(ckBakeKernel, "bake");
1033
1034 #undef FIND_KERNEL
1035                 return true;
1036         }
1037
1038         ~OpenCLDeviceBase()
1039         {
1040                 task_pool.stop();
1041
1042                 if(null_mem)
1043                         clReleaseMemObject(CL_MEM_PTR(null_mem));
1044
1045                 ConstMemMap::iterator mt;
1046                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
1047                         mem_free(*(mt->second));
1048                         delete mt->second;
1049                 }
1050
1051                 if(ckFilmConvertByteKernel)
1052                         clReleaseKernel(ckFilmConvertByteKernel);
1053                 if(ckFilmConvertHalfFloatKernel)
1054                         clReleaseKernel(ckFilmConvertHalfFloatKernel);
1055                 if(ckShaderKernel)
1056                         clReleaseKernel(ckShaderKernel);
1057                 if(ckBakeKernel)
1058                         clReleaseKernel(ckBakeKernel);
1059                 if(cpProgram)
1060                         clReleaseProgram(cpProgram);
1061                 if(cqCommandQueue)
1062                         clReleaseCommandQueue(cqCommandQueue);
1063                 if(cxContext)
1064                         clReleaseContext(cxContext);
1065         }
1066
1067         void mem_alloc(device_memory& mem, MemoryType type)
1068         {
1069                 size_t size = mem.memory_size();
1070
1071                 cl_mem_flags mem_flag;
1072                 void *mem_ptr = NULL;
1073
1074                 if(type == MEM_READ_ONLY)
1075                         mem_flag = CL_MEM_READ_ONLY;
1076                 else if(type == MEM_WRITE_ONLY)
1077                         mem_flag = CL_MEM_WRITE_ONLY;
1078                 else
1079                         mem_flag = CL_MEM_READ_WRITE;
1080
1081                 /* Zero-size allocation might be invoked by render, but not really
1082                  * supported by OpenCL. Using NULL as device pointer also doesn't really
1083                  * work for some reason, so for the time being we'll use special case
1084                  * will null_mem buffer.
1085                  */
1086                 if(size != 0) {
1087                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
1088                                                                         mem_flag,
1089                                                                         size,
1090                                                                         mem_ptr,
1091                                                                         &ciErr);
1092                         opencl_assert_err(ciErr, "clCreateBuffer");
1093                 }
1094                 else {
1095                         mem.device_pointer = null_mem;
1096                 }
1097
1098                 stats.mem_alloc(size);
1099                 mem.device_size = size;
1100         }
1101
1102         void mem_copy_to(device_memory& mem)
1103         {
1104                 /* this is blocking */
1105                 size_t size = mem.memory_size();
1106                 if(size != 0){
1107                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1108                                                            CL_MEM_PTR(mem.device_pointer),
1109                                                            CL_TRUE,
1110                                                            0,
1111                                                            size,
1112                                                            (void*)mem.data_pointer,
1113                                                            0,
1114                                                            NULL, NULL));
1115                 }
1116         }
1117
1118         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
1119         {
1120                 size_t offset = elem*y*w;
1121                 size_t size = elem*w*h;
1122                 assert(size != 0);
1123                 opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
1124                                                   CL_MEM_PTR(mem.device_pointer),
1125                                                   CL_TRUE,
1126                                                   offset,
1127                                                   size,
1128                                                   (uchar*)mem.data_pointer + offset,
1129                                                   0,
1130                                                   NULL, NULL));
1131         }
1132
1133         void mem_zero(device_memory& mem)
1134         {
1135                 if(mem.device_pointer) {
1136                         memset((void*)mem.data_pointer, 0, mem.memory_size());
1137                         mem_copy_to(mem);
1138                 }
1139         }
1140
1141         void mem_free(device_memory& mem)
1142         {
1143                 if(mem.device_pointer) {
1144                         if(mem.device_pointer != null_mem) {
1145                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1146                         }
1147                         mem.device_pointer = 0;
1148
1149                         stats.mem_free(mem.device_size);
1150                         mem.device_size = 0;
1151                 }
1152         }
1153
1154         void const_copy_to(const char *name, void *host, size_t size)
1155         {
1156                 ConstMemMap::iterator i = const_mem_map.find(name);
1157
1158                 if(i == const_mem_map.end()) {
1159                         device_vector<uchar> *data = new device_vector<uchar>();
1160                         data->copy((uchar*)host, size);
1161
1162                         mem_alloc(*data, MEM_READ_ONLY);
1163                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
1164                 }
1165                 else {
1166                         device_vector<uchar> *data = i->second;
1167                         data->copy((uchar*)host, size);
1168                 }
1169
1170                 mem_copy_to(*i->second);
1171         }
1172
1173         void tex_alloc(const char *name,
1174                        device_memory& mem,
1175                        InterpolationType /*interpolation*/,
1176                        bool /*periodic*/)
1177         {
1178                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
1179                 mem_alloc(mem, MEM_READ_ONLY);
1180                 mem_copy_to(mem);
1181                 assert(mem_map.find(name) == mem_map.end());
1182                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
1183         }
1184
1185         void tex_free(device_memory& mem)
1186         {
1187                 if(mem.device_pointer) {
1188                         foreach(const MemMap::value_type& value, mem_map) {
1189                                 if(value.second == mem.device_pointer) {
1190                                         mem_map.erase(value.first);
1191                                         break;
1192                                 }
1193                         }
1194
1195                         mem_free(mem);
1196                 }
1197         }
1198
1199         size_t global_size_round_up(int group_size, int global_size)
1200         {
1201                 int r = global_size % group_size;
1202                 return global_size + ((r == 0)? 0: group_size - r);
1203         }
1204
1205         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
1206         {
1207                 size_t workgroup_size, max_work_items[3];
1208
1209                 clGetKernelWorkGroupInfo(kernel, cdDevice,
1210                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1211                 clGetDeviceInfo(cdDevice,
1212                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
1213         
1214                 /* try to divide evenly over 2 dimensions */
1215                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1216                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
1217
1218                 /* some implementations have max size 1 on 2nd dimension */
1219                 if(local_size[1] > max_work_items[1]) {
1220                         local_size[0] = workgroup_size/max_work_items[1];
1221                         local_size[1] = max_work_items[1];
1222                 }
1223
1224                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
1225
1226                 /* run kernel */
1227                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1228                 opencl_assert(clFlush(cqCommandQueue));
1229         }
1230
1231         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1232         {
1233                 cl_mem ptr;
1234
1235                 MemMap::iterator i = mem_map.find(name);
1236                 if(i != mem_map.end()) {
1237                         ptr = CL_MEM_PTR(i->second);
1238                 }
1239                 else {
1240                         /* work around NULL not working, even though the spec says otherwise */
1241                         ptr = CL_MEM_PTR(null_mem);
1242                 }
1243                 
1244                 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
1245         }
1246
1247         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
1248         {
1249                 /* cast arguments to cl types */
1250                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1251                 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
1252                 cl_mem d_buffer = CL_MEM_PTR(buffer);
1253                 cl_int d_x = task.x;
1254                 cl_int d_y = task.y;
1255                 cl_int d_w = task.w;
1256                 cl_int d_h = task.h;
1257                 cl_float d_sample_scale = 1.0f/(task.sample + 1);
1258                 cl_int d_offset = task.offset;
1259                 cl_int d_stride = task.stride;
1260
1261
1262                 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
1263
1264                 cl_uint start_arg_index =
1265                         kernel_set_args(ckFilmConvertKernel,
1266                                         0,
1267                                         d_data,
1268                                         d_rgba,
1269                                         d_buffer);
1270
1271 #define KERNEL_TEX(type, ttype, name) \
1272         set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
1273 #include "kernel_textures.h"
1274 #undef KERNEL_TEX
1275
1276                 start_arg_index += kernel_set_args(ckFilmConvertKernel,
1277                                                    start_arg_index,
1278                                                    d_sample_scale,
1279                                                    d_x,
1280                                                    d_y,
1281                                                    d_w,
1282                                                    d_h,
1283                                                    d_offset,
1284                                                    d_stride);
1285
1286                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1287         }
1288
1289         void shader(DeviceTask& task)
1290         {
1291                 /* cast arguments to cl types */
1292                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1293                 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1294                 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1295                 cl_int d_shader_eval_type = task.shader_eval_type;
1296                 cl_int d_shader_x = task.shader_x;
1297                 cl_int d_shader_w = task.shader_w;
1298                 cl_int d_offset = task.offset;
1299
1300                 cl_kernel kernel;
1301
1302                 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1303                         kernel = ckBakeKernel;
1304                 else
1305                         kernel = ckShaderKernel;
1306
1307                 for(int sample = 0; sample < task.num_samples; sample++) {
1308
1309                         if(task.get_cancel())
1310                                 break;
1311
1312                         cl_int d_sample = sample;
1313
1314                         cl_uint start_arg_index =
1315                                 kernel_set_args(kernel,
1316                                                 0,
1317                                                 d_data,
1318                                                 d_input,
1319                                                 d_output);
1320
1321 #define KERNEL_TEX(type, ttype, name) \
1322                 set_kernel_arg_mem(kernel, &start_arg_index, #name);
1323 #include "kernel_textures.h"
1324 #undef KERNEL_TEX
1325
1326                         start_arg_index += kernel_set_args(kernel,
1327                                                            start_arg_index,
1328                                                            d_shader_eval_type,
1329                                                            d_shader_x,
1330                                                            d_shader_w,
1331                                                            d_offset,
1332                                                            d_sample);
1333
1334                         enqueue_kernel(kernel, task.shader_w, 1);
1335
1336                         task.update_progress(NULL);
1337                 }
1338         }
1339
1340         class OpenCLDeviceTask : public DeviceTask {
1341         public:
1342                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
1343                 : DeviceTask(task)
1344                 {
1345                         run = function_bind(&OpenCLDeviceBase::thread_run,
1346                                             device,
1347                                             this);
1348                 }
1349         };
1350
1351         int get_split_task_count(DeviceTask& /*task*/)
1352         {
1353                 return 1;
1354         }
1355
1356         void task_add(DeviceTask& task)
1357         {
1358                 task_pool.push(new OpenCLDeviceTask(this, task));
1359         }
1360
1361         void task_wait()
1362         {
1363                 task_pool.wait();
1364         }
1365
1366         void task_cancel()
1367         {
1368                 task_pool.cancel();
1369         }
1370
1371         virtual void thread_run(DeviceTask * /*task*/) = 0;
1372
1373 protected:
1374         string kernel_build_options(const string *debug_src = NULL)
1375         {
1376                 string build_options = " -cl-fast-relaxed-math ";
1377
1378                 if(platform_name == "NVIDIA CUDA") {
1379                         build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1380                                          "-cl-nv-maxrregcount=32 "
1381                                          "-cl-nv-verbose ";
1382
1383                         uint compute_capability_major, compute_capability_minor;
1384                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1385                                         sizeof(cl_uint), &compute_capability_major, NULL);
1386                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1387                                         sizeof(cl_uint), &compute_capability_minor, NULL);
1388
1389                         build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1390                                                        compute_capability_major * 100 +
1391                                                        compute_capability_minor * 10);
1392                 }
1393
1394                 else if(platform_name == "Apple")
1395                         build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1396
1397                 else if(platform_name == "AMD Accelerated Parallel Processing")
1398                         build_options += "-D__KERNEL_OPENCL_AMD__ ";
1399
1400                 else if(platform_name == "Intel(R) OpenCL") {
1401                         build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1402
1403                         /* Options for gdb source level kernel debugging.
1404                          * this segfaults on linux currently.
1405                          */
1406                         if(opencl_kernel_use_debug() && debug_src)
1407                                 build_options += "-g -s \"" + *debug_src + "\" ";
1408                 }
1409
1410                 if(opencl_kernel_use_debug())
1411                         build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1412
1413 #ifdef WITH_CYCLES_DEBUG
1414                 build_options += "-D__KERNEL_DEBUG__ ";
1415 #endif
1416
1417                 return build_options;
1418         }
1419
1420         class ArgumentWrapper {
1421         public:
1422                 ArgumentWrapper() : size(0), pointer(NULL) {}
1423                 template <typename T>
1424                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
1425                                                pointer(&argument) { }
1426                 size_t size;
1427                 void *pointer;
1428         };
1429
1430         /* TODO(sergey): In the future we can use variadic templates, once
1431          * C++0x is allowed. Should allow to clean this up a bit.
1432          */
1433         int kernel_set_args(cl_kernel kernel,
1434                             int start_argument_index,
1435                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
1436                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
1437                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
1438                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
1439                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
1440                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
1441                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
1442                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
1443                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
1444                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
1445                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
1446                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
1447                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
1448                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
1449                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
1450                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
1451                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
1452                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
1453                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
1454                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
1455                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
1456                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
1457                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
1458                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
1459                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
1460                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
1461                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
1462                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
1463                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
1464                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
1465                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
1466                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
1467                             const ArgumentWrapper& arg33 = ArgumentWrapper())
1468         {
1469                 int current_arg_index = 0;
1470 #define FAKE_VARARG_HANDLE_ARG(arg) \
1471                 do { \
1472                         if(arg.pointer != NULL) { \
1473                                 opencl_assert(clSetKernelArg( \
1474                                         kernel, \
1475                                         start_argument_index + current_arg_index, \
1476                                         arg.size, arg.pointer)); \
1477                                 ++current_arg_index; \
1478                         } \
1479                         else { \
1480                                 return current_arg_index; \
1481                         } \
1482                 } while(false)
1483                 FAKE_VARARG_HANDLE_ARG(arg1);
1484                 FAKE_VARARG_HANDLE_ARG(arg2);
1485                 FAKE_VARARG_HANDLE_ARG(arg3);
1486                 FAKE_VARARG_HANDLE_ARG(arg4);
1487                 FAKE_VARARG_HANDLE_ARG(arg5);
1488                 FAKE_VARARG_HANDLE_ARG(arg6);
1489                 FAKE_VARARG_HANDLE_ARG(arg7);
1490                 FAKE_VARARG_HANDLE_ARG(arg8);
1491                 FAKE_VARARG_HANDLE_ARG(arg9);
1492                 FAKE_VARARG_HANDLE_ARG(arg10);
1493                 FAKE_VARARG_HANDLE_ARG(arg11);
1494                 FAKE_VARARG_HANDLE_ARG(arg12);
1495                 FAKE_VARARG_HANDLE_ARG(arg13);
1496                 FAKE_VARARG_HANDLE_ARG(arg14);
1497                 FAKE_VARARG_HANDLE_ARG(arg15);
1498                 FAKE_VARARG_HANDLE_ARG(arg16);
1499                 FAKE_VARARG_HANDLE_ARG(arg17);
1500                 FAKE_VARARG_HANDLE_ARG(arg18);
1501                 FAKE_VARARG_HANDLE_ARG(arg19);
1502                 FAKE_VARARG_HANDLE_ARG(arg20);
1503                 FAKE_VARARG_HANDLE_ARG(arg21);
1504                 FAKE_VARARG_HANDLE_ARG(arg22);
1505                 FAKE_VARARG_HANDLE_ARG(arg23);
1506                 FAKE_VARARG_HANDLE_ARG(arg24);
1507                 FAKE_VARARG_HANDLE_ARG(arg25);
1508                 FAKE_VARARG_HANDLE_ARG(arg26);
1509                 FAKE_VARARG_HANDLE_ARG(arg27);
1510                 FAKE_VARARG_HANDLE_ARG(arg28);
1511                 FAKE_VARARG_HANDLE_ARG(arg29);
1512                 FAKE_VARARG_HANDLE_ARG(arg30);
1513                 FAKE_VARARG_HANDLE_ARG(arg31);
1514                 FAKE_VARARG_HANDLE_ARG(arg32);
1515                 FAKE_VARARG_HANDLE_ARG(arg33);
1516 #undef FAKE_VARARG_HANDLE_ARG
1517                 return current_arg_index;
1518         }
1519
1520         inline void release_kernel_safe(cl_kernel kernel)
1521         {
1522                 if(kernel) {
1523                         clReleaseKernel(kernel);
1524                 }
1525         }
1526
1527         inline void release_mem_object_safe(cl_mem mem)
1528         {
1529                 if(mem != NULL) {
1530                         clReleaseMemObject(mem);
1531                 }
1532         }
1533
1534         inline void release_program_safe(cl_program program)
1535         {
1536                 if(program) {
1537                         clReleaseProgram(program);
1538                 }
1539         }
1540
1541         string build_options_from_requested_features(
1542                 const DeviceRequestedFeatures& requested_features)
1543         {
1544                 string build_options = "";
1545                 if(requested_features.experimental) {
1546                         build_options += " -D__KERNEL_EXPERIMENTAL__";
1547                 }
1548                 build_options += " -D__NODES_MAX_GROUP__=" +
1549                         string_printf("%d", requested_features.max_nodes_group);
1550                 build_options += " -D__NODES_FEATURES__=" +
1551                         string_printf("%d", requested_features.nodes_features);
1552                 build_options += string_printf(" -D__MAX_CLOSURE__=%d",
1553                                                requested_features.max_closure);
1554                 if(!requested_features.use_hair) {
1555                         build_options += " -D__NO_HAIR__";
1556                 }
1557                 if(!requested_features.use_object_motion) {
1558                         build_options += " -D__NO_OBJECT_MOTION__";
1559                 }
1560                 if(!requested_features.use_camera_motion) {
1561                         build_options += " -D__NO_CAMERA_MOTION__";
1562                 }
1563                 if(!requested_features.use_baking) {
1564                         build_options += " -D__NO_BAKING__";
1565                 }
1566                 return build_options;
1567         }
1568
1569         /* ** Those guys are for workign around some compiler-specific bugs ** */
1570
1571         virtual cl_program load_cached_kernel(
1572                 const DeviceRequestedFeatures& /*requested_features*/,
1573                 OpenCLCache::ProgramName program_name,
1574                 thread_scoped_lock& cache_locker)
1575         {
1576                 return OpenCLCache::get_program(cpPlatform,
1577                                                 cdDevice,
1578                                                 program_name,
1579                                                 cache_locker);
1580         }
1581
1582         virtual void store_cached_kernel(cl_platform_id platform,
1583                                          cl_device_id device,
1584                                          cl_program program,
1585                                          OpenCLCache::ProgramName program_name,
1586                                          thread_scoped_lock& cache_locker)
1587         {
1588                 OpenCLCache::store_program(platform,
1589                                            device,
1590                                            program,
1591                                            program_name,
1592                                            cache_locker);
1593         }
1594
1595         virtual string build_options_for_base_program(
1596                 const DeviceRequestedFeatures& /*requested_features*/)
1597         {
1598                 /* TODO(sergey): By default we compile all features, meaning
1599                  * mega kernel is not getting feature-based optimizations.
1600                  *
1601                  * Ideally we need always compile kernel with as less features
1602                  * enabed as possible to keep performance at it's max.
1603                  */
1604                 return "";
1605         }
1606 };
1607
1608 class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
1609 {
1610 public:
1611         cl_kernel ckPathTraceKernel;
1612         cl_program path_trace_program;
1613
1614         OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
1615         : OpenCLDeviceBase(info, stats, background_)
1616         {
1617                 ckPathTraceKernel = NULL;
1618                 path_trace_program = NULL;
1619         }
1620
1621         bool load_kernels(const DeviceRequestedFeatures& requested_features)
1622         {
1623                 /* Get Shader, bake and film convert kernels.
1624                  * It'll also do verification of OpenCL actually initialized.
1625                  */
1626                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1627                         return false;
1628                 }
1629
1630                 /* Try to use cached kernel. */
1631                 thread_scoped_lock cache_locker;
1632                 path_trace_program = OpenCLCache::get_program(cpPlatform,
1633                                                               cdDevice,
1634                                                               OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1635                                                               cache_locker);
1636
1637                 if(!path_trace_program) {
1638                         /* Verify we have right opencl version. */
1639                         if(!opencl_version_check())
1640                                 return false;
1641
1642                         /* Calculate md5 hash to detect changes. */
1643                         string kernel_path = path_get("kernel");
1644                         string kernel_md5 = path_files_md5_hash(kernel_path);
1645                         string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
1646                         string device_md5 = device_md5_hash(custom_kernel_build_options);
1647
1648                         /* Path to cached binary. */
1649                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
1650                                                      device_md5.c_str(),
1651                                                      kernel_md5.c_str());
1652                         clbin = path_user_get(path_join("cache", clbin));
1653
1654                         /* Path to preprocessed source for debugging. */
1655                         string clsrc, *debug_src = NULL;
1656                         if(opencl_kernel_use_debug()) {
1657                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
1658                                                       device_md5.c_str(),
1659                                                       kernel_md5.c_str());
1660                                 clsrc = path_user_get(path_join("cache", clsrc));
1661                                 debug_src = &clsrc;
1662                         }
1663
1664                         /* If exists already, try use it. */
1665                         if(path_exists(clbin) && load_binary(kernel_path,
1666                                                              clbin,
1667                                                              custom_kernel_build_options,
1668                                                              &path_trace_program,
1669                                                              debug_src)) {
1670                                 /* Kernel loaded from binary, nothing to do. */
1671                         }
1672                         else {
1673                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
1674                                                             kernel_md5 + "\n";
1675                                 /* If does not exist or loading binary failed, compile kernel. */
1676                                 if(!compile_kernel(kernel_path,
1677                                                    init_kernel_source,
1678                                                    custom_kernel_build_options,
1679                                                    &path_trace_program,
1680                                                    debug_src))
1681                                 {
1682                                         return false;
1683                                 }
1684                                 /* Save binary for reuse. */
1685                                 if(!save_binary(&path_trace_program, clbin)) {
1686                                         return false;
1687                                 }
1688                         }
1689                         /* Cache the program. */
1690                         OpenCLCache::store_program(cpPlatform,
1691                                                    cdDevice,
1692                                                    path_trace_program,
1693                                                    OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1694                                                    cache_locker);
1695                 }
1696
1697                 /* Find kernels. */
1698                 ckPathTraceKernel = clCreateKernel(path_trace_program,
1699                                                    "kernel_ocl_path_trace",
1700                                                    &ciErr);
1701                 if(opencl_error(ciErr))
1702                         return false;
1703                 return true;
1704         }
1705
1706         ~OpenCLDeviceMegaKernel()
1707         {
1708                 task_pool.stop();
1709                 release_kernel_safe(ckPathTraceKernel);
1710                 release_program_safe(path_trace_program);
1711         }
1712
1713         void path_trace(RenderTile& rtile, int sample)
1714         {
1715                 /* Cast arguments to cl types. */
1716                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1717                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1718                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
1719                 cl_int d_x = rtile.x;
1720                 cl_int d_y = rtile.y;
1721                 cl_int d_w = rtile.w;
1722                 cl_int d_h = rtile.h;
1723                 cl_int d_offset = rtile.offset;
1724                 cl_int d_stride = rtile.stride;
1725
1726                 /* Sample arguments. */
1727                 cl_int d_sample = sample;
1728
1729                 cl_uint start_arg_index =
1730                         kernel_set_args(ckPathTraceKernel,
1731                                         0,
1732                                         d_data,
1733                                         d_buffer,
1734                                         d_rng_state);
1735
1736 #define KERNEL_TEX(type, ttype, name) \
1737                 set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
1738 #include "kernel_textures.h"
1739 #undef KERNEL_TEX
1740
1741                 start_arg_index += kernel_set_args(ckPathTraceKernel,
1742                                                    start_arg_index,
1743                                                    d_sample,
1744                                                    d_x,
1745                                                    d_y,
1746                                                    d_w,
1747                                                    d_h,
1748                                                    d_offset,
1749                                                    d_stride);
1750
1751                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
1752         }
1753
1754         void thread_run(DeviceTask *task)
1755         {
1756                 if(task->type == DeviceTask::FILM_CONVERT) {
1757                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1758                 }
1759                 else if(task->type == DeviceTask::SHADER) {
1760                         shader(*task);
1761                 }
1762                 else if(task->type == DeviceTask::PATH_TRACE) {
1763                         RenderTile tile;
1764                         /* Keep rendering tiles until done. */
1765                         while(task->acquire_tile(this, tile)) {
1766                                 int start_sample = tile.start_sample;
1767                                 int end_sample = tile.start_sample + tile.num_samples;
1768
1769                                 for(int sample = start_sample; sample < end_sample; sample++) {
1770                                         if(task->get_cancel()) {
1771                                                 if(task->need_finish_queue == false)
1772                                                         break;
1773                                         }
1774
1775                                         path_trace(tile, sample);
1776
1777                                         tile.sample = sample + 1;
1778
1779                                         task->update_progress(&tile);
1780                                 }
1781
1782                                 /* Complete kernel execution before release tile */
1783                                 /* This helps in multi-device render;
1784                                  * The device that reaches the critical-section function
1785                                  * release_tile waits (stalling other devices from entering
1786                                  * release_tile) for all kernels to complete. If device1 (a
1787                                  * slow-render device) reaches release_tile first then it would
1788                                  * stall device2 (a fast-render device) from proceeding to render
1789                                  * next tile.
1790                                  */
1791                                 clFinish(cqCommandQueue);
1792
1793                                 task->release_tile(tile);
1794                         }
1795                 }
1796         }
1797 };
1798
1799 /* TODO(sergey): This is to keep tile split on OpenCL level working
1800  * for now, since without this view-port render does not work as it
1801  * should.
1802  *
1803  * Ideally it'll be done on the higher level, but we need to get ready
1804  * for merge rather soon, so let's keep split logic private here in
1805  * the file.
1806  */
1807 class SplitRenderTile : public RenderTile {
1808 public:
1809         SplitRenderTile()
1810                 : RenderTile(),
1811                   buffer_offset_x(0),
1812                   buffer_offset_y(0),
1813                   rng_state_offset_x(0),
1814                   rng_state_offset_y(0),
1815                   buffer_rng_state_stride(0) {}
1816
1817         explicit SplitRenderTile(RenderTile& tile)
1818                 : RenderTile(),
1819                   buffer_offset_x(0),
1820                   buffer_offset_y(0),
1821                   rng_state_offset_x(0),
1822                   rng_state_offset_y(0),
1823                   buffer_rng_state_stride(0)
1824         {
1825                 x = tile.x;
1826                 y = tile.y;
1827                 w = tile.w;
1828                 h = tile.h;
1829                 start_sample = tile.start_sample;
1830                 num_samples = tile.num_samples;
1831                 sample = tile.sample;
1832                 resolution = tile.resolution;
1833                 offset = tile.offset;
1834                 stride = tile.stride;
1835                 buffer = tile.buffer;
1836                 rng_state = tile.rng_state;
1837                 buffers = tile.buffers;
1838         }
1839
1840         /* Split kernel is device global memory constrained;
1841          * hence split kernel cant render big tile size's in
1842          * one go. If the user sets a big tile size (big tile size
1843          * is a term relative to the available device global memory),
1844          * we split the tile further and then call path_trace on
1845          * each of those split tiles. The following variables declared,
1846          * assist in achieving that purpose
1847          */
1848         int buffer_offset_x;
1849         int buffer_offset_y;
1850         int rng_state_offset_x;
1851         int rng_state_offset_y;
1852         int buffer_rng_state_stride;
1853 };
1854
1855 /* OpenCLDeviceSplitKernel's declaration/definition. */
1856 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
1857 {
1858 public:
1859         /* Kernel declaration. */
1860         cl_kernel ckPathTraceKernel_data_init;
1861         cl_kernel ckPathTraceKernel_scene_intersect;
1862         cl_kernel ckPathTraceKernel_lamp_emission;
1863         cl_kernel ckPathTraceKernel_queue_enqueue;
1864         cl_kernel ckPathTraceKernel_background_buffer_update;
1865         cl_kernel ckPathTraceKernel_shader_eval;
1866         cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
1867         cl_kernel ckPathTraceKernel_direct_lighting;
1868         cl_kernel ckPathTraceKernel_shadow_blocked;
1869         cl_kernel ckPathTraceKernel_next_iteration_setup;
1870         cl_kernel ckPathTraceKernel_sum_all_radiance;
1871
1872         /* cl_program declaration. */
1873         cl_program data_init_program;
1874         cl_program scene_intersect_program;
1875         cl_program lamp_emission_program;
1876         cl_program queue_enqueue_program;
1877         cl_program background_buffer_update_program;
1878         cl_program shader_eval_program;
1879         cl_program holdout_emission_blurring_pathtermination_ao_program;
1880         cl_program direct_lighting_program;
1881         cl_program shadow_blocked_program;
1882         cl_program next_iteration_setup_program;
1883         cl_program sum_all_radiance_program;
1884
1885         /* Global memory variables [porting]; These memory is used for
1886          * co-operation between different kernels; Data written by one
1887          * kernel will be available to another kernel via this global
1888          * memory.
1889          */
1890         cl_mem rng_coop;
1891         cl_mem throughput_coop;
1892         cl_mem L_transparent_coop;
1893         cl_mem PathRadiance_coop;
1894         cl_mem Ray_coop;
1895         cl_mem PathState_coop;
1896         cl_mem Intersection_coop;
1897         cl_mem kgbuffer;  /* KernelGlobals buffer. */
1898
1899         /* Global buffers for ShaderData. */
1900         cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
1901         cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
1902                                 * shadow_blocked kernel.
1903                                 */
1904
1905         /* Global buffers of each member of ShaderData. */
1906         cl_mem P_sd;
1907         cl_mem P_sd_DL_shadow;
1908         cl_mem N_sd;
1909         cl_mem N_sd_DL_shadow;
1910         cl_mem Ng_sd;
1911         cl_mem Ng_sd_DL_shadow;
1912         cl_mem I_sd;
1913         cl_mem I_sd_DL_shadow;
1914         cl_mem shader_sd;
1915         cl_mem shader_sd_DL_shadow;
1916         cl_mem flag_sd;
1917         cl_mem flag_sd_DL_shadow;
1918         cl_mem prim_sd;
1919         cl_mem prim_sd_DL_shadow;
1920         cl_mem type_sd;
1921         cl_mem type_sd_DL_shadow;
1922         cl_mem u_sd;
1923         cl_mem u_sd_DL_shadow;
1924         cl_mem v_sd;
1925         cl_mem v_sd_DL_shadow;
1926         cl_mem object_sd;
1927         cl_mem object_sd_DL_shadow;
1928         cl_mem time_sd;
1929         cl_mem time_sd_DL_shadow;
1930         cl_mem ray_length_sd;
1931         cl_mem ray_length_sd_DL_shadow;
1932         cl_mem ray_depth_sd;
1933         cl_mem ray_depth_sd_DL_shadow;
1934         cl_mem transparent_depth_sd;
1935         cl_mem transparent_depth_sd_DL_shadow;
1936
1937         /* Ray differentials. */
1938         cl_mem dP_sd, dI_sd;
1939         cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
1940         cl_mem du_sd, dv_sd;
1941         cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
1942
1943         /* Dp/Du */
1944         cl_mem dPdu_sd, dPdv_sd;
1945         cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
1946
1947         /* Object motion. */
1948         cl_mem ob_tfm_sd, ob_itfm_sd;
1949         cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
1950
1951         cl_mem closure_sd;
1952         cl_mem closure_sd_DL_shadow;
1953         cl_mem num_closure_sd;
1954         cl_mem num_closure_sd_DL_shadow;
1955         cl_mem randb_closure_sd;
1956         cl_mem randb_closure_sd_DL_shadow;
1957         cl_mem ray_P_sd;
1958         cl_mem ray_P_sd_DL_shadow;
1959         cl_mem ray_dP_sd;
1960         cl_mem ray_dP_sd_DL_shadow;
1961
1962         /* Global memory required for shadow blocked and accum_radiance. */
1963         cl_mem BSDFEval_coop;
1964         cl_mem ISLamp_coop;
1965         cl_mem LightRay_coop;
1966         cl_mem AOAlpha_coop;
1967         cl_mem AOBSDF_coop;
1968         cl_mem AOLightRay_coop;
1969         cl_mem Intersection_coop_AO;
1970         cl_mem Intersection_coop_DL;
1971
1972 #ifdef WITH_CYCLES_DEBUG
1973         /* DebugData memory */
1974         cl_mem debugdata_coop;
1975 #endif
1976
1977         /* Global state array that tracks ray state. */
1978         cl_mem ray_state;
1979
1980         /* Per sample buffers. */
1981         cl_mem per_sample_output_buffers;
1982
1983         /* Denotes which sample each ray is being processed for. */
1984         cl_mem work_array;
1985
1986         /* Queue */
1987         cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
1988         cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
1989                              * Tracks the size of each queue.
1990                              */
1991
1992         /* Flag to make sceneintersect and lampemission kernel use queues. */
1993         cl_mem use_queues_flag;
1994
1995         /* Amount of memory in output buffer associated with one pixel/thread. */
1996         size_t per_thread_output_buffer_size;
1997
1998         /* Total allocatable available device memory. */
1999         size_t total_allocatable_memory;
2000
2001         /* host version of ray_state; Used in checking host path-iteration
2002          * termination.
2003          */
2004         char *hostRayStateArray;
2005
2006         /* Number of path-iterations to be done in one shot. */
2007         unsigned int PathIteration_times;
2008
2009 #ifdef __WORK_STEALING__
2010         /* Work pool with respect to each work group. */
2011         cl_mem work_pool_wgs;
2012
2013         /* Denotes the maximum work groups possible w.r.t. current tile size. */
2014         unsigned int max_work_groups;
2015 #endif
2016
2017         /* clos_max value for which the kernels have been loaded currently. */
2018         int current_max_closure;
2019
2020         /* Marked True in constructor and marked false at the end of path_trace(). */
2021         bool first_tile;
2022
2023         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
2024         : OpenCLDeviceBase(info, stats, background_)
2025         {
2026                 background = background_;
2027
2028                 /* Initialize kernels. */
2029                 ckPathTraceKernel_data_init = NULL;
2030                 ckPathTraceKernel_scene_intersect = NULL;
2031                 ckPathTraceKernel_lamp_emission = NULL;
2032                 ckPathTraceKernel_background_buffer_update = NULL;
2033                 ckPathTraceKernel_shader_eval = NULL;
2034                 ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
2035                 ckPathTraceKernel_direct_lighting = NULL;
2036                 ckPathTraceKernel_shadow_blocked = NULL;
2037                 ckPathTraceKernel_next_iteration_setup = NULL;
2038                 ckPathTraceKernel_sum_all_radiance = NULL;
2039                 ckPathTraceKernel_queue_enqueue = NULL;
2040
2041                 /* Initialize program. */
2042                 data_init_program = NULL;
2043                 scene_intersect_program = NULL;
2044                 lamp_emission_program = NULL;
2045                 queue_enqueue_program = NULL;
2046                 background_buffer_update_program = NULL;
2047                 shader_eval_program = NULL;
2048                 holdout_emission_blurring_pathtermination_ao_program = NULL;
2049                 direct_lighting_program = NULL;
2050                 shadow_blocked_program = NULL;
2051                 next_iteration_setup_program = NULL;
2052                 sum_all_radiance_program = NULL;
2053
2054                 /* Initialize cl_mem variables. */
2055                 kgbuffer = NULL;
2056                 sd = NULL;
2057                 sd_DL_shadow = NULL;
2058
2059                 P_sd = NULL;
2060                 P_sd_DL_shadow = NULL;
2061                 N_sd = NULL;
2062                 N_sd_DL_shadow = NULL;
2063                 Ng_sd = NULL;
2064                 Ng_sd_DL_shadow = NULL;
2065                 I_sd = NULL;
2066                 I_sd_DL_shadow = NULL;
2067                 shader_sd = NULL;
2068                 shader_sd_DL_shadow = NULL;
2069                 flag_sd = NULL;
2070                 flag_sd_DL_shadow = NULL;
2071                 prim_sd = NULL;
2072                 prim_sd_DL_shadow = NULL;
2073                 type_sd = NULL;
2074                 type_sd_DL_shadow = NULL;
2075                 u_sd = NULL;
2076                 u_sd_DL_shadow = NULL;
2077                 v_sd = NULL;
2078                 v_sd_DL_shadow = NULL;
2079                 object_sd = NULL;
2080                 object_sd_DL_shadow = NULL;
2081                 time_sd = NULL;
2082                 time_sd_DL_shadow = NULL;
2083                 ray_length_sd = NULL;
2084                 ray_length_sd_DL_shadow = NULL;
2085                 ray_depth_sd = NULL;
2086                 ray_depth_sd_DL_shadow = NULL;
2087                 transparent_depth_sd = NULL;
2088                 transparent_depth_sd_DL_shadow = NULL;
2089
2090                 /* Ray differentials. */
2091                 dP_sd = NULL;
2092                 dI_sd = NULL;
2093                 dP_sd_DL_shadow = NULL;
2094                 dI_sd_DL_shadow = NULL;
2095                 du_sd = NULL;
2096                 dv_sd = NULL;
2097                 du_sd_DL_shadow = NULL;
2098                 dv_sd_DL_shadow = NULL;
2099
2100                 /* Dp/Du */
2101                 dPdu_sd = NULL;
2102                 dPdv_sd = NULL;
2103                 dPdu_sd_DL_shadow = NULL;
2104                 dPdv_sd_DL_shadow = NULL;
2105
2106                 /* Object motion. */
2107                 ob_tfm_sd = NULL;
2108                 ob_itfm_sd = NULL;
2109                 ob_tfm_sd_DL_shadow = NULL;
2110                 ob_itfm_sd_DL_shadow = NULL;
2111
2112                 closure_sd = NULL;
2113                 closure_sd_DL_shadow = NULL;
2114                 num_closure_sd = NULL;
2115                 num_closure_sd_DL_shadow = NULL;
2116                 randb_closure_sd = NULL;
2117                 randb_closure_sd_DL_shadow = NULL;
2118                 ray_P_sd = NULL;
2119                 ray_P_sd_DL_shadow = NULL;
2120                 ray_dP_sd = NULL;
2121                 ray_dP_sd_DL_shadow = NULL;
2122
2123                 rng_coop = NULL;
2124                 throughput_coop = NULL;
2125                 L_transparent_coop = NULL;
2126                 PathRadiance_coop = NULL;
2127                 Ray_coop = NULL;
2128                 PathState_coop = NULL;
2129                 Intersection_coop = NULL;
2130                 ray_state = NULL;
2131
2132                 AOAlpha_coop = NULL;
2133                 AOBSDF_coop = NULL;
2134                 AOLightRay_coop = NULL;
2135                 BSDFEval_coop = NULL;
2136                 ISLamp_coop = NULL;
2137                 LightRay_coop = NULL;
2138                 Intersection_coop_AO = NULL;
2139                 Intersection_coop_DL = NULL;
2140
2141 #ifdef WITH_CYCLES_DEBUG
2142                 debugdata_coop = NULL;
2143 #endif
2144
2145                 work_array = NULL;
2146
2147                 /* Queue. */
2148                 Queue_data = NULL;
2149                 Queue_index = NULL;
2150                 use_queues_flag = NULL;
2151
2152                 per_sample_output_buffers = NULL;
2153
2154                 per_thread_output_buffer_size = 0;
2155                 hostRayStateArray = NULL;
2156                 PathIteration_times = PATH_ITER_INC_FACTOR;
2157 #ifdef __WORK_STEALING__
2158                 work_pool_wgs = NULL;
2159                 max_work_groups = 0;
2160 #endif
2161                 current_max_closure = -1;
2162                 first_tile = true;
2163
2164                 /* Get device's maximum memory that can be allocated. */
2165                 ciErr = clGetDeviceInfo(cdDevice,
2166                                         CL_DEVICE_MAX_MEM_ALLOC_SIZE,
2167                                         sizeof(size_t),
2168                                         &total_allocatable_memory,
2169                                         NULL);
2170                 assert(ciErr == CL_SUCCESS);
2171                 if(platform_name == "AMD Accelerated Parallel Processing") {
2172                         /* This value is tweak-able; AMD platform does not seem to
2173                          * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
2174                          * is considered for further computation.
2175                          */
2176                         total_allocatable_memory /= 2;
2177                 }
2178         }
2179
2180         /* TODO(sergey): Seems really close to load_kernel(),
2181          * could it be de-duplicated?
2182          */
2183         bool load_split_kernel(string kernel_path,
2184                                string kernel_init_source,
2185                                string clbin,
2186                                string custom_kernel_build_options,
2187                                cl_program *program,
2188                                const string *debug_src = NULL)
2189         {
2190                 if(!opencl_version_check())
2191                         return false;
2192
2193                 clbin = path_user_get(path_join("cache", clbin));
2194
2195                 /* If exists already, try use it. */
2196                 if(path_exists(clbin) && load_binary(kernel_path,
2197                                                      clbin,
2198                                                      custom_kernel_build_options,
2199                                                      program,
2200                                                      debug_src)) {
2201                         /* Kernel loaded from binary. */
2202                 }
2203                 else {
2204                         /* If does not exist or loading binary failed, compile kernel. */
2205                         if(!compile_kernel(kernel_path,
2206                                            kernel_init_source,
2207                                            custom_kernel_build_options,
2208                                            program,
2209                                            debug_src))
2210                         {
2211                                 return false;
2212                         }
2213                         /* Save binary for reuse. */
2214                         if(!save_binary(program, clbin)) {
2215                                 return false;
2216                         }
2217                 }
2218                 return true;
2219         }
2220
2221         /* Split kernel utility functions. */
2222         size_t get_tex_size(const char *tex_name)
2223         {
2224                 cl_mem ptr;
2225                 size_t ret_size = 0;
2226                 MemMap::iterator i = mem_map.find(tex_name);
2227                 if(i != mem_map.end()) {
2228                         ptr = CL_MEM_PTR(i->second);
2229                         ciErr = clGetMemObjectInfo(ptr,
2230                                                    CL_MEM_SIZE,
2231                                                    sizeof(ret_size),
2232                                                    &ret_size,
2233                                                    NULL);
2234                         assert(ciErr == CL_SUCCESS);
2235                 }
2236                 return ret_size;
2237         }
2238
2239         size_t get_shader_closure_size(int max_closure)
2240         {
2241                 return (sizeof(ShaderClosure) * max_closure);
2242         }
2243
2244         size_t get_shader_data_size(size_t shader_closure_size)
2245         {
2246                 /* ShaderData size without accounting for ShaderClosure array. */
2247                 size_t shader_data_size =
2248                         sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
2249                 return (shader_data_size + shader_closure_size);
2250         }
2251
2252         /* Returns size of KernelGlobals structure associated with OpenCL. */
2253         size_t get_KernelGlobals_size()
2254         {
2255                 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
2256                  * fetch its size.
2257                  */
2258                 typedef struct KernelGlobals {
2259                         ccl_constant KernelData *data;
2260 #define KERNEL_TEX(type, ttype, name) \
2261         ccl_global type *name;
2262 #include "kernel_textures.h"
2263 #undef KERNEL_TEX
2264                 } KernelGlobals;
2265
2266                 return sizeof(KernelGlobals);
2267         }
2268
2269         /* Returns size of Structure of arrays implementation of. */
2270         size_t get_shaderdata_soa_size()
2271         {
2272                 size_t shader_soa_size = 0;
2273
2274 #define SD_VAR(type, what) shader_soa_size += sizeof(void *);
2275 #define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
2276                 #include "kernel_shaderdata_vars.h"
2277 #undef SD_VAR
2278 #undef SD_CLOSURE_VAR
2279
2280                 return shader_soa_size;
2281         }
2282
2283         bool load_kernels(const DeviceRequestedFeatures& requested_features)
2284         {
2285                 /* Get Shader, bake and film_convert kernels.
2286                  * It'll also do verification of OpenCL actually initialized.
2287                  */
2288                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
2289                         return false;
2290                 }
2291
2292                 string kernel_path = path_get("kernel");
2293                 string kernel_md5 = path_files_md5_hash(kernel_path);
2294                 string device_md5;
2295                 string kernel_init_source;
2296                 string clbin;
2297                 string clsrc, *debug_src = NULL;
2298
2299                 string build_options = "-D__SPLIT_KERNEL__";
2300 #ifdef __WORK_STEALING__
2301                 build_options += " -D__WORK_STEALING__";
2302 #endif
2303                 build_options += build_options_from_requested_features(requested_features);
2304
2305                 /* Set compute device build option. */
2306                 cl_device_type device_type;
2307                 ciErr = clGetDeviceInfo(cdDevice,
2308                                         CL_DEVICE_TYPE,
2309                                         sizeof(cl_device_type),
2310                                         &device_type,
2311                                         NULL);
2312                 assert(ciErr == CL_SUCCESS);
2313                 if(device_type == CL_DEVICE_TYPE_GPU) {
2314                         build_options += " -D__COMPUTE_DEVICE_GPU__";
2315                 }
2316
2317 #define GLUE(a, b) a ## b
2318 #define LOAD_KERNEL(name) \
2319         do { \
2320                 kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
2321                                      kernel_md5 + "\n"; \
2322                 device_md5 = device_md5_hash(build_options); \
2323                 clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
2324                                       device_md5.c_str(), kernel_md5.c_str()); \
2325                 if(opencl_kernel_use_debug()) { \
2326                         clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
2327                                               device_md5.c_str(), kernel_md5.c_str()); \
2328                         clsrc = path_user_get(path_join("cache", clsrc)); \
2329                         debug_src = &clsrc; \
2330                 } \
2331                 if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
2332                                       build_options, \
2333                                       &GLUE(name, _program), \
2334                                       debug_src)) \
2335                 { \
2336                         fprintf(stderr, "Faled to compile %s\n", #name); \
2337                         return false; \
2338                 } \
2339         } while(false)
2340
2341                 LOAD_KERNEL(data_init);
2342                 LOAD_KERNEL(scene_intersect);
2343                 LOAD_KERNEL(lamp_emission);
2344                 LOAD_KERNEL(queue_enqueue);
2345                 LOAD_KERNEL(background_buffer_update);
2346                 LOAD_KERNEL(shader_eval);
2347                 LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
2348                 LOAD_KERNEL(direct_lighting);
2349                 LOAD_KERNEL(shadow_blocked);
2350                 LOAD_KERNEL(next_iteration_setup);
2351                 LOAD_KERNEL(sum_all_radiance);
2352
2353 #undef LOAD_KERNEL
2354
2355 #define FIND_KERNEL(name) \
2356         do { \
2357                 GLUE(ckPathTraceKernel_, name) = \
2358                         clCreateKernel(GLUE(name, _program), \
2359                                        "kernel_ocl_path_trace_"  #name, &ciErr); \
2360                 if(opencl_error(ciErr)) { \
2361                         fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
2362                         return false; \
2363                 } \
2364         } while(false)
2365
2366                 FIND_KERNEL(data_init);
2367                 FIND_KERNEL(scene_intersect);
2368                 FIND_KERNEL(lamp_emission);
2369                 FIND_KERNEL(queue_enqueue);
2370                 FIND_KERNEL(background_buffer_update);
2371                 FIND_KERNEL(shader_eval);
2372                 FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
2373                 FIND_KERNEL(direct_lighting);
2374                 FIND_KERNEL(shadow_blocked);
2375                 FIND_KERNEL(next_iteration_setup);
2376                 FIND_KERNEL(sum_all_radiance);
2377 #undef FIND_KERNEL
2378 #undef GLUE
2379
2380                 current_max_closure = requested_features.max_closure;
2381
2382                 return true;
2383         }
2384
2385         ~OpenCLDeviceSplitKernel()
2386         {
2387                 task_pool.stop();
2388
2389                 /* Release kernels */
2390                 release_kernel_safe(ckPathTraceKernel_data_init);
2391                 release_kernel_safe(ckPathTraceKernel_scene_intersect);
2392                 release_kernel_safe(ckPathTraceKernel_lamp_emission);
2393                 release_kernel_safe(ckPathTraceKernel_queue_enqueue);
2394                 release_kernel_safe(ckPathTraceKernel_background_buffer_update);
2395                 release_kernel_safe(ckPathTraceKernel_shader_eval);
2396                 release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
2397                 release_kernel_safe(ckPathTraceKernel_direct_lighting);
2398                 release_kernel_safe(ckPathTraceKernel_shadow_blocked);
2399                 release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
2400                 release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
2401
2402                 /* Release global memory */
2403                 release_mem_object_safe(P_sd);
2404                 release_mem_object_safe(P_sd_DL_shadow);
2405                 release_mem_object_safe(N_sd);
2406                 release_mem_object_safe(N_sd_DL_shadow);
2407                 release_mem_object_safe(Ng_sd);
2408                 release_mem_object_safe(Ng_sd_DL_shadow);
2409                 release_mem_object_safe(I_sd);
2410                 release_mem_object_safe(I_sd_DL_shadow);
2411                 release_mem_object_safe(shader_sd);
2412                 release_mem_object_safe(shader_sd_DL_shadow);
2413                 release_mem_object_safe(flag_sd);
2414                 release_mem_object_safe(flag_sd_DL_shadow);
2415                 release_mem_object_safe(prim_sd);
2416                 release_mem_object_safe(prim_sd_DL_shadow);
2417                 release_mem_object_safe(type_sd);
2418                 release_mem_object_safe(type_sd_DL_shadow);
2419                 release_mem_object_safe(u_sd);
2420                 release_mem_object_safe(u_sd_DL_shadow);
2421                 release_mem_object_safe(v_sd);
2422                 release_mem_object_safe(v_sd_DL_shadow);
2423                 release_mem_object_safe(object_sd);
2424                 release_mem_object_safe(object_sd_DL_shadow);
2425                 release_mem_object_safe(time_sd);
2426                 release_mem_object_safe(time_sd_DL_shadow);
2427                 release_mem_object_safe(ray_length_sd);
2428                 release_mem_object_safe(ray_length_sd_DL_shadow);
2429                 release_mem_object_safe(ray_depth_sd);
2430                 release_mem_object_safe(ray_depth_sd_DL_shadow);
2431                 release_mem_object_safe(transparent_depth_sd);
2432                 release_mem_object_safe(transparent_depth_sd_DL_shadow);
2433
2434                 /* Ray differentials. */
2435                 release_mem_object_safe(dP_sd);
2436                 release_mem_object_safe(dP_sd_DL_shadow);
2437                 release_mem_object_safe(dI_sd);
2438                 release_mem_object_safe(dI_sd_DL_shadow);
2439                 release_mem_object_safe(du_sd);
2440                 release_mem_object_safe(du_sd_DL_shadow);
2441                 release_mem_object_safe(dv_sd);
2442                 release_mem_object_safe(dv_sd_DL_shadow);
2443
2444                 /* Dp/Du */
2445                 release_mem_object_safe(dPdu_sd);
2446                 release_mem_object_safe(dPdu_sd_DL_shadow);
2447                 release_mem_object_safe(dPdv_sd);
2448                 release_mem_object_safe(dPdv_sd_DL_shadow);
2449
2450                 /* Object motion. */
2451                 release_mem_object_safe(ob_tfm_sd);
2452                 release_mem_object_safe(ob_itfm_sd);
2453
2454                 release_mem_object_safe(ob_tfm_sd_DL_shadow);
2455                 release_mem_object_safe(ob_itfm_sd_DL_shadow);
2456
2457                 release_mem_object_safe(closure_sd);
2458                 release_mem_object_safe(closure_sd_DL_shadow);
2459                 release_mem_object_safe(num_closure_sd);
2460                 release_mem_object_safe(num_closure_sd_DL_shadow);
2461                 release_mem_object_safe(randb_closure_sd);
2462                 release_mem_object_safe(randb_closure_sd_DL_shadow);
2463                 release_mem_object_safe(ray_P_sd);
2464                 release_mem_object_safe(ray_P_sd_DL_shadow);
2465                 release_mem_object_safe(ray_dP_sd);
2466                 release_mem_object_safe(ray_dP_sd_DL_shadow);
2467                 release_mem_object_safe(rng_coop);
2468                 release_mem_object_safe(throughput_coop);
2469                 release_mem_object_safe(L_transparent_coop);
2470                 release_mem_object_safe(PathRadiance_coop);
2471                 release_mem_object_safe(Ray_coop);
2472                 release_mem_object_safe(PathState_coop);
2473                 release_mem_object_safe(Intersection_coop);
2474                 release_mem_object_safe(kgbuffer);
2475                 release_mem_object_safe(sd);
2476                 release_mem_object_safe(sd_DL_shadow);
2477                 release_mem_object_safe(ray_state);
2478                 release_mem_object_safe(AOAlpha_coop);
2479                 release_mem_object_safe(AOBSDF_coop);
2480                 release_mem_object_safe(AOLightRay_coop);
2481                 release_mem_object_safe(BSDFEval_coop);
2482                 release_mem_object_safe(ISLamp_coop);
2483                 release_mem_object_safe(LightRay_coop);
2484                 release_mem_object_safe(Intersection_coop_AO);
2485                 release_mem_object_safe(Intersection_coop_DL);
2486 #ifdef WITH_CYCLES_DEBUG
2487                 release_mem_object_safe(debugdata_coop);
2488 #endif
2489                 release_mem_object_safe(use_queues_flag);
2490                 release_mem_object_safe(Queue_data);
2491                 release_mem_object_safe(Queue_index);
2492                 release_mem_object_safe(work_array);
2493 #ifdef __WORK_STEALING__
2494                 release_mem_object_safe(work_pool_wgs);
2495 #endif
2496                 release_mem_object_safe(per_sample_output_buffers);
2497
2498                 /* Release programs */
2499                 release_program_safe(data_init_program);
2500                 release_program_safe(scene_intersect_program);
2501                 release_program_safe(lamp_emission_program);
2502                 release_program_safe(queue_enqueue_program);
2503                 release_program_safe(background_buffer_update_program);
2504                 release_program_safe(shader_eval_program);
2505                 release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
2506                 release_program_safe(direct_lighting_program);
2507                 release_program_safe(shadow_blocked_program);
2508                 release_program_safe(next_iteration_setup_program);
2509                 release_program_safe(sum_all_radiance_program);
2510
2511                 if(hostRayStateArray != NULL) {
2512                         free(hostRayStateArray);
2513                 }
2514         }
2515
2516         void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
2517         {
2518                 /* cast arguments to cl types */
2519                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
2520                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
2521                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
2522                 cl_int d_x = rtile.x;
2523                 cl_int d_y = rtile.y;
2524                 cl_int d_w = rtile.w;
2525                 cl_int d_h = rtile.h;
2526                 cl_int d_offset = rtile.offset;
2527                 cl_int d_stride = rtile.stride;
2528
2529                 /* Make sure that set render feasible tile size is a multiple of local
2530                  * work size dimensions.
2531                  */
2532                 assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
2533                 assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
2534
2535                 size_t global_size[2];
2536                 size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
2537                                         SPLIT_KERNEL_LOCAL_SIZE_Y};
2538
2539                 /* Set the range of samples to be processed for every ray in
2540                  * path-regeneration logic.
2541                  */
2542                 cl_int start_sample = rtile.start_sample;
2543                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
2544                 cl_int num_samples = rtile.num_samples;
2545
2546 #ifdef __WORK_STEALING__
2547                 global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
2548                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2549                 unsigned int num_parallel_samples = 1;
2550 #else
2551                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2552                 unsigned int num_threads = max_render_feasible_tile_size.x *
2553                                            max_render_feasible_tile_size.y;
2554                 unsigned int num_tile_columns_possible = num_threads / global_size[1];
2555                 /* Estimate number of parallel samples that can be
2556                  * processed in parallel.
2557                  */
2558                 unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
2559                                                         rtile.num_samples);
2560                 /* Wavefront size in AMD is 64.
2561                  * TODO(sergey): What about other platforms?
2562                  */
2563                 if(num_parallel_samples >= 64) {
2564                         /* TODO(sergey): Could use generic round-up here. */
2565                         num_parallel_samples = (num_parallel_samples / 64) * 64;
2566                 }
2567                 assert(num_parallel_samples != 0);
2568
2569                 global_size[0] = d_w * num_parallel_samples;
2570 #endif  /* __WORK_STEALING__ */
2571
2572                 assert(global_size[0] * global_size[1] <=
2573                        max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
2574
2575                 /* Allocate all required global memory once. */
2576                 if(first_tile) {
2577                         size_t num_global_elements = max_render_feasible_tile_size.x *
2578                                                      max_render_feasible_tile_size.y;
2579                         /* TODO(sergey): This will actually over-allocate if
2580                          * particular kernel does not support multiclosure.
2581                          */
2582                         size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
2583
2584 #ifdef __WORK_STEALING__
2585                         /* Calculate max groups */
2586                         size_t max_global_size[2];
2587                         size_t tile_x = max_render_feasible_tile_size.x;
2588                         size_t tile_y = max_render_feasible_tile_size.y;
2589                         max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
2590                         max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
2591                         max_work_groups = (max_global_size[0] * max_global_size[1]) /
2592                                           (local_size[0] * local_size[1]);
2593                         /* Allocate work_pool_wgs memory. */
2594                         work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
2595 #endif  /* __WORK_STEALING__ */
2596
2597                         /* Allocate queue_index memory only once. */
2598                         Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
2599                         use_queues_flag = mem_alloc(sizeof(char));
2600                         kgbuffer = mem_alloc(get_KernelGlobals_size());
2601
2602                         /* Create global buffers for ShaderData. */
2603                         sd = mem_alloc(get_shaderdata_soa_size());
2604                         sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
2605                         P_sd = mem_alloc(num_global_elements * sizeof(float3));
2606                         P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2607                         N_sd = mem_alloc(num_global_elements * sizeof(float3));
2608                         N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2609                         Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
2610                         Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2611                         I_sd = mem_alloc(num_global_elements * sizeof(float3));
2612                         I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2613                         shader_sd = mem_alloc(num_global_elements * sizeof(int));
2614                         shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2615                         flag_sd = mem_alloc(num_global_elements * sizeof(int));
2616                         flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2617                         prim_sd = mem_alloc(num_global_elements * sizeof(int));
2618                         prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2619                         type_sd = mem_alloc(num_global_elements * sizeof(int));
2620                         type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2621                         u_sd = mem_alloc(num_global_elements * sizeof(float));
2622                         u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2623                         v_sd = mem_alloc(num_global_elements * sizeof(float));
2624                         v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2625                         object_sd = mem_alloc(num_global_elements * sizeof(int));
2626                         object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2627                         time_sd = mem_alloc(num_global_elements * sizeof(float));
2628                         time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2629                         ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
2630                         ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2631                         ray_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2632                         ray_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2633                         transparent_depth_sd = mem_alloc(num_global_elements * sizeof(int));
2634                         transparent_depth_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2635
2636                         /* Ray differentials. */
2637                         dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2638                         dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2639                         dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
2640                         dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2641                         du_sd = mem_alloc(num_global_elements * sizeof(differential));
2642                         du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2643                         dv_sd = mem_alloc(num_global_elements * sizeof(differential));
2644                         dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2645
2646                         /* Dp/Du */
2647                         dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
2648                         dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2649                         dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
2650                         dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2651
2652                         /* Object motion. */
2653                         ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2654                         ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2655                         ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2656                         ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2657
2658                         closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
2659                         closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
2660                         num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
2661                         num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2662                         randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
2663                         randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2664                         ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
2665                         ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2666                         ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2667                         ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2668
2669                         /* Creation of global memory buffers which are shared among
2670                          * the kernels.
2671                          */
2672                         rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
2673                         throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
2674                         L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
2675                         PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
2676                         Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
2677                         PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
2678                         Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
2679                         AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
2680                         AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
2681                         AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2682                         BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
2683                         ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
2684                         LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2685                         Intersection_coop_AO = mem_alloc(num_global_elements * sizeof(Intersection));
2686                         Intersection_coop_DL = mem_alloc(num_global_elements * sizeof(Intersection));
2687
2688 #ifdef WITH_CYCLES_DEBUG
2689                         debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
2690 #endif
2691
2692                         ray_state = mem_alloc(num_global_elements * sizeof(char));
2693
2694                         hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
2695                         assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
2696
2697                         Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
2698                         work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
2699                         per_sample_output_buffers = mem_alloc(num_global_elements *
2700                                                               per_thread_output_buffer_size);
2701                 }
2702
2703                 cl_int dQueue_size = global_size[0] * global_size[1];
2704                 cl_int total_num_rays = global_size[0] * global_size[1];
2705
2706                 cl_uint start_arg_index =
2707                         kernel_set_args(ckPathTraceKernel_data_init,
2708                                         0,
2709                                         kgbuffer,
2710                                         sd,
2711                                         sd_DL_shadow,
2712                                         P_sd,
2713                                         P_sd_DL_shadow,
2714                                         N_sd,
2715                                         N_sd_DL_shadow,
2716                                         Ng_sd,
2717                                         Ng_sd_DL_shadow,
2718                                         I_sd,
2719                                         I_sd_DL_shadow,
2720                                         shader_sd,
2721                                         shader_sd_DL_shadow,
2722                                         flag_sd,
2723                                         flag_sd_DL_shadow,
2724                                         prim_sd,
2725                                         prim_sd_DL_shadow,
2726                                         type_sd,
2727                                         type_sd_DL_shadow,
2728                                         u_sd,
2729                                         u_sd_DL_shadow,
2730                                         v_sd,
2731                                         v_sd_DL_shadow,
2732                                         object_sd,
2733                                         object_sd_DL_shadow,
2734                                         time_sd,
2735                                         time_sd_DL_shadow,
2736                                         ray_length_sd,
2737                                         ray_length_sd_DL_shadow,
2738                                         ray_depth_sd,
2739                                         ray_depth_sd_DL_shadow,
2740                                         transparent_depth_sd,
2741                                         transparent_depth_sd_DL_shadow);
2742
2743                 /* Ray differentials. */
2744                 start_arg_index +=
2745                         kernel_set_args(ckPathTraceKernel_data_init,
2746                                         start_arg_index,
2747                                         dP_sd,
2748                                         dP_sd_DL_shadow,
2749                                         dI_sd,
2750                                         dI_sd_DL_shadow,
2751                                         du_sd,
2752                                         du_sd_DL_shadow,
2753                                         dv_sd,
2754                                         dv_sd_DL_shadow);
2755
2756                 /* Dp/Du */
2757                 start_arg_index +=
2758                         kernel_set_args(ckPathTraceKernel_data_init,
2759                                         start_arg_index,
2760                                         dPdu_sd,
2761                                         dPdu_sd_DL_shadow,
2762                                         dPdv_sd,
2763                                         dPdv_sd_DL_shadow);
2764
2765                 /* Object motion. */
2766                 start_arg_index +=
2767                         kernel_set_args(ckPathTraceKernel_data_init,
2768                                         start_arg_index,
2769                                         ob_tfm_sd,
2770                                         ob_tfm_sd_DL_shadow,
2771                                         ob_itfm_sd,
2772                                         ob_itfm_sd_DL_shadow);
2773
2774                 start_arg_index +=
2775                         kernel_set_args(ckPathTraceKernel_data_init,
2776                                         start_arg_index,
2777                                         closure_sd,
2778                                         closure_sd_DL_shadow,
2779                                         num_closure_sd,
2780                                         num_closure_sd_DL_shadow,
2781                                         randb_closure_sd,
2782                                         randb_closure_sd_DL_shadow,
2783                                         ray_P_sd,
2784                                         ray_P_sd_DL_shadow,
2785                                         ray_dP_sd,
2786                                         ray_dP_sd_DL_shadow,
2787                                         d_data,
2788                                         per_sample_output_buffers,
2789                                         d_rng_state,
2790                                         rng_coop,
2791                                         throughput_coop,
2792                                         L_transparent_coop,
2793                                         PathRadiance_coop,
2794                                         Ray_coop,
2795                                         PathState_coop,
2796                                         ray_state);
2797
2798 /* TODO(segrey): Avoid map lookup here. */
2799 #define KERNEL_TEX(type, ttype, name) \
2800         set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
2801 #include "kernel_textures.h"
2802 #undef KERNEL_TEX
2803
2804                 start_arg_index +=
2805                         kernel_set_args(ckPathTraceKernel_data_init,
2806                                         start_arg_index,
2807                                         start_sample,
2808                                         d_x,
2809                                         d_y,
2810                                         d_w,
2811                                         d_h,
2812                                         d_offset,
2813                                         d_stride,
2814                                         rtile.rng_state_offset_x,
2815                                         rtile.rng_state_offset_y,
2816                                         rtile.buffer_rng_state_stride,
2817                                         Queue_data,
2818                                         Queue_index,
2819                                         dQueue_size,
2820                                         use_queues_flag,
2821                                         work_array,
2822 #ifdef __WORK_STEALING__
2823                                         work_pool_wgs,
2824                                         num_samples,
2825 #endif
2826 #ifdef WITH_CYCLES_DEBUG
2827                                         debugdata_coop,
2828 #endif
2829                                         num_parallel_samples);
2830
2831                 kernel_set_args(ckPathTraceKernel_scene_intersect,
2832                                 0,
2833                                 kgbuffer,
2834                                 d_data,
2835                                 rng_coop,
2836                                 Ray_coop,
2837                                 PathState_coop,
2838                                 Intersection_coop,
2839                                 ray_state,
2840                                 d_w,
2841                                 d_h,
2842                                 Queue_data,
2843                                 Queue_index,
2844                                 dQueue_size,
2845                                 use_queues_flag,
2846 #ifdef WITH_CYCLES_DEBUG
2847                                 debugdata_coop,
2848 #endif
2849                                 num_parallel_samples);
2850
2851                 kernel_set_args(ckPathTraceKernel_lamp_emission,
2852                                 0,
2853                                 kgbuffer,
2854                                 d_data,
2855                                 sd,
2856                                 throughput_coop,
2857                                 PathRadiance_coop,
2858                                 Ray_coop,
2859                                 PathState_coop,
2860                                 Intersection_coop,
2861                                 ray_state,
2862                                 d_w,
2863                                 d_h,
2864                                 Queue_data,
2865                                 Queue_index,
2866                                 dQueue_size,
2867                                 use_queues_flag,
2868                                 num_parallel_samples);
2869
2870                 kernel_set_args(ckPathTraceKernel_queue_enqueue,
2871                                 0,
2872                                 Queue_data,
2873                                 Queue_index,
2874                                 ray_state,
2875                                 dQueue_size);
2876
2877                 kernel_set_args(ckPathTraceKernel_background_buffer_update,
2878                                  0,
2879                                  kgbuffer,
2880                                  d_data,
2881                                  sd,
2882                                  per_sample_output_buffers,
2883                                  d_rng_state,
2884                                  rng_coop,
2885                                  throughput_coop,
2886                                  PathRadiance_coop,
2887                                  Ray_coop,
2888          &n