Cycles: Remove few function arguments needed only for the split kernel
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "clew.h"
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_debug.h"
31 #include "util_foreach.h"
32 #include "util_logging.h"
33 #include "util_map.h"
34 #include "util_math.h"
35 #include "util_md5.h"
36 #include "util_opengl.h"
37 #include "util_path.h"
38 #include "util_time.h"
39
40 CCL_NAMESPACE_BEGIN
41
42 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
43
44 /* Macro declarations used with split kernel */
45
46 /* Macro to enable/disable work-stealing */
47 #define __WORK_STEALING__
48
49 #define SPLIT_KERNEL_LOCAL_SIZE_X 64
50 #define SPLIT_KERNEL_LOCAL_SIZE_Y 1
51
52 /* This value may be tuned according to the scene we are rendering.
53  *
54  * Modifying PATH_ITER_INC_FACTOR value proportional to number of expected
55  * ray-bounces will improve performance.
56  */
57 #define PATH_ITER_INC_FACTOR 8
58
59 /* When allocate global memory in chunks. We may not be able to
60  * allocate exactly "CL_DEVICE_MAX_MEM_ALLOC_SIZE" bytes in chunks;
61  * Since some bytes may be needed for aligning chunks of memory;
62  * This is the amount of memory that we dedicate for that purpose.
63  */
64 #define DATA_ALLOCATION_MEM_FACTOR 5000000 //5MB
65
66 struct OpenCLPlatformDevice {
67         OpenCLPlatformDevice(cl_platform_id platform_id,
68                              const string& platform_name,
69                              cl_device_id device_id,
70                              cl_device_type device_type,
71                              const string& device_name)
72           : platform_id(platform_id),
73             platform_name(platform_name),
74             device_id(device_id),
75             device_type(device_type),
76             device_name(device_name) {}
77         cl_platform_id platform_id;
78         string platform_name;
79         cl_device_id device_id;
80         cl_device_type device_type;
81         string device_name;
82 };
83
84 namespace {
85
86 cl_device_type opencl_device_type()
87 {
88         switch(DebugFlags().opencl.device_type)
89         {
90                 case DebugFlags::OpenCL::DEVICE_NONE:
91                         return 0;
92                 case DebugFlags::OpenCL::DEVICE_ALL:
93                         return CL_DEVICE_TYPE_ALL;
94                 case DebugFlags::OpenCL::DEVICE_DEFAULT:
95                         return CL_DEVICE_TYPE_DEFAULT;
96                 case DebugFlags::OpenCL::DEVICE_CPU:
97                         return CL_DEVICE_TYPE_CPU;
98                 case DebugFlags::OpenCL::DEVICE_GPU:
99                         return CL_DEVICE_TYPE_GPU;
100                 case DebugFlags::OpenCL::DEVICE_ACCELERATOR:
101                         return CL_DEVICE_TYPE_ACCELERATOR;
102                 default:
103                         return CL_DEVICE_TYPE_ALL;
104         }
105 }
106
107 inline bool opencl_kernel_use_debug()
108 {
109         return DebugFlags().opencl.debug;
110 }
111
112 bool opencl_kernel_use_advanced_shading(const string& platform)
113 {
114         /* keep this in sync with kernel_types.h! */
115         if(platform == "NVIDIA CUDA")
116                 return true;
117         else if(platform == "Apple")
118                 return true;
119         else if(platform == "AMD Accelerated Parallel Processing")
120                 return true;
121         else if(platform == "Intel(R) OpenCL")
122                 return true;
123         /* Make sure officially unsupported OpenCL platforms
124          * does not set up to use advanced shading.
125          */
126         return false;
127 }
128
129 bool opencl_kernel_use_split(const string& platform_name,
130                              const cl_device_type device_type)
131 {
132         if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_SPLIT) {
133                 VLOG(1) << "Forcing split kernel to use.";
134                 return true;
135         }
136         if(DebugFlags().opencl.kernel_type == DebugFlags::OpenCL::KERNEL_MEGA) {
137                 VLOG(1) << "Forcing mega kernel to use.";
138                 return false;
139         }
140         /* TODO(sergey): Replace string lookups with more enum-like API,
141          * similar to device/vendor checks blender's gpu.
142          */
143         if(platform_name == "AMD Accelerated Parallel Processing" &&
144            device_type == CL_DEVICE_TYPE_GPU)
145         {
146                 return true;
147         }
148         return false;
149 }
150
151 bool opencl_device_supported(const string& platform_name,
152                              const cl_device_id device_id)
153 {
154         cl_device_type device_type;
155         clGetDeviceInfo(device_id,
156                         CL_DEVICE_TYPE,
157                         sizeof(cl_device_type),
158                         &device_type,
159                         NULL);
160         if(platform_name == "AMD Accelerated Parallel Processing" &&
161            device_type == CL_DEVICE_TYPE_GPU)
162         {
163                 return true;
164         }
165         if(platform_name == "Apple" && device_type == CL_DEVICE_TYPE_GPU) {
166                 return true;
167         }
168         return false;
169 }
170
171 bool opencl_platform_version_check(cl_platform_id platform,
172                                    string *error = NULL)
173 {
174         const int req_major = 1, req_minor = 1;
175         int major, minor;
176         char version[256];
177         clGetPlatformInfo(platform,
178                           CL_PLATFORM_VERSION,
179                           sizeof(version),
180                           &version,
181                           NULL);
182         if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
183                 if(error != NULL) {
184                         *error = string_printf("OpenCL: failed to parse platform version string (%s).", version);
185                 }
186                 return false;
187         }
188         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
189                 if(error != NULL) {
190                         *error = string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor);
191                 }
192                 return false;
193         }
194         if(error != NULL) {
195                 *error = "";
196         }
197         return true;
198 }
199
200 bool opencl_device_version_check(cl_device_id device,
201                                  string *error = NULL)
202 {
203         const int req_major = 1, req_minor = 1;
204         int major, minor;
205         char version[256];
206         clGetDeviceInfo(device,
207                         CL_DEVICE_OPENCL_C_VERSION,
208                         sizeof(version),
209                         &version,
210                         NULL);
211         if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
212                 if(error != NULL) {
213                         *error = string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version);
214                 }
215                 return false;
216         }
217         if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
218                 if(error != NULL) {
219                         *error = string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor);
220                 }
221                 return false;
222         }
223         if(error != NULL) {
224                 *error = "";
225         }
226         return true;
227 }
228
229 void opencl_get_usable_devices(vector<OpenCLPlatformDevice> *usable_devices)
230 {
231         const bool force_all_platforms =
232                 (DebugFlags().opencl.kernel_type != DebugFlags::OpenCL::KERNEL_DEFAULT);
233         const cl_device_type device_type = opencl_device_type();
234         static bool first_time = true;
235 #define FIRST_VLOG(severity) if(first_time) VLOG(severity)
236
237         usable_devices->clear();
238
239         if(device_type == 0) {
240                 FIRST_VLOG(2) << "OpenCL devices are forced to be disabled.";
241                 first_time = false;
242                 return;
243         }
244
245         vector<cl_device_id> device_ids;
246         cl_uint num_devices = 0;
247         vector<cl_platform_id> platform_ids;
248         cl_uint num_platforms = 0;
249
250         /* Get devices. */
251         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS ||
252            num_platforms == 0)
253         {
254                 FIRST_VLOG(2) << "No OpenCL platforms were found.";
255                 first_time = false;
256                 return;
257         }
258         platform_ids.resize(num_platforms);
259         if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS) {
260                 FIRST_VLOG(2) << "Failed to fetch platform IDs from the driver..";
261                 first_time = false;
262                 return;
263         }
264         /* Devices are numbered consecutively across platforms. */
265         for(int platform = 0; platform < num_platforms; platform++) {
266                 cl_platform_id platform_id = platform_ids[platform];
267                 char pname[256];
268                 if(clGetPlatformInfo(platform_id,
269                                      CL_PLATFORM_NAME,
270                                      sizeof(pname),
271                                      &pname,
272                                      NULL) != CL_SUCCESS)
273                 {
274                         FIRST_VLOG(2) << "Failed to get platform name, ignoring.";
275                         continue;
276                 }
277                 string platform_name = pname;
278                 FIRST_VLOG(2) << "Enumerating devices for platform "
279                               << platform_name << ".";
280                 if(!opencl_platform_version_check(platform_id)) {
281                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
282                                       << " due to too old compiler version.";
283                         continue;
284                 }
285                 num_devices = 0;
286                 if(clGetDeviceIDs(platform_id,
287                                   device_type,
288                                   0,
289                                   NULL,
290                                   &num_devices) != CL_SUCCESS || num_devices == 0)
291                 {
292                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
293                                       << ", failed to fetch number of devices.";
294                         continue;
295                 }
296                 device_ids.resize(num_devices);
297                 if(clGetDeviceIDs(platform_id,
298                                   device_type,
299                                   num_devices,
300                                   &device_ids[0],
301                                   NULL) != CL_SUCCESS)
302                 {
303                         FIRST_VLOG(2) << "Ignoring platform " << platform_name
304                                       << ", failed to fetch devices list.";
305                         continue;
306                 }
307                 for(int num = 0; num < num_devices; num++) {
308                         cl_device_id device_id = device_ids[num];
309                         char device_name[1024] = "\0";
310                         if(clGetDeviceInfo(device_id,
311                                            CL_DEVICE_NAME,
312                                            sizeof(device_name),
313                                            &device_name,
314                                            NULL) != CL_SUCCESS)
315                         {
316                                 FIRST_VLOG(2) << "Failed to fetch device name, ignoring.";
317                                 continue;
318                         }
319                         if(!opencl_device_version_check(device_id)) {
320                                 FIRST_VLOG(2) << "Ignoring device " << device_name
321                                               << " due to old compiler version.";
322                                 continue;
323                         }
324                         if(force_all_platforms ||
325                            opencl_device_supported(platform_name, device_id))
326                         {
327                                 cl_device_type device_type;
328                                 if(clGetDeviceInfo(device_id,
329                                                    CL_DEVICE_TYPE,
330                                                    sizeof(cl_device_type),
331                                                    &device_type,
332                                                    NULL) != CL_SUCCESS)
333                                 {
334                                         FIRST_VLOG(2) << "Ignoring device " << device_name
335                                                       << ", failed to fetch device type.";
336                                         continue;
337                                 }
338                                 FIRST_VLOG(2) << "Adding new device " << device_name << ".";
339                                 usable_devices->push_back(OpenCLPlatformDevice(platform_id,
340                                                                                platform_name,
341                                                                                device_id,
342                                                                                device_type,
343                                                                                device_name));
344                         }
345                         else {
346                                 FIRST_VLOG(2) << "Ignoring device " << device_name
347                                               << ", not officially supported yet.";
348                         }
349                 }
350         }
351         first_time = false;
352 }
353
354 }  /* namespace */
355
356 /* Thread safe cache for contexts and programs.
357  *
358  * TODO(sergey): Make it more generous, so it can contain any type of program
359  * without hardcoding possible program types in the slot.
360  */
361 class OpenCLCache
362 {
363         struct Slot
364         {
365                 thread_mutex *mutex;
366                 cl_context context;
367                 /* cl_program for shader, bake, film_convert kernels (used in OpenCLDeviceBase) */
368                 cl_program ocl_dev_base_program;
369                 /* cl_program for megakernel (used in OpenCLDeviceMegaKernel) */
370                 cl_program ocl_dev_megakernel_program;
371
372                 Slot() : mutex(NULL),
373                          context(NULL),
374                          ocl_dev_base_program(NULL),
375                          ocl_dev_megakernel_program(NULL) {}
376
377                 Slot(const Slot& rhs)
378                     : mutex(rhs.mutex),
379                       context(rhs.context),
380                       ocl_dev_base_program(rhs.ocl_dev_base_program),
381                       ocl_dev_megakernel_program(rhs.ocl_dev_megakernel_program)
382                 {
383                         /* copy can only happen in map insert, assert that */
384                         assert(mutex == NULL);
385                 }
386
387                 ~Slot()
388                 {
389                         delete mutex;
390                         mutex = NULL;
391                 }
392         };
393
394         /* key is combination of platform ID and device ID */
395         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
396
397         /* map of Slot objects */
398         typedef map<PlatformDevicePair, Slot> CacheMap;
399         CacheMap cache;
400
401         thread_mutex cache_lock;
402
403         /* lazy instantiate */
404         static OpenCLCache &global_instance()
405         {
406                 static OpenCLCache instance;
407                 return instance;
408         }
409
410         OpenCLCache()
411         {
412         }
413
414         ~OpenCLCache()
415         {
416                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
417                  * so this is disabled. It's not necessary to free objects
418                  * at process exit anyway.
419                  * http://software.intel.com/en-us/forums/topic/370083#comments */
420
421                 //flush();
422         }
423
424         /* lookup something in the cache. If this returns NULL, slot_locker
425          * will be holding a lock for the cache. slot_locker should refer to a
426          * default constructed thread_scoped_lock */
427         template<typename T>
428         static T get_something(cl_platform_id platform,
429                                cl_device_id device,
430                                T Slot::*member,
431                                thread_scoped_lock& slot_locker)
432         {
433                 assert(platform != NULL);
434
435                 OpenCLCache& self = global_instance();
436
437                 thread_scoped_lock cache_lock(self.cache_lock);
438
439                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
440                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
441
442                 Slot &slot = ins.first->second;
443
444                 /* create slot lock only while holding cache lock */
445                 if(!slot.mutex)
446                         slot.mutex = new thread_mutex;
447
448                 /* need to unlock cache before locking slot, to allow store to complete */
449                 cache_lock.unlock();
450
451                 /* lock the slot */
452                 slot_locker = thread_scoped_lock(*slot.mutex);
453
454                 /* If the thing isn't cached */
455                 if(slot.*member == NULL) {
456                         /* return with the caller's lock holder holding the slot lock */
457                         return NULL;
458                 }
459
460                 /* the item was already cached, release the slot lock */
461                 slot_locker.unlock();
462
463                 return slot.*member;
464         }
465
466         /* store something in the cache. you MUST have tried to get the item before storing to it */
467         template<typename T>
468         static void store_something(cl_platform_id platform,
469                                     cl_device_id device,
470                                     T thing,
471                                     T Slot::*member,
472                                     thread_scoped_lock& slot_locker)
473         {
474                 assert(platform != NULL);
475                 assert(device != NULL);
476                 assert(thing != NULL);
477
478                 OpenCLCache &self = global_instance();
479
480                 thread_scoped_lock cache_lock(self.cache_lock);
481                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
482                 cache_lock.unlock();
483
484                 Slot &slot = i->second;
485
486                 /* sanity check */
487                 assert(i != self.cache.end());
488                 assert(slot.*member == NULL);
489
490                 slot.*member = thing;
491
492                 /* unlock the slot */
493                 slot_locker.unlock();
494         }
495
496 public:
497
498         enum ProgramName {
499                 OCL_DEV_BASE_PROGRAM,
500                 OCL_DEV_MEGAKERNEL_PROGRAM,
501         };
502
503         /* see get_something comment */
504         static cl_context get_context(cl_platform_id platform,
505                                       cl_device_id device,
506                                       thread_scoped_lock& slot_locker)
507         {
508                 cl_context context = get_something<cl_context>(platform,
509                                                                device,
510                                                                &Slot::context,
511                                                                slot_locker);
512
513                 if(!context)
514                         return NULL;
515
516                 /* caller is going to release it when done with it, so retain it */
517                 cl_int ciErr = clRetainContext(context);
518                 assert(ciErr == CL_SUCCESS);
519                 (void)ciErr;
520
521                 return context;
522         }
523
524         /* see get_something comment */
525         static cl_program get_program(cl_platform_id platform,
526                                       cl_device_id device,
527                                       ProgramName program_name,
528                                       thread_scoped_lock& slot_locker)
529         {
530                 cl_program program = NULL;
531
532                 switch(program_name) {
533                         case OCL_DEV_BASE_PROGRAM:
534                                 /* Get program related to OpenCLDeviceBase */
535                                 program = get_something<cl_program>(platform,
536                                                                     device,
537                                                                     &Slot::ocl_dev_base_program,
538                                                                     slot_locker);
539                                 break;
540                         case OCL_DEV_MEGAKERNEL_PROGRAM:
541                                 /* Get program related to megakernel */
542                                 program = get_something<cl_program>(platform,
543                                                                     device,
544                                                                     &Slot::ocl_dev_megakernel_program,
545                                                                     slot_locker);
546                                 break;
547                 default:
548                         assert(!"Invalid program name");
549                 }
550
551                 if(!program)
552                         return NULL;
553
554                 /* caller is going to release it when done with it, so retain it */
555                 cl_int ciErr = clRetainProgram(program);
556                 assert(ciErr == CL_SUCCESS);
557                 (void)ciErr;
558
559                 return program;
560         }
561
562         /* see store_something comment */
563         static void store_context(cl_platform_id platform,
564                                   cl_device_id device,
565                                   cl_context context,
566                                   thread_scoped_lock& slot_locker)
567         {
568                 store_something<cl_context>(platform,
569                                             device,
570                                             context,
571                                             &Slot::context,
572                                             slot_locker);
573
574                 /* increment reference count in OpenCL.
575                  * The caller is going to release the object when done with it. */
576                 cl_int ciErr = clRetainContext(context);
577                 assert(ciErr == CL_SUCCESS);
578                 (void)ciErr;
579         }
580
581         /* see store_something comment */
582         static void store_program(cl_platform_id platform,
583                                   cl_device_id device,
584                                   cl_program program,
585                                   ProgramName program_name,
586                                   thread_scoped_lock& slot_locker)
587         {
588                 switch(program_name) {
589                         case OCL_DEV_BASE_PROGRAM:
590                                 store_something<cl_program>(platform,
591                                                             device,
592                                                             program,
593                                                             &Slot::ocl_dev_base_program,
594                                                             slot_locker);
595                                 break;
596                         case OCL_DEV_MEGAKERNEL_PROGRAM:
597                                 store_something<cl_program>(platform,
598                                                             device,
599                                                             program,
600                                                             &Slot::ocl_dev_megakernel_program,
601                                                             slot_locker);
602                                 break;
603                         default:
604                                 assert(!"Invalid program name\n");
605                                 return;
606                 }
607
608                 /* Increment reference count in OpenCL.
609                  * The caller is going to release the object when done with it.
610                  */
611                 cl_int ciErr = clRetainProgram(program);
612                 assert(ciErr == CL_SUCCESS);
613                 (void)ciErr;
614         }
615
616         /* Discard all cached contexts and programs.  */
617         static void flush()
618         {
619                 OpenCLCache &self = global_instance();
620                 thread_scoped_lock cache_lock(self.cache_lock);
621
622                 foreach(CacheMap::value_type &item, self.cache) {
623                         if(item.second.ocl_dev_base_program != NULL)
624                                 clReleaseProgram(item.second.ocl_dev_base_program);
625                         if(item.second.ocl_dev_megakernel_program != NULL)
626                                 clReleaseProgram(item.second.ocl_dev_megakernel_program);
627                         if(item.second.context != NULL)
628                                 clReleaseContext(item.second.context);
629                 }
630
631                 self.cache.clear();
632         }
633 };
634
635 class OpenCLDeviceBase : public Device
636 {
637 public:
638         DedicatedTaskPool task_pool;
639         cl_context cxContext;
640         cl_command_queue cqCommandQueue;
641         cl_platform_id cpPlatform;
642         cl_device_id cdDevice;
643         cl_program cpProgram;
644         cl_kernel ckFilmConvertByteKernel;
645         cl_kernel ckFilmConvertHalfFloatKernel;
646         cl_kernel ckShaderKernel;
647         cl_kernel ckBakeKernel;
648         cl_int ciErr;
649
650         typedef map<string, device_vector<uchar>*> ConstMemMap;
651         typedef map<string, device_ptr> MemMap;
652
653         ConstMemMap const_mem_map;
654         MemMap mem_map;
655         device_ptr null_mem;
656
657         bool device_initialized;
658         string platform_name;
659
660         bool opencl_error(cl_int err)
661         {
662                 if(err != CL_SUCCESS) {
663                         string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
664                         if(error_msg == "")
665                                 error_msg = message;
666                         fprintf(stderr, "%s\n", message.c_str());
667                         return true;
668                 }
669
670                 return false;
671         }
672
673         void opencl_error(const string& message)
674         {
675                 if(error_msg == "")
676                         error_msg = message;
677                 fprintf(stderr, "%s\n", message.c_str());
678         }
679
680 #define opencl_assert(stmt) \
681         { \
682                 cl_int err = stmt; \
683                 \
684                 if(err != CL_SUCCESS) { \
685                         string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
686                         if(error_msg == "") \
687                                 error_msg = message; \
688                         fprintf(stderr, "%s\n", message.c_str()); \
689                 } \
690         } (void)0
691
692         void opencl_assert_err(cl_int err, const char* where)
693         {
694                 if(err != CL_SUCCESS) {
695                         string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
696                         if(error_msg == "")
697                                 error_msg = message;
698                         fprintf(stderr, "%s\n", message.c_str());
699 #ifndef NDEBUG
700                         abort();
701 #endif
702                 }
703         }
704
705         OpenCLDeviceBase(DeviceInfo& info, Stats &stats, bool background_)
706         : Device(info, stats, background_)
707         {
708                 cpPlatform = NULL;
709                 cdDevice = NULL;
710                 cxContext = NULL;
711                 cqCommandQueue = NULL;
712                 cpProgram = NULL;
713                 ckFilmConvertByteKernel = NULL;
714                 ckFilmConvertHalfFloatKernel = NULL;
715                 ckShaderKernel = NULL;
716                 ckBakeKernel = NULL;
717                 null_mem = 0;
718                 device_initialized = false;
719
720                 vector<OpenCLPlatformDevice> usable_devices;
721                 opencl_get_usable_devices(&usable_devices);
722                 if(usable_devices.size() == 0) {
723                         opencl_error("OpenCL: no devices found.");
724                         return;
725                 }
726                 assert(info.num < usable_devices.size());
727                 OpenCLPlatformDevice& platform_device = usable_devices[info.num];
728                 cpPlatform = platform_device.platform_id;
729                 cdDevice = platform_device.device_id;
730                 platform_name = platform_device.platform_name;
731                 VLOG(2) << "Creating new Cycles device for OpenCL platform "
732                         << platform_name << ", device "
733                         << platform_device.device_name << ".";
734
735                 {
736                         /* try to use cached context */
737                         thread_scoped_lock cache_locker;
738                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
739
740                         if(cxContext == NULL) {
741                                 /* create context properties array to specify platform */
742                                 const cl_context_properties context_props[] = {
743                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
744                                         0, 0
745                                 };
746
747                                 /* create context */
748                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
749                                         context_notify_callback, cdDevice, &ciErr);
750
751                                 if(opencl_error(ciErr)) {
752                                         opencl_error("OpenCL: clCreateContext failed");
753                                         return;
754                                 }
755
756                                 /* cache it */
757                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
758                         }
759                 }
760
761                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
762                 if(opencl_error(ciErr))
763                         return;
764
765                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
766                 if(opencl_error(ciErr))
767                         return;
768
769                 fprintf(stderr, "Device init success\n");
770                 device_initialized = true;
771         }
772
773         static void CL_CALLBACK context_notify_callback(const char *err_info,
774                 const void * /*private_info*/, size_t /*cb*/, void *user_data)
775         {
776                 char name[256];
777                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
778
779                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
780         }
781
782         bool opencl_version_check()
783         {
784                 string error;
785                 if(!opencl_platform_version_check(cpPlatform, &error)) {
786                         opencl_error(error);
787                         return false;
788                 }
789                 if(!opencl_device_version_check(cdDevice, &error)) {
790                         opencl_error(error);
791                         return false;
792                 }
793                 return true;
794         }
795
796         bool load_binary(const string& /*kernel_path*/,
797                          const string& clbin,
798                          string custom_kernel_build_options,
799                          cl_program *program,
800                          const string *debug_src = NULL)
801         {
802                 /* read binary into memory */
803                 vector<uint8_t> binary;
804
805                 if(!path_read_binary(clbin, binary)) {
806                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
807                         return false;
808                 }
809
810                 /* create program */
811                 cl_int status;
812                 size_t size = binary.size();
813                 const uint8_t *bytes = &binary[0];
814
815                 *program = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
816                         &size, &bytes, &status, &ciErr);
817
818                 if(opencl_error(status) || opencl_error(ciErr)) {
819                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
820                         return false;
821                 }
822
823                 if(!build_kernel(program, custom_kernel_build_options, debug_src))
824                         return false;
825
826                 return true;
827         }
828
829         bool save_binary(cl_program *program, const string& clbin)
830         {
831                 size_t size = 0;
832                 clGetProgramInfo(*program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
833
834                 if(!size)
835                         return false;
836
837                 vector<uint8_t> binary(size);
838                 uint8_t *bytes = &binary[0];
839
840                 clGetProgramInfo(*program, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
841
842                 if(!path_write_binary(clbin, binary)) {
843                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
844                         return false;
845                 }
846
847                 return true;
848         }
849
850         bool build_kernel(cl_program *kernel_program,
851                           string custom_kernel_build_options,
852                           const string *debug_src = NULL)
853         {
854                 string build_options;
855                 build_options = kernel_build_options(debug_src) + custom_kernel_build_options;
856
857                 ciErr = clBuildProgram(*kernel_program, 0, NULL, build_options.c_str(), NULL, NULL);
858
859                 /* show warnings even if build is successful */
860                 size_t ret_val_size = 0;
861
862                 clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
863
864                 if(ret_val_size > 1) {
865                         vector<char> build_log(ret_val_size + 1);
866                         clGetProgramBuildInfo(*kernel_program, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
867
868                         build_log[ret_val_size] = '\0';
869                         /* Skip meaningless empty output from the NVidia compiler. */
870                         if(!(ret_val_size == 2 && build_log[0] == '\n')) {
871                                 fprintf(stderr, "OpenCL kernel build output:\n");
872                                 fprintf(stderr, "%s\n", &build_log[0]);
873                         }
874                 }
875
876                 if(ciErr != CL_SUCCESS) {
877                         opencl_error("OpenCL build failed: errors in console");
878                         return false;
879                 }
880
881                 return true;
882         }
883
884         bool compile_kernel(const string& kernel_path,
885                             string source,
886                             string custom_kernel_build_options,
887                             cl_program *kernel_program,
888                             const string *debug_src = NULL)
889         {
890                 /* we compile kernels consisting of many files. unfortunately opencl
891                  * kernel caches do not seem to recognize changes in included files.
892                  * so we force recompile on changes by adding the md5 hash of all files */
893                 source = path_source_replace_includes(source, kernel_path);
894
895                 if(debug_src)
896                         path_write_text(*debug_src, source);
897
898                 size_t source_len = source.size();
899                 const char *source_str = source.c_str();
900
901                 *kernel_program = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
902
903                 if(opencl_error(ciErr))
904                         return false;
905
906                 double starttime = time_dt();
907                 printf("Compiling OpenCL kernel ...\n");
908                 /* TODO(sergey): Report which kernel is being compiled
909                  * as well (megakernel or which of split kernels etc..).
910                  */
911                 printf("Build flags: %s\n", custom_kernel_build_options.c_str());
912
913                 if(!build_kernel(kernel_program, custom_kernel_build_options, debug_src))
914                         return false;
915
916                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
917
918                 return true;
919         }
920
921         string device_md5_hash(string kernel_custom_build_options = "")
922         {
923                 MD5Hash md5;
924                 char version[256], driver[256], name[256], vendor[256];
925
926                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
927                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
928                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
929                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
930
931                 md5.append((uint8_t*)vendor, strlen(vendor));
932                 md5.append((uint8_t*)version, strlen(version));
933                 md5.append((uint8_t*)name, strlen(name));
934                 md5.append((uint8_t*)driver, strlen(driver));
935
936                 string options = kernel_build_options();
937                 options += kernel_custom_build_options;
938                 md5.append((uint8_t*)options.c_str(), options.size());
939
940                 return md5.get_hex();
941         }
942
943         bool load_kernels(const DeviceRequestedFeatures& requested_features)
944         {
945                 /* Verify if device was initialized. */
946                 if(!device_initialized) {
947                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
948                         return false;
949                 }
950
951                 /* Try to use cached kernel. */
952                 thread_scoped_lock cache_locker;
953                 cpProgram = load_cached_kernel(requested_features,
954                                                OpenCLCache::OCL_DEV_BASE_PROGRAM,
955                                                cache_locker);
956
957                 if(!cpProgram) {
958                         VLOG(2) << "No cached OpenCL kernel.";
959
960                         /* Verify we have right opencl version. */
961                         if(!opencl_version_check())
962                                 return false;
963
964                         string build_flags = build_options_for_base_program(requested_features);
965
966                         /* Calculate md5 hashes to detect changes. */
967                         string kernel_path = path_get("kernel");
968                         string kernel_md5 = path_files_md5_hash(kernel_path);
969                         string device_md5 = device_md5_hash(build_flags);
970
971                         /* Path to cached binary.
972                          *
973                          * TODO(sergey): Seems we could de-duplicate all this string_printf()
974                          * calls with some utility function which will give file name for a
975                          * given hashes..
976                          */
977                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
978                                                      device_md5.c_str(),
979                                                      kernel_md5.c_str());
980                         clbin = path_user_get(path_join("cache", clbin));
981
982                         /* path to preprocessed source for debugging */
983                         string clsrc, *debug_src = NULL;
984
985                         if(opencl_kernel_use_debug()) {
986                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
987                                                       device_md5.c_str(),
988                                                       kernel_md5.c_str());
989                                 clsrc = path_user_get(path_join("cache", clsrc));
990                                 debug_src = &clsrc;
991                         }
992
993                         /* If binary kernel exists already, try use it. */
994                         if(path_exists(clbin) && load_binary(kernel_path,
995                                                              clbin,
996                                                              build_flags,
997                                                              &cpProgram)) {
998                                 /* Kernel loaded from binary, nothing to do. */
999                                 VLOG(2) << "Loaded kernel from " << clbin << ".";
1000                         }
1001                         else {
1002                                 VLOG(2) << "Kernel file " << clbin << " either doesn't exist or failed to be loaded by driver.";
1003                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " + kernel_md5 + "\n";
1004
1005                                 /* If does not exist or loading binary failed, compile kernel. */
1006                                 if(!compile_kernel(kernel_path,
1007                                                    init_kernel_source,
1008                                                    build_flags,
1009                                                    &cpProgram,
1010                                                    debug_src))
1011                                 {
1012                                         return false;
1013                                 }
1014
1015                                 /* Save binary for reuse. */
1016                                 if(!save_binary(&cpProgram, clbin)) {
1017                                         return false;
1018                                 }
1019                         }
1020
1021                         /* Cache the program. */
1022                         store_cached_kernel(cpPlatform,
1023                                             cdDevice,
1024                                             cpProgram,
1025                                             OpenCLCache::OCL_DEV_BASE_PROGRAM,
1026                                             cache_locker);
1027                 }
1028                 else {
1029                         VLOG(2) << "Found cached OpenCL kernel.";
1030                 }
1031
1032                 /* Find kernels. */
1033 #define FIND_KERNEL(kernel_var, kernel_name) \
1034                 do { \
1035                         kernel_var = clCreateKernel(cpProgram, "kernel_ocl_" kernel_name, &ciErr); \
1036                         if(opencl_error(ciErr)) \
1037                                 return false; \
1038                 } while(0)
1039
1040                 FIND_KERNEL(ckFilmConvertByteKernel, "convert_to_byte");
1041                 FIND_KERNEL(ckFilmConvertHalfFloatKernel, "convert_to_half_float");
1042                 FIND_KERNEL(ckShaderKernel, "shader");
1043                 FIND_KERNEL(ckBakeKernel, "bake");
1044
1045 #undef FIND_KERNEL
1046                 return true;
1047         }
1048
1049         ~OpenCLDeviceBase()
1050         {
1051                 task_pool.stop();
1052
1053                 if(null_mem)
1054                         clReleaseMemObject(CL_MEM_PTR(null_mem));
1055
1056                 ConstMemMap::iterator mt;
1057                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
1058                         mem_free(*(mt->second));
1059                         delete mt->second;
1060                 }
1061
1062                 if(ckFilmConvertByteKernel)
1063                         clReleaseKernel(ckFilmConvertByteKernel);
1064                 if(ckFilmConvertHalfFloatKernel)
1065                         clReleaseKernel(ckFilmConvertHalfFloatKernel);
1066                 if(ckShaderKernel)
1067                         clReleaseKernel(ckShaderKernel);
1068                 if(ckBakeKernel)
1069                         clReleaseKernel(ckBakeKernel);
1070                 if(cpProgram)
1071                         clReleaseProgram(cpProgram);
1072                 if(cqCommandQueue)
1073                         clReleaseCommandQueue(cqCommandQueue);
1074                 if(cxContext)
1075                         clReleaseContext(cxContext);
1076         }
1077
1078         void mem_alloc(device_memory& mem, MemoryType type)
1079         {
1080                 size_t size = mem.memory_size();
1081
1082                 cl_mem_flags mem_flag;
1083                 void *mem_ptr = NULL;
1084
1085                 if(type == MEM_READ_ONLY)
1086                         mem_flag = CL_MEM_READ_ONLY;
1087                 else if(type == MEM_WRITE_ONLY)
1088                         mem_flag = CL_MEM_WRITE_ONLY;
1089                 else
1090                         mem_flag = CL_MEM_READ_WRITE;
1091
1092                 /* Zero-size allocation might be invoked by render, but not really
1093                  * supported by OpenCL. Using NULL as device pointer also doesn't really
1094                  * work for some reason, so for the time being we'll use special case
1095                  * will null_mem buffer.
1096                  */
1097                 if(size != 0) {
1098                         mem.device_pointer = (device_ptr)clCreateBuffer(cxContext,
1099                                                                         mem_flag,
1100                                                                         size,
1101                                                                         mem_ptr,
1102                                                                         &ciErr);
1103                         opencl_assert_err(ciErr, "clCreateBuffer");
1104                 }
1105                 else {
1106                         mem.device_pointer = null_mem;
1107                 }
1108
1109                 stats.mem_alloc(size);
1110                 mem.device_size = size;
1111         }
1112
1113         void mem_copy_to(device_memory& mem)
1114         {
1115                 /* this is blocking */
1116                 size_t size = mem.memory_size();
1117                 if(size != 0) {
1118                         opencl_assert(clEnqueueWriteBuffer(cqCommandQueue,
1119                                                            CL_MEM_PTR(mem.device_pointer),
1120                                                            CL_TRUE,
1121                                                            0,
1122                                                            size,
1123                                                            (void*)mem.data_pointer,
1124                                                            0,
1125                                                            NULL, NULL));
1126                 }
1127         }
1128
1129         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
1130         {
1131                 size_t offset = elem*y*w;
1132                 size_t size = elem*w*h;
1133                 assert(size != 0);
1134                 opencl_assert(clEnqueueReadBuffer(cqCommandQueue,
1135                                                   CL_MEM_PTR(mem.device_pointer),
1136                                                   CL_TRUE,
1137                                                   offset,
1138                                                   size,
1139                                                   (uchar*)mem.data_pointer + offset,
1140                                                   0,
1141                                                   NULL, NULL));
1142         }
1143
1144         void mem_zero(device_memory& mem)
1145         {
1146                 if(mem.device_pointer) {
1147                         memset((void*)mem.data_pointer, 0, mem.memory_size());
1148                         mem_copy_to(mem);
1149                 }
1150         }
1151
1152         void mem_free(device_memory& mem)
1153         {
1154                 if(mem.device_pointer) {
1155                         if(mem.device_pointer != null_mem) {
1156                                 opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
1157                         }
1158                         mem.device_pointer = 0;
1159
1160                         stats.mem_free(mem.device_size);
1161                         mem.device_size = 0;
1162                 }
1163         }
1164
1165         void const_copy_to(const char *name, void *host, size_t size)
1166         {
1167                 ConstMemMap::iterator i = const_mem_map.find(name);
1168
1169                 if(i == const_mem_map.end()) {
1170                         device_vector<uchar> *data = new device_vector<uchar>();
1171                         data->copy((uchar*)host, size);
1172
1173                         mem_alloc(*data, MEM_READ_ONLY);
1174                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
1175                 }
1176                 else {
1177                         device_vector<uchar> *data = i->second;
1178                         data->copy((uchar*)host, size);
1179                 }
1180
1181                 mem_copy_to(*i->second);
1182         }
1183
1184         void tex_alloc(const char *name,
1185                        device_memory& mem,
1186                        InterpolationType /*interpolation*/,
1187                        ExtensionType /*extension*/)
1188         {
1189                 VLOG(1) << "Texture allocate: " << name << ", " << mem.memory_size() << " bytes.";
1190                 mem_alloc(mem, MEM_READ_ONLY);
1191                 mem_copy_to(mem);
1192                 assert(mem_map.find(name) == mem_map.end());
1193                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
1194         }
1195
1196         void tex_free(device_memory& mem)
1197         {
1198                 if(mem.device_pointer) {
1199                         foreach(const MemMap::value_type& value, mem_map) {
1200                                 if(value.second == mem.device_pointer) {
1201                                         mem_map.erase(value.first);
1202                                         break;
1203                                 }
1204                         }
1205
1206                         mem_free(mem);
1207                 }
1208         }
1209
1210         size_t global_size_round_up(int group_size, int global_size)
1211         {
1212                 int r = global_size % group_size;
1213                 return global_size + ((r == 0)? 0: group_size - r);
1214         }
1215
1216         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
1217         {
1218                 size_t workgroup_size, max_work_items[3];
1219
1220                 clGetKernelWorkGroupInfo(kernel, cdDevice,
1221                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
1222                 clGetDeviceInfo(cdDevice,
1223                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
1224         
1225                 /* try to divide evenly over 2 dimensions */
1226                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
1227                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
1228
1229                 /* some implementations have max size 1 on 2nd dimension */
1230                 if(local_size[1] > max_work_items[1]) {
1231                         local_size[0] = workgroup_size/max_work_items[1];
1232                         local_size[1] = max_work_items[1];
1233                 }
1234
1235                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
1236
1237                 /* run kernel */
1238                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
1239                 opencl_assert(clFlush(cqCommandQueue));
1240         }
1241
1242         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
1243         {
1244                 cl_mem ptr;
1245
1246                 MemMap::iterator i = mem_map.find(name);
1247                 if(i != mem_map.end()) {
1248                         ptr = CL_MEM_PTR(i->second);
1249                 }
1250                 else {
1251                         /* work around NULL not working, even though the spec says otherwise */
1252                         ptr = CL_MEM_PTR(null_mem);
1253                 }
1254                 
1255                 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
1256         }
1257
1258         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
1259         {
1260                 /* cast arguments to cl types */
1261                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1262                 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
1263                 cl_mem d_buffer = CL_MEM_PTR(buffer);
1264                 cl_int d_x = task.x;
1265                 cl_int d_y = task.y;
1266                 cl_int d_w = task.w;
1267                 cl_int d_h = task.h;
1268                 cl_float d_sample_scale = 1.0f/(task.sample + 1);
1269                 cl_int d_offset = task.offset;
1270                 cl_int d_stride = task.stride;
1271
1272
1273                 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
1274
1275                 cl_uint start_arg_index =
1276                         kernel_set_args(ckFilmConvertKernel,
1277                                         0,
1278                                         d_data,
1279                                         d_rgba,
1280                                         d_buffer);
1281
1282 #define KERNEL_TEX(type, ttype, name) \
1283         set_kernel_arg_mem(ckFilmConvertKernel, &start_arg_index, #name);
1284 #include "kernel_textures.h"
1285 #undef KERNEL_TEX
1286
1287                 start_arg_index += kernel_set_args(ckFilmConvertKernel,
1288                                                    start_arg_index,
1289                                                    d_sample_scale,
1290                                                    d_x,
1291                                                    d_y,
1292                                                    d_w,
1293                                                    d_h,
1294                                                    d_offset,
1295                                                    d_stride);
1296
1297                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1298         }
1299
1300         void shader(DeviceTask& task)
1301         {
1302                 /* cast arguments to cl types */
1303                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1304                 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1305                 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1306                 cl_mem d_output_luma = CL_MEM_PTR(task.shader_output_luma);
1307                 cl_int d_shader_eval_type = task.shader_eval_type;
1308                 cl_int d_shader_filter = task.shader_filter;
1309                 cl_int d_shader_x = task.shader_x;
1310                 cl_int d_shader_w = task.shader_w;
1311                 cl_int d_offset = task.offset;
1312
1313                 cl_kernel kernel;
1314
1315                 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1316                         kernel = ckBakeKernel;
1317                 else
1318                         kernel = ckShaderKernel;
1319
1320                 for(int sample = 0; sample < task.num_samples; sample++) {
1321
1322                         if(task.get_cancel())
1323                                 break;
1324
1325                         cl_int d_sample = sample;
1326
1327                         cl_uint start_arg_index =
1328                                 kernel_set_args(kernel,
1329                                                 0,
1330                                                 d_data,
1331                                                 d_input,
1332                                                 d_output);
1333
1334                         if(task.shader_eval_type < SHADER_EVAL_BAKE) {
1335                                 start_arg_index += kernel_set_args(kernel,
1336                                                                    start_arg_index,
1337                                                                    d_output_luma);
1338                         }
1339
1340 #define KERNEL_TEX(type, ttype, name) \
1341                         set_kernel_arg_mem(kernel, &start_arg_index, #name);
1342 #include "kernel_textures.h"
1343 #undef KERNEL_TEX
1344
1345                         start_arg_index += kernel_set_args(kernel,
1346                                                            start_arg_index,
1347                                                            d_shader_eval_type);
1348                         if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
1349                                 start_arg_index += kernel_set_args(kernel,
1350                                                                    start_arg_index,
1351                                                                    d_shader_filter);
1352                         }
1353                         start_arg_index += kernel_set_args(kernel,
1354                                                            start_arg_index,
1355                                                            d_shader_x,
1356                                                            d_shader_w,
1357                                                            d_offset,
1358                                                            d_sample);
1359
1360                         enqueue_kernel(kernel, task.shader_w, 1);
1361
1362                         task.update_progress(NULL);
1363                 }
1364         }
1365
1366         class OpenCLDeviceTask : public DeviceTask {
1367         public:
1368                 OpenCLDeviceTask(OpenCLDeviceBase *device, DeviceTask& task)
1369                 : DeviceTask(task)
1370                 {
1371                         run = function_bind(&OpenCLDeviceBase::thread_run,
1372                                             device,
1373                                             this);
1374                 }
1375         };
1376
1377         int get_split_task_count(DeviceTask& /*task*/)
1378         {
1379                 return 1;
1380         }
1381
1382         void task_add(DeviceTask& task)
1383         {
1384                 task_pool.push(new OpenCLDeviceTask(this, task));
1385         }
1386
1387         void task_wait()
1388         {
1389                 task_pool.wait();
1390         }
1391
1392         void task_cancel()
1393         {
1394                 task_pool.cancel();
1395         }
1396
1397         virtual void thread_run(DeviceTask * /*task*/) = 0;
1398
1399 protected:
1400         string kernel_build_options(const string *debug_src = NULL)
1401         {
1402                 string build_options = "-cl-fast-relaxed-math ";
1403
1404                 if(platform_name == "NVIDIA CUDA") {
1405                         build_options += "-D__KERNEL_OPENCL_NVIDIA__ "
1406                                          "-cl-nv-maxrregcount=32 "
1407                                          "-cl-nv-verbose ";
1408
1409                         uint compute_capability_major, compute_capability_minor;
1410                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
1411                                         sizeof(cl_uint), &compute_capability_major, NULL);
1412                         clGetDeviceInfo(cdDevice, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
1413                                         sizeof(cl_uint), &compute_capability_minor, NULL);
1414
1415                         build_options += string_printf("-D__COMPUTE_CAPABILITY__=%u ",
1416                                                        compute_capability_major * 100 +
1417                                                        compute_capability_minor * 10);
1418                 }
1419
1420                 else if(platform_name == "Apple")
1421                         build_options += "-D__KERNEL_OPENCL_APPLE__ ";
1422
1423                 else if(platform_name == "AMD Accelerated Parallel Processing")
1424                         build_options += "-D__KERNEL_OPENCL_AMD__ ";
1425
1426                 else if(platform_name == "Intel(R) OpenCL") {
1427                         build_options += "-D__KERNEL_OPENCL_INTEL_CPU__ ";
1428
1429                         /* Options for gdb source level kernel debugging.
1430                          * this segfaults on linux currently.
1431                          */
1432                         if(opencl_kernel_use_debug() && debug_src)
1433                                 build_options += "-g -s \"" + *debug_src + "\" ";
1434                 }
1435
1436                 if(opencl_kernel_use_debug())
1437                         build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
1438
1439 #ifdef WITH_CYCLES_DEBUG
1440                 build_options += "-D__KERNEL_DEBUG__ ";
1441 #endif
1442
1443                 return build_options;
1444         }
1445
1446         class ArgumentWrapper {
1447         public:
1448                 ArgumentWrapper() : size(0), pointer(NULL) {}
1449                 template <typename T>
1450                 ArgumentWrapper(T& argument) : size(sizeof(argument)),
1451                                                pointer(&argument) { }
1452                 ArgumentWrapper(int argument) : size(sizeof(int)),
1453                                                 int_value(argument),
1454                                                 pointer(&int_value) { }
1455                 ArgumentWrapper(float argument) : size(sizeof(float)),
1456                                                   float_value(argument),
1457                                                   pointer(&float_value) { }
1458                 size_t size;
1459                 int int_value;
1460                 float float_value;
1461                 void *pointer;
1462         };
1463
1464         /* TODO(sergey): In the future we can use variadic templates, once
1465          * C++0x is allowed. Should allow to clean this up a bit.
1466          */
1467         int kernel_set_args(cl_kernel kernel,
1468                             int start_argument_index,
1469                             const ArgumentWrapper& arg1 = ArgumentWrapper(),
1470                             const ArgumentWrapper& arg2 = ArgumentWrapper(),
1471                             const ArgumentWrapper& arg3 = ArgumentWrapper(),
1472                             const ArgumentWrapper& arg4 = ArgumentWrapper(),
1473                             const ArgumentWrapper& arg5 = ArgumentWrapper(),
1474                             const ArgumentWrapper& arg6 = ArgumentWrapper(),
1475                             const ArgumentWrapper& arg7 = ArgumentWrapper(),
1476                             const ArgumentWrapper& arg8 = ArgumentWrapper(),
1477                             const ArgumentWrapper& arg9 = ArgumentWrapper(),
1478                             const ArgumentWrapper& arg10 = ArgumentWrapper(),
1479                             const ArgumentWrapper& arg11 = ArgumentWrapper(),
1480                             const ArgumentWrapper& arg12 = ArgumentWrapper(),
1481                             const ArgumentWrapper& arg13 = ArgumentWrapper(),
1482                             const ArgumentWrapper& arg14 = ArgumentWrapper(),
1483                             const ArgumentWrapper& arg15 = ArgumentWrapper(),
1484                             const ArgumentWrapper& arg16 = ArgumentWrapper(),
1485                             const ArgumentWrapper& arg17 = ArgumentWrapper(),
1486                             const ArgumentWrapper& arg18 = ArgumentWrapper(),
1487                             const ArgumentWrapper& arg19 = ArgumentWrapper(),
1488                             const ArgumentWrapper& arg20 = ArgumentWrapper(),
1489                             const ArgumentWrapper& arg21 = ArgumentWrapper(),
1490                             const ArgumentWrapper& arg22 = ArgumentWrapper(),
1491                             const ArgumentWrapper& arg23 = ArgumentWrapper(),
1492                             const ArgumentWrapper& arg24 = ArgumentWrapper(),
1493                             const ArgumentWrapper& arg25 = ArgumentWrapper(),
1494                             const ArgumentWrapper& arg26 = ArgumentWrapper(),
1495                             const ArgumentWrapper& arg27 = ArgumentWrapper(),
1496                             const ArgumentWrapper& arg28 = ArgumentWrapper(),
1497                             const ArgumentWrapper& arg29 = ArgumentWrapper(),
1498                             const ArgumentWrapper& arg30 = ArgumentWrapper(),
1499                             const ArgumentWrapper& arg31 = ArgumentWrapper(),
1500                             const ArgumentWrapper& arg32 = ArgumentWrapper(),
1501                             const ArgumentWrapper& arg33 = ArgumentWrapper())
1502         {
1503                 int current_arg_index = 0;
1504 #define FAKE_VARARG_HANDLE_ARG(arg) \
1505                 do { \
1506                         if(arg.pointer != NULL) { \
1507                                 opencl_assert(clSetKernelArg( \
1508                                         kernel, \
1509                                         start_argument_index + current_arg_index, \
1510                                         arg.size, arg.pointer)); \
1511                                 ++current_arg_index; \
1512                         } \
1513                         else { \
1514                                 return current_arg_index; \
1515                         } \
1516                 } while(false)
1517                 FAKE_VARARG_HANDLE_ARG(arg1);
1518                 FAKE_VARARG_HANDLE_ARG(arg2);
1519                 FAKE_VARARG_HANDLE_ARG(arg3);
1520                 FAKE_VARARG_HANDLE_ARG(arg4);
1521                 FAKE_VARARG_HANDLE_ARG(arg5);
1522                 FAKE_VARARG_HANDLE_ARG(arg6);
1523                 FAKE_VARARG_HANDLE_ARG(arg7);
1524                 FAKE_VARARG_HANDLE_ARG(arg8);
1525                 FAKE_VARARG_HANDLE_ARG(arg9);
1526                 FAKE_VARARG_HANDLE_ARG(arg10);
1527                 FAKE_VARARG_HANDLE_ARG(arg11);
1528                 FAKE_VARARG_HANDLE_ARG(arg12);
1529                 FAKE_VARARG_HANDLE_ARG(arg13);
1530                 FAKE_VARARG_HANDLE_ARG(arg14);
1531                 FAKE_VARARG_HANDLE_ARG(arg15);
1532                 FAKE_VARARG_HANDLE_ARG(arg16);
1533                 FAKE_VARARG_HANDLE_ARG(arg17);
1534                 FAKE_VARARG_HANDLE_ARG(arg18);
1535                 FAKE_VARARG_HANDLE_ARG(arg19);
1536                 FAKE_VARARG_HANDLE_ARG(arg20);
1537                 FAKE_VARARG_HANDLE_ARG(arg21);
1538                 FAKE_VARARG_HANDLE_ARG(arg22);
1539                 FAKE_VARARG_HANDLE_ARG(arg23);
1540                 FAKE_VARARG_HANDLE_ARG(arg24);
1541                 FAKE_VARARG_HANDLE_ARG(arg25);
1542                 FAKE_VARARG_HANDLE_ARG(arg26);
1543                 FAKE_VARARG_HANDLE_ARG(arg27);
1544                 FAKE_VARARG_HANDLE_ARG(arg28);
1545                 FAKE_VARARG_HANDLE_ARG(arg29);
1546                 FAKE_VARARG_HANDLE_ARG(arg30);
1547                 FAKE_VARARG_HANDLE_ARG(arg31);
1548                 FAKE_VARARG_HANDLE_ARG(arg32);
1549                 FAKE_VARARG_HANDLE_ARG(arg33);
1550 #undef FAKE_VARARG_HANDLE_ARG
1551                 return current_arg_index;
1552         }
1553
1554         inline void release_kernel_safe(cl_kernel kernel)
1555         {
1556                 if(kernel) {
1557                         clReleaseKernel(kernel);
1558                 }
1559         }
1560
1561         inline void release_mem_object_safe(cl_mem mem)
1562         {
1563                 if(mem != NULL) {
1564                         clReleaseMemObject(mem);
1565                 }
1566         }
1567
1568         inline void release_program_safe(cl_program program)
1569         {
1570                 if(program) {
1571                         clReleaseProgram(program);
1572                 }
1573         }
1574
1575         /* ** Those guys are for workign around some compiler-specific bugs ** */
1576
1577         virtual cl_program load_cached_kernel(
1578                 const DeviceRequestedFeatures& /*requested_features*/,
1579                 OpenCLCache::ProgramName program_name,
1580                 thread_scoped_lock& cache_locker)
1581         {
1582                 return OpenCLCache::get_program(cpPlatform,
1583                                                 cdDevice,
1584                                                 program_name,
1585                                                 cache_locker);
1586         }
1587
1588         virtual void store_cached_kernel(cl_platform_id platform,
1589                                          cl_device_id device,
1590                                          cl_program program,
1591                                          OpenCLCache::ProgramName program_name,
1592                                          thread_scoped_lock& cache_locker)
1593         {
1594                 OpenCLCache::store_program(platform,
1595                                            device,
1596                                            program,
1597                                            program_name,
1598                                            cache_locker);
1599         }
1600
1601         virtual string build_options_for_base_program(
1602                 const DeviceRequestedFeatures& /*requested_features*/)
1603         {
1604                 /* TODO(sergey): By default we compile all features, meaning
1605                  * mega kernel is not getting feature-based optimizations.
1606                  *
1607                  * Ideally we need always compile kernel with as less features
1608                  * enabled as possible to keep performance at it's max.
1609                  */
1610                 return "";
1611         }
1612 };
1613
1614 class OpenCLDeviceMegaKernel : public OpenCLDeviceBase
1615 {
1616 public:
1617         cl_kernel ckPathTraceKernel;
1618         cl_program path_trace_program;
1619
1620         OpenCLDeviceMegaKernel(DeviceInfo& info, Stats &stats, bool background_)
1621         : OpenCLDeviceBase(info, stats, background_)
1622         {
1623                 ckPathTraceKernel = NULL;
1624                 path_trace_program = NULL;
1625         }
1626
1627         bool load_kernels(const DeviceRequestedFeatures& requested_features)
1628         {
1629                 /* Get Shader, bake and film convert kernels.
1630                  * It'll also do verification of OpenCL actually initialized.
1631                  */
1632                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
1633                         return false;
1634                 }
1635
1636                 /* Try to use cached kernel. */
1637                 thread_scoped_lock cache_locker;
1638                 path_trace_program = OpenCLCache::get_program(cpPlatform,
1639                                                               cdDevice,
1640                                                               OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1641                                                               cache_locker);
1642
1643                 if(!path_trace_program) {
1644                         /* Verify we have right opencl version. */
1645                         if(!opencl_version_check())
1646                                 return false;
1647
1648                         /* Calculate md5 hash to detect changes. */
1649                         string kernel_path = path_get("kernel");
1650                         string kernel_md5 = path_files_md5_hash(kernel_path);
1651                         string custom_kernel_build_options = "-D__COMPILE_ONLY_MEGAKERNEL__ ";
1652                         string device_md5 = device_md5_hash(custom_kernel_build_options);
1653
1654                         /* Path to cached binary. */
1655                         string clbin = string_printf("cycles_kernel_%s_%s.clbin",
1656                                                      device_md5.c_str(),
1657                                                      kernel_md5.c_str());
1658                         clbin = path_user_get(path_join("cache", clbin));
1659
1660                         /* Path to preprocessed source for debugging. */
1661                         string clsrc, *debug_src = NULL;
1662                         if(opencl_kernel_use_debug()) {
1663                                 clsrc = string_printf("cycles_kernel_%s_%s.cl",
1664                                                       device_md5.c_str(),
1665                                                       kernel_md5.c_str());
1666                                 clsrc = path_user_get(path_join("cache", clsrc));
1667                                 debug_src = &clsrc;
1668                         }
1669
1670                         /* If exists already, try use it. */
1671                         if(path_exists(clbin) && load_binary(kernel_path,
1672                                                              clbin,
1673                                                              custom_kernel_build_options,
1674                                                              &path_trace_program,
1675                                                              debug_src)) {
1676                                 /* Kernel loaded from binary, nothing to do. */
1677                         }
1678                         else {
1679                                 string init_kernel_source = "#include \"kernels/opencl/kernel.cl\" // " +
1680                                                             kernel_md5 + "\n";
1681                                 /* If does not exist or loading binary failed, compile kernel. */
1682                                 if(!compile_kernel(kernel_path,
1683                                                    init_kernel_source,
1684                                                    custom_kernel_build_options,
1685                                                    &path_trace_program,
1686                                                    debug_src))
1687                                 {
1688                                         return false;
1689                                 }
1690                                 /* Save binary for reuse. */
1691                                 if(!save_binary(&path_trace_program, clbin)) {
1692                                         return false;
1693                                 }
1694                         }
1695                         /* Cache the program. */
1696                         OpenCLCache::store_program(cpPlatform,
1697                                                    cdDevice,
1698                                                    path_trace_program,
1699                                                    OpenCLCache::OCL_DEV_MEGAKERNEL_PROGRAM,
1700                                                    cache_locker);
1701                 }
1702
1703                 /* Find kernels. */
1704                 ckPathTraceKernel = clCreateKernel(path_trace_program,
1705                                                    "kernel_ocl_path_trace",
1706                                                    &ciErr);
1707                 if(opencl_error(ciErr))
1708                         return false;
1709                 return true;
1710         }
1711
1712         ~OpenCLDeviceMegaKernel()
1713         {
1714                 task_pool.stop();
1715                 release_kernel_safe(ckPathTraceKernel);
1716                 release_program_safe(path_trace_program);
1717         }
1718
1719         void path_trace(RenderTile& rtile, int sample)
1720         {
1721                 /* Cast arguments to cl types. */
1722                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1723                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
1724                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
1725                 cl_int d_x = rtile.x;
1726                 cl_int d_y = rtile.y;
1727                 cl_int d_w = rtile.w;
1728                 cl_int d_h = rtile.h;
1729                 cl_int d_offset = rtile.offset;
1730                 cl_int d_stride = rtile.stride;
1731
1732                 /* Sample arguments. */
1733                 cl_int d_sample = sample;
1734
1735                 cl_uint start_arg_index =
1736                         kernel_set_args(ckPathTraceKernel,
1737                                         0,
1738                                         d_data,
1739                                         d_buffer,
1740                                         d_rng_state);
1741
1742 #define KERNEL_TEX(type, ttype, name) \
1743                 set_kernel_arg_mem(ckPathTraceKernel, &start_arg_index, #name);
1744 #include "kernel_textures.h"
1745 #undef KERNEL_TEX
1746
1747                 start_arg_index += kernel_set_args(ckPathTraceKernel,
1748                                                    start_arg_index,
1749                                                    d_sample,
1750                                                    d_x,
1751                                                    d_y,
1752                                                    d_w,
1753                                                    d_h,
1754                                                    d_offset,
1755                                                    d_stride);
1756
1757                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
1758         }
1759
1760         void thread_run(DeviceTask *task)
1761         {
1762                 if(task->type == DeviceTask::FILM_CONVERT) {
1763                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1764                 }
1765                 else if(task->type == DeviceTask::SHADER) {
1766                         shader(*task);
1767                 }
1768                 else if(task->type == DeviceTask::PATH_TRACE) {
1769                         RenderTile tile;
1770                         /* Keep rendering tiles until done. */
1771                         while(task->acquire_tile(this, tile)) {
1772                                 int start_sample = tile.start_sample;
1773                                 int end_sample = tile.start_sample + tile.num_samples;
1774
1775                                 for(int sample = start_sample; sample < end_sample; sample++) {
1776                                         if(task->get_cancel()) {
1777                                                 if(task->need_finish_queue == false)
1778                                                         break;
1779                                         }
1780
1781                                         path_trace(tile, sample);
1782
1783                                         tile.sample = sample + 1;
1784
1785                                         task->update_progress(&tile);
1786                                 }
1787
1788                                 /* Complete kernel execution before release tile */
1789                                 /* This helps in multi-device render;
1790                                  * The device that reaches the critical-section function
1791                                  * release_tile waits (stalling other devices from entering
1792                                  * release_tile) for all kernels to complete. If device1 (a
1793                                  * slow-render device) reaches release_tile first then it would
1794                                  * stall device2 (a fast-render device) from proceeding to render
1795                                  * next tile.
1796                                  */
1797                                 clFinish(cqCommandQueue);
1798
1799                                 task->release_tile(tile);
1800                         }
1801                 }
1802         }
1803 };
1804
1805 /* TODO(sergey): This is to keep tile split on OpenCL level working
1806  * for now, since without this view-port render does not work as it
1807  * should.
1808  *
1809  * Ideally it'll be done on the higher level, but we need to get ready
1810  * for merge rather soon, so let's keep split logic private here in
1811  * the file.
1812  */
1813 class SplitRenderTile : public RenderTile {
1814 public:
1815         SplitRenderTile()
1816                 : RenderTile(),
1817                   buffer_offset_x(0),
1818                   buffer_offset_y(0),
1819                   rng_state_offset_x(0),
1820                   rng_state_offset_y(0),
1821                   buffer_rng_state_stride(0) {}
1822
1823         explicit SplitRenderTile(RenderTile& tile)
1824                 : RenderTile(),
1825                   buffer_offset_x(0),
1826                   buffer_offset_y(0),
1827                   rng_state_offset_x(0),
1828                   rng_state_offset_y(0),
1829                   buffer_rng_state_stride(0)
1830         {
1831                 x = tile.x;
1832                 y = tile.y;
1833                 w = tile.w;
1834                 h = tile.h;
1835                 start_sample = tile.start_sample;
1836                 num_samples = tile.num_samples;
1837                 sample = tile.sample;
1838                 resolution = tile.resolution;
1839                 offset = tile.offset;
1840                 stride = tile.stride;
1841                 buffer = tile.buffer;
1842                 rng_state = tile.rng_state;
1843                 buffers = tile.buffers;
1844         }
1845
1846         /* Split kernel is device global memory constrained;
1847          * hence split kernel cant render big tile size's in
1848          * one go. If the user sets a big tile size (big tile size
1849          * is a term relative to the available device global memory),
1850          * we split the tile further and then call path_trace on
1851          * each of those split tiles. The following variables declared,
1852          * assist in achieving that purpose
1853          */
1854         int buffer_offset_x;
1855         int buffer_offset_y;
1856         int rng_state_offset_x;
1857         int rng_state_offset_y;
1858         int buffer_rng_state_stride;
1859 };
1860
1861 /* OpenCLDeviceSplitKernel's declaration/definition. */
1862 class OpenCLDeviceSplitKernel : public OpenCLDeviceBase
1863 {
1864 public:
1865         /* Kernel declaration. */
1866         cl_kernel ckPathTraceKernel_data_init;
1867         cl_kernel ckPathTraceKernel_scene_intersect;
1868         cl_kernel ckPathTraceKernel_lamp_emission;
1869         cl_kernel ckPathTraceKernel_queue_enqueue;
1870         cl_kernel ckPathTraceKernel_background_buffer_update;
1871         cl_kernel ckPathTraceKernel_shader_eval;
1872         cl_kernel ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao;
1873         cl_kernel ckPathTraceKernel_direct_lighting;
1874         cl_kernel ckPathTraceKernel_shadow_blocked;
1875         cl_kernel ckPathTraceKernel_next_iteration_setup;
1876         cl_kernel ckPathTraceKernel_sum_all_radiance;
1877
1878         /* cl_program declaration. */
1879         cl_program data_init_program;
1880         cl_program scene_intersect_program;
1881         cl_program lamp_emission_program;
1882         cl_program queue_enqueue_program;
1883         cl_program background_buffer_update_program;
1884         cl_program shader_eval_program;
1885         cl_program holdout_emission_blurring_pathtermination_ao_program;
1886         cl_program direct_lighting_program;
1887         cl_program shadow_blocked_program;
1888         cl_program next_iteration_setup_program;
1889         cl_program sum_all_radiance_program;
1890
1891         /* Global memory variables [porting]; These memory is used for
1892          * co-operation between different kernels; Data written by one
1893          * kernel will be available to another kernel via this global
1894          * memory.
1895          */
1896         cl_mem rng_coop;
1897         cl_mem throughput_coop;
1898         cl_mem L_transparent_coop;
1899         cl_mem PathRadiance_coop;
1900         cl_mem Ray_coop;
1901         cl_mem PathState_coop;
1902         cl_mem Intersection_coop;
1903         cl_mem kgbuffer;  /* KernelGlobals buffer. */
1904
1905         /* Global buffers for ShaderData. */
1906         cl_mem sd;             /* ShaderData used in the main path-iteration loop. */
1907         cl_mem sd_DL_shadow;   /* ShaderData used in Direct Lighting and
1908                                 * shadow_blocked kernel.
1909                                 */
1910
1911         /* Global buffers of each member of ShaderData. */
1912         cl_mem P_sd;
1913         cl_mem P_sd_DL_shadow;
1914         cl_mem N_sd;
1915         cl_mem N_sd_DL_shadow;
1916         cl_mem Ng_sd;
1917         cl_mem Ng_sd_DL_shadow;
1918         cl_mem I_sd;
1919         cl_mem I_sd_DL_shadow;
1920         cl_mem shader_sd;
1921         cl_mem shader_sd_DL_shadow;
1922         cl_mem flag_sd;
1923         cl_mem flag_sd_DL_shadow;
1924         cl_mem prim_sd;
1925         cl_mem prim_sd_DL_shadow;
1926         cl_mem type_sd;
1927         cl_mem type_sd_DL_shadow;
1928         cl_mem u_sd;
1929         cl_mem u_sd_DL_shadow;
1930         cl_mem v_sd;
1931         cl_mem v_sd_DL_shadow;
1932         cl_mem object_sd;
1933         cl_mem object_sd_DL_shadow;
1934         cl_mem time_sd;
1935         cl_mem time_sd_DL_shadow;
1936         cl_mem ray_length_sd;
1937         cl_mem ray_length_sd_DL_shadow;
1938
1939         /* Ray differentials. */
1940         cl_mem dP_sd, dI_sd;
1941         cl_mem dP_sd_DL_shadow, dI_sd_DL_shadow;
1942         cl_mem du_sd, dv_sd;
1943         cl_mem du_sd_DL_shadow, dv_sd_DL_shadow;
1944
1945         /* Dp/Du */
1946         cl_mem dPdu_sd, dPdv_sd;
1947         cl_mem dPdu_sd_DL_shadow, dPdv_sd_DL_shadow;
1948
1949         /* Object motion. */
1950         cl_mem ob_tfm_sd, ob_itfm_sd;
1951         cl_mem ob_tfm_sd_DL_shadow, ob_itfm_sd_DL_shadow;
1952
1953         cl_mem closure_sd;
1954         cl_mem closure_sd_DL_shadow;
1955         cl_mem num_closure_sd;
1956         cl_mem num_closure_sd_DL_shadow;
1957         cl_mem randb_closure_sd;
1958         cl_mem randb_closure_sd_DL_shadow;
1959         cl_mem ray_P_sd;
1960         cl_mem ray_P_sd_DL_shadow;
1961         cl_mem ray_dP_sd;
1962         cl_mem ray_dP_sd_DL_shadow;
1963
1964         /* Global memory required for shadow blocked and accum_radiance. */
1965         cl_mem BSDFEval_coop;
1966         cl_mem ISLamp_coop;
1967         cl_mem LightRay_coop;
1968         cl_mem AOAlpha_coop;
1969         cl_mem AOBSDF_coop;
1970         cl_mem AOLightRay_coop;
1971         cl_mem Intersection_coop_shadow;
1972
1973 #ifdef WITH_CYCLES_DEBUG
1974         /* DebugData memory */
1975         cl_mem debugdata_coop;
1976 #endif
1977
1978         /* Global state array that tracks ray state. */
1979         cl_mem ray_state;
1980
1981         /* Per sample buffers. */
1982         cl_mem per_sample_output_buffers;
1983
1984         /* Denotes which sample each ray is being processed for. */
1985         cl_mem work_array;
1986
1987         /* Queue */
1988         cl_mem Queue_data;  /* Array of size queuesize * num_queues * sizeof(int). */
1989         cl_mem Queue_index; /* Array of size num_queues * sizeof(int);
1990                              * Tracks the size of each queue.
1991                              */
1992
1993         /* Flag to make sceneintersect and lampemission kernel use queues. */
1994         cl_mem use_queues_flag;
1995
1996         /* Amount of memory in output buffer associated with one pixel/thread. */
1997         size_t per_thread_output_buffer_size;
1998
1999         /* Total allocatable available device memory. */
2000         size_t total_allocatable_memory;
2001
2002         /* host version of ray_state; Used in checking host path-iteration
2003          * termination.
2004          */
2005         char *hostRayStateArray;
2006
2007         /* Number of path-iterations to be done in one shot. */
2008         unsigned int PathIteration_times;
2009
2010 #ifdef __WORK_STEALING__
2011         /* Work pool with respect to each work group. */
2012         cl_mem work_pool_wgs;
2013
2014         /* Denotes the maximum work groups possible w.r.t. current tile size. */
2015         unsigned int max_work_groups;
2016 #endif
2017
2018         /* clos_max value for which the kernels have been loaded currently. */
2019         int current_max_closure;
2020
2021         /* Marked True in constructor and marked false at the end of path_trace(). */
2022         bool first_tile;
2023
2024         OpenCLDeviceSplitKernel(DeviceInfo& info, Stats &stats, bool background_)
2025         : OpenCLDeviceBase(info, stats, background_)
2026         {
2027                 background = background_;
2028
2029                 /* Initialize kernels. */
2030                 ckPathTraceKernel_data_init = NULL;
2031                 ckPathTraceKernel_scene_intersect = NULL;
2032                 ckPathTraceKernel_lamp_emission = NULL;
2033                 ckPathTraceKernel_background_buffer_update = NULL;
2034                 ckPathTraceKernel_shader_eval = NULL;
2035                 ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao = NULL;
2036                 ckPathTraceKernel_direct_lighting = NULL;
2037                 ckPathTraceKernel_shadow_blocked = NULL;
2038                 ckPathTraceKernel_next_iteration_setup = NULL;
2039                 ckPathTraceKernel_sum_all_radiance = NULL;
2040                 ckPathTraceKernel_queue_enqueue = NULL;
2041
2042                 /* Initialize program. */
2043                 data_init_program = NULL;
2044                 scene_intersect_program = NULL;
2045                 lamp_emission_program = NULL;
2046                 queue_enqueue_program = NULL;
2047                 background_buffer_update_program = NULL;
2048                 shader_eval_program = NULL;
2049                 holdout_emission_blurring_pathtermination_ao_program = NULL;
2050                 direct_lighting_program = NULL;
2051                 shadow_blocked_program = NULL;
2052                 next_iteration_setup_program = NULL;
2053                 sum_all_radiance_program = NULL;
2054
2055                 /* Initialize cl_mem variables. */
2056                 kgbuffer = NULL;
2057                 sd = NULL;
2058                 sd_DL_shadow = NULL;
2059
2060                 P_sd = NULL;
2061                 P_sd_DL_shadow = NULL;
2062                 N_sd = NULL;
2063                 N_sd_DL_shadow = NULL;
2064                 Ng_sd = NULL;
2065                 Ng_sd_DL_shadow = NULL;
2066                 I_sd = NULL;
2067                 I_sd_DL_shadow = NULL;
2068                 shader_sd = NULL;
2069                 shader_sd_DL_shadow = NULL;
2070                 flag_sd = NULL;
2071                 flag_sd_DL_shadow = NULL;
2072                 prim_sd = NULL;
2073                 prim_sd_DL_shadow = NULL;
2074                 type_sd = NULL;
2075                 type_sd_DL_shadow = NULL;
2076                 u_sd = NULL;
2077                 u_sd_DL_shadow = NULL;
2078                 v_sd = NULL;
2079                 v_sd_DL_shadow = NULL;
2080                 object_sd = NULL;
2081                 object_sd_DL_shadow = NULL;
2082                 time_sd = NULL;
2083                 time_sd_DL_shadow = NULL;
2084                 ray_length_sd = NULL;
2085                 ray_length_sd_DL_shadow = NULL;
2086
2087                 /* Ray differentials. */
2088                 dP_sd = NULL;
2089                 dI_sd = NULL;
2090                 dP_sd_DL_shadow = NULL;
2091                 dI_sd_DL_shadow = NULL;
2092                 du_sd = NULL;
2093                 dv_sd = NULL;
2094                 du_sd_DL_shadow = NULL;
2095                 dv_sd_DL_shadow = NULL;
2096
2097                 /* Dp/Du */
2098                 dPdu_sd = NULL;
2099                 dPdv_sd = NULL;
2100                 dPdu_sd_DL_shadow = NULL;
2101                 dPdv_sd_DL_shadow = NULL;
2102
2103                 /* Object motion. */
2104                 ob_tfm_sd = NULL;
2105                 ob_itfm_sd = NULL;
2106                 ob_tfm_sd_DL_shadow = NULL;
2107                 ob_itfm_sd_DL_shadow = NULL;
2108
2109                 closure_sd = NULL;
2110                 closure_sd_DL_shadow = NULL;
2111                 num_closure_sd = NULL;
2112                 num_closure_sd_DL_shadow = NULL;
2113                 randb_closure_sd = NULL;
2114                 randb_closure_sd_DL_shadow = NULL;
2115                 ray_P_sd = NULL;
2116                 ray_P_sd_DL_shadow = NULL;
2117                 ray_dP_sd = NULL;
2118                 ray_dP_sd_DL_shadow = NULL;
2119
2120                 rng_coop = NULL;
2121                 throughput_coop = NULL;
2122                 L_transparent_coop = NULL;
2123                 PathRadiance_coop = NULL;
2124                 Ray_coop = NULL;
2125                 PathState_coop = NULL;
2126                 Intersection_coop = NULL;
2127                 ray_state = NULL;
2128
2129                 AOAlpha_coop = NULL;
2130                 AOBSDF_coop = NULL;
2131                 AOLightRay_coop = NULL;
2132                 BSDFEval_coop = NULL;
2133                 ISLamp_coop = NULL;
2134                 LightRay_coop = NULL;
2135                 Intersection_coop_shadow = NULL;
2136
2137 #ifdef WITH_CYCLES_DEBUG
2138                 debugdata_coop = NULL;
2139 #endif
2140
2141                 work_array = NULL;
2142
2143                 /* Queue. */
2144                 Queue_data = NULL;
2145                 Queue_index = NULL;
2146                 use_queues_flag = NULL;
2147
2148                 per_sample_output_buffers = NULL;
2149
2150                 per_thread_output_buffer_size = 0;
2151                 hostRayStateArray = NULL;
2152                 PathIteration_times = PATH_ITER_INC_FACTOR;
2153 #ifdef __WORK_STEALING__
2154                 work_pool_wgs = NULL;
2155                 max_work_groups = 0;
2156 #endif
2157                 current_max_closure = -1;
2158                 first_tile = true;
2159
2160                 /* Get device's maximum memory that can be allocated. */
2161                 ciErr = clGetDeviceInfo(cdDevice,
2162                                         CL_DEVICE_MAX_MEM_ALLOC_SIZE,
2163                                         sizeof(size_t),
2164                                         &total_allocatable_memory,
2165                                         NULL);
2166                 assert(ciErr == CL_SUCCESS);
2167                 if(platform_name == "AMD Accelerated Parallel Processing") {
2168                         /* This value is tweak-able; AMD platform does not seem to
2169                          * give maximum performance when all of CL_DEVICE_MAX_MEM_ALLOC_SIZE
2170                          * is considered for further computation.
2171                          */
2172                         total_allocatable_memory /= 2;
2173                 }
2174         }
2175
2176         /* TODO(sergey): Seems really close to load_kernel(),
2177          * could it be de-duplicated?
2178          */
2179         bool load_split_kernel(string kernel_path,
2180                                string kernel_init_source,
2181                                string clbin,
2182                                string custom_kernel_build_options,
2183                                cl_program *program,
2184                                const string *debug_src = NULL)
2185         {
2186                 if(!opencl_version_check())
2187                         return false;
2188
2189                 clbin = path_user_get(path_join("cache", clbin));
2190
2191                 /* If exists already, try use it. */
2192                 if(path_exists(clbin) && load_binary(kernel_path,
2193                                                      clbin,
2194                                                      custom_kernel_build_options,
2195                                                      program,
2196                                                      debug_src)) {
2197                         /* Kernel loaded from binary. */
2198                 }
2199                 else {
2200                         /* If does not exist or loading binary failed, compile kernel. */
2201                         if(!compile_kernel(kernel_path,
2202                                            kernel_init_source,
2203                                            custom_kernel_build_options,
2204                                            program,
2205                                            debug_src))
2206                         {
2207                                 return false;
2208                         }
2209                         /* Save binary for reuse. */
2210                         if(!save_binary(program, clbin)) {
2211                                 return false;
2212                         }
2213                 }
2214                 return true;
2215         }
2216
2217         /* Split kernel utility functions. */
2218         size_t get_tex_size(const char *tex_name)
2219         {
2220                 cl_mem ptr;
2221                 size_t ret_size = 0;
2222                 MemMap::iterator i = mem_map.find(tex_name);
2223                 if(i != mem_map.end()) {
2224                         ptr = CL_MEM_PTR(i->second);
2225                         ciErr = clGetMemObjectInfo(ptr,
2226                                                    CL_MEM_SIZE,
2227                                                    sizeof(ret_size),
2228                                                    &ret_size,
2229                                                    NULL);
2230                         assert(ciErr == CL_SUCCESS);
2231                 }
2232                 return ret_size;
2233         }
2234
2235         size_t get_shader_closure_size(int max_closure)
2236         {
2237                 return (sizeof(ShaderClosure) * max_closure);
2238         }
2239
2240         size_t get_shader_data_size(size_t shader_closure_size)
2241         {
2242                 /* ShaderData size without accounting for ShaderClosure array. */
2243                 size_t shader_data_size =
2244                         sizeof(ShaderData) - (sizeof(ShaderClosure) * MAX_CLOSURE);
2245                 return (shader_data_size + shader_closure_size);
2246         }
2247
2248         /* Returns size of KernelGlobals structure associated with OpenCL. */
2249         size_t get_KernelGlobals_size()
2250         {
2251                 /* Copy dummy KernelGlobals related to OpenCL from kernel_globals.h to
2252                  * fetch its size.
2253                  */
2254                 typedef struct KernelGlobals {
2255                         ccl_constant KernelData *data;
2256 #define KERNEL_TEX(type, ttype, name) \
2257         ccl_global type *name;
2258 #include "kernel_textures.h"
2259 #undef KERNEL_TEX
2260                         void *sd_input;
2261                         void *isect_shadow;
2262                 } KernelGlobals;
2263
2264                 return sizeof(KernelGlobals);
2265         }
2266
2267         /* Returns size of Structure of arrays implementation of. */
2268         size_t get_shaderdata_soa_size()
2269         {
2270                 size_t shader_soa_size = 0;
2271
2272 #define SD_VAR(type, what) shader_soa_size += sizeof(void *);
2273 #define SD_CLOSURE_VAR(type, what, max_closure) shader_soa_size += sizeof(void *);
2274                 #include "kernel_shaderdata_vars.h"
2275 #undef SD_VAR
2276 #undef SD_CLOSURE_VAR
2277
2278                 return shader_soa_size;
2279         }
2280
2281         bool load_kernels(const DeviceRequestedFeatures& requested_features)
2282         {
2283                 /* Get Shader, bake and film_convert kernels.
2284                  * It'll also do verification of OpenCL actually initialized.
2285                  */
2286                 if(!OpenCLDeviceBase::load_kernels(requested_features)) {
2287                         return false;
2288                 }
2289
2290                 string kernel_path = path_get("kernel");
2291                 string kernel_md5 = path_files_md5_hash(kernel_path);
2292                 string device_md5;
2293                 string kernel_init_source;
2294                 string clbin;
2295                 string clsrc, *debug_src = NULL;
2296
2297                 string build_options = "-D__SPLIT_KERNEL__ ";
2298 #ifdef __WORK_STEALING__
2299                 build_options += "-D__WORK_STEALING__ ";
2300 #endif
2301                 build_options += requested_features.get_build_options();
2302
2303                 /* Set compute device build option. */
2304                 cl_device_type device_type;
2305                 ciErr = clGetDeviceInfo(cdDevice,
2306                                         CL_DEVICE_TYPE,
2307                                         sizeof(cl_device_type),
2308                                         &device_type,
2309                                         NULL);
2310                 assert(ciErr == CL_SUCCESS);
2311                 if(device_type == CL_DEVICE_TYPE_GPU) {
2312                         build_options += " -D__COMPUTE_DEVICE_GPU__";
2313                 }
2314
2315 #define GLUE(a, b) a ## b
2316 #define LOAD_KERNEL(name) \
2317         do { \
2318                 kernel_init_source = "#include \"kernels/opencl/kernel_" #name ".cl\" // " + \
2319                                      kernel_md5 + "\n"; \
2320                 device_md5 = device_md5_hash(build_options); \
2321                 clbin = string_printf("cycles_kernel_%s_%s_" #name ".clbin", \
2322                                       device_md5.c_str(), kernel_md5.c_str()); \
2323                 if(opencl_kernel_use_debug()) { \
2324                         clsrc = string_printf("cycles_kernel_%s_%s_" #name ".cl", \
2325                                               device_md5.c_str(), kernel_md5.c_str()); \
2326                         clsrc = path_user_get(path_join("cache", clsrc)); \
2327                         debug_src = &clsrc; \
2328                 } \
2329                 if(!load_split_kernel(kernel_path, kernel_init_source, clbin, \
2330                                       build_options, \
2331                                       &GLUE(name, _program), \
2332                                       debug_src)) \
2333                 { \
2334                         fprintf(stderr, "Faled to compile %s\n", #name); \
2335                         return false; \
2336                 } \
2337         } while(false)
2338
2339                 LOAD_KERNEL(data_init);
2340                 LOAD_KERNEL(scene_intersect);
2341                 LOAD_KERNEL(lamp_emission);
2342                 LOAD_KERNEL(queue_enqueue);
2343                 LOAD_KERNEL(background_buffer_update);
2344                 LOAD_KERNEL(shader_eval);
2345                 LOAD_KERNEL(holdout_emission_blurring_pathtermination_ao);
2346                 LOAD_KERNEL(direct_lighting);
2347                 LOAD_KERNEL(shadow_blocked);
2348                 LOAD_KERNEL(next_iteration_setup);
2349                 LOAD_KERNEL(sum_all_radiance);
2350
2351 #undef LOAD_KERNEL
2352
2353 #define FIND_KERNEL(name) \
2354         do { \
2355                 GLUE(ckPathTraceKernel_, name) = \
2356                         clCreateKernel(GLUE(name, _program), \
2357                                        "kernel_ocl_path_trace_"  #name, &ciErr); \
2358                 if(opencl_error(ciErr)) { \
2359                         fprintf(stderr,"Missing kernel kernel_ocl_path_trace_%s\n", #name); \
2360                         return false; \
2361                 } \
2362         } while(false)
2363
2364                 FIND_KERNEL(data_init);
2365                 FIND_KERNEL(scene_intersect);
2366                 FIND_KERNEL(lamp_emission);
2367                 FIND_KERNEL(queue_enqueue);
2368                 FIND_KERNEL(background_buffer_update);
2369                 FIND_KERNEL(shader_eval);
2370                 FIND_KERNEL(holdout_emission_blurring_pathtermination_ao);
2371                 FIND_KERNEL(direct_lighting);
2372                 FIND_KERNEL(shadow_blocked);
2373                 FIND_KERNEL(next_iteration_setup);
2374                 FIND_KERNEL(sum_all_radiance);
2375 #undef FIND_KERNEL
2376 #undef GLUE
2377
2378                 current_max_closure = requested_features.max_closure;
2379
2380                 return true;
2381         }
2382
2383         ~OpenCLDeviceSplitKernel()
2384         {
2385                 task_pool.stop();
2386
2387                 /* Release kernels */
2388                 release_kernel_safe(ckPathTraceKernel_data_init);
2389                 release_kernel_safe(ckPathTraceKernel_scene_intersect);
2390                 release_kernel_safe(ckPathTraceKernel_lamp_emission);
2391                 release_kernel_safe(ckPathTraceKernel_queue_enqueue);
2392                 release_kernel_safe(ckPathTraceKernel_background_buffer_update);
2393                 release_kernel_safe(ckPathTraceKernel_shader_eval);
2394                 release_kernel_safe(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao);
2395                 release_kernel_safe(ckPathTraceKernel_direct_lighting);
2396                 release_kernel_safe(ckPathTraceKernel_shadow_blocked);
2397                 release_kernel_safe(ckPathTraceKernel_next_iteration_setup);
2398                 release_kernel_safe(ckPathTraceKernel_sum_all_radiance);
2399
2400                 /* Release global memory */
2401                 release_mem_object_safe(P_sd);
2402                 release_mem_object_safe(P_sd_DL_shadow);
2403                 release_mem_object_safe(N_sd);
2404                 release_mem_object_safe(N_sd_DL_shadow);
2405                 release_mem_object_safe(Ng_sd);
2406                 release_mem_object_safe(Ng_sd_DL_shadow);
2407                 release_mem_object_safe(I_sd);
2408                 release_mem_object_safe(I_sd_DL_shadow);
2409                 release_mem_object_safe(shader_sd);
2410                 release_mem_object_safe(shader_sd_DL_shadow);
2411                 release_mem_object_safe(flag_sd);
2412                 release_mem_object_safe(flag_sd_DL_shadow);
2413                 release_mem_object_safe(prim_sd);
2414                 release_mem_object_safe(prim_sd_DL_shadow);
2415                 release_mem_object_safe(type_sd);
2416                 release_mem_object_safe(type_sd_DL_shadow);
2417                 release_mem_object_safe(u_sd);
2418                 release_mem_object_safe(u_sd_DL_shadow);
2419                 release_mem_object_safe(v_sd);
2420                 release_mem_object_safe(v_sd_DL_shadow);
2421                 release_mem_object_safe(object_sd);
2422                 release_mem_object_safe(object_sd_DL_shadow);
2423                 release_mem_object_safe(time_sd);
2424                 release_mem_object_safe(time_sd_DL_shadow);
2425                 release_mem_object_safe(ray_length_sd);
2426                 release_mem_object_safe(ray_length_sd_DL_shadow);
2427
2428                 /* Ray differentials. */
2429                 release_mem_object_safe(dP_sd);
2430                 release_mem_object_safe(dP_sd_DL_shadow);
2431                 release_mem_object_safe(dI_sd);
2432                 release_mem_object_safe(dI_sd_DL_shadow);
2433                 release_mem_object_safe(du_sd);
2434                 release_mem_object_safe(du_sd_DL_shadow);
2435                 release_mem_object_safe(dv_sd);
2436                 release_mem_object_safe(dv_sd_DL_shadow);
2437
2438                 /* Dp/Du */
2439                 release_mem_object_safe(dPdu_sd);
2440                 release_mem_object_safe(dPdu_sd_DL_shadow);
2441                 release_mem_object_safe(dPdv_sd);
2442                 release_mem_object_safe(dPdv_sd_DL_shadow);
2443
2444                 /* Object motion. */
2445                 release_mem_object_safe(ob_tfm_sd);
2446                 release_mem_object_safe(ob_itfm_sd);
2447
2448                 release_mem_object_safe(ob_tfm_sd_DL_shadow);
2449                 release_mem_object_safe(ob_itfm_sd_DL_shadow);
2450
2451                 release_mem_object_safe(closure_sd);
2452                 release_mem_object_safe(closure_sd_DL_shadow);
2453                 release_mem_object_safe(num_closure_sd);
2454                 release_mem_object_safe(num_closure_sd_DL_shadow);
2455                 release_mem_object_safe(randb_closure_sd);
2456                 release_mem_object_safe(randb_closure_sd_DL_shadow);
2457                 release_mem_object_safe(ray_P_sd);
2458                 release_mem_object_safe(ray_P_sd_DL_shadow);
2459                 release_mem_object_safe(ray_dP_sd);
2460                 release_mem_object_safe(ray_dP_sd_DL_shadow);
2461                 release_mem_object_safe(rng_coop);
2462                 release_mem_object_safe(throughput_coop);
2463                 release_mem_object_safe(L_transparent_coop);
2464                 release_mem_object_safe(PathRadiance_coop);
2465                 release_mem_object_safe(Ray_coop);
2466                 release_mem_object_safe(PathState_coop);
2467                 release_mem_object_safe(Intersection_coop);
2468                 release_mem_object_safe(kgbuffer);
2469                 release_mem_object_safe(sd);
2470                 release_mem_object_safe(sd_DL_shadow);
2471                 release_mem_object_safe(ray_state);
2472                 release_mem_object_safe(AOAlpha_coop);
2473                 release_mem_object_safe(AOBSDF_coop);
2474                 release_mem_object_safe(AOLightRay_coop);
2475                 release_mem_object_safe(BSDFEval_coop);
2476                 release_mem_object_safe(ISLamp_coop);
2477                 release_mem_object_safe(LightRay_coop);
2478                 release_mem_object_safe(Intersection_coop_shadow);
2479 #ifdef WITH_CYCLES_DEBUG
2480                 release_mem_object_safe(debugdata_coop);
2481 #endif
2482                 release_mem_object_safe(use_queues_flag);
2483                 release_mem_object_safe(Queue_data);
2484                 release_mem_object_safe(Queue_index);
2485                 release_mem_object_safe(work_array);
2486 #ifdef __WORK_STEALING__
2487                 release_mem_object_safe(work_pool_wgs);
2488 #endif
2489                 release_mem_object_safe(per_sample_output_buffers);
2490
2491                 /* Release programs */
2492                 release_program_safe(data_init_program);
2493                 release_program_safe(scene_intersect_program);
2494                 release_program_safe(lamp_emission_program);
2495                 release_program_safe(queue_enqueue_program);
2496                 release_program_safe(background_buffer_update_program);
2497                 release_program_safe(shader_eval_program);
2498                 release_program_safe(holdout_emission_blurring_pathtermination_ao_program);
2499                 release_program_safe(direct_lighting_program);
2500                 release_program_safe(shadow_blocked_program);
2501                 release_program_safe(next_iteration_setup_program);
2502                 release_program_safe(sum_all_radiance_program);
2503
2504                 if(hostRayStateArray != NULL) {
2505                         free(hostRayStateArray);
2506                 }
2507         }
2508
2509         void path_trace(SplitRenderTile& rtile, int2 max_render_feasible_tile_size)
2510         {
2511                 /* cast arguments to cl types */
2512                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
2513                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
2514                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
2515                 cl_int d_x = rtile.x;
2516                 cl_int d_y = rtile.y;
2517                 cl_int d_w = rtile.w;
2518                 cl_int d_h = rtile.h;
2519                 cl_int d_offset = rtile.offset;
2520                 cl_int d_stride = rtile.stride;
2521
2522                 /* Make sure that set render feasible tile size is a multiple of local
2523                  * work size dimensions.
2524                  */
2525                 assert(max_render_feasible_tile_size.x % SPLIT_KERNEL_LOCAL_SIZE_X == 0);
2526                 assert(max_render_feasible_tile_size.y % SPLIT_KERNEL_LOCAL_SIZE_Y == 0);
2527
2528                 size_t global_size[2];
2529                 size_t local_size[2] = {SPLIT_KERNEL_LOCAL_SIZE_X,
2530                                         SPLIT_KERNEL_LOCAL_SIZE_Y};
2531
2532                 /* Set the range of samples to be processed for every ray in
2533                  * path-regeneration logic.
2534                  */
2535                 cl_int start_sample = rtile.start_sample;
2536                 cl_int end_sample = rtile.start_sample + rtile.num_samples;
2537                 cl_int num_samples = rtile.num_samples;
2538
2539 #ifdef __WORK_STEALING__
2540                 global_size[0] = (((d_w - 1) / local_size[0]) + 1) * local_size[0];
2541                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2542                 unsigned int num_parallel_samples = 1;
2543 #else
2544                 global_size[1] = (((d_h - 1) / local_size[1]) + 1) * local_size[1];
2545                 unsigned int num_threads = max_render_feasible_tile_size.x *
2546                                            max_render_feasible_tile_size.y;
2547                 unsigned int num_tile_columns_possible = num_threads / global_size[1];
2548                 /* Estimate number of parallel samples that can be
2549                  * processed in parallel.
2550                  */
2551                 unsigned int num_parallel_samples = min(num_tile_columns_possible / d_w,
2552                                                         rtile.num_samples);
2553                 /* Wavefront size in AMD is 64.
2554                  * TODO(sergey): What about other platforms?
2555                  */
2556                 if(num_parallel_samples >= 64) {
2557                         /* TODO(sergey): Could use generic round-up here. */
2558                         num_parallel_samples = (num_parallel_samples / 64) * 64;
2559                 }
2560                 assert(num_parallel_samples != 0);
2561
2562                 global_size[0] = d_w * num_parallel_samples;
2563 #endif  /* __WORK_STEALING__ */
2564
2565                 assert(global_size[0] * global_size[1] <=
2566                        max_render_feasible_tile_size.x * max_render_feasible_tile_size.y);
2567
2568                 /* Allocate all required global memory once. */
2569                 if(first_tile) {
2570                         size_t num_global_elements = max_render_feasible_tile_size.x *
2571                                                      max_render_feasible_tile_size.y;
2572                         /* TODO(sergey): This will actually over-allocate if
2573                          * particular kernel does not support multiclosure.
2574                          */
2575                         size_t ShaderClosure_size = get_shader_closure_size(current_max_closure);
2576
2577 #ifdef __WORK_STEALING__
2578                         /* Calculate max groups */
2579                         size_t max_global_size[2];
2580                         size_t tile_x = max_render_feasible_tile_size.x;
2581                         size_t tile_y = max_render_feasible_tile_size.y;
2582                         max_global_size[0] = (((tile_x - 1) / local_size[0]) + 1) * local_size[0];
2583                         max_global_size[1] = (((tile_y - 1) / local_size[1]) + 1) * local_size[1];
2584                         max_work_groups = (max_global_size[0] * max_global_size[1]) /
2585                                           (local_size[0] * local_size[1]);
2586                         /* Allocate work_pool_wgs memory. */
2587                         work_pool_wgs = mem_alloc(max_work_groups * sizeof(unsigned int));
2588 #endif  /* __WORK_STEALING__ */
2589
2590                         /* Allocate queue_index memory only once. */
2591                         Queue_index = mem_alloc(NUM_QUEUES * sizeof(int));
2592                         use_queues_flag = mem_alloc(sizeof(char));
2593                         kgbuffer = mem_alloc(get_KernelGlobals_size());
2594
2595                         /* Create global buffers for ShaderData. */
2596                         sd = mem_alloc(get_shaderdata_soa_size());
2597                         sd_DL_shadow = mem_alloc(get_shaderdata_soa_size());
2598                         P_sd = mem_alloc(num_global_elements * sizeof(float3));
2599                         P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2600                         N_sd = mem_alloc(num_global_elements * sizeof(float3));
2601                         N_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2602                         Ng_sd = mem_alloc(num_global_elements * sizeof(float3));
2603                         Ng_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2604                         I_sd = mem_alloc(num_global_elements * sizeof(float3));
2605                         I_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2606                         shader_sd = mem_alloc(num_global_elements * sizeof(int));
2607                         shader_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2608                         flag_sd = mem_alloc(num_global_elements * sizeof(int));
2609                         flag_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2610                         prim_sd = mem_alloc(num_global_elements * sizeof(int));
2611                         prim_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2612                         type_sd = mem_alloc(num_global_elements * sizeof(int));
2613                         type_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2614                         u_sd = mem_alloc(num_global_elements * sizeof(float));
2615                         u_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2616                         v_sd = mem_alloc(num_global_elements * sizeof(float));
2617                         v_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2618                         object_sd = mem_alloc(num_global_elements * sizeof(int));
2619                         object_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2620                         time_sd = mem_alloc(num_global_elements * sizeof(float));
2621                         time_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2622                         ray_length_sd = mem_alloc(num_global_elements * sizeof(float));
2623                         ray_length_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2624
2625                         /* Ray differentials. */
2626                         dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2627                         dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2628                         dI_sd = mem_alloc(num_global_elements * sizeof(differential3));
2629                         dI_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2630                         du_sd = mem_alloc(num_global_elements * sizeof(differential));
2631                         du_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2632                         dv_sd = mem_alloc(num_global_elements * sizeof(differential));
2633                         dv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential));
2634
2635                         /* Dp/Du */
2636                         dPdu_sd = mem_alloc(num_global_elements * sizeof(float3));
2637                         dPdu_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2638                         dPdv_sd = mem_alloc(num_global_elements * sizeof(float3));
2639                         dPdv_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2640
2641                         /* Object motion. */
2642                         ob_tfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2643                         ob_tfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2644                         ob_itfm_sd = mem_alloc(num_global_elements * sizeof(Transform));
2645                         ob_itfm_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(Transform));
2646
2647                         closure_sd = mem_alloc(num_global_elements * ShaderClosure_size);
2648                         closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * ShaderClosure_size);
2649                         num_closure_sd = mem_alloc(num_global_elements * sizeof(int));
2650                         num_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(int));
2651                         randb_closure_sd = mem_alloc(num_global_elements * sizeof(float));
2652                         randb_closure_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float));
2653                         ray_P_sd = mem_alloc(num_global_elements * sizeof(float3));
2654                         ray_P_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(float3));
2655                         ray_dP_sd = mem_alloc(num_global_elements * sizeof(differential3));
2656                         ray_dP_sd_DL_shadow = mem_alloc(num_global_elements * 2 * sizeof(differential3));
2657
2658                         /* Creation of global memory buffers which are shared among
2659                          * the kernels.
2660                          */
2661                         rng_coop = mem_alloc(num_global_elements * sizeof(RNG));
2662                         throughput_coop = mem_alloc(num_global_elements * sizeof(float3));
2663                         L_transparent_coop = mem_alloc(num_global_elements * sizeof(float));
2664                         PathRadiance_coop = mem_alloc(num_global_elements * sizeof(PathRadiance));
2665                         Ray_coop = mem_alloc(num_global_elements * sizeof(Ray));
2666                         PathState_coop = mem_alloc(num_global_elements * sizeof(PathState));
2667                         Intersection_coop = mem_alloc(num_global_elements * sizeof(Intersection));
2668                         AOAlpha_coop = mem_alloc(num_global_elements * sizeof(float3));
2669                         AOBSDF_coop = mem_alloc(num_global_elements * sizeof(float3));
2670                         AOLightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2671                         BSDFEval_coop = mem_alloc(num_global_elements * sizeof(BsdfEval));
2672                         ISLamp_coop = mem_alloc(num_global_elements * sizeof(int));
2673                         LightRay_coop = mem_alloc(num_global_elements * sizeof(Ray));
2674                         Intersection_coop_shadow = mem_alloc(2 * num_global_elements * sizeof(Intersection));
2675
2676 #ifdef WITH_CYCLES_DEBUG
2677                         debugdata_coop = mem_alloc(num_global_elements * sizeof(DebugData));
2678 #endif
2679
2680                         ray_state = mem_alloc(num_global_elements * sizeof(char));
2681
2682                         hostRayStateArray = (char *)calloc(num_global_elements, sizeof(char));
2683                         assert(hostRayStateArray != NULL && "Can't create hostRayStateArray memory");
2684
2685                         Queue_data = mem_alloc(num_global_elements * (NUM_QUEUES * sizeof(int)+sizeof(int)));
2686                         work_array = mem_alloc(num_global_elements * sizeof(unsigned int));
2687                         per_sample_output_buffers = mem_alloc(num_global_elements *
2688                                                               per_thread_output_buffer_size);
2689                 }
2690
2691                 cl_int dQueue_size = global_size[0] * global_size[1];
2692                 cl_int total_num_rays = global_size[0] * global_size[1];
2693
2694                 cl_uint start_arg_index =
2695                         kernel_set_args(ckPathTraceKernel_data_init,
2696                                         0,
2697                                         kgbuffer,
2698                                         sd,
2699                                         sd_DL_shadow,
2700                                         P_sd,
2701                                         P_sd_DL_shadow,
2702                                         N_sd,
2703                                         N_sd_DL_shadow,
2704                                         Ng_sd,
2705                                         Ng_sd_DL_shadow,
2706                                         I_sd,
2707                                         I_sd_DL_shadow,
2708                                         shader_sd,
2709                                         shader_sd_DL_shadow,
2710                                         flag_sd,
2711                                         flag_sd_DL_shadow,
2712                                         prim_sd,
2713                                         prim_sd_DL_shadow,
2714                                         type_sd,
2715                                         type_sd_DL_shadow,
2716                                         u_sd,
2717                                         u_sd_DL_shadow,
2718                                         v_sd,
2719                                         v_sd_DL_shadow,
2720                                         object_sd,
2721                                         object_sd_DL_shadow,
2722                                         time_sd,
2723                                         time_sd_DL_shadow,
2724                                         ray_length_sd,
2725                                         ray_length_sd_DL_shadow);
2726
2727                 /* Ray differentials. */
2728                 start_arg_index +=
2729                         kernel_set_args(ckPathTraceKernel_data_init,
2730                                         start_arg_index,
2731                                         dP_sd,
2732                                         dP_sd_DL_shadow,
2733                                         dI_sd,
2734                                         dI_sd_DL_shadow,
2735                                         du_sd,
2736                                         du_sd_DL_shadow,
2737                                         dv_sd,
2738                                         dv_sd_DL_shadow);
2739
2740                 /* Dp/Du */
2741                 start_arg_index +=
2742                         kernel_set_args(ckPathTraceKernel_data_init,
2743                                         start_arg_index,
2744                                         dPdu_sd,
2745                                         dPdu_sd_DL_shadow,
2746                                         dPdv_sd,
2747                                         dPdv_sd_DL_shadow);
2748
2749                 /* Object motion. */
2750                 start_arg_index +=
2751                         kernel_set_args(ckPathTraceKernel_data_init,
2752                                         start_arg_index,
2753                                         ob_tfm_sd,
2754                                         ob_tfm_sd_DL_shadow,
2755                                         ob_itfm_sd,
2756                                         ob_itfm_sd_DL_shadow);
2757
2758                 start_arg_index +=
2759                         kernel_set_args(ckPathTraceKernel_data_init,
2760                                         start_arg_index,
2761                                         closure_sd,
2762                                         closure_sd_DL_shadow,
2763                                         num_closure_sd,
2764                                         num_closure_sd_DL_shadow,
2765                                         randb_closure_sd,
2766                                         randb_closure_sd_DL_shadow,
2767                                         ray_P_sd,
2768                                         ray_P_sd_DL_shadow,
2769                                         ray_dP_sd,
2770                                         ray_dP_sd_DL_shadow,
2771                                         d_data,
2772                                         per_sample_output_buffers,
2773                                         d_rng_state,
2774                                         rng_coop,
2775                                         throughput_coop,
2776                                         L_transparent_coop,
2777                                         PathRadiance_coop,
2778                                         Ray_coop,
2779                                         PathState_coop,
2780                                         Intersection_coop_shadow,
2781                                         ray_state);
2782
2783 /* TODO(sergey): Avoid map lookup here. */
2784 #define KERNEL_TEX(type, ttype, name) \
2785         set_kernel_arg_mem(ckPathTraceKernel_data_init, &start_arg_index, #name);
2786 #include "kernel_textures.h"
2787 #undef KERNEL_TEX
2788
2789                 start_arg_index +=
2790                         kernel_set_args(ckPathTraceKernel_data_init,
2791                                         start_arg_index,
2792                                         start_sample,
2793                                         d_x,
2794                                         d_y,
2795                                         d_w,
2796                                         d_h,
2797                                         d_offset,
2798                                         d_stride,
2799                                         rtile.rng_state_offset_x,
2800                                         rtile.rng_state_offset_y,
2801                                         rtile.buffer_rng_state_stride,
2802                                         Queue_data,
2803                                         Queue_index,
2804                                         dQueue_size,
2805                                         use_queues_flag,
2806                                         work_array,
2807 #ifdef __WORK_STEALING__
2808                                         work_pool_wgs,
2809                                         num_samples,
2810 #endif
2811 #ifdef WITH_CYCLES_DEBUG
2812                                         debugdata_coop,
2813 #endif
2814                                         num_parallel_samples);
2815
2816                 kernel_set_args(ckPathTraceKernel_scene_intersect,
2817                                 0,
2818                                 kgbuffer,
2819                                 d_data,
2820                                 rng_coop,
2821                                 Ray_coop,
2822                                 PathState_coop,
2823                                 Intersection_coop,
2824                                 ray_state,
2825                                 d_w,
2826                                 d_h,
2827                                 Queue_data,
2828                                 Queue_index,
2829                                 dQueue_size,
2830                                 use_queues_flag,
2831 #ifdef WITH_CYCLES_DEBUG
2832                                 debugdata_coop,
2833 #endif
2834                                 num_parallel_samples);
2835
2836                 kernel_set_args(ckPathTraceKernel_lamp_emission,
2837                                 0,
2838                                 kgbuffer,
2839                                 d_data,
2840                                 throughput_coop,
2841                                 PathRadiance_coop,
2842                                 Ray_coop,
2843                                 PathState_coop,
2844                                 Intersection_coop,
2845                                 ray_state,
2846                                 d_w,
2847                                 d_h,
2848                                 Queue_data,
2849                                 Queue_index,
2850                                 dQueue_size,
2851                                 use_queues_flag,
2852                                 num_parallel_samples);
2853
2854                 kernel_set_args(ckPathTraceKernel_queue_enqueue,
2855                                 0,
2856                                 Queue_data,
2857                                 Queue_index,
2858                                 ray_state,
2859                                 dQueue_size);
2860
2861                 kernel_set_args(ckPathTraceKernel_background_buffer_update,
2862                                  0,
2863                                  kgbuffer,
2864                                  d_data,
2865                                  per_sample_output_buffers,
2866                                  d_rng_state,
2867                                  rng_coop,
2868                                  throughput_coop,
2869                                  PathRadiance_coop,
2870                                  Ray_coop,
2871                                  PathState_coop,
2872                                  L_transparent_coop,
2873                                  ray_state,
2874                                  d_w,
2875                                  d_h,
2876                                  d_x,
2877                                  d_y,
2878                                  d_stride,
2879                                  rtile.rng_state_offset_x,
2880                                  rtile.rng_state_offset_y,
2881                                  rtile.buffer_rng_state_stride,
2882                                  work_array,
2883                                  Queue_data,
2884                                  Queue_index,
2885                                  dQueue_size,
2886                                  end_sample,
2887                                  start_sample,
2888 #ifdef __WORK_STEALING__
2889                                  work_pool_wgs,
2890                                  num_samples,
2891 #endif
2892 #ifdef WITH_CYCLES_DEBUG
2893                                  debugdata_coop,
2894 #endif
2895                                  num_parallel_samples);
2896
2897                 kernel_set_args(ckPathTraceKernel_shader_eval,
2898                                 0,
2899                                 kgbuffer,
2900                                 d_data,
2901                                 sd,
2902                                 rng_coop,
2903                                 Ray_coop,
2904                                 PathState_coop,
2905                                 Intersection_coop,
2906                                 ray_state,
2907                                 Queue_data,
2908                                 Queue_index,
2909                                 dQueue_size);
2910
2911                 kernel_set_args(ckPathTraceKernel_holdout_emission_blurring_pathtermination_ao,
2912                                 0,
2913                                 kgbuffer,
2914                                 d_data,
2915                                 sd,
2916                                 per_sample_output_buffers,
2917                                 rng_coop,
2918                                 throughput_coop,
2919                                 L_transparent_coop,
2920                                 PathRadiance_coop,
2921                                 PathState_coop,
2922                                 Intersection_coop,
2923                                 AOAlpha_coop,
2924                                 AOBSDF_coop,
2925                                 AOLightRay_coop,
2926                                 d_w,
2927                                 d_h,
2928                                 d_x,
2929                                 d_y,
2930                                 d_stride,
2931                                 ray_state,
2932                                 work_array,
2933                                 Queue_data,
2934                                 Queue_index,
2935                                 dQueue_size,
2936 #ifdef __WORK_STEALING__
2937                                 start_sample,
2938 #endif
2939                                 num_parallel_samples);
2940