Cycles: Code cleanup, spaces around keywords
[blender-staging.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011-2013 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 #ifdef WITH_OPENCL
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 #include "clew.h"
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_foreach.h"
31 #include "util_map.h"
32 #include "util_math.h"
33 #include "util_md5.h"
34 #include "util_opengl.h"
35 #include "util_path.h"
36 #include "util_time.h"
37
38 CCL_NAMESPACE_BEGIN
39
40 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
41
42 static cl_device_type opencl_device_type()
43 {
44         char *device = getenv("CYCLES_OPENCL_TEST");
45
46         if(device) {
47                 if(strcmp(device, "ALL") == 0)
48                         return CL_DEVICE_TYPE_ALL;
49                 else if(strcmp(device, "DEFAULT") == 0)
50                         return CL_DEVICE_TYPE_DEFAULT;
51                 else if(strcmp(device, "CPU") == 0)
52                         return CL_DEVICE_TYPE_CPU;
53                 else if(strcmp(device, "GPU") == 0)
54                         return CL_DEVICE_TYPE_GPU;
55                 else if(strcmp(device, "ACCELERATOR") == 0)
56                         return CL_DEVICE_TYPE_ACCELERATOR;
57         }
58
59         return CL_DEVICE_TYPE_ALL;
60 }
61
62 static bool opencl_kernel_use_debug()
63 {
64         return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
65 }
66
67 static bool opencl_kernel_use_advanced_shading(const string& platform)
68 {
69         /* keep this in sync with kernel_types.h! */
70         if(platform == "NVIDIA CUDA")
71                 return true;
72         else if(platform == "Apple")
73                 return false;
74         else if(platform == "AMD Accelerated Parallel Processing")
75                 return false;
76         else if(platform == "Intel(R) OpenCL")
77                 return true;
78
79         return false;
80 }
81
82 static string opencl_kernel_build_options(const string& platform, const string *debug_src = NULL)
83 {
84         string build_options = " -cl-fast-relaxed-math ";
85
86         if(platform == "NVIDIA CUDA")
87                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=32 -cl-nv-verbose ";
88
89         else if(platform == "Apple")
90                 build_options += "-D__KERNEL_OPENCL_APPLE__ ";
91
92         else if(platform == "AMD Accelerated Parallel Processing")
93                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
94
95         else if(platform == "Intel(R) OpenCL") {
96                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__";
97
98                 /* options for gdb source level kernel debugging. this segfaults on linux currently */
99                 if(opencl_kernel_use_debug() && debug_src)
100                         build_options += "-g -s \"" + *debug_src + "\"";
101         }
102
103         if(opencl_kernel_use_debug())
104                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
105
106 #ifdef WITH_CYCLES_DEBUG
107         build_options += "-D__KERNEL_DEBUG__ ";
108 #endif
109
110         return build_options;
111 }
112
113 /* thread safe cache for contexts and programs */
114 class OpenCLCache
115 {
116         struct Slot
117         {
118                 thread_mutex *mutex;
119                 cl_context context;
120                 cl_program program;
121
122                 Slot() : mutex(NULL), context(NULL), program(NULL) {}
123
124                 Slot(const Slot &rhs)
125                         : mutex(rhs.mutex)
126                         , context(rhs.context)
127                         , program(rhs.program)
128                 {
129                         /* copy can only happen in map insert, assert that */
130                         assert(mutex == NULL);
131                 }
132
133                 ~Slot()
134                 {
135                         delete mutex;
136                         mutex = NULL;
137                 }
138         };
139
140         /* key is combination of platform ID and device ID */
141         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
142
143         /* map of Slot objects */
144         typedef map<PlatformDevicePair, Slot> CacheMap;
145         CacheMap cache;
146
147         thread_mutex cache_lock;
148
149         /* lazy instantiate */
150         static OpenCLCache &global_instance()
151         {
152                 static OpenCLCache instance;
153                 return instance;
154         }
155
156         OpenCLCache()
157         {
158         }
159
160         ~OpenCLCache()
161         {
162                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
163                  * so this is disabled. It's not necessary to free objects
164                  * at process exit anyway.
165                  * http://software.intel.com/en-us/forums/topic/370083#comments */
166
167                 //flush();
168         }
169
170         /* lookup something in the cache. If this returns NULL, slot_locker
171          * will be holding a lock for the cache. slot_locker should refer to a
172          * default constructed thread_scoped_lock */
173         template<typename T>
174         static T get_something(cl_platform_id platform, cl_device_id device,
175                 T Slot::*member, thread_scoped_lock &slot_locker)
176         {
177                 assert(platform != NULL);
178
179                 OpenCLCache &self = global_instance();
180
181                 thread_scoped_lock cache_lock(self.cache_lock);
182
183                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
184                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
185
186                 Slot &slot = ins.first->second;
187
188                 /* create slot lock only while holding cache lock */
189                 if(!slot.mutex)
190                         slot.mutex = new thread_mutex;
191
192                 /* need to unlock cache before locking slot, to allow store to complete */
193                 cache_lock.unlock();
194
195                 /* lock the slot */
196                 slot_locker = thread_scoped_lock(*slot.mutex);
197
198                 /* If the thing isn't cached */
199                 if(slot.*member == NULL) {
200                         /* return with the caller's lock holder holding the slot lock */
201                         return NULL;
202                 }
203
204                 /* the item was already cached, release the slot lock */
205                 slot_locker.unlock();
206
207                 return slot.*member;
208         }
209
210         /* store something in the cache. you MUST have tried to get the item before storing to it */
211         template<typename T>
212         static void store_something(cl_platform_id platform, cl_device_id device, T thing,
213                 T Slot::*member, thread_scoped_lock &slot_locker)
214         {
215                 assert(platform != NULL);
216                 assert(device != NULL);
217                 assert(thing != NULL);
218
219                 OpenCLCache &self = global_instance();
220
221                 thread_scoped_lock cache_lock(self.cache_lock);
222                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
223                 cache_lock.unlock();
224
225                 Slot &slot = i->second;
226
227                 /* sanity check */
228                 assert(i != self.cache.end());
229                 assert(slot.*member == NULL);
230
231                 slot.*member = thing;
232
233                 /* unlock the slot */
234                 slot_locker.unlock();
235         }
236
237 public:
238         /* see get_something comment */
239         static cl_context get_context(cl_platform_id platform, cl_device_id device,
240                 thread_scoped_lock &slot_locker)
241         {
242                 cl_context context = get_something<cl_context>(platform, device, &Slot::context, slot_locker);
243
244                 if(!context)
245                         return NULL;
246
247                 /* caller is going to release it when done with it, so retain it */
248                 cl_int ciErr = clRetainContext(context);
249                 assert(ciErr == CL_SUCCESS);
250                 (void)ciErr;
251
252                 return context;
253         }
254
255         /* see get_something comment */
256         static cl_program get_program(cl_platform_id platform, cl_device_id device,
257                 thread_scoped_lock &slot_locker)
258         {
259                 cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker);
260
261                 if(!program)
262                         return NULL;
263
264                 /* caller is going to release it when done with it, so retain it */
265                 cl_int ciErr = clRetainProgram(program);
266                 assert(ciErr == CL_SUCCESS);
267                 (void)ciErr;
268
269                 return program;
270         }
271
272         /* see store_something comment */
273         static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
274                 thread_scoped_lock &slot_locker)
275         {
276                 store_something<cl_context>(platform, device, context, &Slot::context, slot_locker);
277
278                 /* increment reference count in OpenCL.
279                  * The caller is going to release the object when done with it. */
280                 cl_int ciErr = clRetainContext(context);
281                 assert(ciErr == CL_SUCCESS);
282                 (void)ciErr;
283         }
284
285         /* see store_something comment */
286         static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
287                 thread_scoped_lock &slot_locker)
288         {
289                 store_something<cl_program>(platform, device, program, &Slot::program, slot_locker);
290
291                 /* increment reference count in OpenCL.
292                  * The caller is going to release the object when done with it. */
293                 cl_int ciErr = clRetainProgram(program);
294                 assert(ciErr == CL_SUCCESS);
295                 (void)ciErr;
296         }
297
298         /* discard all cached contexts and programs
299          * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
300         static void flush()
301         {
302                 OpenCLCache &self = global_instance();
303                 thread_scoped_lock cache_lock(self.cache_lock);
304
305                 foreach(CacheMap::value_type &item, self.cache) {
306                         if(item.second.program != NULL)
307                                 clReleaseProgram(item.second.program);
308                         if(item.second.context != NULL)
309                                 clReleaseContext(item.second.context);
310                 }
311
312                 self.cache.clear();
313         }
314 };
315
316 class OpenCLDevice : public Device
317 {
318 public:
319         DedicatedTaskPool task_pool;
320         cl_context cxContext;
321         cl_command_queue cqCommandQueue;
322         cl_platform_id cpPlatform;
323         cl_device_id cdDevice;
324         cl_program cpProgram;
325         cl_kernel ckPathTraceKernel;
326         cl_kernel ckFilmConvertByteKernel;
327         cl_kernel ckFilmConvertHalfFloatKernel;
328         cl_kernel ckShaderKernel;
329         cl_kernel ckBakeKernel;
330         cl_int ciErr;
331
332         typedef map<string, device_vector<uchar>*> ConstMemMap;
333         typedef map<string, device_ptr> MemMap;
334
335         ConstMemMap const_mem_map;
336         MemMap mem_map;
337         device_ptr null_mem;
338
339         bool device_initialized;
340         string platform_name;
341
342         bool opencl_error(cl_int err)
343         {
344                 if(err != CL_SUCCESS) {
345                         string message = string_printf("OpenCL error (%d): %s", err, clewErrorString(err));
346                         if(error_msg == "")
347                                 error_msg = message;
348                         fprintf(stderr, "%s\n", message.c_str());
349                         return true;
350                 }
351
352                 return false;
353         }
354
355         void opencl_error(const string& message)
356         {
357                 if(error_msg == "")
358                         error_msg = message;
359                 fprintf(stderr, "%s\n", message.c_str());
360         }
361
362 #define opencl_assert(stmt) \
363         { \
364                 cl_int err = stmt; \
365                 \
366                 if(err != CL_SUCCESS) { \
367                         string message = string_printf("OpenCL error: %s in %s", clewErrorString(err), #stmt); \
368                         if(error_msg == "") \
369                                 error_msg = message; \
370                         fprintf(stderr, "%s\n", message.c_str()); \
371                 } \
372         } (void)0
373
374         void opencl_assert_err(cl_int err, const char* where)
375         {
376                 if(err != CL_SUCCESS) {
377                         string message = string_printf("OpenCL error (%d): %s in %s", err, clewErrorString(err), where);
378                         if(error_msg == "")
379                                 error_msg = message;
380                         fprintf(stderr, "%s\n", message.c_str());
381 #ifndef NDEBUG
382                         abort();
383 #endif
384                 }
385         }
386
387         OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
388         : Device(info, stats, background_)
389         {
390                 cpPlatform = NULL;
391                 cdDevice = NULL;
392                 cxContext = NULL;
393                 cqCommandQueue = NULL;
394                 cpProgram = NULL;
395                 ckPathTraceKernel = NULL;
396                 ckFilmConvertByteKernel = NULL;
397                 ckFilmConvertHalfFloatKernel = NULL;
398                 ckShaderKernel = NULL;
399                 ckBakeKernel = NULL;
400                 null_mem = 0;
401                 device_initialized = false;
402
403                 /* setup platform */
404                 cl_uint num_platforms;
405
406                 ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
407                 if(opencl_error(ciErr))
408                         return;
409
410                 if(num_platforms == 0) {
411                         opencl_error("OpenCL: no platforms found.");
412                         return;
413                 }
414
415                 vector<cl_platform_id> platforms(num_platforms, NULL);
416
417                 ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
418                 if(opencl_error(ciErr)) {
419                         fprintf(stderr, "clGetPlatformIDs failed \n");
420                         return;
421                 }
422
423                 int num_base = 0;
424                 int total_devices = 0;
425
426                 for(int platform = 0; platform < num_platforms; platform++) {
427                         cl_uint num_devices;
428
429                         if(opencl_error(clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices)))
430                                 return;
431
432                         total_devices += num_devices;
433
434                         if(info.num - num_base >= num_devices) {
435                                 /* num doesn't refer to a device in this platform */
436                                 num_base += num_devices;
437                                 continue;
438                         }
439
440                         /* device is in this platform */
441                         cpPlatform = platforms[platform];
442
443                         /* get devices */
444                         vector<cl_device_id> device_ids(num_devices, NULL);
445
446                         if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL))) {
447                                 fprintf(stderr, "clGetDeviceIDs failed \n");
448                                 return;
449                         }
450
451                         cdDevice = device_ids[info.num - num_base];
452
453                         char name[256];
454                         clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
455                         platform_name = name;
456
457                         break;
458                 }
459
460                 if(total_devices == 0) {
461                         opencl_error("OpenCL: no devices found.");
462                         return;
463                 }
464                 else if(!cdDevice) {
465                         opencl_error("OpenCL: specified device not found.");
466                         return;
467                 }
468
469                 {
470                         /* try to use cached context */
471                         thread_scoped_lock cache_locker;
472                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
473
474                         if(cxContext == NULL) {
475                                 /* create context properties array to specify platform */
476                                 const cl_context_properties context_props[] = {
477                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
478                                         0, 0
479                                 };
480
481                                 /* create context */
482                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
483                                         context_notify_callback, cdDevice, &ciErr);
484
485                                 if(opencl_error(ciErr)) {
486                                         opencl_error("OpenCL: clCreateContext failed");
487                                         return;
488                                 }
489
490                                 /* cache it */
491                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
492                         }
493                 }
494
495                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
496                 if(opencl_error(ciErr))
497                         return;
498
499                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
500                 if(opencl_error(ciErr))
501                         return;
502
503                 fprintf(stderr,"Device init succes\n");
504                 device_initialized = true;
505         }
506
507         static void CL_CALLBACK context_notify_callback(const char *err_info,
508                 const void * /*private_info*/, size_t /*cb*/, void *user_data)
509         {
510                 char name[256];
511                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
512
513                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
514         }
515
516         bool opencl_version_check()
517         {
518                 char version[256];
519
520                 int major, minor, req_major = 1, req_minor = 1;
521
522                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
523
524                 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
525                         opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
526                         return false;
527                 }
528
529                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
530                         opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
531                         return false;
532                 }
533
534                 clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
535
536                 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
537                         opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
538                         return false;
539                 }
540
541                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
542                         opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
543                         return false;
544                 }
545
546                 return true;
547         }
548
549         bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL)
550         {
551                 /* read binary into memory */
552                 vector<uint8_t> binary;
553
554                 if(!path_read_binary(clbin, binary)) {
555                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
556                         return false;
557                 }
558
559                 /* create program */
560                 cl_int status;
561                 size_t size = binary.size();
562                 const uint8_t *bytes = &binary[0];
563
564                 cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
565                         &size, &bytes, &status, &ciErr);
566
567                 if(opencl_error(status) || opencl_error(ciErr)) {
568                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
569                         return false;
570                 }
571
572                 if(!build_kernel(kernel_path, debug_src))
573                         return false;
574
575                 return true;
576         }
577
578         bool save_binary(const string& clbin)
579         {
580                 size_t size = 0;
581                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
582
583                 if(!size)
584                         return false;
585
586                 vector<uint8_t> binary(size);
587                 uint8_t *bytes = &binary[0];
588
589                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
590
591                 if(!path_write_binary(clbin, binary)) {
592                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
593                         return false;
594                 }
595
596                 return true;
597         }
598
599         bool build_kernel(const string& /*kernel_path*/, const string *debug_src = NULL)
600         {
601                 string build_options = opencl_kernel_build_options(platform_name, debug_src);
602         
603                 ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
604
605                 /* show warnings even if build is successful */
606                 size_t ret_val_size = 0;
607
608                 clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
609
610                 if(ret_val_size > 1) {
611                         vector<char> build_log(ret_val_size+1);
612                         clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
613
614                         build_log[ret_val_size] = '\0';
615                         fprintf(stderr, "OpenCL kernel build output:\n");
616                         fprintf(stderr, "%s\n", &build_log[0]);
617                 }
618
619                 if(ciErr != CL_SUCCESS) {
620                         opencl_error("OpenCL build failed: errors in console");
621                         return false;
622                 }
623
624                 return true;
625         }
626
627         bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL)
628         {
629                 /* we compile kernels consisting of many files. unfortunately opencl
630                  * kernel caches do not seem to recognize changes in included files.
631                  * so we force recompile on changes by adding the md5 hash of all files */
632                 string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
633                 source = path_source_replace_includes(source, kernel_path);
634
635                 if(debug_src)
636                         path_write_text(*debug_src, source);
637
638                 size_t source_len = source.size();
639                 const char *source_str = source.c_str();
640
641                 cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
642
643                 if(opencl_error(ciErr))
644                         return false;
645
646                 double starttime = time_dt();
647                 printf("Compiling OpenCL kernel ...\n");
648
649                 if(!build_kernel(kernel_path, debug_src))
650                         return false;
651
652                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
653
654                 return true;
655         }
656
657         string device_md5_hash()
658         {
659                 MD5Hash md5;
660                 char version[256], driver[256], name[256], vendor[256];
661
662                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
663                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
664                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
665                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
666
667                 md5.append((uint8_t*)vendor, strlen(vendor));
668                 md5.append((uint8_t*)version, strlen(version));
669                 md5.append((uint8_t*)name, strlen(name));
670                 md5.append((uint8_t*)driver, strlen(driver));
671
672                 string options = opencl_kernel_build_options(platform_name);
673                 md5.append((uint8_t*)options.c_str(), options.size());
674
675                 return md5.get_hex();
676         }
677
678         bool load_kernels(bool /*experimental*/)
679         {
680                 /* verify if device was initialized */
681                 if(!device_initialized) {
682                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
683                         return false;
684                 }
685
686                 /* try to use cached kernel */
687                 thread_scoped_lock cache_locker;
688                 cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
689
690                 if(!cpProgram) {
691                         /* verify we have right opencl version */
692                         if(!opencl_version_check())
693                                 return false;
694
695                         /* md5 hash to detect changes */
696                         string kernel_path = path_get("kernel");
697                         string kernel_md5 = path_files_md5_hash(kernel_path);
698                         string device_md5 = device_md5_hash();
699
700                         /* path to cached binary */
701                         string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
702                         clbin = path_user_get(path_join("cache", clbin));
703
704                         /* path to preprocessed source for debugging */
705                         string clsrc, *debug_src = NULL;
706
707                         if(opencl_kernel_use_debug()) {
708                                 clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
709                                 clsrc = path_user_get(path_join("cache", clsrc));
710                                 debug_src = &clsrc;
711                         }
712
713                         /* if exists already, try use it */
714                         if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
715                                 /* kernel loaded from binary */
716                         }
717                         else {
718                                 /* if does not exist or loading binary failed, compile kernel */
719                                 if(!compile_kernel(kernel_path, kernel_md5, debug_src))
720                                         return false;
721
722                                 /* save binary for reuse */
723                                 if(!save_binary(clbin))
724                                         return false;
725                         }
726
727                         /* cache the program */
728                         OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
729                 }
730
731                 /* find kernels */
732                 ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
733                 if(opencl_error(ciErr))
734                         return false;
735
736                 ckFilmConvertByteKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_byte", &ciErr);
737                 if(opencl_error(ciErr))
738                         return false;
739
740                 ckFilmConvertHalfFloatKernel = clCreateKernel(cpProgram, "kernel_ocl_convert_to_half_float", &ciErr);
741                 if(opencl_error(ciErr))
742                         return false;
743
744                 ckShaderKernel = clCreateKernel(cpProgram, "kernel_ocl_shader", &ciErr);
745                 if(opencl_error(ciErr))
746                         return false;
747
748                 ckBakeKernel = clCreateKernel(cpProgram, "kernel_ocl_bake", &ciErr);
749                 if(opencl_error(ciErr))
750                         return false;
751
752                 return true;
753         }
754
755         ~OpenCLDevice()
756         {
757                 task_pool.stop();
758
759                 if(null_mem)
760                         clReleaseMemObject(CL_MEM_PTR(null_mem));
761
762                 ConstMemMap::iterator mt;
763                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
764                         mem_free(*(mt->second));
765                         delete mt->second;
766                 }
767
768                 if(ckPathTraceKernel)
769                         clReleaseKernel(ckPathTraceKernel);  
770                 if(ckFilmConvertByteKernel)
771                         clReleaseKernel(ckFilmConvertByteKernel);  
772                 if(ckFilmConvertHalfFloatKernel)
773                         clReleaseKernel(ckFilmConvertHalfFloatKernel);  
774                 if(cpProgram)
775                         clReleaseProgram(cpProgram);
776                 if(cqCommandQueue)
777                         clReleaseCommandQueue(cqCommandQueue);
778                 if(cxContext)
779                         clReleaseContext(cxContext);
780         }
781
782         void mem_alloc(device_memory& mem, MemoryType type)
783         {
784                 size_t size = mem.memory_size();
785
786                 cl_mem_flags mem_flag;
787                 void *mem_ptr = NULL;
788
789                 if(type == MEM_READ_ONLY)
790                         mem_flag = CL_MEM_READ_ONLY;
791                 else if(type == MEM_WRITE_ONLY)
792                         mem_flag = CL_MEM_WRITE_ONLY;
793                 else
794                         mem_flag = CL_MEM_READ_WRITE;
795
796                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
797
798                 opencl_assert_err(ciErr, "clCreateBuffer");
799
800                 stats.mem_alloc(size);
801                 mem.device_size = size;
802         }
803
804         void mem_copy_to(device_memory& mem)
805         {
806                 /* this is blocking */
807                 size_t size = mem.memory_size();
808                 opencl_assert(clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL));
809         }
810
811         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
812         {
813                 size_t offset = elem*y*w;
814                 size_t size = elem*w*h;
815
816                 opencl_assert(clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL));
817         }
818
819         void mem_zero(device_memory& mem)
820         {
821                 if(mem.device_pointer) {
822                         memset((void*)mem.data_pointer, 0, mem.memory_size());
823                         mem_copy_to(mem);
824                 }
825         }
826
827         void mem_free(device_memory& mem)
828         {
829                 if(mem.device_pointer) {
830                         opencl_assert(clReleaseMemObject(CL_MEM_PTR(mem.device_pointer)));
831                         mem.device_pointer = 0;
832
833                         stats.mem_free(mem.device_size);
834                         mem.device_size = 0;
835                 }
836         }
837
838         void const_copy_to(const char *name, void *host, size_t size)
839         {
840                 ConstMemMap::iterator i = const_mem_map.find(name);
841
842                 if(i == const_mem_map.end()) {
843                         device_vector<uchar> *data = new device_vector<uchar>();
844                         data->copy((uchar*)host, size);
845
846                         mem_alloc(*data, MEM_READ_ONLY);
847                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
848                 }
849                 else {
850                         device_vector<uchar> *data = i->second;
851                         data->copy((uchar*)host, size);
852                 }
853
854                 mem_copy_to(*i->second);
855         }
856
857         void tex_alloc(const char *name,
858                        device_memory& mem,
859                        InterpolationType /*interpolation*/,
860                        bool /*periodic*/)
861         {
862                 mem_alloc(mem, MEM_READ_ONLY);
863                 mem_copy_to(mem);
864                 assert(mem_map.find(name) == mem_map.end());
865                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
866         }
867
868         void tex_free(device_memory& mem)
869         {
870                 if(mem.device_pointer) {
871                         foreach(const MemMap::value_type& value, mem_map) {
872                                 if(value.second == mem.device_pointer) {
873                                         mem_map.erase(value.first);
874                                         break;
875                                 }
876                         }
877
878                         mem_free(mem);
879                 }
880         }
881
882         size_t global_size_round_up(int group_size, int global_size)
883         {
884                 int r = global_size % group_size;
885                 return global_size + ((r == 0)? 0: group_size - r);
886         }
887
888         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
889         {
890                 size_t workgroup_size, max_work_items[3];
891
892                 clGetKernelWorkGroupInfo(kernel, cdDevice,
893                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
894                 clGetDeviceInfo(cdDevice,
895                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
896         
897                 /* try to divide evenly over 2 dimensions */
898                 size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
899                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
900
901                 /* some implementations have max size 1 on 2nd dimension */
902                 if(local_size[1] > max_work_items[1]) {
903                         local_size[0] = workgroup_size/max_work_items[1];
904                         local_size[1] = max_work_items[1];
905                 }
906
907                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
908
909                 /* run kernel */
910                 opencl_assert(clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
911                 opencl_assert(clFlush(cqCommandQueue));
912         }
913
914         void path_trace(RenderTile& rtile, int sample)
915         {
916                 /* cast arguments to cl types */
917                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
918                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
919                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
920                 cl_int d_x = rtile.x;
921                 cl_int d_y = rtile.y;
922                 cl_int d_w = rtile.w;
923                 cl_int d_h = rtile.h;
924                 cl_int d_sample = sample;
925                 cl_int d_offset = rtile.offset;
926                 cl_int d_stride = rtile.stride;
927
928                 /* sample arguments */
929                 cl_uint narg = 0;
930
931                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data));
932                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
933                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state));
934
935 #define KERNEL_TEX(type, ttype, name) \
936         set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
937 #include "kernel_textures.h"
938
939                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample));
940                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x));
941                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y));
942                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w));
943                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h));
944                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset));
945                 opencl_assert(clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride));
946
947                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
948         }
949
950         void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
951         {
952                 cl_mem ptr;
953
954                 MemMap::iterator i = mem_map.find(name);
955                 if(i != mem_map.end()) {
956                         ptr = CL_MEM_PTR(i->second);
957                 }
958                 else {
959                         /* work around NULL not working, even though the spec says otherwise */
960                         ptr = CL_MEM_PTR(null_mem);
961                 }
962                 
963                 opencl_assert(clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr));
964         }
965
966         void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
967         {
968                 /* cast arguments to cl types */
969                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
970                 cl_mem d_rgba = (rgba_byte)? CL_MEM_PTR(rgba_byte): CL_MEM_PTR(rgba_half);
971                 cl_mem d_buffer = CL_MEM_PTR(buffer);
972                 cl_int d_x = task.x;
973                 cl_int d_y = task.y;
974                 cl_int d_w = task.w;
975                 cl_int d_h = task.h;
976                 cl_float d_sample_scale = 1.0f/(task.sample + 1);
977                 cl_int d_offset = task.offset;
978                 cl_int d_stride = task.stride;
979
980                 /* sample arguments */
981                 cl_uint narg = 0;
982
983
984                 cl_kernel ckFilmConvertKernel = (rgba_byte)? ckFilmConvertByteKernel: ckFilmConvertHalfFloatKernel;
985
986                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data));
987                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba));
988                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer));
989
990 #define KERNEL_TEX(type, ttype, name) \
991         set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
992 #include "kernel_textures.h"
993
994                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample_scale), (void*)&d_sample_scale));
995                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x));
996                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y));
997                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w));
998                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h));
999                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset));
1000                 opencl_assert(clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride));
1001
1002
1003
1004                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1005         }
1006
1007         void shader(DeviceTask& task)
1008         {
1009                 /* cast arguments to cl types */
1010                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
1011                 cl_mem d_input = CL_MEM_PTR(task.shader_input);
1012                 cl_mem d_output = CL_MEM_PTR(task.shader_output);
1013                 cl_int d_shader_eval_type = task.shader_eval_type;
1014                 cl_int d_shader_x = task.shader_x;
1015                 cl_int d_shader_w = task.shader_w;
1016                 cl_int d_offset = task.offset;
1017
1018                 /* sample arguments */
1019                 cl_uint narg = 0;
1020
1021                 cl_kernel kernel;
1022
1023                 if(task.shader_eval_type >= SHADER_EVAL_BAKE)
1024                         kernel = ckBakeKernel;
1025                 else
1026                         kernel = ckShaderKernel;
1027
1028                 for(int sample = 0; sample < task.num_samples; sample++) {
1029
1030                         if(task.get_cancel())
1031                                 break;
1032
1033                         cl_int d_sample = sample;
1034
1035                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_data), (void*)&d_data));
1036                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_input), (void*)&d_input));
1037                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_output), (void*)&d_output));
1038
1039 #define KERNEL_TEX(type, ttype, name) \
1040                 set_kernel_arg_mem(kernel, &narg, #name);
1041 #include "kernel_textures.h"
1042
1043                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_eval_type), (void*)&d_shader_eval_type));
1044                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_x), (void*)&d_shader_x));
1045                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_shader_w), (void*)&d_shader_w));
1046                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_offset), (void*)&d_offset));
1047                         opencl_assert(clSetKernelArg(kernel, narg++, sizeof(d_sample), (void*)&d_sample));
1048
1049                         enqueue_kernel(kernel, task.shader_w, 1);
1050
1051                         task.update_progress(NULL);
1052                 }
1053         }
1054
1055         void thread_run(DeviceTask *task)
1056         {
1057                 if(task->type == DeviceTask::FILM_CONVERT) {
1058                         film_convert(*task, task->buffer, task->rgba_byte, task->rgba_half);
1059                 }
1060                 else if(task->type == DeviceTask::SHADER) {
1061                         shader(*task);
1062                 }
1063                 else if(task->type == DeviceTask::PATH_TRACE) {
1064                         RenderTile tile;
1065                         
1066                         /* keep rendering tiles until done */
1067                         while(task->acquire_tile(this, tile)) {
1068                                 int start_sample = tile.start_sample;
1069                                 int end_sample = tile.start_sample + tile.num_samples;
1070
1071                                 for(int sample = start_sample; sample < end_sample; sample++) {
1072                                         if(task->get_cancel()) {
1073                                                 if(task->need_finish_queue == false)
1074                                                         break;
1075                                         }
1076
1077                                         path_trace(tile, sample);
1078
1079                                         tile.sample = sample + 1;
1080
1081                                         task->update_progress(&tile);
1082                                 }
1083
1084                                 task->release_tile(tile);
1085                         }
1086                 }
1087         }
1088
1089         class OpenCLDeviceTask : public DeviceTask {
1090         public:
1091                 OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
1092                 : DeviceTask(task)
1093                 {
1094                         run = function_bind(&OpenCLDevice::thread_run, device, this);
1095                 }
1096         };
1097
1098         int get_split_task_count(DeviceTask& /*task*/)
1099         {
1100                 return 1;
1101         }
1102
1103         void task_add(DeviceTask& task)
1104         {
1105                 task_pool.push(new OpenCLDeviceTask(this, task));
1106         }
1107
1108         void task_wait()
1109         {
1110                 task_pool.wait();
1111         }
1112
1113         void task_cancel()
1114         {
1115                 task_pool.cancel();
1116         }
1117 };
1118
1119 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
1120 {
1121         return new OpenCLDevice(info, stats, background);
1122 }
1123
1124 bool device_opencl_init(void) {
1125         static bool initialized = false;
1126         static bool result = false;
1127
1128         if(initialized)
1129                 return result;
1130
1131         initialized = true;
1132
1133         // OpenCL disabled for now, only works with this environment variable set
1134         if(!getenv("CYCLES_OPENCL_TEST")) {
1135                 result = false;
1136         }
1137         else {
1138                 result = clewInit() == CLEW_SUCCESS;
1139         }
1140
1141         return result;
1142 }
1143
1144 void device_opencl_info(vector<DeviceInfo>& devices)
1145 {
1146         vector<cl_device_id> device_ids;
1147         cl_uint num_devices = 0;
1148         vector<cl_platform_id> platform_ids;
1149         cl_uint num_platforms = 0;
1150
1151         /* get devices */
1152         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
1153                 return;
1154         
1155         platform_ids.resize(num_platforms);
1156
1157         if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS)
1158                 return;
1159
1160         /* devices are numbered consecutively across platforms */
1161         int num_base = 0;
1162
1163         for(int platform = 0; platform < num_platforms; platform++, num_base += num_devices) {
1164                 num_devices = 0;
1165                 if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), 0, NULL, &num_devices) != CL_SUCCESS || num_devices == 0)
1166                         continue;
1167
1168                 device_ids.resize(num_devices);
1169
1170                 if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), num_devices, &device_ids[0], NULL) != CL_SUCCESS)
1171                         continue;
1172
1173                 char pname[256];
1174                 clGetPlatformInfo(platform_ids[platform], CL_PLATFORM_NAME, sizeof(pname), &pname, NULL);
1175                 string platform_name = pname;
1176
1177                 /* add devices */
1178                 for(int num = 0; num < num_devices; num++) {
1179                         cl_device_id device_id = device_ids[num];
1180                         char name[1024] = "\0";
1181
1182                         if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
1183                                 continue;
1184
1185                         DeviceInfo info;
1186
1187                         info.type = DEVICE_OPENCL;
1188                         info.description = string(name);
1189                         info.num = num_base + num;
1190                         info.id = string_printf("OPENCL_%d", info.num);
1191                         /* we don't know if it's used for display, but assume it is */
1192                         info.display_device = true;
1193                         info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
1194                         info.pack_images = true;
1195
1196                         devices.push_back(info);
1197                 }
1198         }
1199 }
1200
1201 string device_opencl_capabilities(void)
1202 {
1203         /* TODO(sergey): Not implemented yet. */
1204         return "";
1205 }
1206
1207 CCL_NAMESPACE_END
1208
1209 #endif /* WITH_OPENCL */