add BLI_strcpy_rlen, replace strcat, which was used in misleading way.
[blender.git] / intern / cycles / device / device_opencl.cpp
1 /*
2  * Copyright 2011, Blender Foundation.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License
6  * as published by the Free Software Foundation; either version 2
7  * of the License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
18
19 #ifdef WITH_OPENCL
20
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #include "device.h"
26 #include "device_intern.h"
27
28 #include "buffers.h"
29
30 #include "util_foreach.h"
31 #include "util_map.h"
32 #include "util_math.h"
33 #include "util_md5.h"
34 #include "util_opencl.h"
35 #include "util_opengl.h"
36 #include "util_path.h"
37 #include "util_time.h"
38
39 CCL_NAMESPACE_BEGIN
40
41 #define CL_MEM_PTR(p) ((cl_mem)(uintptr_t)(p))
42
43 static cl_device_type opencl_device_type()
44 {
45         char *device = getenv("CYCLES_OPENCL_TEST");
46
47         if(device) {
48                 if(strcmp(device, "ALL") == 0)
49                         return CL_DEVICE_TYPE_ALL;
50                 else if(strcmp(device, "DEFAULT") == 0)
51                         return CL_DEVICE_TYPE_DEFAULT;
52                 else if(strcmp(device, "CPU") == 0)
53                         return CL_DEVICE_TYPE_CPU;
54                 else if(strcmp(device, "GPU") == 0)
55                         return CL_DEVICE_TYPE_GPU;
56                 else if(strcmp(device, "ACCELERATOR") == 0)
57                         return CL_DEVICE_TYPE_ACCELERATOR;
58         }
59
60         return CL_DEVICE_TYPE_ALL;
61 }
62
63 static bool opencl_kernel_use_debug()
64 {
65         return (getenv("CYCLES_OPENCL_DEBUG") != NULL);
66 }
67
68 static bool opencl_kernel_use_advanced_shading(const string& platform)
69 {
70         /* keep this in sync with kernel_types.h! */
71         if(platform == "NVIDIA CUDA")
72                 return true;
73         else if(platform == "Apple")
74                 return false;
75         else if(platform == "AMD Accelerated Parallel Processing")
76                 return false;
77         else if(platform == "Intel(R) OpenCL")
78                 return true;
79
80         return false;
81 }
82
83 static string opencl_kernel_build_options(const string& platform, const string *debug_src = NULL)
84 {
85         string build_options = " -cl-fast-relaxed-math ";
86
87         if(platform == "NVIDIA CUDA")
88                 build_options += "-D__KERNEL_OPENCL_NVIDIA__ -cl-nv-maxrregcount=24 -cl-nv-verbose ";
89
90         else if(platform == "Apple")
91                 build_options += "-D__KERNEL_OPENCL_APPLE__ -Wno-missing-prototypes ";
92
93         else if(platform == "AMD Accelerated Parallel Processing")
94                 build_options += "-D__KERNEL_OPENCL_AMD__ ";
95
96         else if(platform == "Intel(R) OpenCL") {
97                 build_options += "-D__KERNEL_OPENCL_INTEL_CPU__";
98
99                 /* options for gdb source level kernel debugging. this segfaults on linux currently */
100                 if(opencl_kernel_use_debug() && debug_src)
101                         build_options += "-g -s \"" + *debug_src + "\"";
102         }
103
104         if(opencl_kernel_use_debug())
105                 build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
106
107         if(opencl_kernel_use_advanced_shading(platform))
108                 build_options += "-D__KERNEL_OPENCL_NEED_ADVANCED_SHADING__ ";
109         
110         return build_options;
111 }
112
113 /* thread safe cache for contexts and programs */
114 class OpenCLCache
115 {
116         struct Slot
117         {
118                 thread_mutex *mutex;
119                 cl_context context;
120                 cl_program program;
121
122                 Slot() : mutex(NULL), context(NULL), program(NULL) {}
123
124                 Slot(const Slot &rhs)
125                         : mutex(rhs.mutex)
126                         , context(rhs.context)
127                         , program(rhs.program)
128                 {
129                         /* copy can only happen in map insert, assert that */
130                         assert(mutex == NULL);
131                 }
132
133                 ~Slot()
134                 {
135                         delete mutex;
136                         mutex = NULL;
137                 }
138         };
139
140         /* key is combination of platform ID and device ID */
141         typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
142
143         /* map of Slot objects */
144         typedef map<PlatformDevicePair, Slot> CacheMap;
145         CacheMap cache;
146
147         thread_mutex cache_lock;
148
149         /* lazy instantiate */
150         static OpenCLCache &global_instance()
151         {
152                 static OpenCLCache instance;
153                 return instance;
154         }
155
156         OpenCLCache()
157         {
158         }
159
160         ~OpenCLCache()
161         {
162                 /* Intel OpenCL bug raises SIGABRT due to pure virtual call
163                  * so this is disabled. It's not necessary to free objects
164                  * at process exit anyway.
165                  * http://software.intel.com/en-us/forums/topic/370083#comments */
166
167                 //flush();
168         }
169
170         /* lookup something in the cache. If this returns NULL, slot_locker
171          * will be holding a lock for the cache. slot_locker should refer to a
172          * default constructed thread_scoped_lock */
173         template<typename T>
174         static T get_something(cl_platform_id platform, cl_device_id device,
175                 T Slot::*member, thread_scoped_lock &slot_locker)
176         {
177                 assert(platform != NULL);
178
179                 OpenCLCache &self = global_instance();
180
181                 thread_scoped_lock cache_lock(self.cache_lock);
182
183                 pair<CacheMap::iterator,bool> ins = self.cache.insert(
184                         CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
185
186                 Slot &slot = ins.first->second;
187
188                 /* create slot lock only while holding cache lock */
189                 if(!slot.mutex)
190                         slot.mutex = new thread_mutex;
191
192                 /* need to unlock cache before locking slot, to allow store to complete */
193                 cache_lock.unlock();
194
195                 /* lock the slot */
196                 slot_locker = thread_scoped_lock(*slot.mutex);
197
198                 /* If the thing isn't cached */
199                 if(slot.*member == NULL) {
200                         /* return with the caller's lock holder holding the slot lock */
201                         return NULL;
202                 }
203
204                 /* the item was already cached, release the slot lock */
205                 slot_locker.unlock();
206
207                 return slot.*member;
208         }
209
210         /* store something in the cache. you MUST have tried to get the item before storing to it */
211         template<typename T>
212         static void store_something(cl_platform_id platform, cl_device_id device, T thing,
213                 T Slot::*member, thread_scoped_lock &slot_locker)
214         {
215                 assert(platform != NULL);
216                 assert(device != NULL);
217                 assert(thing != NULL);
218
219                 OpenCLCache &self = global_instance();
220
221                 thread_scoped_lock cache_lock(self.cache_lock);
222                 CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
223                 cache_lock.unlock();
224
225                 Slot &slot = i->second;
226
227                 /* sanity check */
228                 assert(i != self.cache.end());
229                 assert(slot.*member == NULL);
230
231                 slot.*member = thing;
232
233                 /* unlock the slot */
234                 slot_locker.unlock();
235         }
236
237 public:
238         /* see get_something comment */
239         static cl_context get_context(cl_platform_id platform, cl_device_id device,
240                 thread_scoped_lock &slot_locker)
241         {
242                 cl_context context = get_something<cl_context>(platform, device, &Slot::context, slot_locker);
243
244                 /* caller is going to release it when done with it, so retain it */
245                 cl_int ciErr = clRetainContext(context);
246                 assert(ciErr == CL_SUCCESS);
247                 (void)ciErr;
248
249                 return context;
250         }
251
252         /* see get_something comment */
253         static cl_program get_program(cl_platform_id platform, cl_device_id device,
254                 thread_scoped_lock &slot_locker)
255         {
256                 cl_program program = get_something<cl_program>(platform, device, &Slot::program, slot_locker);
257
258                 /* caller is going to release it when done with it, so retain it */
259                 cl_int ciErr = clRetainProgram(program);
260                 assert(ciErr == CL_SUCCESS);
261                 (void)ciErr;
262
263                 return program;
264         }
265
266         /* see store_something comment */
267         static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
268                 thread_scoped_lock &slot_locker)
269         {
270                 store_something<cl_context>(platform, device, context, &Slot::context, slot_locker);
271
272                 /* increment reference count in OpenCL.
273                  * The caller is going to release the object when done with it. */
274                 cl_int ciErr = clRetainContext(context);
275                 assert(ciErr == CL_SUCCESS);
276                 (void)ciErr;
277         }
278
279         /* see store_something comment */
280         static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
281                 thread_scoped_lock &slot_locker)
282         {
283                 store_something<cl_program>(platform, device, program, &Slot::program, slot_locker);
284
285                 /* increment reference count in OpenCL.
286                  * The caller is going to release the object when done with it. */
287                 cl_int ciErr = clRetainProgram(program);
288                 assert(ciErr == CL_SUCCESS);
289                 (void)ciErr;
290         }
291
292         /* discard all cached contexts and programs
293          * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
294         static void flush()
295         {
296                 OpenCLCache &self = global_instance();
297                 thread_scoped_lock cache_lock(self.cache_lock);
298
299                 foreach(CacheMap::value_type &item, self.cache) {
300                         if(item.second.program != NULL)
301                                 clReleaseProgram(item.second.program);
302                         if(item.second.context != NULL)
303                                 clReleaseContext(item.second.context);
304                 }
305
306                 self.cache.clear();
307         }
308 };
309
310 class OpenCLDevice : public Device
311 {
312 public:
313         TaskPool task_pool;
314         cl_context cxContext;
315         cl_command_queue cqCommandQueue;
316         cl_platform_id cpPlatform;
317         cl_device_id cdDevice;
318         cl_program cpProgram;
319         cl_kernel ckPathTraceKernel;
320         cl_kernel ckFilmConvertKernel;
321         cl_int ciErr;
322
323         typedef map<string, device_vector<uchar>*> ConstMemMap;
324         typedef map<string, device_ptr> MemMap;
325
326         ConstMemMap const_mem_map;
327         MemMap mem_map;
328         device_ptr null_mem;
329
330         bool device_initialized;
331         string platform_name;
332
333         const char *opencl_error_string(cl_int err)
334         {
335                 switch (err) {
336                         case CL_SUCCESS: return "Success!";
337                         case CL_DEVICE_NOT_FOUND: return "Device not found.";
338                         case CL_DEVICE_NOT_AVAILABLE: return "Device not available";
339                         case CL_COMPILER_NOT_AVAILABLE: return "Compiler not available";
340                         case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "Memory object allocation failure";
341                         case CL_OUT_OF_RESOURCES: return "Out of resources";
342                         case CL_OUT_OF_HOST_MEMORY: return "Out of host memory";
343                         case CL_PROFILING_INFO_NOT_AVAILABLE: return "Profiling information not available";
344                         case CL_MEM_COPY_OVERLAP: return "Memory copy overlap";
345                         case CL_IMAGE_FORMAT_MISMATCH: return "Image format mismatch";
346                         case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "Image format not supported";
347                         case CL_BUILD_PROGRAM_FAILURE: return "Program build failure";
348                         case CL_MAP_FAILURE: return "Map failure";
349                         case CL_INVALID_VALUE: return "Invalid value";
350                         case CL_INVALID_DEVICE_TYPE: return "Invalid device type";
351                         case CL_INVALID_PLATFORM: return "Invalid platform";
352                         case CL_INVALID_DEVICE: return "Invalid device";
353                         case CL_INVALID_CONTEXT: return "Invalid context";
354                         case CL_INVALID_QUEUE_PROPERTIES: return "Invalid queue properties";
355                         case CL_INVALID_COMMAND_QUEUE: return "Invalid command queue";
356                         case CL_INVALID_HOST_PTR: return "Invalid host pointer";
357                         case CL_INVALID_MEM_OBJECT: return "Invalid memory object";
358                         case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "Invalid image format descriptor";
359                         case CL_INVALID_IMAGE_SIZE: return "Invalid image size";
360                         case CL_INVALID_SAMPLER: return "Invalid sampler";
361                         case CL_INVALID_BINARY: return "Invalid binary";
362                         case CL_INVALID_BUILD_OPTIONS: return "Invalid build options";
363                         case CL_INVALID_PROGRAM: return "Invalid program";
364                         case CL_INVALID_PROGRAM_EXECUTABLE: return "Invalid program executable";
365                         case CL_INVALID_KERNEL_NAME: return "Invalid kernel name";
366                         case CL_INVALID_KERNEL_DEFINITION: return "Invalid kernel definition";
367                         case CL_INVALID_KERNEL: return "Invalid kernel";
368                         case CL_INVALID_ARG_INDEX: return "Invalid argument index";
369                         case CL_INVALID_ARG_VALUE: return "Invalid argument value";
370                         case CL_INVALID_ARG_SIZE: return "Invalid argument size";
371                         case CL_INVALID_KERNEL_ARGS: return "Invalid kernel arguments";
372                         case CL_INVALID_WORK_DIMENSION: return "Invalid work dimension";
373                         case CL_INVALID_WORK_GROUP_SIZE: return "Invalid work group size";
374                         case CL_INVALID_WORK_ITEM_SIZE: return "Invalid work item size";
375                         case CL_INVALID_GLOBAL_OFFSET: return "Invalid global offset";
376                         case CL_INVALID_EVENT_WAIT_LIST: return "Invalid event wait list";
377                         case CL_INVALID_EVENT: return "Invalid event";
378                         case CL_INVALID_OPERATION: return "Invalid operation";
379                         case CL_INVALID_GL_OBJECT: return "Invalid OpenGL object";
380                         case CL_INVALID_BUFFER_SIZE: return "Invalid buffer size";
381                         case CL_INVALID_MIP_LEVEL: return "Invalid mip-map level";
382                         default: return "Unknown";
383                 }
384         }
385
386         bool opencl_error(cl_int err)
387         {
388                 if(err != CL_SUCCESS) {
389                         string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
390                         if(error_msg == "")
391                                 error_msg = message;
392                         fprintf(stderr, "%s\n", message.c_str());
393                         return true;
394                 }
395
396                 return false;
397         }
398
399         void opencl_error(const string& message)
400         {
401                 if(error_msg == "")
402                         error_msg = message;
403                 fprintf(stderr, "%s\n", message.c_str());
404         }
405
406         void opencl_assert(cl_int err)
407         {
408                 if(err != CL_SUCCESS) {
409                         string message = string_printf("OpenCL error (%d): %s", err, opencl_error_string(err));
410                         if(error_msg == "")
411                                 error_msg = message;
412                         fprintf(stderr, "%s\n", message.c_str());
413 #ifndef NDEBUG
414                         abort();
415 #endif
416                 }
417         }
418
419         OpenCLDevice(DeviceInfo& info, Stats &stats, bool background_)
420           : Device(stats)
421         {
422                 background = background_;
423                 cpPlatform = NULL;
424                 cdDevice = NULL;
425                 cxContext = NULL;
426                 cqCommandQueue = NULL;
427                 cpProgram = NULL;
428                 ckPathTraceKernel = NULL;
429                 ckFilmConvertKernel = NULL;
430                 null_mem = 0;
431                 device_initialized = false;
432
433                 /* setup platform */
434                 cl_uint num_platforms;
435
436                 ciErr = clGetPlatformIDs(0, NULL, &num_platforms);
437                 if(opencl_error(ciErr))
438                         return;
439
440                 if(num_platforms == 0) {
441                         opencl_error("OpenCL: no platforms found.");
442                         return;
443                 }
444
445                 vector<cl_platform_id> platforms(num_platforms, NULL);
446
447                 ciErr = clGetPlatformIDs(num_platforms, &platforms[0], NULL);
448                 if(opencl_error(ciErr))
449                         return;
450
451                 int num_base = 0;
452                 int total_devices = 0;
453
454                 for (int platform = 0; platform < num_platforms; platform++) {
455                         cl_uint num_devices;
456
457                         if(opencl_error(clGetDeviceIDs(platforms[platform], opencl_device_type(), 0, NULL, &num_devices)))
458                                 return;
459
460                         total_devices += num_devices;
461
462                         if(info.num - num_base >= num_devices) {
463                                 /* num doesn't refer to a device in this platform */
464                                 num_base += num_devices;
465                                 continue;
466                         }
467
468                         /* device is in this platform */
469                         cpPlatform = platforms[platform];
470
471                         /* get devices */
472                         vector<cl_device_id> device_ids(num_devices, NULL);
473
474                         if(opencl_error(clGetDeviceIDs(cpPlatform, opencl_device_type(), num_devices, &device_ids[0], NULL)))
475                                 return;
476
477                         cdDevice = device_ids[info.num - num_base];
478
479                         char name[256];
480                         clGetPlatformInfo(cpPlatform, CL_PLATFORM_NAME, sizeof(name), &name, NULL);
481                         platform_name = name;
482
483                         break;
484                 }
485
486                 if(total_devices == 0) {
487                         opencl_error("OpenCL: no devices found.");
488                         return;
489                 }
490                 else if(!cdDevice) {
491                         opencl_error("OpenCL: specified device not found.");
492                         return;
493                 }
494
495                 {
496                         /* try to use cached context */
497                         thread_scoped_lock cache_locker;
498                         cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
499
500                         if(cxContext == NULL) {
501                                 /* create context properties array to specify platform */
502                                 const cl_context_properties context_props[] = {
503                                         CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
504                                         0, 0
505                                 };
506
507                                 /* create context */
508                                 cxContext = clCreateContext(context_props, 1, &cdDevice,
509                                         context_notify_callback, cdDevice, &ciErr);
510
511                                 if(opencl_error(ciErr))
512                                         return;
513
514                                 /* cache it */
515                                 OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
516                         }
517                 }
518
519                 cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
520                 if(opencl_error(ciErr))
521                         return;
522
523                 null_mem = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, 1, NULL, &ciErr);
524                 if(opencl_error(ciErr))
525                         return;
526
527                 device_initialized = true;
528         }
529
530         static void context_notify_callback(const char *err_info,
531                 const void *private_info, size_t cb, void *user_data)
532         {
533                 char name[256];
534                 clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
535
536                 fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
537         }
538
539         bool opencl_version_check()
540         {
541                 char version[256];
542
543                 int major, minor, req_major = 1, req_minor = 1;
544
545                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VERSION, sizeof(version), &version, NULL);
546
547                 if(sscanf(version, "OpenCL %d.%d", &major, &minor) < 2) {
548                         opencl_error(string_printf("OpenCL: failed to parse platform version string (%s).", version));
549                         return false;
550                 }
551
552                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
553                         opencl_error(string_printf("OpenCL: platform version 1.1 or later required, found %d.%d", major, minor));
554                         return false;
555                 }
556
557                 clGetDeviceInfo(cdDevice, CL_DEVICE_OPENCL_C_VERSION, sizeof(version), &version, NULL);
558
559                 if(sscanf(version, "OpenCL C %d.%d", &major, &minor) < 2) {
560                         opencl_error(string_printf("OpenCL: failed to parse OpenCL C version string (%s).", version));
561                         return false;
562                 }
563
564                 if(!((major == req_major && minor >= req_minor) || (major > req_major))) {
565                         opencl_error(string_printf("OpenCL: C version 1.1 or later required, found %d.%d", major, minor));
566                         return false;
567                 }
568
569                 return true;
570         }
571
572         bool load_binary(const string& kernel_path, const string& clbin, const string *debug_src = NULL)
573         {
574                 /* read binary into memory */
575                 vector<uint8_t> binary;
576
577                 if(!path_read_binary(clbin, binary)) {
578                         opencl_error(string_printf("OpenCL failed to read cached binary %s.", clbin.c_str()));
579                         return false;
580                 }
581
582                 /* create program */
583                 cl_int status;
584                 size_t size = binary.size();
585                 const uint8_t *bytes = &binary[0];
586
587                 cpProgram = clCreateProgramWithBinary(cxContext, 1, &cdDevice,
588                         &size, &bytes, &status, &ciErr);
589
590                 if(opencl_error(status) || opencl_error(ciErr)) {
591                         opencl_error(string_printf("OpenCL failed create program from cached binary %s.", clbin.c_str()));
592                         return false;
593                 }
594
595                 if(!build_kernel(kernel_path, debug_src))
596                         return false;
597
598                 return true;
599         }
600
601         bool save_binary(const string& clbin)
602         {
603                 size_t size = 0;
604                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL);
605
606                 if(!size)
607                         return false;
608
609                 vector<uint8_t> binary(size);
610                 uint8_t *bytes = &binary[0];
611
612                 clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, sizeof(uint8_t*), &bytes, NULL);
613
614                 if(!path_write_binary(clbin, binary)) {
615                         opencl_error(string_printf("OpenCL failed to write cached binary %s.", clbin.c_str()));
616                         return false;
617                 }
618
619                 return true;
620         }
621
622         bool build_kernel(const string& kernel_path, const string *debug_src = NULL)
623         {
624                 string build_options = opencl_kernel_build_options(platform_name, debug_src);
625         
626                 ciErr = clBuildProgram(cpProgram, 0, NULL, build_options.c_str(), NULL, NULL);
627
628                 /* show warnings even if build is successful */
629                 size_t ret_val_size = 0;
630
631                 clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
632
633                 if(ret_val_size > 1) {
634                         vector<char> build_log(ret_val_size+1);
635                         clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, &build_log[0], NULL);
636
637                         build_log[ret_val_size] = '\0';
638                         fprintf(stderr, "OpenCL kernel build output:\n");
639                         fprintf(stderr, "%s\n", &build_log[0]);
640                 }
641
642                 if(ciErr != CL_SUCCESS) {
643                         opencl_error("OpenCL build failed: errors in console");
644                         return false;
645                 }
646
647                 return true;
648         }
649
650         bool compile_kernel(const string& kernel_path, const string& kernel_md5, const string *debug_src = NULL)
651         {
652                 /* we compile kernels consisting of many files. unfortunately opencl
653                  * kernel caches do not seem to recognize changes in included files.
654                  * so we force recompile on changes by adding the md5 hash of all files */
655                 string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
656                 source = path_source_replace_includes(source, kernel_path);
657
658                 if(debug_src)
659                         path_write_text(*debug_src, source);
660
661                 size_t source_len = source.size();
662                 const char *source_str = source.c_str();
663
664                 cpProgram = clCreateProgramWithSource(cxContext, 1, &source_str, &source_len, &ciErr);
665
666                 if(opencl_error(ciErr))
667                         return false;
668
669                 double starttime = time_dt();
670                 printf("Compiling OpenCL kernel ...\n");
671
672                 if(!build_kernel(kernel_path, debug_src))
673                         return false;
674
675                 printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
676
677                 return true;
678         }
679
680         string device_md5_hash()
681         {
682                 MD5Hash md5;
683                 char version[256], driver[256], name[256], vendor[256];
684
685                 clGetPlatformInfo(cpPlatform, CL_PLATFORM_VENDOR, sizeof(vendor), &vendor, NULL);
686                 clGetDeviceInfo(cdDevice, CL_DEVICE_VERSION, sizeof(version), &version, NULL);
687                 clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(name), &name, NULL);
688                 clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(driver), &driver, NULL);
689
690                 md5.append((uint8_t*)vendor, strlen(vendor));
691                 md5.append((uint8_t*)version, strlen(version));
692                 md5.append((uint8_t*)name, strlen(name));
693                 md5.append((uint8_t*)driver, strlen(driver));
694
695                 string options = opencl_kernel_build_options(platform_name);
696                 md5.append((uint8_t*)options.c_str(), options.size());
697
698                 return md5.get_hex();
699         }
700
701         bool load_kernels(bool experimental)
702         {
703                 /* verify if device was initialized */
704                 if(!device_initialized) {
705                         fprintf(stderr, "OpenCL: failed to initialize device.\n");
706                         return false;
707                 }
708
709                 /* try to use cached kernel */
710                 thread_scoped_lock cache_locker;
711                 cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
712
713                 if(!cpProgram) {
714                         /* verify we have right opencl version */
715                         if(!opencl_version_check())
716                                 return false;
717
718                         /* md5 hash to detect changes */
719                         string kernel_path = path_get("kernel");
720                         string kernel_md5 = path_files_md5_hash(kernel_path);
721                         string device_md5 = device_md5_hash();
722
723                         /* path to cached binary */
724                         string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
725                         clbin = path_user_get(path_join("cache", clbin));
726
727                         /* path to preprocessed source for debugging */
728                         string clsrc, *debug_src = NULL;
729
730                         if(opencl_kernel_use_debug()) {
731                                 clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
732                                 clsrc = path_user_get(path_join("cache", clsrc));
733                                 debug_src = &clsrc;
734                         }
735
736                         /* if exists already, try use it */
737                         if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
738                                 /* kernel loaded from binary */
739                         }
740                         else {
741                                 /* if does not exist or loading binary failed, compile kernel */
742                                 if(!compile_kernel(kernel_path, kernel_md5, debug_src))
743                                         return false;
744
745                                 /* save binary for reuse */
746                                 if(!save_binary(clbin))
747                                         return false;
748                         }
749
750                         /* cache the program */
751                         OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
752                 }
753
754                 /* find kernels */
755                 ckPathTraceKernel = clCreateKernel(cpProgram, "kernel_ocl_path_trace", &ciErr);
756                 if(opencl_error(ciErr))
757                         return false;
758
759                 ckFilmConvertKernel = clCreateKernel(cpProgram, "kernel_ocl_tonemap", &ciErr);
760                 if(opencl_error(ciErr))
761                         return false;
762
763                 return true;
764         }
765
766         ~OpenCLDevice()
767         {
768                 task_pool.stop();
769
770                 if(null_mem)
771                         clReleaseMemObject(CL_MEM_PTR(null_mem));
772
773                 ConstMemMap::iterator mt;
774                 for(mt = const_mem_map.begin(); mt != const_mem_map.end(); mt++) {
775                         mem_free(*(mt->second));
776                         delete mt->second;
777                 }
778
779                 if(ckPathTraceKernel)
780                         clReleaseKernel(ckPathTraceKernel);  
781                 if(ckFilmConvertKernel)
782                         clReleaseKernel(ckFilmConvertKernel);  
783                 if(cpProgram)
784                         clReleaseProgram(cpProgram);
785                 if(cqCommandQueue)
786                         clReleaseCommandQueue(cqCommandQueue);
787                 if(cxContext)
788                         clReleaseContext(cxContext);
789         }
790
791         void mem_alloc(device_memory& mem, MemoryType type)
792         {
793                 size_t size = mem.memory_size();
794
795                 cl_mem_flags mem_flag;
796                 void *mem_ptr = NULL;
797
798                 if(type == MEM_READ_ONLY)
799                         mem_flag = CL_MEM_READ_ONLY;
800                 else if(type == MEM_WRITE_ONLY)
801                         mem_flag = CL_MEM_WRITE_ONLY;
802                 else
803                         mem_flag = CL_MEM_READ_WRITE;
804
805                 mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
806
807                 opencl_assert(ciErr);
808
809                 stats.mem_alloc(size);
810         }
811
812         void mem_copy_to(device_memory& mem)
813         {
814                 /* this is blocking */
815                 size_t size = mem.memory_size();
816                 ciErr = clEnqueueWriteBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, 0, size, (void*)mem.data_pointer, 0, NULL, NULL);
817                 opencl_assert(ciErr);
818         }
819
820         void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
821         {
822                 size_t offset = elem*y*w;
823                 size_t size = elem*w*h;
824
825                 ciErr = clEnqueueReadBuffer(cqCommandQueue, CL_MEM_PTR(mem.device_pointer), CL_TRUE, offset, size, (uchar*)mem.data_pointer + offset, 0, NULL, NULL);
826                 opencl_assert(ciErr);
827         }
828
829         void mem_zero(device_memory& mem)
830         {
831                 if(mem.device_pointer) {
832                         memset((void*)mem.data_pointer, 0, mem.memory_size());
833                         mem_copy_to(mem);
834                 }
835         }
836
837         void mem_free(device_memory& mem)
838         {
839                 if(mem.device_pointer) {
840                         ciErr = clReleaseMemObject(CL_MEM_PTR(mem.device_pointer));
841                         mem.device_pointer = 0;
842                         opencl_assert(ciErr);
843
844                         stats.mem_free(mem.memory_size());
845                 }
846         }
847
848         void const_copy_to(const char *name, void *host, size_t size)
849         {
850                 ConstMemMap::iterator i = const_mem_map.find(name);
851
852                 if(i == const_mem_map.end()) {
853                         device_vector<uchar> *data = new device_vector<uchar>();
854                         data->copy((uchar*)host, size);
855
856                         mem_alloc(*data, MEM_READ_ONLY);
857                         i = const_mem_map.insert(ConstMemMap::value_type(name, data)).first;
858                 }
859                 else {
860                         device_vector<uchar> *data = i->second;
861                         data->copy((uchar*)host, size);
862                 }
863
864                 mem_copy_to(*i->second);
865         }
866
867         void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
868         {
869                 mem_alloc(mem, MEM_READ_ONLY);
870                 mem_copy_to(mem);
871                 assert(mem_map.find(name) == mem_map.end());
872                 mem_map.insert(MemMap::value_type(name, mem.device_pointer));
873         }
874
875         void tex_free(device_memory& mem)
876         {
877                 if(mem.data_pointer)
878                         mem_free(mem);
879         }
880
881         size_t global_size_round_up(int group_size, int global_size)
882         {
883                 int r = global_size % group_size;
884                 return global_size + ((r == 0)? 0: group_size - r);
885         }
886
887         void enqueue_kernel(cl_kernel kernel, size_t w, size_t h)
888         {
889                 size_t workgroup_size, max_work_items[3];
890
891                 clGetKernelWorkGroupInfo(kernel, cdDevice,
892                         CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &workgroup_size, NULL);
893                 clGetDeviceInfo(cdDevice,
894                         CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t)*3, max_work_items, NULL);
895         
896                 /* try to divide evenly over 2 dimensions */
897                 size_t sqrt_workgroup_size = max(sqrt((double)workgroup_size), 1.0);
898                 size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
899
900                 /* some implementations have max size 1 on 2nd dimension */
901                 if(local_size[1] > max_work_items[1]) {
902                         local_size[0] = workgroup_size/max_work_items[1];
903                         local_size[1] = max_work_items[1];
904                 }
905
906                 size_t global_size[2] = {global_size_round_up(local_size[0], w), global_size_round_up(local_size[1], h)};
907
908                 /* run kernel */
909                 ciErr = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
910                 opencl_assert(ciErr);
911                 opencl_assert(clFlush(cqCommandQueue));
912         }
913
914         void path_trace(RenderTile& rtile, int sample)
915         {
916                 /* cast arguments to cl types */
917                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
918                 cl_mem d_buffer = CL_MEM_PTR(rtile.buffer);
919                 cl_mem d_rng_state = CL_MEM_PTR(rtile.rng_state);
920                 cl_int d_x = rtile.x;
921                 cl_int d_y = rtile.y;
922                 cl_int d_w = rtile.w;
923                 cl_int d_h = rtile.h;
924                 cl_int d_sample = sample;
925                 cl_int d_offset = rtile.offset;
926                 cl_int d_stride = rtile.stride;
927
928                 /* sample arguments */
929                 cl_uint narg = 0;
930                 ciErr = 0;
931
932                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_data), (void*)&d_data);
933                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
934                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_rng_state), (void*)&d_rng_state);
935
936 #define KERNEL_TEX(type, ttype, name) \
937         ciErr |= set_kernel_arg_mem(ckPathTraceKernel, &narg, #name);
938 #include "kernel_textures.h"
939
940                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_sample), (void*)&d_sample);
941                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_x), (void*)&d_x);
942                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_y), (void*)&d_y);
943                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_w), (void*)&d_w);
944                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_h), (void*)&d_h);
945                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_offset), (void*)&d_offset);
946                 ciErr |= clSetKernelArg(ckPathTraceKernel, narg++, sizeof(d_stride), (void*)&d_stride);
947
948                 opencl_assert(ciErr);
949
950                 enqueue_kernel(ckPathTraceKernel, d_w, d_h);
951         }
952
953         cl_int set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name)
954         {
955                 cl_mem ptr;
956                 cl_int err = 0;
957
958                 MemMap::iterator i = mem_map.find(name);
959                 if(i != mem_map.end()) {
960                         ptr = CL_MEM_PTR(i->second);
961                 }
962                 else {
963                         /* work around NULL not working, even though the spec says otherwise */
964                         ptr = CL_MEM_PTR(null_mem);
965                 }
966                 
967                 err |= clSetKernelArg(kernel, (*narg)++, sizeof(ptr), (void*)&ptr);
968                 opencl_assert(err);
969
970                 return err;
971         }
972
973         void tonemap(DeviceTask& task, device_ptr buffer, device_ptr rgba)
974         {
975                 /* cast arguments to cl types */
976                 cl_mem d_data = CL_MEM_PTR(const_mem_map["__data"]->device_pointer);
977                 cl_mem d_rgba = CL_MEM_PTR(rgba);
978                 cl_mem d_buffer = CL_MEM_PTR(buffer);
979                 cl_int d_x = task.x;
980                 cl_int d_y = task.y;
981                 cl_int d_w = task.w;
982                 cl_int d_h = task.h;
983                 cl_int d_sample = task.sample;
984                 cl_int d_offset = task.offset;
985                 cl_int d_stride = task.stride;
986
987                 /* sample arguments */
988                 cl_uint narg = 0;
989                 ciErr = 0;
990
991                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_data), (void*)&d_data);
992                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_rgba), (void*)&d_rgba);
993                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_buffer), (void*)&d_buffer);
994
995 #define KERNEL_TEX(type, ttype, name) \
996         ciErr |= set_kernel_arg_mem(ckFilmConvertKernel, &narg, #name);
997 #include "kernel_textures.h"
998
999                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_sample), (void*)&d_sample);
1000                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_x), (void*)&d_x);
1001                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_y), (void*)&d_y);
1002                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_w), (void*)&d_w);
1003                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_h), (void*)&d_h);
1004                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_offset), (void*)&d_offset);
1005                 ciErr |= clSetKernelArg(ckFilmConvertKernel, narg++, sizeof(d_stride), (void*)&d_stride);
1006
1007                 opencl_assert(ciErr);
1008
1009                 enqueue_kernel(ckFilmConvertKernel, d_w, d_h);
1010         }
1011
1012         void thread_run(DeviceTask *task)
1013         {
1014                 if(task->type == DeviceTask::TONEMAP) {
1015                         tonemap(*task, task->buffer, task->rgba);
1016                 }
1017                 else if(task->type == DeviceTask::PATH_TRACE) {
1018                         RenderTile tile;
1019                         
1020                         /* keep rendering tiles until done */
1021                         while(task->acquire_tile(this, tile)) {
1022                                 int start_sample = tile.start_sample;
1023                                 int end_sample = tile.start_sample + tile.num_samples;
1024
1025                                 for(int sample = start_sample; sample < end_sample; sample++) {
1026                                         if(task->get_cancel()) {
1027                                                 if(task->need_finish_queue == false)
1028                                                         break;
1029                                         }
1030
1031                                         path_trace(tile, sample);
1032
1033                                         tile.sample = sample + 1;
1034
1035                                         //task->update_progress(tile);
1036                                 }
1037
1038                                 task->release_tile(tile);
1039                         }
1040                 }
1041         }
1042
1043         class OpenCLDeviceTask : public DeviceTask {
1044         public:
1045                 OpenCLDeviceTask(OpenCLDevice *device, DeviceTask& task)
1046                 : DeviceTask(task)
1047                 {
1048                         run = function_bind(&OpenCLDevice::thread_run, device, this);
1049                 }
1050         };
1051
1052         void task_add(DeviceTask& task)
1053         {
1054                 task_pool.push(new OpenCLDeviceTask(this, task));
1055         }
1056
1057         void task_wait()
1058         {
1059                 task_pool.wait_work();
1060         }
1061
1062         void task_cancel()
1063         {
1064                 task_pool.cancel();
1065         }
1066 };
1067
1068 Device *device_opencl_create(DeviceInfo& info, Stats &stats, bool background)
1069 {
1070         return new OpenCLDevice(info, stats, background);
1071 }
1072
1073 void device_opencl_info(vector<DeviceInfo>& devices)
1074 {
1075         vector<cl_device_id> device_ids;
1076         cl_uint num_devices = 0;
1077         vector<cl_platform_id> platform_ids;
1078         cl_uint num_platforms = 0;
1079
1080         /* get devices */
1081         if(clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS || num_platforms == 0)
1082                 return;
1083         
1084         platform_ids.resize(num_platforms);
1085
1086         if(clGetPlatformIDs(num_platforms, &platform_ids[0], NULL) != CL_SUCCESS)
1087                 return;
1088
1089         /* devices are numbered consecutively across platforms */
1090         int num_base = 0;
1091
1092         for (int platform = 0; platform < num_platforms; platform++, num_base += num_devices) {
1093                 num_devices = 0;
1094                 if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), 0, NULL, &num_devices) != CL_SUCCESS || num_devices == 0)
1095                         continue;
1096
1097                 device_ids.resize(num_devices);
1098
1099                 if(clGetDeviceIDs(platform_ids[platform], opencl_device_type(), num_devices, &device_ids[0], NULL) != CL_SUCCESS)
1100                         continue;
1101
1102                 char pname[256];
1103                 clGetPlatformInfo(platform_ids[platform], CL_PLATFORM_NAME, sizeof(pname), &pname, NULL);
1104                 string platform_name = pname;
1105
1106                 /* add devices */
1107                 for(int num = 0; num < num_devices; num++) {
1108                         cl_device_id device_id = device_ids[num];
1109                         char name[1024] = "\0";
1110
1111                         if(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(name), &name, NULL) != CL_SUCCESS)
1112                                 continue;
1113
1114                         DeviceInfo info;
1115
1116                         info.type = DEVICE_OPENCL;
1117                         info.description = string(name);
1118                         info.num = num_base + num;
1119                         info.id = string_printf("OPENCL_%d", info.num);
1120                         /* we don't know if it's used for display, but assume it is */
1121                         info.display_device = true;
1122                         info.advanced_shading = opencl_kernel_use_advanced_shading(platform_name);
1123                         info.pack_images = true;
1124
1125                         devices.push_back(info);
1126                 }
1127         }
1128 }
1129
1130 CCL_NAMESPACE_END
1131
1132 #endif /* WITH_OPENCL */
1133