Cycles OpenCL: keep the opencl context and program around for quicker rendering
authorBrecht Van Lommel <brechtvanlommel@pandora.be>
Fri, 31 May 2013 16:19:03 +0000 (16:19 +0000)
committerBrecht Van Lommel <brechtvanlommel@pandora.be>
Fri, 31 May 2013 16:19:03 +0000 (16:19 +0000)
the second time, as for example Intel CPU startup time is 9 seconds.

* Adds an cache for contexts and programs for each platform and device pair,
  which also ensure now no two threads try to compile and write the binary cache
  file at the same time.
* Change clFinish to clFlush so we don't block until the result is done, instead
  it will block at the moment we copy back memory.
* Fix error in Cycles time_sleep implementation, does not affect any active code
  though.
* Adds some (disabled) debugging code in the task scheduler.

Patch #35559 by Doug Gale.

intern/cycles/device/device_opencl.cpp
intern/cycles/util/util_opencl.cpp
intern/cycles/util/util_task.cpp
intern/cycles/util/util_time.cpp

index 1cd538d655fef850ef0b5b551911c28fb4ffd577..0b9881c0eb5ad7abc7dcfd0ec5eb2ca144d9e5c9 100644 (file)
@@ -104,12 +104,194 @@ static string opencl_kernel_build_options(const string& platform, const string *
        if(opencl_kernel_use_debug())
                build_options += "-D__KERNEL_OPENCL_DEBUG__ ";
 
-       if (opencl_kernel_use_advanced_shading(platform))
+       if(opencl_kernel_use_advanced_shading(platform))
                build_options += "-D__KERNEL_OPENCL_NEED_ADVANCED_SHADING__ ";
        
        return build_options;
 }
 
+/* thread safe cache for contexts and programs */
+class OpenCLCache
+{
+       struct Slot
+       {
+               thread_mutex *mutex;
+               cl_context context;
+               cl_program program;
+
+               Slot() : mutex(NULL), context(NULL), program(NULL) {}
+
+               Slot(const Slot &rhs)
+                       : mutex(rhs.mutex)
+                       , context(rhs.context)
+                       , program(rhs.program)
+               {
+                       /* copy can only happen in map insert, assert that */
+                       assert(mutex == NULL);
+               }
+
+               ~Slot()
+               {
+                       delete mutex;
+                       mutex = NULL;
+               }
+       };
+
+       /* key is combination of platform ID and device ID */
+       typedef pair<cl_platform_id, cl_device_id> PlatformDevicePair;
+
+       /* map of Slot objects */
+       typedef map<PlatformDevicePair, Slot> CacheMap;
+       CacheMap cache;
+
+       thread_mutex cache_lock;
+
+       /* lazy instantiate */
+       static OpenCLCache &global_instance()
+       {
+               static OpenCLCache instance;
+               return instance;
+       }
+
+       OpenCLCache()
+       {
+       }
+
+       ~OpenCLCache()
+       {
+               /* Intel OpenCL bug raises SIGABRT due to pure virtual call
+                * so this is disabled. It's not necessary to free objects
+                * at process exit anyway.
+                * http://software.intel.com/en-us/forums/topic/370083#comments */
+
+               //flush();
+       }
+
+       /* lookup something in the cache. If this returns NULL, slot_locker
+        * will be holding a lock for the cache. slot_locker should refer to a
+        * default constructed thread_scoped_lock */
+       template<typename T>
+       static T get_something(cl_platform_id platform, cl_device_id device,
+               T Slot::*member, cl_int (*retain_func)(T), thread_scoped_lock &slot_locker)
+       {
+               assert(platform != NULL);
+
+               OpenCLCache &self = global_instance();
+
+               thread_scoped_lock cache_lock(self.cache_lock);
+
+               pair<CacheMap::iterator,bool> ins = self.cache.insert(
+                       CacheMap::value_type(PlatformDevicePair(platform, device), Slot()));
+
+               Slot &slot = ins.first->second;
+
+               /* create slot lock only while holding cache lock */
+               if(!slot.mutex)
+                       slot.mutex = new thread_mutex;
+
+               /* need to unlock cache before locking slot, to allow store to complete */
+               cache_lock.unlock();
+
+               /* lock the slot */
+               slot_locker = thread_scoped_lock(*slot.mutex);
+
+               /* If the thing isn't cached */
+               if(slot.*member == NULL) {
+                       /* return with the caller's lock holder holding the slot lock */
+                       return NULL;
+               }
+
+               /* the item was already cached, release the slot lock */
+               slot_locker.unlock();
+
+               /* caller is going to release it when done with it, so retain it */
+               cl_int ciErr = retain_func(slot.*member);
+               assert(ciErr == CL_SUCCESS);
+               (void)ciErr;
+
+               return slot.*member;
+       }
+
+       /* store something in the cache. you MUST have tried to get the item before storing to it */
+       template<typename T>
+       static void store_something(cl_platform_id platform, cl_device_id device, T thing,
+               T Slot::*member, cl_int (*retain_func)(T), thread_scoped_lock &slot_locker)
+       {
+               assert(platform != NULL);
+               assert(device != NULL);
+               assert(thing != NULL);
+
+               OpenCLCache &self = global_instance();
+
+               thread_scoped_lock cache_lock(self.cache_lock);
+               CacheMap::iterator i = self.cache.find(PlatformDevicePair(platform, device));
+               cache_lock.unlock();
+
+               Slot &slot = i->second;
+
+               /* sanity check */
+               assert(i != self.cache.end());
+               assert(slot.*member == NULL);
+
+               slot.*member = thing;
+
+               /* unlock the slot */
+               slot_locker.unlock();
+
+               /* increment reference count in OpenCL.
+                * The caller is going to release the object when done with it. */
+               cl_int ciErr = retain_func(thing);
+               assert(ciErr == CL_SUCCESS);
+               (void)ciErr;
+       }
+
+public:
+       /* see get_something comment */
+       static cl_context get_context(cl_platform_id platform, cl_device_id device,
+               thread_scoped_lock &slot_locker)
+       {
+               return get_something(platform, device, &Slot::context, clRetainContext, slot_locker);
+       }
+
+       /* see get_something comment */
+       static cl_program get_program(cl_platform_id platform, cl_device_id device,
+               thread_scoped_lock &slot_locker)
+       {
+               return get_something(platform, device, &Slot::program, clRetainProgram, slot_locker);
+       }
+
+       /* see store_something comment */
+       static void store_context(cl_platform_id platform, cl_device_id device, cl_context context,
+               thread_scoped_lock &slot_locker)
+       {
+               store_something(platform, device, context, &Slot::context, clRetainContext, slot_locker);
+       }
+
+       /* see store_something comment */
+       static void store_program(cl_platform_id platform, cl_device_id device, cl_program program,
+               thread_scoped_lock &slot_locker)
+       {
+               store_something(platform, device, program, &Slot::program, clRetainProgram, slot_locker);
+       }
+
+       /* discard all cached contexts and programs
+        * the parameter is a temporary workaround. See OpenCLCache::~OpenCLCache */
+       static void flush()
+       {
+               OpenCLCache &self = global_instance();
+               thread_scoped_lock cache_lock(self.cache_lock);
+
+               foreach(CacheMap::value_type &item, self.cache) {
+                       if(item.second.program != NULL)
+                               clReleaseProgram(item.second.program);
+                       if(item.second.context != NULL)
+                               clReleaseContext(item.second.context);
+               }
+
+               self.cache.clear();
+       }
+};
+
 class OpenCLDevice : public Device
 {
 public:
@@ -290,21 +472,34 @@ public:
                        opencl_error("OpenCL: no devices found.");
                        return;
                }
-               else if (!cdDevice) {
+               else if(!cdDevice) {
                        opencl_error("OpenCL: specified device not found.");
                        return;
                }
 
-               /* Create context properties array to specify platform */
-               const cl_context_properties context_props[] = {
-                       CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
-                       0, 0
-               };
-
-               /* create context */
-               cxContext = clCreateContext(context_props, 1, &cdDevice, NULL, NULL, &ciErr);
-               if(opencl_error(ciErr))
-                       return;
+               {
+                       /* try to use cached context */
+                       thread_scoped_lock cache_locker;
+                       cxContext = OpenCLCache::get_context(cpPlatform, cdDevice, cache_locker);
+
+                       if(cxContext == NULL) {
+                               /* create context properties array to specify platform */
+                               const cl_context_properties context_props[] = {
+                                       CL_CONTEXT_PLATFORM, (cl_context_properties)cpPlatform,
+                                       0, 0
+                               };
+
+                               /* create context */
+                               cxContext = clCreateContext(context_props, 1, &cdDevice,
+                                       context_notify_callback, cdDevice, &ciErr);
+
+                               if(opencl_error(ciErr))
+                                       return;
+
+                               /* cache it */
+                               OpenCLCache::store_context(cpPlatform, cdDevice, cxContext, cache_locker);
+                       }
+               }
 
                cqCommandQueue = clCreateCommandQueue(cxContext, cdDevice, 0, &ciErr);
                if(opencl_error(ciErr))
@@ -317,6 +512,15 @@ public:
                device_initialized = true;
        }
 
+       static void context_notify_callback(const char *err_info,
+               const void *private_info, size_t cb, void *user_data)
+       {
+               char name[256];
+               clGetDeviceInfo((cl_device_id)user_data, CL_DEVICE_NAME, sizeof(name), &name, NULL);
+
+               fprintf(stderr, "OpenCL error (%s): %s\n", name, err_info);
+       }
+
        bool opencl_version_check()
        {
                char version[256];
@@ -436,7 +640,7 @@ public:
                string source = "#include \"kernel.cl\" // " + kernel_md5 + "\n";
                source = path_source_replace_includes(source, kernel_path);
 
-               if (debug_src)
+               if(debug_src)
                        path_write_text(*debug_src, source);
 
                size_t source_len = source.size();
@@ -487,39 +691,49 @@ public:
                        return false;
                }
 
-               /* verify we have right opencl version */
-               if(!opencl_version_check())
-                       return false;
+               /* try to use cached kernel */
+               thread_scoped_lock cache_locker;
+               cpProgram = OpenCLCache::get_program(cpPlatform, cdDevice, cache_locker);
 
-               /* md5 hash to detect changes */
-               string kernel_path = path_get("kernel");
-               string kernel_md5 = path_files_md5_hash(kernel_path);
-               string device_md5 = device_md5_hash();
+               if(!cpProgram) {
+                       /* verify we have right opencl version */
+                       if(!opencl_version_check())
+                               return false;
 
-               /* path to cached binary */
-               string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
-               clbin = path_user_get(path_join("cache", clbin));
+                       /* md5 hash to detect changes */
+                       string kernel_path = path_get("kernel");
+                       string kernel_md5 = path_files_md5_hash(kernel_path);
+                       string device_md5 = device_md5_hash();
 
-               /* path to preprocessed source for debugging */
-               string clsrc, *debug_src = NULL;
-               
-               if (opencl_kernel_use_debug()) {
-                       clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
-                       clsrc = path_user_get(path_join("cache", clsrc));
-                       debug_src = &clsrc;
-               }
+                       /* path to cached binary */
+                       string clbin = string_printf("cycles_kernel_%s_%s.clbin", device_md5.c_str(), kernel_md5.c_str());
+                       clbin = path_user_get(path_join("cache", clbin));
 
-               /* if exists already, try use it */
-               if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
-                       /* kernel loaded from binary */
-               }
-               else {
-                       /* if does not exist or loading binary failed, compile kernel */
-                       if(!compile_kernel(kernel_path, kernel_md5, debug_src))
-                               return false;
+                       /* path to preprocessed source for debugging */
+                       string clsrc, *debug_src = NULL;
+
+                       if(opencl_kernel_use_debug()) {
+                               clsrc = string_printf("cycles_kernel_%s_%s.cl", device_md5.c_str(), kernel_md5.c_str());
+                               clsrc = path_user_get(path_join("cache", clsrc));
+                               debug_src = &clsrc;
+                       }
 
-                       /* save binary for reuse */
-                       save_binary(clbin);
+                       /* if exists already, try use it */
+                       if(path_exists(clbin) && load_binary(kernel_path, clbin, debug_src)) {
+                               /* kernel loaded from binary */
+                       }
+                       else {
+                               /* if does not exist or loading binary failed, compile kernel */
+                               if(!compile_kernel(kernel_path, kernel_md5, debug_src))
+                                       return false;
+
+                               /* save binary for reuse */
+                               if(!save_binary(clbin))
+                                       return false;
+                       }
+
+                       /* cache the program */
+                       OpenCLCache::store_program(cpPlatform, cdDevice, cpProgram, cache_locker);
                }
 
                /* find kernels */
@@ -563,12 +777,17 @@ public:
        {
                size_t size = mem.memory_size();
 
+               cl_mem_flags mem_flag;
+               void *mem_ptr = NULL;
+
                if(type == MEM_READ_ONLY)
-                       mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_ONLY, size, NULL, &ciErr);
+                       mem_flag = CL_MEM_READ_ONLY;
                else if(type == MEM_WRITE_ONLY)
-                       mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_WRITE_ONLY, size, NULL, &ciErr);
+                       mem_flag = CL_MEM_WRITE_ONLY;
                else
-                       mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, CL_MEM_READ_WRITE, size, NULL, &ciErr);
+                       mem_flag = CL_MEM_READ_WRITE;
+
+               mem.device_pointer = (device_ptr)clCreateBuffer(cxContext, mem_flag, size, mem_ptr, &ciErr);
 
                opencl_assert(ciErr);
 
@@ -664,7 +883,7 @@ public:
                size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
 
                /* some implementations have max size 1 on 2nd dimension */
-               if (local_size[1] > max_work_items[1]) {
+               if(local_size[1] > max_work_items[1]) {
                        local_size[0] = workgroup_size/max_work_items[1];
                        local_size[1] = max_work_items[1];
                }
@@ -674,7 +893,7 @@ public:
                /* run kernel */
                ciErr = clEnqueueNDRangeKernel(cqCommandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL);
                opencl_assert(ciErr);
-               opencl_assert(clFinish(cqCommandQueue));
+               opencl_assert(clFlush(cqCommandQueue));
        }
 
        void path_trace(RenderTile& rtile, int sample)
@@ -789,7 +1008,7 @@ public:
                                int end_sample = tile.start_sample + tile.num_samples;
 
                                for(int sample = start_sample; sample < end_sample; sample++) {
-                                       if (task->get_cancel()) {
+                                       if(task->get_cancel()) {
                                                if(task->need_finish_queue == false)
                                                        break;
                                        }
@@ -798,7 +1017,7 @@ public:
 
                                        tile.sample = sample + 1;
 
-                                       task->update_progress(tile);
+                                       //task->update_progress(tile);
                                }
 
                                task->release_tile(tile);
index c146c14b10ca5e04cce462fd18b8da18217faa18..1e67afb3fa892d4cc35a9a8aebc3c170411dbe05 100644 (file)
@@ -114,6 +114,7 @@ PFNCLGETEXTENSIONFUNCTIONADDRESS    __clewGetExtensionFunctionAddress   = NULL;
 #endif  //  CLCC_GENERATE_DOCUMENTATION
 
 
+#if 0
 //! \brief Unloads OpenCL dynamic library, should not be called directly
 static void clewExit(void)
 {
@@ -124,6 +125,7 @@ static void clewExit(void)
                module = NULL;
        }
 }
+#endif
 
 //! \param path path to dynamic library to load
 //! \return CLEW_ERROR_OPEN_FAILED if the library could not be opened
@@ -138,7 +140,6 @@ int clLibraryInit()
 #else
        const char *path = "libOpenCL.so";
 #endif
-       int error = 0;
 
        // OpenCL disabled for now, only works with this environment variable set
        if(!getenv("CYCLES_OPENCL_TEST"))
@@ -159,8 +160,11 @@ int clLibraryInit()
                return 0;
        }
 
+       // Disabled because we retain OpenCL context and it's difficult to ensure
+       // this will exit after releasing the context
+#if 0
        //  Set unloading
-       error = atexit(clewExit);
+       int error = atexit(clewExit);
 
        if (error)
        {
@@ -170,6 +174,7 @@ int clLibraryInit()
 
                return 0;
        }
+#endif
 
        //  Determine function entry-points
        __clewGetPlatformIDs                = (PFNCLGETPLATFORMIDS              )CLCC_DYNLIB_IMPORT(module, "clGetPlatformIDs");
index 43f15ba0ce6efcf16422e76b57a5d671ee5a7226..abcb05561bdd6e7626f8f119daeaf91b2fa3715e 100644 (file)
 #include "util_system.h"
 #include "util_task.h"
 
+//#define THREADING_DEBUG_ENABLED
+
+#ifdef THREADING_DEBUG_ENABLED
+#include <stdio.h>
+#define THREADING_DEBUG(...) do { printf(__VA_ARGS__); fflush(stdout); } while(0)
+#else
+#define THREADING_DEBUG(...)
+#endif
+
 CCL_NAMESPACE_BEGIN
 
 /* Task Pool */
@@ -95,8 +104,11 @@ void TaskPool::wait_work()
                if(num == 0)
                        break;
 
-               if(!found_entry)
+               if(!found_entry) {
+                       THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::wait_work !found_entry\n", num);
                        num_cond.wait(num_lock);
+                       THREADING_DEBUG("num==%d, condition wait done in TaskPool::wait_work !found_entry\n", num);
+               }
        }
 }
 
@@ -109,8 +121,11 @@ void TaskPool::cancel()
        {
                thread_scoped_lock num_lock(num_mutex);
 
-               while(num)
+               while(num) {
+                       THREADING_DEBUG("num==%d, Waiting for condition in TaskPool::cancel\n", num);
                        num_cond.wait(num_lock);
+                       THREADING_DEBUG("num==%d condition wait done in TaskPool::cancel\n", num);
+               }
        }
 
        do_cancel = false;
@@ -134,8 +149,10 @@ void TaskPool::num_decrease(int done)
        num -= done;
 
        assert(num >= 0);
-       if(num == 0)
+       if(num == 0) {
+               THREADING_DEBUG("num==%d, notifying all in TaskPool::num_decrease\n", num);
                num_cond.notify_all();
+       }
 
        num_mutex.unlock();
 }
@@ -144,6 +161,7 @@ void TaskPool::num_increase()
 {
        thread_scoped_lock num_lock(num_mutex);
        num++;
+       THREADING_DEBUG("num==%d, notifying all in TaskPool::num_increase\n", num);
        num_cond.notify_all();
 }
 
index 5f543fc7f919de54487de941644085810d6f3594..4edd59780a2e5a29b88e18b464d2b5da74e015a1 100644 (file)
@@ -58,12 +58,23 @@ double time_dt()
        return now.tv_sec + now.tv_usec*1e-6;
 }
 
+/* sleep t seconds */
 void time_sleep(double t)
 {
-       if(t >= 1.0)
-               sleep((int)t);
+       /* get whole seconds */
+       int s = (int)t;
 
-       usleep((int)(t*1e6));
+       if(s >= 1) {
+               sleep(s);
+
+               /* adjust parameter to remove whole seconds */
+               t -= s;
+       }
+
+       /* get microseconds */
+       int us = (int)(t * 1e6);
+       if (us > 0)
+               usleep(us);
 }
 
 CCL_NAMESPACE_END