ClangFormat: apply to source, most of intern
[blender.git] / intern / cycles / device / device_cuda.cpp
index 3aa6bce..68bc3bd 100644 (file)
@@ -62,2144 +62,2242 @@ namespace {
 
 const char *cuewErrorString(CUresult result)
 {
-       /* We can only give error code here without major code duplication, that
-        * should be enough since dynamic loading is only being disabled by folks
-        * who knows what they're doing anyway.
-        *
-        * NOTE: Avoid call from several threads.
-        */
-       static string error;
-       error = string_printf("%d", result);
-       return error.c_str();
+  /* We can only give error code here without major code duplication, that
+   * should be enough since dynamic loading is only being disabled by folks
+   * who knows what they're doing anyway.
+   *
+   * NOTE: Avoid call from several threads.
+   */
+  static string error;
+  error = string_printf("%d", result);
+  return error.c_str();
 }
 
 const char *cuewCompilerPath()
 {
-       return CYCLES_CUDA_NVCC_EXECUTABLE;
+  return CYCLES_CUDA_NVCC_EXECUTABLE;
 }
 
 int cuewCompilerVersion()
 {
-       return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
+  return (CUDA_VERSION / 100) + (CUDA_VERSION % 100 / 10);
 }
 
-}  /* namespace */
-#endif  /* WITH_CUDA_DYNLOAD */
+} /* namespace */
+#endif /* WITH_CUDA_DYNLOAD */
 
 class CUDADevice;
 
 class CUDASplitKernel : public DeviceSplitKernel {
-       CUDADevice *device;
-public:
-       explicit CUDASplitKernel(CUDADevice *device);
-
-       virtual uint64_t state_buffer_size(device_memory& kg, device_memory& data, size_t num_threads);
-
-       virtual bool enqueue_split_kernel_data_init(const KernelDimensions& dim,
-                                                   RenderTile& rtile,
-                                                   int num_global_elements,
-                                                   device_memory& kernel_globals,
-                                                   device_memory& kernel_data_,
-                                                   device_memory& split_data,
-                                                   device_memory& ray_state,
-                                                   device_memory& queue_index,
-                                                   device_memory& use_queues_flag,
-                                                   device_memory& work_pool_wgs);
-
-       virtual SplitKernelFunction* get_split_kernel_function(const string& kernel_name,
-                                                              const DeviceRequestedFeatures&);
-       virtual int2 split_kernel_local_size();
-       virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask *task);
+  CUDADevice *device;
+
+ public:
+  explicit CUDASplitKernel(CUDADevice *device);
+
+  virtual uint64_t state_buffer_size(device_memory &kg, device_memory &data, size_t num_threads);
+
+  virtual bool enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                              RenderTile &rtile,
+                                              int num_global_elements,
+                                              device_memory &kernel_globals,
+                                              device_memory &kernel_data_,
+                                              device_memory &split_data,
+                                              device_memory &ray_state,
+                                              device_memory &queue_index,
+                                              device_memory &use_queues_flag,
+                                              device_memory &work_pool_wgs);
+
+  virtual SplitKernelFunction *get_split_kernel_function(const string &kernel_name,
+                                                         const DeviceRequestedFeatures &);
+  virtual int2 split_kernel_local_size();
+  virtual int2 split_kernel_global_size(device_memory &kg, device_memory &data, DeviceTask *task);
 };
 
 /* Utility to push/pop CUDA context. */
 class CUDAContextScope {
-public:
-       CUDAContextScope(CUDADevice *device);
-       ~CUDAContextScope();
+ public:
+  CUDAContextScope(CUDADevice *device);
+  ~CUDAContextScope();
 
-private:
-       CUDADevice *device;
+ private:
+  CUDADevice *device;
 };
 
-class CUDADevice : public Device
-{
-public:
-       DedicatedTaskPool task_pool;
-       CUdevice cuDevice;
-       CUcontext cuContext;
-       CUmodule cuModule, cuFilterModule;
-       size_t device_texture_headroom;
-       size_t device_working_headroom;
-       bool move_texture_to_host;
-       size_t map_host_used;
-       size_t map_host_limit;
-       int can_map_host;
-       int cuDevId;
-       int cuDevArchitecture;
-       bool first_error;
-       CUDASplitKernel *split_kernel;
-
-       struct CUDAMem {
-               CUDAMem()
-               : texobject(0), array(0), map_host_pointer(0), free_map_host(false) {}
-
-               CUtexObject texobject;
-               CUarray array;
-               void *map_host_pointer;
-               bool free_map_host;
-       };
-       typedef map<device_memory*, CUDAMem> CUDAMemMap;
-       CUDAMemMap cuda_mem_map;
-
-       struct PixelMem {
-               GLuint cuPBO;
-               CUgraphicsResource cuPBOresource;
-               GLuint cuTexId;
-               int w, h;
-       };
-       map<device_ptr, PixelMem> pixel_mem_map;
-
-       /* Bindless Textures */
-       device_vector<TextureInfo> texture_info;
-       bool need_texture_info;
-
-       CUdeviceptr cuda_device_ptr(device_ptr mem)
-       {
-               return (CUdeviceptr)mem;
-       }
-
-       static bool have_precompiled_kernels()
-       {
-               string cubins_path = path_get("lib");
-               return path_exists(cubins_path);
-       }
-
-       virtual bool show_samples() const
-       {
-               /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
-               return true;
-       }
-
-       virtual BVHLayoutMask get_bvh_layout_mask() const {
-               return BVH_LAYOUT_BVH2;
-       }
-
-/*#ifdef NDEBUG
+class CUDADevice : public Device {
+ public:
+  DedicatedTaskPool task_pool;
+  CUdevice cuDevice;
+  CUcontext cuContext;
+  CUmodule cuModule, cuFilterModule;
+  size_t device_texture_headroom;
+  size_t device_working_headroom;
+  bool move_texture_to_host;
+  size_t map_host_used;
+  size_t map_host_limit;
+  int can_map_host;
+  int cuDevId;
+  int cuDevArchitecture;
+  bool first_error;
+  CUDASplitKernel *split_kernel;
+
+  struct CUDAMem {
+    CUDAMem() : texobject(0), array(0), map_host_pointer(0), free_map_host(false)
+    {
+    }
+
+    CUtexObject texobject;
+    CUarray array;
+    void *map_host_pointer;
+    bool free_map_host;
+  };
+  typedef map<device_memory *, CUDAMem> CUDAMemMap;
+  CUDAMemMap cuda_mem_map;
+
+  struct PixelMem {
+    GLuint cuPBO;
+    CUgraphicsResource cuPBOresource;
+    GLuint cuTexId;
+    int w, h;
+  };
+  map<device_ptr, PixelMem> pixel_mem_map;
+
+  /* Bindless Textures */
+  device_vector<TextureInfo> texture_info;
+  bool need_texture_info;
+
+  CUdeviceptr cuda_device_ptr(device_ptr mem)
+  {
+    return (CUdeviceptr)mem;
+  }
+
+  static bool have_precompiled_kernels()
+  {
+    string cubins_path = path_get("lib");
+    return path_exists(cubins_path);
+  }
+
+  virtual bool show_samples() const
+  {
+    /* The CUDADevice only processes one tile at a time, so showing samples is fine. */
+    return true;
+  }
+
+  virtual BVHLayoutMask get_bvh_layout_mask() const
+  {
+    return BVH_LAYOUT_BVH2;
+  }
+
+  /*#ifdef NDEBUG
 #define cuda_abort()
 #else
 #define cuda_abort() abort()
 #endif*/
-       void cuda_error_documentation()
-       {
-               if(first_error) {
-                       fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
-                       fprintf(stderr, "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
-                       first_error = false;
-               }
-       }
+  void cuda_error_documentation()
+  {
+    if (first_error) {
+      fprintf(stderr,
+              "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
+      fprintf(stderr,
+              "https://docs.blender.org/manual/en/dev/render/cycles/gpu_rendering.html\n\n");
+      first_error = false;
+    }
+  }
 
 #define cuda_assert(stmt) \
-       { \
-               CUresult result = stmt; \
-               \
-               if(result != CUDA_SUCCESS) { \
-                       string message = string_printf("CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
-                       if(error_msg == "") \
-                               error_msg = message; \
-                       fprintf(stderr, "%s\n", message.c_str()); \
-                       /*cuda_abort();*/ \
-                       cuda_error_documentation(); \
-               } \
-       } (void) 0
-
-       bool cuda_error_(CUresult result, const string& stmt)
-       {
-               if(result == CUDA_SUCCESS)
-                       return false;
-
-               string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
-               if(error_msg == "")
-                       error_msg = message;
-               fprintf(stderr, "%s\n", message.c_str());
-               cuda_error_documentation();
-               return true;
-       }
+  { \
+    CUresult result = stmt; \
+\
+    if (result != CUDA_SUCCESS) { \
+      string message = string_printf( \
+          "CUDA error: %s in %s, line %d", cuewErrorString(result), #stmt, __LINE__); \
+      if (error_msg == "") \
+        error_msg = message; \
+      fprintf(stderr, "%s\n", message.c_str()); \
+      /*cuda_abort();*/ \
+      cuda_error_documentation(); \
+    } \
+  } \
+  (void)0
+
+  bool cuda_error_(CUresult result, const string &stmt)
+  {
+    if (result == CUDA_SUCCESS)
+      return false;
+
+    string message = string_printf("CUDA error at %s: %s", stmt.c_str(), cuewErrorString(result));
+    if (error_msg == "")
+      error_msg = message;
+    fprintf(stderr, "%s\n", message.c_str());
+    cuda_error_documentation();
+    return true;
+  }
 
 #define cuda_error(stmt) cuda_error_(stmt, #stmt)
 
-       void cuda_error_message(const string& message)
-       {
-               if(error_msg == "")
-                       error_msg = message;
-               fprintf(stderr, "%s\n", message.c_str());
-               cuda_error_documentation();
-       }
-
-       CUDADevice(DeviceInfo& info, Stats &stats, Profiler &profiler, bool background_)
-       : Device(info, stats, profiler, background_),
-         texture_info(this, "__texture_info", MEM_TEXTURE)
-       {
-               first_error = true;
-               background = background_;
-
-               cuDevId = info.num;
-               cuDevice = 0;
-               cuContext = 0;
-
-               cuModule = 0;
-               cuFilterModule = 0;
-
-               split_kernel = NULL;
-
-               need_texture_info = false;
-
-               device_texture_headroom = 0;
-               device_working_headroom = 0;
-               move_texture_to_host = false;
-               map_host_limit = 0;
-               map_host_used = 0;
-               can_map_host = 0;
-
-               /* Intialize CUDA. */
-               if(cuda_error(cuInit(0)))
-                       return;
-
-               /* Setup device and context. */
-               if(cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
-                       return;
-
-               /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
-                * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
-                * so we can predict which memory to map to host. */
-               cuda_assert(cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
-
-               unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
-               if(can_map_host) {
-                       ctx_flags |= CU_CTX_MAP_HOST;
-                       init_host_memory();
-               }
-
-               /* Create context. */
-               CUresult result;
-
-               if(background) {
-                       result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-               }
-               else {
-                       result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
-
-                       if(result != CUDA_SUCCESS) {
-                               result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
-                               background = true;
-                       }
-               }
-
-               if(cuda_error_(result, "cuCtxCreate"))
-                       return;
-
-               int major, minor;
-               cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-               cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-               cuDevArchitecture = major*100 + minor*10;
-
-               /* Pop context set by cuCtxCreate. */
-               cuCtxPopCurrent(NULL);
-       }
-
-       ~CUDADevice()
-       {
-               task_pool.stop();
-
-               delete split_kernel;
-
-               texture_info.free();
-
-               cuda_assert(cuCtxDestroy(cuContext));
-       }
-
-       bool support_device(const DeviceRequestedFeatures& /*requested_features*/)
-       {
-               int major, minor;
-               cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-               cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-               /* We only support sm_30 and above */
-               if(major < 3) {
-                       cuda_error_message(string_printf("CUDA device supported only with compute capability 3.0 or up, found %d.%d.", major, minor));
-                       return false;
-               }
-
-               return true;
-       }
-
-       bool use_adaptive_compilation()
-       {
-               return DebugFlags().cuda.adaptive_compile;
-       }
-
-       bool use_split_kernel()
-       {
-               return DebugFlags().cuda.split_kernel;
-       }
-
-       /* Common NVCC flags which stays the same regardless of shading model,
-        * kernel sources md5 and only depends on compiler or compilation settings.
-        */
-       string compile_kernel_get_common_cflags(
-               const DeviceRequestedFeatures& requested_features,
-               bool filter=false, bool split=false)
-       {
-               const int machine = system_cpu_bits();
-               const string source_path = path_get("source");
-               const string include_path = source_path;
-               string cflags = string_printf("-m%d "
-                                             "--ptxas-options=\"-v\" "
-                                             "--use_fast_math "
-                                             "-DNVCC "
-                                              "-I\"%s\"",
-                                             machine,
-                                             include_path.c_str());
-               if(!filter && use_adaptive_compilation()) {
-                       cflags += " " + requested_features.get_build_options();
-               }
-               const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
-               if(extra_cflags) {
-                       cflags += string(" ") + string(extra_cflags);
-               }
+  void cuda_error_message(const string &message)
+  {
+    if (error_msg == "")
+      error_msg = message;
+    fprintf(stderr, "%s\n", message.c_str());
+    cuda_error_documentation();
+  }
+
+  CUDADevice(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background_)
+      : Device(info, stats, profiler, background_),
+        texture_info(this, "__texture_info", MEM_TEXTURE)
+  {
+    first_error = true;
+    background = background_;
+
+    cuDevId = info.num;
+    cuDevice = 0;
+    cuContext = 0;
+
+    cuModule = 0;
+    cuFilterModule = 0;
+
+    split_kernel = NULL;
+
+    need_texture_info = false;
+
+    device_texture_headroom = 0;
+    device_working_headroom = 0;
+    move_texture_to_host = false;
+    map_host_limit = 0;
+    map_host_used = 0;
+    can_map_host = 0;
+
+    /* Intialize CUDA. */
+    if (cuda_error(cuInit(0)))
+      return;
+
+    /* Setup device and context. */
+    if (cuda_error(cuDeviceGet(&cuDevice, cuDevId)))
+      return;
+
+    /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
+     * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
+     * so we can predict which memory to map to host. */
+    cuda_assert(
+        cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
+
+    unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
+    if (can_map_host) {
+      ctx_flags |= CU_CTX_MAP_HOST;
+      init_host_memory();
+    }
+
+    /* Create context. */
+    CUresult result;
+
+    if (background) {
+      result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+    }
+    else {
+      result = cuGLCtxCreate(&cuContext, ctx_flags, cuDevice);
+
+      if (result != CUDA_SUCCESS) {
+        result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
+        background = true;
+      }
+    }
+
+    if (cuda_error_(result, "cuCtxCreate"))
+      return;
+
+    int major, minor;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+    cuDevArchitecture = major * 100 + minor * 10;
+
+    /* Pop context set by cuCtxCreate. */
+    cuCtxPopCurrent(NULL);
+  }
+
+  ~CUDADevice()
+  {
+    task_pool.stop();
+
+    delete split_kernel;
+
+    texture_info.free();
+
+    cuda_assert(cuCtxDestroy(cuContext));
+  }
+
+  bool support_device(const DeviceRequestedFeatures & /*requested_features*/)
+  {
+    int major, minor;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+    /* We only support sm_30 and above */
+    if (major < 3) {
+      cuda_error_message(string_printf(
+          "CUDA device supported only with compute capability 3.0 or up, found %d.%d.",
+          major,
+          minor));
+      return false;
+    }
+
+    return true;
+  }
+
+  bool use_adaptive_compilation()
+  {
+    return DebugFlags().cuda.adaptive_compile;
+  }
+
+  bool use_split_kernel()
+  {
+    return DebugFlags().cuda.split_kernel;
+  }
+
+  /* Common NVCC flags which stays the same regardless of shading model,
+   * kernel sources md5 and only depends on compiler or compilation settings.
+   */
+  string compile_kernel_get_common_cflags(const DeviceRequestedFeatures &requested_features,
+                                          bool filter = false,
+                                          bool split = false)
+  {
+    const int machine = system_cpu_bits();
+    const string source_path = path_get("source");
+    const string include_path = source_path;
+    string cflags = string_printf(
+        "-m%d "
+        "--ptxas-options=\"-v\" "
+        "--use_fast_math "
+        "-DNVCC "
+        "-I\"%s\"",
+        machine,
+        include_path.c_str());
+    if (!filter && use_adaptive_compilation()) {
+      cflags += " " + requested_features.get_build_options();
+    }
+    const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
+    if (extra_cflags) {
+      cflags += string(" ") + string(extra_cflags);
+    }
 #ifdef WITH_CYCLES_DEBUG
-               cflags += " -D__KERNEL_DEBUG__";
+    cflags += " -D__KERNEL_DEBUG__";
 #endif
 
-               if(split) {
-                       cflags += " -D__SPLIT__";
-               }
-
-               return cflags;
-       }
-
-       bool compile_check_compiler() {
-               const char *nvcc = cuewCompilerPath();
-               if(nvcc == NULL) {
-                       cuda_error_message("CUDA nvcc compiler not found. "
-                                          "Install CUDA toolkit in default location.");
-                       return false;
-               }
-               const int cuda_version = cuewCompilerVersion();
-               VLOG(1) << "Found nvcc " << nvcc
-                       << ", CUDA version " << cuda_version
-                       << ".";
-               const int major = cuda_version / 10, minor = cuda_version % 10;
-               if(cuda_version == 0) {
-                       cuda_error_message("CUDA nvcc compiler version could not be parsed.");
-                       return false;
-               }
-               if(cuda_version < 80) {
-                       printf("Unsupported CUDA version %d.%d detected, "
-                              "you need CUDA 8.0 or newer.\n",
-                              major, minor);
-                       return false;
-               }
-               else if(cuda_version != 101) {
-                       printf("CUDA version %d.%d detected, build may succeed but only "
-                              "CUDA 10.1 is officially supported.\n",
-                              major, minor);
-               }
-               return true;
-       }
-
-       string compile_kernel(const DeviceRequestedFeatures& requested_features,
-                             bool filter=false, bool split=false)
-       {
-               const char *name, *source;
-               if(filter) {
-                       name = "filter";
-                       source = "filter.cu";
-               }
-               else if(split) {
-                       name = "kernel_split";
-                       source = "kernel_split.cu";
-               }
-               else {
-                       name = "kernel";
-                       source = "kernel.cu";
-               }
-               /* Compute cubin name. */
-               int major, minor;
-               cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
-               cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
-
-               /* Attempt to use kernel provided with Blender. */
-               if(!use_adaptive_compilation()) {
-                       const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin",
-                                                                   name, major, minor));
-                       VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
-                       if(path_exists(cubin)) {
-                               VLOG(1) << "Using precompiled kernel.";
-                               return cubin;
-                       }
-               }
-
-               const string common_cflags =
-                       compile_kernel_get_common_cflags(requested_features, filter, split);
-
-               /* Try to use locally compiled kernel. */
-               const string source_path = path_get("source");
-               const string kernel_md5 = path_files_md5_hash(source_path);
-
-               /* We include cflags into md5 so changing cuda toolkit or changing other
-                * compiler command line arguments makes sure cubin gets re-built.
-                */
-               const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
-
-               const string cubin_file = string_printf("cycles_%s_sm%d%d_%s.cubin",
-                                                       name, major, minor,
-                                                       cubin_md5.c_str());
-               const string cubin = path_cache_get(path_join("kernels", cubin_file));
-               VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
-               if(path_exists(cubin)) {
-                       VLOG(1) << "Using locally compiled kernel.";
-                       return cubin;
-               }
+    if (split) {
+      cflags += " -D__SPLIT__";
+    }
+
+    return cflags;
+  }
+
+  bool compile_check_compiler()
+  {
+    const char *nvcc = cuewCompilerPath();
+    if (nvcc == NULL) {
+      cuda_error_message(
+          "CUDA nvcc compiler not found. "
+          "Install CUDA toolkit in default location.");
+      return false;
+    }
+    const int cuda_version = cuewCompilerVersion();
+    VLOG(1) << "Found nvcc " << nvcc << ", CUDA version " << cuda_version << ".";
+    const int major = cuda_version / 10, minor = cuda_version % 10;
+    if (cuda_version == 0) {
+      cuda_error_message("CUDA nvcc compiler version could not be parsed.");
+      return false;
+    }
+    if (cuda_version < 80) {
+      printf(
+          "Unsupported CUDA version %d.%d detected, "
+          "you need CUDA 8.0 or newer.\n",
+          major,
+          minor);
+      return false;
+    }
+    else if (cuda_version != 101) {
+      printf(
+          "CUDA version %d.%d detected, build may succeed but only "
+          "CUDA 10.1 is officially supported.\n",
+          major,
+          minor);
+    }
+    return true;
+  }
+
+  string compile_kernel(const DeviceRequestedFeatures &requested_features,
+                        bool filter = false,
+                        bool split = false)
+  {
+    const char *name, *source;
+    if (filter) {
+      name = "filter";
+      source = "filter.cu";
+    }
+    else if (split) {
+      name = "kernel_split";
+      source = "kernel_split.cu";
+    }
+    else {
+      name = "kernel";
+      source = "kernel.cu";
+    }
+    /* Compute cubin name. */
+    int major, minor;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
+    cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
+
+    /* Attempt to use kernel provided with Blender. */
+    if (!use_adaptive_compilation()) {
+      const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
+      VLOG(1) << "Testing for pre-compiled kernel " << cubin << ".";
+      if (path_exists(cubin)) {
+        VLOG(1) << "Using precompiled kernel.";
+        return cubin;
+      }
+    }
+
+    const string common_cflags = compile_kernel_get_common_cflags(
+        requested_features, filter, split);
+
+    /* Try to use locally compiled kernel. */
+    const string source_path = path_get("source");
+    const string kernel_md5 = path_files_md5_hash(source_path);
+
+    /* We include cflags into md5 so changing cuda toolkit or changing other
+     * compiler command line arguments makes sure cubin gets re-built.
+     */
+    const string cubin_md5 = util_md5_string(kernel_md5 + common_cflags);
+
+    const string cubin_file = string_printf(
+        "cycles_%s_sm%d%d_%s.cubin", name, major, minor, cubin_md5.c_str());
+    const string cubin = path_cache_get(path_join("kernels", cubin_file));
+    VLOG(1) << "Testing for locally compiled kernel " << cubin << ".";
+    if (path_exists(cubin)) {
+      VLOG(1) << "Using locally compiled kernel.";
+      return cubin;
+    }
 
 #ifdef _WIN32
-               if(have_precompiled_kernels()) {
-                       if(major < 3) {
-                               cuda_error_message(string_printf(
-                                       "CUDA device requires compute capability 3.0 or up, "
-                                       "found %d.%d. Your GPU is not supported.",
-                                       major, minor));
-                       }
-                       else {
-                               cuda_error_message(string_printf(
-                                       "CUDA binary kernel for this graphics card compute "
-                                       "capability (%d.%d) not found.",
-                                       major, minor));
-                       }
-                       return "";
-               }
+    if (have_precompiled_kernels()) {
+      if (major < 3) {
+        cuda_error_message(
+            string_printf("CUDA device requires compute capability 3.0 or up, "
+                          "found %d.%d. Your GPU is not supported.",
+                          major,
+                          minor));
+      }
+      else {
+        cuda_error_message(
+            string_printf("CUDA binary kernel for this graphics card compute "
+                          "capability (%d.%d) not found.",
+                          major,
+                          minor));
+      }
+      return "";
+    }
 #endif
 
-               /* Compile. */
-               if(!compile_check_compiler()) {
-                       return "";
-               }
-               const char *nvcc = cuewCompilerPath();
-               const string kernel = path_join(
-                       path_join(source_path, "kernel"),
-                       path_join("kernels",
-                                 path_join("cuda", source)));
-               double starttime = time_dt();
-               printf("Compiling CUDA kernel ...\n");
-
-               path_create_directories(cubin);
-
-               string command = string_printf("\"%s\" "
-                                              "-arch=sm_%d%d "
-                                              "--cubin \"%s\" "
-                                              "-o \"%s\" "
-                                              "%s ",
-                                              nvcc,
-                                              major, minor,
-                                              kernel.c_str(),
-                                              cubin.c_str(),
-                                              common_cflags.c_str());
-
-               printf("%s\n", command.c_str());
-
-               if(system(command.c_str()) == -1) {
-                       cuda_error_message("Failed to execute compilation command, "
-                                          "see console for details.");
-                       return "";
-               }
-
-               /* Verify if compilation succeeded */
-               if(!path_exists(cubin)) {
-                       cuda_error_message("CUDA kernel compilation failed, "
-                                          "see console for details.");
-                       return "";
-               }
-
-               printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
-
-               return cubin;
-       }
-
-       bool load_kernels(const DeviceRequestedFeatures& requested_features)
-       {
-               /* TODO(sergey): Support kernels re-load for CUDA devices.
-                *
-                * Currently re-loading kernel will invalidate memory pointers,
-                * causing problems in cuCtxSynchronize.
-                */
-               if(cuFilterModule && cuModule) {
-                       VLOG(1) << "Skipping kernel reload, not currently supported.";
-                       return true;
-               }
-
-               /* check if cuda init succeeded */
-               if(cuContext == 0)
-                       return false;
-
-               /* check if GPU is supported */
-               if(!support_device(requested_features))
-                       return false;
-
-               /* get kernel */
-               string cubin = compile_kernel(requested_features, false, use_split_kernel());
-               if(cubin == "")
-                       return false;
-
-               string filter_cubin = compile_kernel(requested_features, true, false);
-               if(filter_cubin == "")
-                       return false;
-
-               /* open module */
-               CUDAContextScope scope(this);
-
-               string cubin_data;
-               CUresult result;
-
-               if(path_read_text(cubin, cubin_data))
-                       result = cuModuleLoadData(&cuModule, cubin_data.c_str());
-               else
-                       result = CUDA_ERROR_FILE_NOT_FOUND;
-
-               if(cuda_error_(result, "cuModuleLoad"))
-                       cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
-
-               if(path_read_text(filter_cubin, cubin_data))
-                       result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
-               else
-                       result = CUDA_ERROR_FILE_NOT_FOUND;
-
-               if(cuda_error_(result, "cuModuleLoad"))
-                       cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
-
-               if(result == CUDA_SUCCESS) {
-                       reserve_local_memory(requested_features);
-               }
-
-               return (result == CUDA_SUCCESS);
-       }
-
-       void reserve_local_memory(const DeviceRequestedFeatures& requested_features)
-       {
-               if(use_split_kernel()) {
-                       /* Split kernel mostly uses global memory and adaptive compilation,
-                        * difficult to predict how much is needed currently. */
-                       return;
-               }
-
-               /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
-                * needed for kernel launches, so that we can reliably figure out when
-                * to allocate scene data in mapped host memory. */
-               CUDAContextScope scope(this);
-
-               size_t total = 0, free_before = 0, free_after = 0;
-               cuMemGetInfo(&free_before, &total);
-
-               /* Get kernel function. */
-               CUfunction cuPathTrace;
-
-               if(requested_features.use_integrator_branched) {
-                       cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-               }
-               else {
-                       cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-               }
-
-               cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-               int min_blocks, num_threads_per_block;
-               cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-
-               /* Launch kernel, using just 1 block appears sufficient to reserve
-                * memory for all multiprocessors. It would be good to do this in
-                * parallel for the multi GPU case still to make it faster. */
-               CUdeviceptr d_work_tiles = 0;
-               uint total_work_size = 0;
-
-               void *args[] = {&d_work_tiles,
-                               &total_work_size};
-
-               cuda_assert(cuLaunchKernel(cuPathTrace,
-                                          1, 1, 1,
-                                          num_threads_per_block, 1, 1,
-                                          0, 0, args, 0));
-
-               cuda_assert(cuCtxSynchronize());
-
-               cuMemGetInfo(&free_after, &total);
-               VLOG(1) << "Local memory reserved "
-                       << string_human_readable_number(free_before - free_after) << " bytes. ("
-                       << string_human_readable_size(free_before - free_after) << ")";
+    /* Compile. */
+    if (!compile_check_compiler()) {
+      return "";
+    }
+    const char *nvcc = cuewCompilerPath();
+    const string kernel = path_join(path_join(source_path, "kernel"),
+                                    path_join("kernels", path_join("cuda", source)));
+    double starttime = time_dt();
+    printf("Compiling CUDA kernel ...\n");
+
+    path_create_directories(cubin);
+
+    string command = string_printf(
+        "\"%s\" "
+        "-arch=sm_%d%d "
+        "--cubin \"%s\" "
+        "-o \"%s\" "
+        "%s ",
+        nvcc,
+        major,
+        minor,
+        kernel.c_str(),
+        cubin.c_str(),
+        common_cflags.c_str());
+
+    printf("%s\n", command.c_str());
+
+    if (system(command.c_str()) == -1) {
+      cuda_error_message(
+          "Failed to execute compilation command, "
+          "see console for details.");
+      return "";
+    }
+
+    /* Verify if compilation succeeded */
+    if (!path_exists(cubin)) {
+      cuda_error_message(
+          "CUDA kernel compilation failed, "
+          "see console for details.");
+      return "";
+    }
+
+    printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
+
+    return cubin;
+  }
+
+  bool load_kernels(const DeviceRequestedFeatures &requested_features)
+  {
+    /* TODO(sergey): Support kernels re-load for CUDA devices.
+     *
+     * Currently re-loading kernel will invalidate memory pointers,
+     * causing problems in cuCtxSynchronize.
+     */
+    if (cuFilterModule && cuModule) {
+      VLOG(1) << "Skipping kernel reload, not currently supported.";
+      return true;
+    }
+
+    /* check if cuda init succeeded */
+    if (cuContext == 0)
+      return false;
+
+    /* check if GPU is supported */
+    if (!support_device(requested_features))
+      return false;
+
+    /* get kernel */
+    string cubin = compile_kernel(requested_features, false, use_split_kernel());
+    if (cubin == "")
+      return false;
+
+    string filter_cubin = compile_kernel(requested_features, true, false);
+    if (filter_cubin == "")
+      return false;
+
+    /* open module */
+    CUDAContextScope scope(this);
+
+    string cubin_data;
+    CUresult result;
+
+    if (path_read_text(cubin, cubin_data))
+      result = cuModuleLoadData(&cuModule, cubin_data.c_str());
+    else
+      result = CUDA_ERROR_FILE_NOT_FOUND;
+
+    if (cuda_error_(result, "cuModuleLoad"))
+      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", cubin.c_str()));
+
+    if (path_read_text(filter_cubin, cubin_data))
+      result = cuModuleLoadData(&cuFilterModule, cubin_data.c_str());
+    else
+      result = CUDA_ERROR_FILE_NOT_FOUND;
+
+    if (cuda_error_(result, "cuModuleLoad"))
+      cuda_error_message(string_printf("Failed loading CUDA kernel %s.", filter_cubin.c_str()));
+
+    if (result == CUDA_SUCCESS) {
+      reserve_local_memory(requested_features);
+    }
+
+    return (result == CUDA_SUCCESS);
+  }
+
+  void reserve_local_memory(const DeviceRequestedFeatures &requested_features)
+  {
+    if (use_split_kernel()) {
+      /* Split kernel mostly uses global memory and adaptive compilation,
+       * difficult to predict how much is needed currently. */
+      return;
+    }
+
+    /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
+     * needed for kernel launches, so that we can reliably figure out when
+     * to allocate scene data in mapped host memory. */
+    CUDAContextScope scope(this);
+
+    size_t total = 0, free_before = 0, free_after = 0;
+    cuMemGetInfo(&free_before, &total);
+
+    /* Get kernel function. */
+    CUfunction cuPathTrace;
+
+    if (requested_features.use_integrator_branched) {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+    }
+
+    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+    int min_blocks, num_threads_per_block;
+    cuda_assert(cuOccupancyMaxPotentialBlockSize(
+        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+
+    /* Launch kernel, using just 1 block appears sufficient to reserve
+     * memory for all multiprocessors. It would be good to do this in
+     * parallel for the multi GPU case still to make it faster. */
+    CUdeviceptr d_work_tiles = 0;
+    uint total_work_size = 0;
+
+    void *args[] = {&d_work_tiles, &total_work_size};
+
+    cuda_assert(cuLaunchKernel(cuPathTrace, 1, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+    cuda_assert(cuCtxSynchronize());
+
+    cuMemGetInfo(&free_after, &total);
+    VLOG(1) << "Local memory reserved " << string_human_readable_number(free_before - free_after)
+            << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
 
 #if 0
-               /* For testing mapped host memory, fill up device memory. */
-               const size_t keep_mb = 1024;
-
-               while(free_after > keep_mb * 1024 * 1024LL) {
-                       CUdeviceptr tmp;
-                       cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
-                       cuMemGetInfo(&free_after, &total);
-               }
+    /* For testing mapped host memory, fill up device memory. */
+    const size_t keep_mb = 1024;
+
+    while(free_after > keep_mb * 1024 * 1024LL) {
+      CUdeviceptr tmp;
+      cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
+      cuMemGetInfo(&free_after, &total);
+    }
 #endif
-       }
-
-       void init_host_memory()
-       {
-               /* Limit amount of host mapped memory, because allocating too much can
-                * cause system instability. Leave at least half or 4 GB of system
-                * memory free, whichever is smaller. */
-               size_t default_limit = 4 * 1024 * 1024 * 1024LL;
-               size_t system_ram = system_physical_ram();
-
-               if(system_ram > 0) {
-                       if(system_ram / 2 > default_limit) {
-                               map_host_limit = system_ram - default_limit;
-                       }
-                       else {
-                               map_host_limit = system_ram / 2;
-                       }
-               }
-               else {
-                       VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
-                       map_host_limit = 0;
-               }
-
-               /* Amount of device memory to keep is free after texture memory
-                * and working memory allocations respectively. We set the working
-                * memory limit headroom lower so that some space is left after all
-                * texture memory allocations. */
-               device_working_headroom = 32 * 1024 * 1024LL; // 32MB
-               device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
-
-               VLOG(1) << "Mapped host memory limit set to "
-                       << string_human_readable_number(map_host_limit) << " bytes. ("
-                       << string_human_readable_size(map_host_limit) << ")";
-       }
-
-       void load_texture_info()
-       {
-               if(need_texture_info) {
-                       texture_info.copy_to_device();
-                       need_texture_info = false;
-               }
-       }
-
-       void move_textures_to_host(size_t size, bool for_texture)
-       {
-               /* Signal to reallocate textures in host memory only. */
-               move_texture_to_host = true;
-
-               while(size > 0) {
-                       /* Find suitable memory allocation to move. */
-                       device_memory *max_mem = NULL;
-                       size_t max_size = 0;
-                       bool max_is_image = false;
-
-                       foreach(CUDAMemMap::value_type& pair, cuda_mem_map) {
-                               device_memory& mem = *pair.first;
-                               CUDAMem *cmem = &pair.second;
-
-                               bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-                               bool is_image = is_texture && (mem.data_height > 1);
-
-                               /* Can't move this type of memory. */
-                               if(!is_texture || cmem->array) {
-                                       continue;
-                               }
-
-                               /* Already in host memory. */
-                               if(cmem->map_host_pointer) {
-                                       continue;
-                               }
-
-                               /* For other textures, only move image textures. */
-                               if(for_texture && !is_image) {
-                                       continue;
-                               }
-
-                               /* Try to move largest allocation, prefer moving images. */
-                               if(is_image > max_is_image ||
-                                  (is_image == max_is_image && mem.device_size > max_size)) {
-                                       max_is_image = is_image;
-                                       max_size = mem.device_size;
-                                       max_mem = &mem;
-                               }
-                       }
-
-                       /* Move to host memory. This part is mutex protected since
-                        * multiple CUDA devices could be moving the memory. The
-                        * first one will do it, and the rest will adopt the pointer. */
-                       if(max_mem) {
-                               VLOG(1) << "Move memory from device to host: " << max_mem->name;
-
-                               static thread_mutex move_mutex;
-                               thread_scoped_lock lock(move_mutex);
-
-                               /* Preserve the original device pointer, in case of multi device
-                                * we can't change it because the pointer mapping would break. */
-                               device_ptr prev_pointer = max_mem->device_pointer;
-                               size_t prev_size = max_mem->device_size;
-
-                               tex_free(*max_mem);
-                               tex_alloc(*max_mem);
-                               size = (max_size >= size)? 0: size - max_size;
-
-                               max_mem->device_pointer = prev_pointer;
-                               max_mem->device_size = prev_size;
-                       }
-                       else {
-                               break;
-                       }
-               }
-
-               /* Update texture info array with new pointers. */
-               load_texture_info();
-
-               move_texture_to_host = false;
-       }
-
-       CUDAMem *generic_alloc(device_memory& mem, size_t pitch_padding = 0)
-       {
-               CUDAContextScope scope(this);
-
-               CUdeviceptr device_pointer = 0;
-               size_t size = mem.memory_size() + pitch_padding;
-
-               CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
-               const char *status = "";
-
-               /* First try allocating in device memory, respecting headroom. We make
-                * an exception for texture info. It is small and frequently accessed,
-                * so treat it as working memory.
-                *
-                * If there is not enough room for working memory, we will try to move
-                * textures to host memory, assuming the performance impact would have
-                * been worse for working memory. */
-               bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
-               bool is_image = is_texture && (mem.data_height > 1);
-
-               size_t headroom = (is_texture)? device_texture_headroom:
-                                               device_working_headroom;
-
-               size_t total = 0, free = 0;
-               cuMemGetInfo(&free, &total);
-
-               /* Move textures to host memory if needed. */
-               if(!move_texture_to_host && !is_image && (size + headroom) >= free) {
-                       move_textures_to_host(size + headroom - free, is_texture);
-                       cuMemGetInfo(&free, &total);
-               }
-
-               /* Allocate in device memory. */
-               if(!move_texture_to_host && (size + headroom) < free) {
-                       mem_alloc_result = cuMemAlloc(&device_pointer, size);
-                       if(mem_alloc_result == CUDA_SUCCESS) {
-                               status = " in device memory";
-                       }
-               }
-
-               /* Fall back to mapped host memory if needed and possible. */
-               void *map_host_pointer = 0;
-               bool free_map_host = false;
-
-               if(mem_alloc_result != CUDA_SUCCESS && can_map_host &&
-                  map_host_used + size < map_host_limit) {
-                       if(mem.shared_pointer) {
-                               /* Another device already allocated host memory. */
-                               mem_alloc_result = CUDA_SUCCESS;
-                               map_host_pointer = mem.shared_pointer;
-                       }
-                       else {
-                               /* Allocate host memory ourselves. */
-                               mem_alloc_result = cuMemHostAlloc(&map_host_pointer, size,
-                                                                 CU_MEMHOSTALLOC_DEVICEMAP |
-                                                                 CU_MEMHOSTALLOC_WRITECOMBINED);
-                               mem.shared_pointer = map_host_pointer;
-                               free_map_host = true;
-                       }
-
-                       if(mem_alloc_result == CUDA_SUCCESS) {
-                               cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
-                               map_host_used += size;
-                               status = " in host memory";
-
-                               /* Replace host pointer with our host allocation. Only works if
-                                * CUDA memory layout is the same and has no pitch padding. Also
-                                * does not work if we move textures to host during a render,
-                                * since other devices might be using the memory. */
-                               if(!move_texture_to_host && pitch_padding == 0 &&
-                                  mem.host_pointer && mem.host_pointer != mem.shared_pointer) {
-                                       memcpy(mem.shared_pointer, mem.host_pointer, size);
-                                       mem.host_free();
-                                       mem.host_pointer = mem.shared_pointer;
-                               }
-                       }
-                       else {
-                               status = " failed, out of host memory";
-                       }
-               }
-               else if(mem_alloc_result != CUDA_SUCCESS) {
-                       status = " failed, out of device and host memory";
-               }
-
-               if(mem_alloc_result != CUDA_SUCCESS) {
-                       cuda_assert(mem_alloc_result);
-               }
-
-               if(mem.name) {
-                       VLOG(1) << "Buffer allocate: " << mem.name << ", "
-                                       << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                                       << string_human_readable_size(mem.memory_size()) << ")"
-                                       << status;
-               }
-
-               mem.device_pointer = (device_ptr)device_pointer;
-               mem.device_size = size;
-               stats.mem_alloc(size);
-
-               if(!mem.device_pointer) {
-                       return NULL;
-               }
-
-               /* Insert into map of allocations. */
-               CUDAMem *cmem = &cuda_mem_map[&mem];
-               cmem->map_host_pointer = map_host_pointer;
-               cmem->free_map_host = free_map_host;
-               return cmem;
-       }
-
-       void generic_copy_to(device_memory& mem)
-       {
-               if(mem.host_pointer && mem.device_pointer) {
-                       CUDAContextScope scope(this);
-
-                       if(mem.host_pointer != mem.shared_pointer) {
-                               cuda_assert(cuMemcpyHtoD(cuda_device_ptr(mem.device_pointer),
-                                                        mem.host_pointer,
-                                                        mem.memory_size()));
-                       }
-               }
-       }
-
-       void generic_free(device_memory& mem)
-       {
-               if(mem.device_pointer) {
-                       CUDAContextScope scope(this);
-                       const CUDAMem& cmem = cuda_mem_map[&mem];
-
-                       if(cmem.map_host_pointer) {
-                               /* Free host memory. */
-                               if(cmem.free_map_host) {
-                                       cuMemFreeHost(cmem.map_host_pointer);
-                                       if(mem.host_pointer == mem.shared_pointer) {
-                                               mem.host_pointer = 0;
-                                       }
-                                       mem.shared_pointer = 0;
-                               }
-
-                               map_host_used -= mem.device_size;
-                       }
-                       else {
-                               /* Free device memory. */
-                               cuMemFree(mem.device_pointer);
-                       }
-
-                       stats.mem_free(mem.device_size);
-                       mem.device_pointer = 0;
-                       mem.device_size = 0;
-
-                       cuda_mem_map.erase(cuda_mem_map.find(&mem));
-               }
-       }
-
-       void mem_alloc(device_memory& mem)
-       {
-               if(mem.type == MEM_PIXELS && !background) {
-                       pixels_alloc(mem);
-               }
-               else if(mem.type == MEM_TEXTURE) {
-                       assert(!"mem_alloc not supported for textures.");
-               }
-               else {
-                       generic_alloc(mem);
-               }
-       }
-
-       void mem_copy_to(device_memory& mem)
-       {
-               if(mem.type == MEM_PIXELS) {
-                       assert(!"mem_copy_to not supported for pixels.");
-               }
-               else if(mem.type == MEM_TEXTURE) {
-                       tex_free(mem);
-                       tex_alloc(mem);
-               }
-               else {
-                       if(!mem.device_pointer) {
-                               generic_alloc(mem);
-                       }
-
-                       generic_copy_to(mem);
-               }
-       }
-
-       void mem_copy_from(device_memory& mem, int y, int w, int h, int elem)
-       {
-               if(mem.type == MEM_PIXELS && !background) {
-                       pixels_copy_from(mem, y, w, h);
-               }
-               else if(mem.type == MEM_TEXTURE) {
-                       assert(!"mem_copy_from not supported for textures.");
-               }
-               else {
-                       CUDAContextScope scope(this);
-                       size_t offset = elem*y*w;
-                       size_t size = elem*w*h;
-
-                       if(mem.host_pointer && mem.device_pointer) {
-                               cuda_assert(cuMemcpyDtoH((uchar*)mem.host_pointer + offset,
-                                                                                (CUdeviceptr)(mem.device_pointer + offset), size));
-                       }
-                       else if(mem.host_pointer) {
-                               memset((char*)mem.host_pointer + offset, 0, size);
-                       }
-               }
-       }
-
-       void mem_zero(device_memory& mem)
-       {
-               if(!mem.device_pointer) {
-                       mem_alloc(mem);
-               }
-
-               if(mem.host_pointer) {
-                       memset(mem.host_pointer, 0, mem.memory_size());
-               }
-
-               if(mem.device_pointer &&
-                  (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
-                       CUDAContextScope scope(this);
-                       cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
-               }
-       }
-
-       void mem_free(device_memory& mem)
-       {
-               if(mem.type == MEM_PIXELS && !background) {
-                       pixels_free(mem);
-               }
-               else if(mem.type == MEM_TEXTURE) {
-                       tex_free(mem);
-               }
-               else {
-                       generic_free(mem);
-               }
-       }
-
-       virtual device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int /*size*/)
-       {
-               return (device_ptr) (((char*) mem.device_pointer) + mem.memory_elements_size(offset));
-       }
-
-       void const_copy_to(const char *name, void *host, size_t size)
-       {
-               CUDAContextScope scope(this);
-               CUdeviceptr mem;
-               size_t bytes;
-
-               cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
-               //assert(bytes == size);
-               cuda_assert(cuMemcpyHtoD(mem, host, size));
-       }
-
-       void tex_alloc(device_memory& mem)
-       {
-               CUDAContextScope scope(this);
-
-               /* General variables for both architectures */
-               string bind_name = mem.name;
-               size_t dsize = datatype_size(mem.data_type);
-               size_t size = mem.memory_size();
-
-               CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
-               switch(mem.extension) {
-                       case EXTENSION_REPEAT:
-                               address_mode = CU_TR_ADDRESS_MODE_WRAP;
-                               break;
-                       case EXTENSION_EXTEND:
-                               address_mode = CU_TR_ADDRESS_MODE_CLAMP;
-                               break;
-                       case EXTENSION_CLIP:
-                               address_mode = CU_TR_ADDRESS_MODE_BORDER;
-                               break;
-                       default:
-                               assert(0);
-                               break;
-               }
-
-               CUfilter_mode filter_mode;
-               if(mem.interpolation == INTERPOLATION_CLOSEST) {
-                       filter_mode = CU_TR_FILTER_MODE_POINT;
-               }
-               else {
-                       filter_mode = CU_TR_FILTER_MODE_LINEAR;
-               }
-
-               /* Data Storage */
-               if(mem.interpolation == INTERPOLATION_NONE) {
-                       generic_alloc(mem);
-                       generic_copy_to(mem);
-
-                       CUdeviceptr cumem;
-                       size_t cubytes;
-
-                       cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
-
-                       if(cubytes == 8) {
-                               /* 64 bit device pointer */
-                               uint64_t ptr = mem.device_pointer;
-                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-                       }
-                       else {
-                               /* 32 bit device pointer */
-                               uint32_t ptr = (uint32_t)mem.device_pointer;
-                               cuda_assert(cuMemcpyHtoD(cumem, (void*)&ptr, cubytes));
-                       }
-                       return;
-               }
-
-               /* Image Texture Storage */
-               CUarray_format_enum format;
-               switch(mem.data_type) {
-                       case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
-                       case TYPE_UINT16: format = CU_AD_FORMAT_UNSIGNED_INT16; break;
-                       case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
-                       case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
-                       case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
-                       case TYPE_HALF: format = CU_AD_FORMAT_HALF; break;
-                       default: assert(0); return;
-               }
-
-               CUDAMem *cmem = NULL;
-               CUarray array_3d = NULL;
-               size_t src_pitch = mem.data_width * dsize * mem.data_elements;
-               size_t dst_pitch = src_pitch;
-
-               if(mem.data_depth > 1) {
-                       /* 3D texture using array, there is no API for linear memory. */
-                       CUDA_ARRAY3D_DESCRIPTOR desc;
-
-                       desc.Width = mem.data_width;
-                       desc.Height = mem.data_height;
-                       desc.Depth = mem.data_depth;
-                       desc.Format = format;
-                       desc.NumChannels = mem.data_elements;
-                       desc.Flags = 0;
-
-                       VLOG(1) << "Array 3D allocate: " << mem.name << ", "
-                               << string_human_readable_number(mem.memory_size()) << " bytes. ("
-                               << string_human_readable_size(mem.memory_size()) << ")";
-
-                       cuda_assert(cuArray3DCreate(&array_3d, &desc));
-
-                       if(!array_3d) {
-                               return;
-                       }
-
-                       CUDA_MEMCPY3D param;
-                       memset(&param, 0, sizeof(param));
-                       param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
-                       param.dstArray = array_3d;
-                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                       param.srcHost = mem.host_pointer;
-                       param.srcPitch = src_pitch;
-                       param.WidthInBytes = param.srcPitch;
-                       param.Height = mem.data_height;
-                       param.Depth = mem.data_depth;
-
-                       cuda_assert(cuMemcpy3D(&param));
-
-                       mem.device_pointer = (device_ptr)array_3d;
-                       mem.device_size = size;
-                       stats.mem_alloc(size);
-
-                       cmem = &cuda_mem_map[&mem];
-                       cmem->texobject = 0;
-                       cmem->array = array_3d;
-               }
-               else if(mem.data_height > 0) {
-                       /* 2D texture, using pitch aligned linear memory. */
-                       int alignment = 0;
-                       cuda_assert(cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
-                       dst_pitch = align_up(src_pitch, alignment);
-                       size_t dst_size = dst_pitch * mem.data_height;
-
-                       cmem = generic_alloc(mem, dst_size - mem.memory_size());
-                       if(!cmem) {
-                               return;
-                       }
-
-                       CUDA_MEMCPY2D param;
-                       memset(&param, 0, sizeof(param));
-                       param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-                       param.dstDevice = mem.device_pointer;
-                       param.dstPitch = dst_pitch;
-                       param.srcMemoryType = CU_MEMORYTYPE_HOST;
-                       param.srcHost = mem.host_pointer;
-                       param.srcPitch = src_pitch;
-                       param.WidthInBytes = param.srcPitch;
-                       param.Height = mem.data_height;
-
-                       cuda_assert(cuMemcpy2DUnaligned(&param));
-               }
-               else {
-                       /* 1D texture, using linear memory. */
-                       cmem = generic_alloc(mem);
-                       if(!cmem) {
-                               return;
-                       }
-
-                       cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
-               }
-
-               /* Kepler+, bindless textures. */
-               int flat_slot = 0;
-               if(string_startswith(mem.name, "__tex_image")) {
-                       int pos =  string(mem.name).rfind("_");
-                       flat_slot = atoi(mem.name + pos + 1);
-               }
-               else {
-                       assert(0);
-               }
-
-               CUDA_RESOURCE_DESC resDesc;
-               memset(&resDesc, 0, sizeof(resDesc));
-
-               if(array_3d) {
-                       resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-                       resDesc.res.array.hArray = array_3d;
-                       resDesc.flags = 0;
-               }
-               else if(mem.data_height > 0) {
-                       resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
-                       resDesc.res.pitch2D.devPtr = mem.device_pointer;
-                       resDesc.res.pitch2D.format = format;
-                       resDesc.res.pitch2D.numChannels = mem.data_elements;
-                       resDesc.res.pitch2D.height = mem.data_height;
-                       resDesc.res.pitch2D.width = mem.data_width;
-                       resDesc.res.pitch2D.pitchInBytes = dst_pitch;
-               }
-               else {
-                       resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
-                       resDesc.res.linear.devPtr = mem.device_pointer;
-                       resDesc.res.linear.format = format;
-                       resDesc.res.linear.numChannels = mem.data_elements;
-                       resDesc.res.linear.sizeInBytes = mem.device_size;
-               }
-
-               CUDA_TEXTURE_DESC texDesc;
-               memset(&texDesc, 0, sizeof(texDesc));
-               texDesc.addressMode[0] = address_mode;
-               texDesc.addressMode[1] = address_mode;
-               texDesc.addressMode[2] = address_mode;
-               texDesc.filterMode = filter_mode;
-               texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
-
-               cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
-
-               /* Resize once */
-               if(flat_slot >= texture_info.size()) {
-                       /* Allocate some slots in advance, to reduce amount
-                        * of re-allocations. */
-                       texture_info.resize(flat_slot + 128);
-               }
-
-               /* Set Mapping and tag that we need to (re-)upload to device */
-               TextureInfo& info = texture_info[flat_slot];
-               info.data = (uint64_t)cmem->texobject;
-               info.cl_buffer = 0;
-               info.interpolation = mem.interpolation;
-               info.extension = mem.extension;
-               info.width = mem.data_width;
-               info.height = mem.data_height;
-               info.depth = mem.data_depth;
-               need_texture_info = true;
-       }
-
-       void tex_free(device_memory& mem)
-       {
-               if(mem.device_pointer) {
-                       CUDAContextScope scope(this);
-                       const CUDAMem& cmem = cuda_mem_map[&mem];
-
-                       if(cmem.texobject) {
-                               /* Free bindless texture. */
-                               cuTexObjectDestroy(cmem.texobject);
-                       }
-
-                       if(cmem.array) {
-                               /* Free array. */
-                               cuArrayDestroy(cmem.array);
-                               stats.mem_free(mem.device_size);
-                               mem.device_pointer = 0;
-                               mem.device_size = 0;
-
-                               cuda_mem_map.erase(cuda_mem_map.find(&mem));
-                       }
-                       else {
-                               generic_free(mem);
-                       }
-               }
-       }
-
-#define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
-                       int threads_per_block;                                                                              \
-                       cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-                       int threads = (int)sqrt((float)threads_per_block);                                                  \
-                       int xblocks = ((w) + threads - 1)/threads;                                                          \
-                       int yblocks = ((h) + threads - 1)/threads;
-
-#define CUDA_LAUNCH_KERNEL(func, args)                      \
-                       cuda_assert(cuLaunchKernel(func,                \
-                                                  xblocks, yblocks, 1, \
-                                                  threads, threads, 1, \
-                                                  0, 0, args, 0));
+  }
+
+  void init_host_memory()
+  {
+    /* Limit amount of host mapped memory, because allocating too much can
+     * cause system instability. Leave at least half or 4 GB of system
+     * memory free, whichever is smaller. */
+    size_t default_limit = 4 * 1024 * 1024 * 1024LL;
+    size_t system_ram = system_physical_ram();
+
+    if (system_ram > 0) {
+      if (system_ram / 2 > default_limit) {
+        map_host_limit = system_ram - default_limit;
+      }
+      else {
+        map_host_limit = system_ram / 2;
+      }
+    }
+    else {
+      VLOG(1) << "Mapped host memory disabled, failed to get system RAM";
+      map_host_limit = 0;
+    }
+
+    /* Amount of device memory to keep is free after texture memory
+     * and working memory allocations respectively. We set the working
+     * memory limit headroom lower so that some space is left after all
+     * texture memory allocations. */
+    device_working_headroom = 32 * 1024 * 1024LL;   // 32MB
+    device_texture_headroom = 128 * 1024 * 1024LL;  // 128MB
+
+    VLOG(1) << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
+            << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
+  }
+
+  void load_texture_info()
+  {
+    if (need_texture_info) {
+      texture_info.copy_to_device();
+      need_texture_info = false;
+    }
+  }
+
+  void move_textures_to_host(size_t size, bool for_texture)
+  {
+    /* Signal to reallocate textures in host memory only. */
+    move_texture_to_host = true;
+
+    while (size > 0) {
+      /* Find suitable memory allocation to move. */
+      device_memory *max_mem = NULL;
+      size_t max_size = 0;
+      bool max_is_image = false;
+
+      foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
+        device_memory &mem = *pair.first;
+        CUDAMem *cmem = &pair.second;
+
+        bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+        bool is_image = is_texture && (mem.data_height > 1);
+
+        /* Can't move this type of memory. */
+        if (!is_texture || cmem->array) {
+          continue;
+        }
+
+        /* Already in host memory. */
+        if (cmem->map_host_pointer) {
+          continue;
+        }
+
+        /* For other textures, only move image textures. */
+        if (for_texture && !is_image) {
+          continue;
+        }
+
+        /* Try to move largest allocation, prefer moving images. */
+        if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
+          max_is_image = is_image;
+          max_size = mem.device_size;
+          max_mem = &mem;
+        }
+      }
+
+      /* Move to host memory. This part is mutex protected since
+       * multiple CUDA devices could be moving the memory. The
+       * first one will do it, and the rest will adopt the pointer. */
+      if (max_mem) {
+        VLOG(1) << "Move memory from device to host: " << max_mem->name;
+
+        static thread_mutex move_mutex;
+        thread_scoped_lock lock(move_mutex);
+
+        /* Preserve the original device pointer, in case of multi device
+         * we can't change it because the pointer mapping would break. */
+        device_ptr prev_pointer = max_mem->device_pointer;
+        size_t prev_size = max_mem->device_size;
+
+        tex_free(*max_mem);
+        tex_alloc(*max_mem);
+        size = (max_size >= size) ? 0 : size - max_size;
+
+        max_mem->device_pointer = prev_pointer;
+        max_mem->device_size = prev_size;
+      }
+      else {
+        break;
+      }
+    }
+
+    /* Update texture info array with new pointers. */
+    load_texture_info();
+
+    move_texture_to_host = false;
+  }
+
+  CUDAMem *generic_alloc(device_memory &mem, size_t pitch_padding = 0)
+  {
+    CUDAContextScope scope(this);
+
+    CUdeviceptr device_pointer = 0;
+    size_t size = mem.memory_size() + pitch_padding;
+
+    CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
+    const char *status = "";
+
+    /* First try allocating in device memory, respecting headroom. We make
+     * an exception for texture info. It is small and frequently accessed,
+     * so treat it as working memory.
+     *
+     * If there is not enough room for working memory, we will try to move
+     * textures to host memory, assuming the performance impact would have
+     * been worse for working memory. */
+    bool is_texture = (mem.type == MEM_TEXTURE) && (&mem != &texture_info);
+    bool is_image = is_texture && (mem.data_height > 1);
+
+    size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
+
+    size_t total = 0, free = 0;
+    cuMemGetInfo(&free, &total);
+
+    /* Move textures to host memory if needed. */
+    if (!move_texture_to_host && !is_image && (size + headroom) >= free) {
+      move_textures_to_host(size + headroom - free, is_texture);
+      cuMemGetInfo(&free, &total);
+    }
+
+    /* Allocate in device memory. */
+    if (!move_texture_to_host && (size + headroom) < free) {
+      mem_alloc_result = cuMemAlloc(&device_pointer, size);
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        status = " in device memory";
+      }
+    }
+
+    /* Fall back to mapped host memory if needed and possible. */
+    void *map_host_pointer = 0;
+    bool free_map_host = false;
+
+    if (mem_alloc_result != CUDA_SUCCESS && can_map_host &&
+        map_host_used + size < map_host_limit) {
+      if (mem.shared_pointer) {
+        /* Another device already allocated host memory. */
+        mem_alloc_result = CUDA_SUCCESS;
+        map_host_pointer = mem.shared_pointer;
+      }
+      else {
+        /* Allocate host memory ourselves. */
+        mem_alloc_result = cuMemHostAlloc(
+            &map_host_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
+        mem.shared_pointer = map_host_pointer;
+        free_map_host = true;
+      }
+
+      if (mem_alloc_result == CUDA_SUCCESS) {
+        cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, mem.shared_pointer, 0));
+        map_host_used += size;
+        status = " in host memory";
+
+        /* Replace host pointer with our host allocation. Only works if
+         * CUDA memory layout is the same and has no pitch padding. Also
+         * does not work if we move textures to host during a render,
+         * since other devices might be using the memory. */
+        if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
+            mem.host_pointer != mem.shared_pointer) {
+          memcpy(mem.shared_pointer, mem.host_pointer, size);
+          mem.host_free();
+          mem.host_pointer = mem.shared_pointer;
+        }
+      }
+      else {
+        status = " failed, out of host memory";
+      }
+    }
+    else if (mem_alloc_result != CUDA_SUCCESS) {
+      status = " failed, out of device and host memory";
+    }
+
+    if (mem_alloc_result != CUDA_SUCCESS) {
+      cuda_assert(mem_alloc_result);
+    }
+
+    if (mem.name) {
+      VLOG(1) << "Buffer allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")" << status;
+    }
+
+    mem.device_pointer = (device_ptr)device_pointer;
+    mem.device_size = size;
+    stats.mem_alloc(size);
+
+    if (!mem.device_pointer) {
+      return NULL;
+    }
+
+    /* Insert into map of allocations. */
+    CUDAMem *cmem = &cuda_mem_map[&mem];
+    cmem->map_host_pointer = map_host_pointer;
+    cmem->free_map_host = free_map_host;
+    return cmem;
+  }
+
+  void generic_copy_to(device_memory &mem)
+  {
+    if (mem.host_pointer && mem.device_pointer) {
+      CUDAContextScope scope(this);
+
+      if (mem.host_pointer != mem.shared_pointer) {
+        cuda_assert(cuMemcpyHtoD(
+            cuda_device_ptr(mem.device_pointer), mem.host_pointer, mem.memory_size()));
+      }
+    }
+  }
+
+  void generic_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      CUDAContextScope scope(this);
+      const CUDAMem &cmem = cuda_mem_map[&mem];
+
+      if (cmem.map_host_pointer) {
+        /* Free host memory. */
+        if (cmem.free_map_host) {
+          cuMemFreeHost(cmem.map_host_pointer);
+          if (mem.host_pointer == mem.shared_pointer) {
+            mem.host_pointer = 0;
+          }
+          mem.shared_pointer = 0;
+        }
+
+        map_host_used -= mem.device_size;
+      }
+      else {
+        /* Free device memory. */
+        cuMemFree(mem.device_pointer);
+      }
+
+      stats.mem_free(mem.device_size);
+      mem.device_pointer = 0;
+      mem.device_size = 0;
+
+      cuda_mem_map.erase(cuda_mem_map.find(&mem));
+    }
+  }
+
+  void mem_alloc(device_memory &mem)
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      pixels_alloc(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_alloc not supported for textures.");
+    }
+    else {
+      generic_alloc(mem);
+    }
+  }
+
+  void mem_copy_to(device_memory &mem)
+  {
+    if (mem.type == MEM_PIXELS) {
+      assert(!"mem_copy_to not supported for pixels.");
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+      tex_alloc(mem);
+    }
+    else {
+      if (!mem.device_pointer) {
+        generic_alloc(mem);
+      }
+
+      generic_copy_to(mem);
+    }
+  }
+
+  void mem_copy_from(device_memory &mem, int y, int w, int h, int elem)
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      pixels_copy_from(mem, y, w, h);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      assert(!"mem_copy_from not supported for textures.");
+    }
+    else {
+      CUDAContextScope scope(this);
+      size_t offset = elem * y * w;
+      size_t size = elem * w * h;
+
+      if (mem.host_pointer && mem.device_pointer) {
+        cuda_assert(cuMemcpyDtoH(
+            (uchar *)mem.host_pointer + offset, (CUdeviceptr)(mem.device_pointer + offset), size));
+      }
+      else if (mem.host_pointer) {
+        memset((char *)mem.host_pointer + offset, 0, size);
+      }
+    }
+  }
+
+  void mem_zero(device_memory &mem)
+  {
+    if (!mem.device_pointer) {
+      mem_alloc(mem);
+    }
+
+    if (mem.host_pointer) {
+      memset(mem.host_pointer, 0, mem.memory_size());
+    }
+
+    if (mem.device_pointer && (!mem.host_pointer || mem.host_pointer != mem.shared_pointer)) {
+      CUDAContextScope scope(this);
+      cuda_assert(cuMemsetD8(cuda_device_ptr(mem.device_pointer), 0, mem.memory_size()));
+    }
+  }
+
+  void mem_free(device_memory &mem)
+  {
+    if (mem.type == MEM_PIXELS && !background) {
+      pixels_free(mem);
+    }
+    else if (mem.type == MEM_TEXTURE) {
+      tex_free(mem);
+    }
+    else {
+      generic_free(mem);
+    }
+  }
+
+  virtual device_ptr mem_alloc_sub_ptr(device_memory &mem, int offset, int /*size*/)
+  {
+    return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
+  }
+
+  void const_copy_to(const char *name, void *host, size_t size)
+  {
+    CUDAContextScope scope(this);
+    CUdeviceptr mem;
+    size_t bytes;
+
+    cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, name));
+    //assert(bytes == size);
+    cuda_assert(cuMemcpyHtoD(mem, host, size));
+  }
+
+  void tex_alloc(device_memory &mem)
+  {
+    CUDAContextScope scope(this);
+
+    /* General variables for both architectures */
+    string bind_name = mem.name;
+    size_t dsize = datatype_size(mem.data_type);
+    size_t size = mem.memory_size();
+
+    CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
+    switch (mem.extension) {
+      case EXTENSION_REPEAT:
+        address_mode = CU_TR_ADDRESS_MODE_WRAP;
+        break;
+      case EXTENSION_EXTEND:
+        address_mode = CU_TR_ADDRESS_MODE_CLAMP;
+        break;
+      case EXTENSION_CLIP:
+        address_mode = CU_TR_ADDRESS_MODE_BORDER;
+        break;
+      default:
+        assert(0);
+        break;
+    }
+
+    CUfilter_mode filter_mode;
+    if (mem.interpolation == INTERPOLATION_CLOSEST) {
+      filter_mode = CU_TR_FILTER_MODE_POINT;
+    }
+    else {
+      filter_mode = CU_TR_FILTER_MODE_LINEAR;
+    }
+
+    /* Data Storage */
+    if (mem.interpolation == INTERPOLATION_NONE) {
+      generic_alloc(mem);
+      generic_copy_to(mem);
+
+      CUdeviceptr cumem;
+      size_t cubytes;
+
+      cuda_assert(cuModuleGetGlobal(&cumem, &cubytes, cuModule, bind_name.c_str()));
+
+      if (cubytes == 8) {
+        /* 64 bit device pointer */
+        uint64_t ptr = mem.device_pointer;
+        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
+      }
+      else {
+        /* 32 bit device pointer */
+        uint32_t ptr = (uint32_t)mem.device_pointer;
+        cuda_assert(cuMemcpyHtoD(cumem, (void *)&ptr, cubytes));
+      }
+      return;
+    }
+
+    /* Image Texture Storage */
+    CUarray_format_enum format;
+    switch (mem.data_type) {
+      case TYPE_UCHAR:
+        format = CU_AD_FORMAT_UNSIGNED_INT8;
+        break;
+      case TYPE_UINT16:
+        format = CU_AD_FORMAT_UNSIGNED_INT16;
+        break;
+      case TYPE_UINT:
+        format = CU_AD_FORMAT_UNSIGNED_INT32;
+        break;
+      case TYPE_INT:
+        format = CU_AD_FORMAT_SIGNED_INT32;
+        break;
+      case TYPE_FLOAT:
+        format = CU_AD_FORMAT_FLOAT;
+        break;
+      case TYPE_HALF:
+        format = CU_AD_FORMAT_HALF;
+        break;
+      default:
+        assert(0);
+        return;
+    }
+
+    CUDAMem *cmem = NULL;
+    CUarray array_3d = NULL;
+    size_t src_pitch = mem.data_width * dsize * mem.data_elements;
+    size_t dst_pitch = src_pitch;
+
+    if (mem.data_depth > 1) {
+      /* 3D texture using array, there is no API for linear memory. */
+      CUDA_ARRAY3D_DESCRIPTOR desc;
+
+      desc.Width = mem.data_width;
+      desc.Height = mem.data_height;
+      desc.Depth = mem.data_depth;
+      desc.Format = format;
+      desc.NumChannels = mem.data_elements;
+      desc.Flags = 0;
+
+      VLOG(1) << "Array 3D allocate: " << mem.name << ", "
+              << string_human_readable_number(mem.memory_size()) << " bytes. ("
+              << string_human_readable_size(mem.memory_size()) << ")";
+
+      cuda_assert(cuArray3DCreate(&array_3d, &desc));
+
+      if (!array_3d) {
+        return;
+      }
+
+      CUDA_MEMCPY3D param;
+      memset(&param, 0, sizeof(param));
+      param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+      param.dstArray = array_3d;
+      param.srcMemoryType = CU_MEMORYTYPE_HOST;
+      param.srcHost = mem.host_pointer;
+      param.srcPitch = src_pitch;
+      param.WidthInBytes = param.srcPitch;
+      param.Height = mem.data_height;
+      param.Depth = mem.data_depth;
+
+      cuda_assert(cuMemcpy3D(&param));
+
+      mem.device_pointer = (device_ptr)array_3d;
+      mem.device_size = size;
+      stats.mem_alloc(size);
+
+      cmem = &cuda_mem_map[&mem];
+      cmem->texobject = 0;
+      cmem->array = array_3d;
+    }
+    else if (mem.data_height > 0) {
+      /* 2D texture, using pitch aligned linear memory. */
+      int alignment = 0;
+      cuda_assert(
+          cuDeviceGetAttribute(&alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
+      dst_pitch = align_up(src_pitch, alignment);
+      size_t dst_size = dst_pitch * mem.data_height;
+
+      cmem = generic_alloc(mem, dst_size - mem.memory_size());
+      if (!cmem) {
+        return;
+      }
+
+      CUDA_MEMCPY2D param;
+      memset(&param, 0, sizeof(param));
+      param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      param.dstDevice = mem.device_pointer;
+      param.dstPitch = dst_pitch;
+      param.srcMemoryType = CU_MEMORYTYPE_HOST;
+      param.srcHost = mem.host_pointer;
+      param.srcPitch = src_pitch;
+      param.WidthInBytes = param.srcPitch;
+      param.Height = mem.data_height;
+
+      cuda_assert(cuMemcpy2DUnaligned(&param));
+    }
+    else {
+      /* 1D texture, using linear memory. */
+      cmem = generic_alloc(mem);
+      if (!cmem) {
+        return;
+      }
+
+      cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
+    }
+
+    /* Kepler+, bindless textures. */
+    int flat_slot = 0;
+    if (string_startswith(mem.name, "__tex_image")) {
+      int pos = string(mem.name).rfind("_");
+      flat_slot = atoi(mem.name + pos + 1);
+    }
+    else {
+      assert(0);
+    }
+
+    CUDA_RESOURCE_DESC resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+
+    if (array_3d) {
+      resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      resDesc.res.array.hArray = array_3d;
+      resDesc.flags = 0;
+    }
+    else if (mem.data_height > 0) {
+      resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
+      resDesc.res.pitch2D.devPtr = mem.device_pointer;
+      resDesc.res.pitch2D.format = format;
+      resDesc.res.pitch2D.numChannels = mem.data_elements;
+      resDesc.res.pitch2D.height = mem.data_height;
+      resDesc.res.pitch2D.width = mem.data_width;
+      resDesc.res.pitch2D.pitchInBytes = dst_pitch;
+    }
+    else {
+      resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
+      resDesc.res.linear.devPtr = mem.device_pointer;
+      resDesc.res.linear.format = format;
+      resDesc.res.linear.numChannels = mem.data_elements;
+      resDesc.res.linear.sizeInBytes = mem.device_size;
+    }
+
+    CUDA_TEXTURE_DESC texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    texDesc.addressMode[0] = address_mode;
+    texDesc.addressMode[1] = address_mode;
+    texDesc.addressMode[2] = address_mode;
+    texDesc.filterMode = filter_mode;
+    texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
+
+    cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
+
+    /* Resize once */
+    if (flat_slot >= texture_info.size()) {
+      /* Allocate some slots in advance, to reduce amount
+       * of re-allocations. */
+      texture_info.resize(flat_slot + 128);
+    }
+
+    /* Set Mapping and tag that we need to (re-)upload to device */
+    TextureInfo &info = texture_info[flat_slot];
+    info.data = (uint64_t)cmem->texobject;
+    info.cl_buffer = 0;
+    info.interpolation = mem.interpolation;
+    info.extension = mem.extension;
+    info.width = mem.data_width;
+    info.height = mem.data_height;
+    info.depth = mem.data_depth;
+    need_texture_info = true;
+  }
+
+  void tex_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      CUDAContextScope scope(this);
+      const CUDAMem &cmem = cuda_mem_map[&mem];
+
+      if (cmem.texobject) {
+        /* Free bindless texture. */
+        cuTexObjectDestroy(cmem.texobject);
+      }
+
+      if (cmem.array) {
+        /* Free array. */
+        cuArrayDestroy(cmem.array);
+        stats.mem_free(mem.device_size);
+        mem.device_pointer = 0;
+        mem.device_size = 0;
+
+        cuda_mem_map.erase(cuda_mem_map.find(&mem));
+      }
+      else {
+        generic_free(mem);
+      }
+    }
+  }
+
+#define CUDA_GET_BLOCKSIZE(func, w, h) \
+  int threads_per_block; \
+  cuda_assert( \
+      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+  int threads = (int)sqrt((float)threads_per_block); \
+  int xblocks = ((w) + threads - 1) / threads; \
+  int yblocks = ((h) + threads - 1) / threads;
+
+#define CUDA_LAUNCH_KERNEL(func, args) \
+  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads, threads, 1, 0, 0, args, 0));
 
 /* Similar as above, but for 1-dimensional blocks. */
-#define CUDA_GET_BLOCKSIZE_1D(func, w, h)                                                                       \
-                       int threads_per_block;                                                                              \
-                       cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
-                       int xblocks = ((w) + threads_per_block - 1)/threads_per_block;                                      \
-                       int yblocks = h;
-
-#define CUDA_LAUNCH_KERNEL_1D(func, args)                       \
-                       cuda_assert(cuLaunchKernel(func,                    \
-                                                  xblocks, yblocks, 1,     \
-                                                  threads_per_block, 1, 1, \
-                                                  0, 0, args, 0));
-
-       bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
-                                      DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               int stride = task->buffer.stride;
-               int w = task->buffer.width;
-               int h = task->buffer.h;
-               int r = task->nlm_state.r;
-               int f = task->nlm_state.f;
-               float a = task->nlm_state.a;
-               float k_2 = task->nlm_state.k_2;
-
-               int pass_stride = task->buffer.pass_stride;
-               int num_shifts = (2*r+1)*(2*r+1);
-               int channel_offset = task->nlm_state.is_color? task->buffer.pass_stride : 0;
-               int frame_offset = 0;
-
-               if(have_error())
-                       return false;
-
-               CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-               CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
-               CUdeviceptr weightAccum = difference + 2*sizeof(float)*pass_stride*num_shifts;
-               CUdeviceptr scale_ptr = 0;
-
-               cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*pass_stride));
-               cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*pass_stride));
-
-               {
-                       CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
-                       cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-                       cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
-
-                       CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
-
-                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &scale_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &channel_offset, &frame_offset, &a, &k_2};
-                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-                       void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &pass_stride, &channel_offset, &r, &f};
-
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
-               }
-
-               {
-                       CUfunction cuNLMNormalize;
-                       cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
-                       cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
-                       void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
-                       CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
-                       CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-                       cuda_assert(cuCtxSynchronize());
-               }
-
-               return !have_error();
-       }
-
-       bool denoising_construct_transform(DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilterConstructTransform;
-               cuda_assert(cuModuleGetFunction(&cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
-               cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
-               CUDA_GET_BLOCKSIZE(cuFilterConstructTransform,
-                                  task->storage.w,
-                                  task->storage.h);
-
-               void *args[] = {&task->buffer.mem.device_pointer,
-                               &task->tile_info_mem.device_pointer,
-                               &task->storage.transform.device_pointer,
-                               &task->storage.rank.device_pointer,
-                               &task->filter_area,
-                               &task->rect,
-                               &task->radius,
-                               &task->pca_threshold,
-                               &task->buffer.pass_stride,
-                               &task->buffer.frame_stride,
-                               &task->buffer.use_time};
-               CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_accumulate(device_ptr color_ptr,
-                                 device_ptr color_variance_ptr,
-                                 device_ptr scale_ptr,
-                                 int frame,
-                                 DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               int r = task->radius;
-               int f = 4;
-               float a = 1.0f;
-               float k_2 = task->nlm_k_2;
-
-               int w = task->reconstruction_state.source_w;
-               int h = task->reconstruction_state.source_h;
-               int stride = task->buffer.stride;
-               int frame_offset = frame * task->buffer.frame_stride;
-               int t = task->tile_info->frames[frame];
-
-               int pass_stride = task->buffer.pass_stride;
-               int num_shifts = (2*r+1)*(2*r+1);
-
-               if(have_error())
-                       return false;
-
-               CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
-               CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
-
-               CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
-               cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-               cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-               cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-               cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-
-               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-
-               CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
-                                    task->reconstruction_state.source_w * task->reconstruction_state.source_h,
-                                    num_shifts);
-
-               void *calc_difference_args[] = {&color_ptr,
-                                               &color_variance_ptr,
-                                               &scale_ptr,
-                                               &difference,
-                                               &w, &h,
-                                               &stride, &pass_stride,
-                                               &r, &pass_stride,
-                                               &frame_offset,
-                                               &a, &k_2};
-               void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
-               void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
-               void *construct_gramian_args[] = {&t,
-                                                 &blurDifference,
-                                                 &task->buffer.mem.device_pointer,
-                                                 &task->storage.transform.device_pointer,
-                                                 &task->storage.rank.device_pointer,
-                                                 &task->storage.XtWX.device_pointer,
-                                                 &task->storage.XtWY.device_pointer,
-                                                 &task->reconstruction_state.filter_window,
-                                                 &w, &h, &stride,
-                                                 &pass_stride, &r,
-                                                 &f,
-                                                 &frame_offset,
-                                                 &task->buffer.use_time};
-
-               CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
-               CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-               CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
-               CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
-               CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_solve(device_ptr output_ptr,
-                            DenoisingTask *task)
-       {
-               CUfunction cuFinalize;
-               cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
-               cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
-               void *finalize_args[] = {&output_ptr,
-                                        &task->storage.rank.device_pointer,
-                                        &task->storage.XtWX.device_pointer,
-                                        &task->storage.XtWY.device_pointer,
-                                        &task->filter_area,
-                                        &task->reconstruction_state.buffer_params.x,
-                                        &task->render_buffer.samples};
-               CUDA_GET_BLOCKSIZE(cuFinalize,
-                                  task->reconstruction_state.source_w,
-                                  task->reconstruction_state.source_h);
-               CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_combine_halves(device_ptr a_ptr, device_ptr b_ptr,
-                                     device_ptr mean_ptr, device_ptr variance_ptr,
-                                     int r, int4 rect, DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilterCombineHalves;
-               cuda_assert(cuModuleGetFunction(&cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
-               cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
-               CUDA_GET_BLOCKSIZE(cuFilterCombineHalves,
-                                  task->rect.z-task->rect.x,
-                                  task->rect.w-task->rect.y);
-
-               void *args[] = {&mean_ptr,
-                               &variance_ptr,
-                               &a_ptr,
-                               &b_ptr,
-                               &rect,
-                               &r};
-               CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_divide_shadow(device_ptr a_ptr, device_ptr b_ptr,
-                                    device_ptr sample_variance_ptr, device_ptr sv_variance_ptr,
-                                    device_ptr buffer_variance_ptr, DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilterDivideShadow;
-               cuda_assert(cuModuleGetFunction(&cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
-               cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
-               CUDA_GET_BLOCKSIZE(cuFilterDivideShadow,
-                                  task->rect.z-task->rect.x,
-                                  task->rect.w-task->rect.y);
-
-               void *args[] = {&task->render_buffer.samples,
-                               &task->tile_info_mem.device_pointer,
-                               &a_ptr,
-                               &b_ptr,
-                               &sample_variance_ptr,
-                               &sv_variance_ptr,
-                               &buffer_variance_ptr,
-                               &task->rect,
-                               &task->render_buffer.pass_stride,
-                               &task->render_buffer.offset};
-               CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_get_feature(int mean_offset,
-                                  int variance_offset,
-                                  device_ptr mean_ptr,
-                                  device_ptr variance_ptr,
-                                  float scale,
-                                  DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilterGetFeature;
-               cuda_assert(cuModuleGetFunction(&cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
-               cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
-               CUDA_GET_BLOCKSIZE(cuFilterGetFeature,
-                                  task->rect.z-task->rect.x,
-                                  task->rect.w-task->rect.y);
-
-               void *args[] = {&task->render_buffer.samples,
-                               &task->tile_info_mem.device_pointer,
-                               &mean_offset,
-                               &variance_offset,
-                               &mean_ptr,
-                               &variance_ptr,
-                               &scale,
-                               &task->rect,
-                               &task->render_buffer.pass_stride,
-                               &task->render_buffer.offset};
-               CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_write_feature(int out_offset,
-                                    device_ptr from_ptr,
-                                    device_ptr buffer_ptr,
-                                    DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilterWriteFeature;
-               cuda_assert(cuModuleGetFunction(&cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
-               cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
-               CUDA_GET_BLOCKSIZE(cuFilterWriteFeature,
-                                  task->filter_area.z,
-                                  task->filter_area.w);
-
-               void *args[] = {&task->render_buffer.samples,
-                               &task->reconstruction_state.buffer_params,
-                               &task->filter_area,
-                               &from_ptr,
-                               &buffer_ptr,
-                               &out_offset,
-                               &task->rect};
-               CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       bool denoising_detect_outliers(device_ptr image_ptr,
-                                      device_ptr variance_ptr,
-                                      device_ptr depth_ptr,
-                                      device_ptr output_ptr,
-                                      DenoisingTask *task)
-       {
-               if(have_error())
-                       return false;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilterDetectOutliers;
-               cuda_assert(cuModuleGetFunction(&cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
-               cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
-               CUDA_GET_BLOCKSIZE(cuFilterDetectOutliers,
-                                  task->rect.z-task->rect.x,
-                                  task->rect.w-task->rect.y);
-
-               void *args[] = {&image_ptr,
-                               &variance_ptr,
-                               &depth_ptr,
-                               &output_ptr,
-                               &task->rect,
-                               &task->buffer.pass_stride};
-
-               CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
-               cuda_assert(cuCtxSynchronize());
-
-               return !have_error();
-       }
-
-       void denoise(RenderTile &rtile, DenoisingTask& denoising)
-       {
-               denoising.functions.construct_transform = function_bind(&CUDADevice::denoising_construct_transform, this, &denoising);
-               denoising.functions.accumulate = function_bind(&CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
-               denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
-               denoising.functions.divide_shadow = function_bind(&CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
-               denoising.functions.non_local_means = function_bind(&CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
-               denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
-               denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
-               denoising.functions.write_feature = function_bind(&CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
-               denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-
-               denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
-               denoising.render_buffer.samples = rtile.sample;
-               denoising.buffer.gpu_temporary_mem = true;
-
-               denoising.run_denoising(&rtile);
-       }
-
-       void path_trace(DeviceTask& task, RenderTile& rtile, device_vector<WorkTile>& work_tiles)
-       {
-               scoped_timer timer(&rtile.buffers->render_time);
-
-               if(have_error())
-                       return;
-
-               CUDAContextScope scope(this);
-               CUfunction cuPathTrace;
-
-               /* Get kernel function. */
-               if(task.integrator_branched) {
-                       cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
-               }
-               else {
-                       cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
-               }
-
-               if(have_error()) {
-                       return;
-               }
-
-               cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
-
-               /* Allocate work tile. */
-               work_tiles.alloc(1);
-
-               WorkTile *wtile = work_tiles.data();
-               wtile->x = rtile.x;
-               wtile->y = rtile.y;
-               wtile->w = rtile.w;
-               wtile->h = rtile.h;
-               wtile->offset = rtile.offset;
-               wtile->stride = rtile.stride;
-               wtile->buffer = (float*)cuda_device_ptr(rtile.buffer);
-
-               /* Prepare work size. More step samples render faster, but for now we
-                * remain conservative for GPUs connected to a display to avoid driver
-                * timeouts and display freezing. */
-               int min_blocks, num_threads_per_block;
-               cuda_assert(cuOccupancyMaxPotentialBlockSize(&min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
-               if(!info.display_device) {
-                       min_blocks *= 8;
-               }
-
-               uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
-
-               /* Render all samples. */
-               int start_sample = rtile.start_sample;
-               int end_sample = rtile.start_sample + rtile.num_samples;
-
-               for(int sample = start_sample; sample < end_sample; sample += step_samples) {
-                       /* Setup and copy work tile to device. */
-                       wtile->start_sample = sample;
-                       wtile->num_samples = min(step_samples, end_sample - sample);
-                       work_tiles.copy_to_device();
-
-                       CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
-                       uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
-                       uint num_blocks = divide_up(total_work_size, num_threads_per_block);
-
-                       /* Launch kernel. */
-                       void *args[] = {&d_work_tiles,
-                                       &total_work_size};
-
-                       cuda_assert(cuLaunchKernel(cuPathTrace,
-                                                  num_blocks, 1, 1,
-                                                  num_threads_per_block, 1, 1,
-                                                  0, 0, args, 0));
-
-                       cuda_assert(cuCtxSynchronize());
-
-                       /* Update progress. */
-                       rtile.sample = sample + wtile->num_samples;
-                       task.update_progress(&rtile, rtile.w*rtile.h*wtile->num_samples);
-
-                       if(task.get_cancel()) {
-                               if(task.need_finish_queue == false)
-                                       break;
-                       }
-               }
-       }
-
-       void film_convert(DeviceTask& task, device_ptr buffer, device_ptr rgba_byte, device_ptr rgba_half)
-       {
-               if(have_error())
-                       return;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuFilmConvert;
-               CUdeviceptr d_rgba = map_pixels((rgba_byte)? rgba_byte: rgba_half);
-               CUdeviceptr d_buffer = cuda_device_ptr(buffer);
-
-               /* get kernel function */
-               if(rgba_half) {
-                       cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
-               }
-               else {
-                       cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
-               }
-
-
-               float sample_scale = 1.0f/(task.sample + 1);
-
-               /* pass in parameters */
-               void *args[] = {&d_rgba,
-                               &d_buffer,
-                               &sample_scale,
-                               &task.x,
-                               &task.y,
-                               &task.w,
-                               &task.h,
-                               &task.offset,
-                               &task.stride};
-
-               /* launch kernel */
-               int threads_per_block;
-               cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
-
-               int xthreads = (int)sqrt(threads_per_block);
-               int ythreads = (int)sqrt(threads_per_block);
-               int xblocks = (task.w + xthreads - 1)/xthreads;
-               int yblocks = (task.h + ythreads - 1)/ythreads;
-
-               cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
-
-               cuda_assert(cuLaunchKernel(cuFilmConvert,
-                                          xblocks , yblocks, 1, /* blocks */
-                                          xthreads, ythreads, 1, /* threads */
-                                          0, 0, args, 0));
-
-               unmap_pixels((rgba_byte)? rgba_byte: rgba_half);
-
-               cuda_assert(cuCtxSynchronize());
-       }
-
-       void shader(DeviceTask& task)
-       {
-               if(have_error())
-                       return;
-
-               CUDAContextScope scope(this);
-
-               CUfunction cuShader;
-               CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
-               CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
-
-               /* get kernel function */
-               if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-                       cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
-               }
-               else if(task.shader_eval_type == SHADER_EVAL_DISPLACE) {
-                       cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
-               }
-               else {
-                       cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
-               }
-
-               /* do tasks in smaller chunks, so we can cancel it */
-               const int shader_chunk_size = 65536;
-               const int start = task.shader_x;
-               const int end = task.shader_x + task.shader_w;
-               int offset = task.offset;
-
-               bool canceled = false;
-               for(int sample = 0; sample < task.num_samples && !canceled; sample++) {
-                       for(int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
-                               int shader_w = min(shader_chunk_size, end - shader_x);
-
-                               /* pass in parameters */
-                               void *args[8];
-                               int arg = 0;
-                               args[arg++] = &d_input;
-                               args[arg++] = &d_output;
-                               args[arg++] = &task.shader_eval_type;
-                               if(task.shader_eval_type >= SHADER_EVAL_BAKE) {
-                                       args[arg++] = &task.shader_filter;
-                               }
-                               args[arg++] = &shader_x;
-                               args[arg++] = &shader_w;
-                               args[arg++] = &offset;
-                               args[arg++] = &sample;
-
-                               /* launch kernel */
-                               int threads_per_block;
-                               cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
-
-                               int xblocks = (shader_w + threads_per_block - 1)/threads_per_block;
-
-                               cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
-                               cuda_assert(cuLaunchKernel(cuShader,
-                                                          xblocks , 1, 1, /* blocks */
-                                                          threads_per_block, 1, 1, /* threads */
-                                                          0, 0, args, 0));
-
-                               cuda_assert(cuCtxSynchronize());
-
-                               if(task.get_cancel()) {
-                                       canceled = true;
-                                       break;
-                               }
-                       }
-
-                       task.update_progress(NULL);
-               }
-       }
-
-       CUdeviceptr map_pixels(device_ptr mem)
-       {
-               if(!background) {
-                       PixelMem pmem = pixel_mem_map[mem];
-                       CUdeviceptr buffer;
-
-                       size_t bytes;
-                       cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
-                       cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
-
-                       return buffer;
-               }
-
-               return cuda_device_ptr(mem);
-       }
-
-       void unmap_pixels(device_ptr mem)
-       {
-               if(!background) {
-                       PixelMem pmem = pixel_mem_map[mem];
-
-                       cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
-               }
-       }
-
-       void pixels_alloc(device_memory& mem)
-       {
-               PixelMem pmem;
-
-               pmem.w = mem.data_width;
-               pmem.h = mem.data_height;
-
-               CUDAContextScope scope(this);
-
-               glGenBuffers(1, &pmem.cuPBO);
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-               if(mem.data_type == TYPE_HALF)
-                       glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(GLhalf)*4, NULL, GL_DYNAMIC_DRAW);
-               else
-                       glBufferData(GL_PIXEL_UNPACK_BUFFER, pmem.w*pmem.h*sizeof(uint8_t)*4, NULL, GL_DYNAMIC_DRAW);
-
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-               glActiveTexture(GL_TEXTURE0);
-               glGenTextures(1, &pmem.cuTexId);
-               glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-               if(mem.data_type == TYPE_HALF)
-                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
-               else
-                       glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
-               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-               glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-               glBindTexture(GL_TEXTURE_2D, 0);
-
-               CUresult result = cuGraphicsGLRegisterBuffer(&pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
-
-               if(result == CUDA_SUCCESS) {
-                       mem.device_pointer = pmem.cuTexId;
-                       pixel_mem_map[mem.device_pointer] = pmem;
-
-                       mem.device_size = mem.memory_size();
-                       stats.mem_alloc(mem.device_size);
-
-                       return;
-               }
-               else {
-                       /* failed to register buffer, fallback to no interop */
-                       glDeleteBuffers(1, &pmem.cuPBO);
-                       glDeleteTextures(1, &pmem.cuTexId);
-
-                       background = true;
-               }
-       }
-
-       void pixels_copy_from(device_memory& mem, int y, int w, int h)
-       {
-               PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-               CUDAContextScope scope(this);
-
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-               uchar *pixels = (uchar*)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
-               size_t offset = sizeof(uchar)*4*y*w;
-               memcpy((uchar*)mem.host_pointer + offset, pixels + offset, sizeof(uchar)*4*w*h);
-               glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
-               glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-       }
-
-       void pixels_free(device_memory& mem)
-       {
-               if(mem.device_pointer) {
-                       PixelMem pmem = pixel_mem_map[mem.device_pointer];
-
-                       CUDAContextScope scope(this);
-
-                       cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
-                       glDeleteBuffers(1, &pmem.cuPBO);
-                       glDeleteTextures(1, &pmem.cuTexId);
-
-                       pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
-                       mem.device_pointer = 0;
-
-                       stats.mem_free(mem.device_size);
-                       mem.device_size = 0;
-               }
-       }
-
-       void draw_pixels(
-           device_memory& mem, int y,
-           int w, int h, int width, int height,
-           int dx, int dy, int dw, int dh, bool transparent,
-               const DeviceDrawParams &draw_params)
-       {
-               assert(mem.type == MEM_PIXELS);
-
-               if(!background) {
-                       const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
-                       PixelMem pmem = pixel_mem_map[mem.device_pointer];
-                       float *vpointer;
-
-                       CUDAContextScope scope(this);
-
-                       /* for multi devices, this assumes the inefficient method that we allocate
-                        * all pixels on the device even though we only render to a subset */
-                       size_t offset = 4*y*w;
-
-                       if(mem.data_type == TYPE_HALF)
-                               offset *= sizeof(GLhalf);
-                       else
-                               offset *= sizeof(uint8_t);
-
-                       glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
-                       glActiveTexture(GL_TEXTURE0);
-                       glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
-                       if(mem.data_type == TYPE_HALF) {
-                               glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void*)offset);
-                       }
-                       else {
-                               glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void*)offset);
-                       }
-                       glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
-
-                       if(transparent) {
-                               glEnable(GL_BLEND);
-                               glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
-                       }
-
-                       GLint shader_program;
-                       if(use_fallback_shader) {
-                               if(!bind_fallback_display_space_shader(dw, dh)) {
-                                       return;
-                               }
-                               shader_program = fallback_shader_program;
-                       }
-                       else {
-                               draw_params.bind_display_space_shader_cb();
-                               glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
-                       }
-
-                       if(!vertex_buffer) {
-                               glGenBuffers(1, &vertex_buffer);
-                       }
-
-                       glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
-                       /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
-                       glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
-
-                       vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
-
-                       if(vpointer) {
-                               /* texture coordinate - vertex pair */
-                               vpointer[0] = 0.0f;
-                               vpointer[1] = 0.0f;
-                               vpointer[2] = dx;
-                               vpointer[3] = dy;
-
-                               vpointer[4] = (float)w/(float)pmem.w;
-                               vpointer[5] = 0.0f;
-                               vpointer[6] = (float)width + dx;
-                               vpointer[7] = dy;
-
-                               vpointer[8] = (float)w/(float)pmem.w;
-                               vpointer[9] = (float)h/(float)pmem.h;
-                               vpointer[10] = (float)width + dx;
-                               vpointer[11] = (float)height + dy;
-
-                               vpointer[12] = 0.0f;
-                               vpointer[13] = (float)h/(float)pmem.h;
-                               vpointer[14] = dx;
-                               vpointer[15] = (float)height + dy;
-
-                               glUnmapBuffer(GL_ARRAY_BUFFER);
-                       }
-
-                       GLuint vertex_array_object;
-                       GLuint position_attribute, texcoord_attribute;
-
-                       glGenVertexArrays(1, &vertex_array_object);
-                       glBindVertexArray(vertex_array_object);
-
-                       texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
-                       position_attribute = glGetAttribLocation(shader_program, "pos");
-
-                       glEnableVertexAttribArray(texcoord_attribute);
-                       glEnableVertexAttribArray(position_attribute);
-
-                       glVertexAttribPointer(texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
-                       glVertexAttribPointer(position_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)(sizeof(float) * 2));
-
-                       glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-                       if(use_fallback_shader) {
-                               glUseProgram(0);
-                       }
-                       else {
-                               draw_params.unbind_display_space_shader_cb();
-                       }
-
-                       if(transparent) {
-                               glDisable(GL_BLEND);
-                       }
-
-                       glBindTexture(GL_TEXTURE_2D, 0);
-
-                       return;
-               }
-
-               Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
-       }
-
-       void thread_run(DeviceTask *task)
-       {
-               CUDAContextScope scope(this);
-
-               if(task->type == DeviceTask::RENDER) {
-                       DeviceRequestedFeatures requested_features;
-                       if(use_split_kernel()) {
-                               if(split_kernel == NULL) {
-                                       split_kernel = new CUDASplitKernel(this);
-                                       split_kernel->load_kernels(requested_features);
-                               }
-                       }
-
-                       device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
-
-                       /* keep rendering tiles until done */
-                       RenderTile tile;
-                       DenoisingTask denoising(this, *task);
-
-                       while(task->acquire_tile(this, tile)) {
-                               if(tile.task == RenderTile::PATH_TRACE) {
-                                       if(use_split_kernel()) {
-                                               device_only_memory<uchar> void_buffer(this, "void_buffer");
-                                               split_kernel->path_trace(task, tile, void_buffer, void_buffer);
-                                       }
-                                       else {
-                                               path_trace(*task, tile, work_tiles);
-                                       }
-                               }
-                               else if(tile.task == RenderTile::DENOISE) {
-                                       tile.sample = tile.start_sample + tile.num_samples;
-
-                                       denoise(tile, denoising);
-
-                                       task->update_progress(&tile, tile.w*tile.h);
-                               }
-
-                               task->release_tile(tile);
-
-                               if(task->get_cancel()) {
-                                       if(task->need_finish_queue == false)
-                                               break;
-                               }
-                       }
-
-                       work_tiles.free();
-               }
-               else if(task->type == DeviceTask::SHADER) {
-                       shader(*task);
-
-                       cuda_assert(cuCtxSynchronize());
-               }
-       }
-
-       class CUDADeviceTask : public DeviceTask {
-       public:
-               CUDADeviceTask(CUDADevice *device, DeviceTask& task)
-               : DeviceTask(task)
-               {
-                       run = function_bind(&CUDADevice::thread_run, device, this);
-               }
-       };
-
-       int get_split_task_count(DeviceTask& /*task*/)
-       {
-               return 1;
-       }
-
-       void task_add(DeviceTask& task)
-       {
-               CUDAContextScope scope(this);
-
-               /* Load texture info. */
-               load_texture_info();
-
-               /* Synchronize all memory copies before executing task. */
-               cuda_assert(cuCtxSynchronize());
-
-               if(task.type == DeviceTask::FILM_CONVERT) {
-                       /* must be done in main thread due to opengl access */
-                       film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
-               }
-               else {
-                       task_pool.push(new CUDADeviceTask(this, task));
-               }
-       }
-
-       void task_wait()
-       {
-               task_pool.wait();
-       }
-
-       void task_cancel()
-       {
-               task_pool.cancel();
-       }
-
-       friend class CUDASplitKernelFunction;
-       friend class CUDASplitKernel;
-       friend class CUDAContextScope;
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
+  int threads_per_block; \
+  cuda_assert( \
+      cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+  int xblocks = ((w) + threads_per_block - 1) / threads_per_block; \
+  int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args) \
+  cuda_assert(cuLaunchKernel(func, xblocks, yblocks, 1, threads_per_block, 1, 1, 0, 0, args, 0));
+
+  bool denoising_non_local_means(device_ptr image_ptr,
+                                 device_ptr guide_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr out_ptr,
+                                 DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    int stride = task->buffer.stride;
+    int w = task->buffer.width;
+    int h = task->buffer.h;
+    int r = task->nlm_state.r;
+    int f = task->nlm_state.f;
+    float a = task->nlm_state.a;
+    float k_2 = task->nlm_state.k_2;
+
+    int pass_stride = task->buffer.pass_stride;
+    int num_shifts = (2 * r + 1) * (2 * r + 1);
+    int channel_offset = task->nlm_state.is_color ? task->buffer.pass_stride : 0;
+    int frame_offset = 0;
+
+    if (have_error())
+      return false;
+
+    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+    CUdeviceptr weightAccum = difference + 2 * sizeof(float) * pass_stride * num_shifts;
+    CUdeviceptr scale_ptr = 0;
+
+    cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float) * pass_stride));
+    cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float) * pass_stride));
+
+    {
+      CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+      cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
+
+      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
+
+      CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w * h, num_shifts);
+
+      void *calc_difference_args[] = {&guide_ptr,
+                                      &variance_ptr,
+                                      &scale_ptr,
+                                      &difference,
+                                      &w,
+                                      &h,
+                                      &stride,
+                                      &pass_stride,
+                                      &r,
+                                      &channel_offset,
+                                      &frame_offset,
+                                      &a,
+                                      &k_2};
+      void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+      void *calc_weight_args[] = {
+          &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+      void *update_output_args[] = {&blurDifference,
+                                    &image_ptr,
+                                    &out_ptr,
+                                    &weightAccum,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &channel_offset,
+                                    &r,
+                                    &f};
+
+      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+      CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+    }
+
+    {
+      CUfunction cuNLMNormalize;
+      cuda_assert(cuModuleGetFunction(
+          &cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+      cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+      void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+      CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+      CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+      cuda_assert(cuCtxSynchronize());
+    }
+
+    return !have_error();
+  }
+
+  bool denoising_construct_transform(DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterConstructTransform;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterConstructTransform, cuFilterModule, "kernel_cuda_filter_construct_transform"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterConstructTransform, CU_FUNC_CACHE_PREFER_SHARED));
+    CUDA_GET_BLOCKSIZE(cuFilterConstructTransform, task->storage.w, task->storage.h);
+
+    void *args[] = {&task->buffer.mem.device_pointer,
+                    &task->tile_info_mem.device_pointer,
+                    &task->storage.transform.device_pointer,
+                    &task->storage.rank.device_pointer,
+                    &task->filter_area,
+                    &task->rect,
+                    &task->radius,
+                    &task->pca_threshold,
+                    &task->buffer.pass_stride,
+                    &task->buffer.frame_stride,
+                    &task->buffer.use_time};
+    CUDA_LAUNCH_KERNEL(cuFilterConstructTransform, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_accumulate(device_ptr color_ptr,
+                            device_ptr color_variance_ptr,
+                            device_ptr scale_ptr,
+                            int frame,
+                            DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    int r = task->radius;
+    int f = 4;
+    float a = 1.0f;
+    float k_2 = task->nlm_k_2;
+
+    int w = task->reconstruction_state.source_w;
+    int h = task->reconstruction_state.source_h;
+    int stride = task->buffer.stride;
+    int frame_offset = frame * task->buffer.frame_stride;
+    int t = task->tile_info->frames[frame];
+
+    int pass_stride = task->buffer.pass_stride;
+    int num_shifts = (2 * r + 1) * (2 * r + 1);
+
+    if (have_error())
+      return false;
+
+    CUdeviceptr difference = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+    CUdeviceptr blurDifference = difference + sizeof(float) * pass_stride * num_shifts;
+
+    CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+    cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+    cuda_assert(cuModuleGetFunction(
+        &cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
+    cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+    CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                          task->reconstruction_state.source_w *
+                              task->reconstruction_state.source_h,
+                          num_shifts);
+
+    void *calc_difference_args[] = {&color_ptr,
+                                    &color_variance_ptr,
+                                    &scale_ptr,
+                                    &difference,
+                                    &w,
+                                    &h,
+                                    &stride,
+                                    &pass_stride,
+                                    &r,
+                                    &pass_stride,
+                                    &frame_offset,
+                                    &a,
+                                    &k_2};
+    void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *calc_weight_args[] = {
+        &blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
+    void *construct_gramian_args[] = {&t,
+                                      &blurDifference,
+                                      &task->buffer.mem.device_pointer,
+                                      &task->storage.transform.device_pointer,
+                                      &task->storage.rank.device_pointer,
+                                      &task->storage.XtWX.device_pointer,
+                                      &task->storage.XtWY.device_pointer,
+                                      &task->reconstruction_state.filter_window,
+                                      &w,
+                                      &h,
+                                      &stride,
+                                      &pass_stride,
+                                      &r,
+                                      &f,
+                                      &frame_offset,
+                                      &task->buffer.use_time};
+
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+    CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_solve(device_ptr output_ptr, DenoisingTask *task)
+  {
+    CUfunction cuFinalize;
+    cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+    cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+    void *finalize_args[] = {&output_ptr,
+                             &task->storage.rank.device_pointer,
+                             &task->storage.XtWX.device_pointer,
+                             &task->storage.XtWY.device_pointer,
+                             &task->filter_area,
+                             &task->reconstruction_state.buffer_params.x,
+                             &task->render_buffer.samples};
+    CUDA_GET_BLOCKSIZE(
+        cuFinalize, task->reconstruction_state.source_w, task->reconstruction_state.source_h);
+    CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_combine_halves(device_ptr a_ptr,
+                                device_ptr b_ptr,
+                                device_ptr mean_ptr,
+                                device_ptr variance_ptr,
+                                int r,
+                                int4 rect,
+                                DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterCombineHalves;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterCombineHalves, cuFilterModule, "kernel_cuda_filter_combine_halves"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterCombineHalves, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterCombineHalves, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&mean_ptr, &variance_ptr, &a_ptr, &b_ptr, &rect, &r};
+    CUDA_LAUNCH_KERNEL(cuFilterCombineHalves, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_divide_shadow(device_ptr a_ptr,
+                               device_ptr b_ptr,
+                               device_ptr sample_variance_ptr,
+                               device_ptr sv_variance_ptr,
+                               device_ptr buffer_variance_ptr,
+                               DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterDivideShadow;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterDivideShadow, cuFilterModule, "kernel_cuda_filter_divide_shadow"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterDivideShadow, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterDivideShadow, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->tile_info_mem.device_pointer,
+                    &a_ptr,
+                    &b_ptr,
+                    &sample_variance_ptr,
+                    &sv_variance_ptr,
+                    &buffer_variance_ptr,
+                    &task->rect,
+                    &task->render_buffer.pass_stride,
+                    &task->render_buffer.offset};
+    CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_get_feature(int mean_offset,
+                             int variance_offset,
+                             device_ptr mean_ptr,
+                             device_ptr variance_ptr,
+                             float scale,
+                             DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterGetFeature;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterGetFeature, cuFilterModule, "kernel_cuda_filter_get_feature"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterGetFeature, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterGetFeature, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->tile_info_mem.device_pointer,
+                    &mean_offset,
+                    &variance_offset,
+                    &mean_ptr,
+                    &variance_ptr,
+                    &scale,
+                    &task->rect,
+                    &task->render_buffer.pass_stride,
+                    &task->render_buffer.offset};
+    CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_write_feature(int out_offset,
+                               device_ptr from_ptr,
+                               device_ptr buffer_ptr,
+                               DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterWriteFeature;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterWriteFeature, cuFilterModule, "kernel_cuda_filter_write_feature"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterWriteFeature, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(cuFilterWriteFeature, task->filter_area.z, task->filter_area.w);
+
+    void *args[] = {&task->render_buffer.samples,
+                    &task->reconstruction_state.buffer_params,
+                    &task->filter_area,
+                    &from_ptr,
+                    &buffer_ptr,
+                    &out_offset,
+                    &task->rect};
+    CUDA_LAUNCH_KERNEL(cuFilterWriteFeature, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  bool denoising_detect_outliers(device_ptr image_ptr,
+                                 device_ptr variance_ptr,
+                                 device_ptr depth_ptr,
+                                 device_ptr output_ptr,
+                                 DenoisingTask *task)
+  {
+    if (have_error())
+      return false;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilterDetectOutliers;
+    cuda_assert(cuModuleGetFunction(
+        &cuFilterDetectOutliers, cuFilterModule, "kernel_cuda_filter_detect_outliers"));
+    cuda_assert(cuFuncSetCacheConfig(cuFilterDetectOutliers, CU_FUNC_CACHE_PREFER_L1));
+    CUDA_GET_BLOCKSIZE(
+        cuFilterDetectOutliers, task->rect.z - task->rect.x, task->rect.w - task->rect.y);
+
+    void *args[] = {&image_ptr,
+                    &variance_ptr,
+                    &depth_ptr,
+                    &output_ptr,
+                    &task->rect,
+                    &task->buffer.pass_stride};
+
+    CUDA_LAUNCH_KERNEL(cuFilterDetectOutliers, args);
+    cuda_assert(cuCtxSynchronize());
+
+    return !have_error();
+  }
+
+  void denoise(RenderTile &rtile, DenoisingTask &denoising)
+  {
+    denoising.functions.construct_transform = function_bind(
+        &CUDADevice::denoising_construct_transform, this, &denoising);
+    denoising.functions.accumulate = function_bind(
+        &CUDADevice::denoising_accumulate, this, _1, _2, _3, _4, &denoising);
+    denoising.functions.solve = function_bind(&CUDADevice::denoising_solve, this, _1, &denoising);
+    denoising.functions.divide_shadow = function_bind(
+        &CUDADevice::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
+    denoising.functions.non_local_means = function_bind(
+        &CUDADevice::denoising_non_local_means, this, _1, _2, _3, _4, &denoising);
+    denoising.functions.combine_halves = function_bind(
+        &CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
+    denoising.functions.get_feature = function_bind(
+        &CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, _5, &denoising);
+    denoising.functions.write_feature = function_bind(
+        &CUDADevice::denoising_write_feature, this, _1, _2, _3, &denoising);
+    denoising.functions.detect_outliers = function_bind(
+        &CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
+
+    denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
+    denoising.render_buffer.samples = rtile.sample;
+    denoising.buffer.gpu_temporary_mem = true;
+
+    denoising.run_denoising(&rtile);
+  }
+
+  void path_trace(DeviceTask &task, RenderTile &rtile, device_vector<WorkTile> &work_tiles)
+  {
+    scoped_timer timer(&rtile.buffers->render_time);
+
+    if (have_error())
+      return;
+
+    CUDAContextScope scope(this);
+    CUfunction cuPathTrace;
+
+    /* Get kernel function. */
+    if (task.integrator_branched) {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_branched_path_trace"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuPathTrace, cuModule, "kernel_cuda_path_trace"));
+    }
+
+    if (have_error()) {
+      return;
+    }
+
+    cuda_assert(cuFuncSetCacheConfig(cuPathTrace, CU_FUNC_CACHE_PREFER_L1));
+
+    /* Allocate work tile. */
+    work_tiles.alloc(1);
+
+    WorkTile *wtile = work_tiles.data();
+    wtile->x = rtile.x;
+    wtile->y = rtile.y;
+    wtile->w = rtile.w;
+    wtile->h = rtile.h;
+    wtile->offset = rtile.offset;
+    wtile->stride = rtile.stride;
+    wtile->buffer = (float *)cuda_device_ptr(rtile.buffer);
+
+    /* Prepare work size. More step samples render faster, but for now we
+     * remain conservative for GPUs connected to a display to avoid driver
+     * timeouts and display freezing. */
+    int min_blocks, num_threads_per_block;
+    cuda_assert(cuOccupancyMaxPotentialBlockSize(
+        &min_blocks, &num_threads_per_block, cuPathTrace, NULL, 0, 0));
+    if (!info.display_device) {
+      min_blocks *= 8;
+    }
+
+    uint step_samples = divide_up(min_blocks * num_threads_per_block, wtile->w * wtile->h);
+
+    /* Render all samples. */
+    int start_sample = rtile.start_sample;
+    int end_sample = rtile.start_sample + rtile.num_samples;
+
+    for (int sample = start_sample; sample < end_sample; sample += step_samples) {
+      /* Setup and copy work tile to device. */
+      wtile->start_sample = sample;
+      wtile->num_samples = min(step_samples, end_sample - sample);
+      work_tiles.copy_to_device();
+
+      CUdeviceptr d_work_tiles = cuda_device_ptr(work_tiles.device_pointer);
+      uint total_work_size = wtile->w * wtile->h * wtile->num_samples;
+      uint num_blocks = divide_up(total_work_size, num_threads_per_block);
+
+      /* Launch kernel. */
+      void *args[] = {&d_work_tiles, &total_work_size};
+
+      cuda_assert(cuLaunchKernel(
+          cuPathTrace, num_blocks, 1, 1, num_threads_per_block, 1, 1, 0, 0, args, 0));
+
+      cuda_assert(cuCtxSynchronize());
+
+      /* Update progress. */
+      rtile.sample = sample + wtile->num_samples;
+      task.update_progress(&rtile, rtile.w * rtile.h * wtile->num_samples);
+
+      if (task.get_cancel()) {
+        if (task.need_finish_queue == false)
+          break;
+      }
+    }
+  }
+
+  void film_convert(DeviceTask &task,
+                    device_ptr buffer,
+                    device_ptr rgba_byte,
+                    device_ptr rgba_half)
+  {
+    if (have_error())
+      return;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuFilmConvert;
+    CUdeviceptr d_rgba = map_pixels((rgba_byte) ? rgba_byte : rgba_half);
+    CUdeviceptr d_buffer = cuda_device_ptr(buffer);
+
+    /* get kernel function */
+    if (rgba_half) {
+      cuda_assert(
+          cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_half_float"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuFilmConvert, cuModule, "kernel_cuda_convert_to_byte"));
+    }
+
+    float sample_scale = 1.0f / (task.sample + 1);
+
+    /* pass in parameters */
+    void *args[] = {&d_rgba,
+                    &d_buffer,
+                    &sample_scale,
+                    &task.x,
+                    &task.y,
+                    &task.w,
+                    &task.h,
+                    &task.offset,
+                    &task.stride};
+
+    /* launch kernel */
+    int threads_per_block;
+    cuda_assert(cuFuncGetAttribute(
+        &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuFilmConvert));
+
+    int xthreads = (int)sqrt(threads_per_block);
+    int ythreads = (int)sqrt(threads_per_block);
+    int xblocks = (task.w + xthreads - 1) / xthreads;
+    int yblocks = (task.h + ythreads - 1) / ythreads;
+
+    cuda_assert(cuFuncSetCacheConfig(cuFilmConvert, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(cuFilmConvert,
+                               xblocks,
+                               yblocks,
+                               1, /* blocks */
+                               xthreads,
+                               ythreads,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    unmap_pixels((rgba_byte) ? rgba_byte : rgba_half);
+
+    cuda_assert(cuCtxSynchronize());
+  }
+
+  void shader(DeviceTask &task)
+  {
+    if (have_error())
+      return;
+
+    CUDAContextScope scope(this);
+
+    CUfunction cuShader;
+    CUdeviceptr d_input = cuda_device_ptr(task.shader_input);
+    CUdeviceptr d_output = cuda_device_ptr(task.shader_output);
+
+    /* get kernel function */
+    if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_bake"));
+    }
+    else if (task.shader_eval_type == SHADER_EVAL_DISPLACE) {
+      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_displace"));
+    }
+    else {
+      cuda_assert(cuModuleGetFunction(&cuShader, cuModule, "kernel_cuda_background"));
+    }
+
+    /* do tasks in smaller chunks, so we can cancel it */
+    const int shader_chunk_size = 65536;
+    const int start = task.shader_x;
+    const int end = task.shader_x + task.shader_w;
+    int offset = task.offset;
+
+    bool canceled = false;
+    for (int sample = 0; sample < task.num_samples && !canceled; sample++) {
+      for (int shader_x = start; shader_x < end; shader_x += shader_chunk_size) {
+        int shader_w = min(shader_chunk_size, end - shader_x);
+
+        /* pass in parameters */
+        void *args[8];
+        int arg = 0;
+        args[arg++] = &d_input;
+        args[arg++] = &d_output;
+        args[arg++] = &task.shader_eval_type;
+        if (task.shader_eval_type >= SHADER_EVAL_BAKE) {
+          args[arg++] = &task.shader_filter;
+        }
+        args[arg++] = &shader_x;
+        args[arg++] = &shader_w;
+        args[arg++] = &offset;
+        args[arg++] = &sample;
+
+        /* launch kernel */
+        int threads_per_block;
+        cuda_assert(cuFuncGetAttribute(
+            &threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, cuShader));
+
+        int xblocks = (shader_w + threads_per_block - 1) / threads_per_block;
+
+        cuda_assert(cuFuncSetCacheConfig(cuShader, CU_FUNC_CACHE_PREFER_L1));
+        cuda_assert(cuLaunchKernel(cuShader,
+                                   xblocks,
+                                   1,
+                                   1, /* blocks */
+                                   threads_per_block,
+                                   1,
+                                   1, /* threads */
+                                   0,
+                                   0,
+                                   args,
+                                   0));
+
+        cuda_assert(cuCtxSynchronize());
+
+        if (task.get_cancel()) {
+          canceled = true;
+          break;
+        }
+      }
+
+      task.update_progress(NULL);
+    }
+  }
+
+  CUdeviceptr map_pixels(device_ptr mem)
+  {
+    if (!background) {
+      PixelMem pmem = pixel_mem_map[mem];
+      CUdeviceptr buffer;
+
+      size_t bytes;
+      cuda_assert(cuGraphicsMapResources(1, &pmem.cuPBOresource, 0));
+      cuda_assert(cuGraphicsResourceGetMappedPointer(&buffer, &bytes, pmem.cuPBOresource));
+
+      return buffer;
+    }
+
+    return cuda_device_ptr(mem);
+  }
+
+  void unmap_pixels(device_ptr mem)
+  {
+    if (!background) {
+      PixelMem pmem = pixel_mem_map[mem];
+
+      cuda_assert(cuGraphicsUnmapResources(1, &pmem.cuPBOresource, 0));
+    }
+  }
+
+  void pixels_alloc(device_memory &mem)
+  {
+    PixelMem pmem;
+
+    pmem.w = mem.data_width;
+    pmem.h = mem.data_height;
+
+    CUDAContextScope scope(this);
+
+    glGenBuffers(1, &pmem.cuPBO);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    if (mem.data_type == TYPE_HALF)
+      glBufferData(
+          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(GLhalf) * 4, NULL, GL_DYNAMIC_DRAW);
+    else
+      glBufferData(
+          GL_PIXEL_UNPACK_BUFFER, pmem.w * pmem.h * sizeof(uint8_t) * 4, NULL, GL_DYNAMIC_DRAW);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+    glActiveTexture(GL_TEXTURE0);
+    glGenTextures(1, &pmem.cuTexId);
+    glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+    if (mem.data_type == TYPE_HALF)
+      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, pmem.w, pmem.h, 0, GL_RGBA, GL_HALF_FLOAT, NULL);
+    else
+      glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, pmem.w, pmem.h, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glBindTexture(GL_TEXTURE_2D, 0);
+
+    CUresult result = cuGraphicsGLRegisterBuffer(
+        &pmem.cuPBOresource, pmem.cuPBO, CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE);
+
+    if (result == CUDA_SUCCESS) {
+      mem.device_pointer = pmem.cuTexId;
+      pixel_mem_map[mem.device_pointer] = pmem;
+
+      mem.device_size = mem.memory_size();
+      stats.mem_alloc(mem.device_size);
+
+      return;
+    }
+    else {
+      /* failed to register buffer, fallback to no interop */
+      glDeleteBuffers(1, &pmem.cuPBO);
+      glDeleteTextures(1, &pmem.cuTexId);
+
+      background = true;
+    }
+  }
+
+  void pixels_copy_from(device_memory &mem, int y, int w, int h)
+  {
+    PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+    CUDAContextScope scope(this);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+    uchar *pixels = (uchar *)glMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_READ_ONLY);
+    size_t offset = sizeof(uchar) * 4 * y * w;
+    memcpy((uchar *)mem.host_pointer + offset, pixels + offset, sizeof(uchar) * 4 * w * h);
+    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+  }
+
+  void pixels_free(device_memory &mem)
+  {
+    if (mem.device_pointer) {
+      PixelMem pmem = pixel_mem_map[mem.device_pointer];
+
+      CUDAContextScope scope(this);
+
+      cuda_assert(cuGraphicsUnregisterResource(pmem.cuPBOresource));
+      glDeleteBuffers(1, &pmem.cuPBO);
+      glDeleteTextures(1, &pmem.cuTexId);
+
+      pixel_mem_map.erase(pixel_mem_map.find(mem.device_pointer));
+      mem.device_pointer = 0;
+
+      stats.mem_free(mem.device_size);
+      mem.device_size = 0;
+    }
+  }
+
+  void draw_pixels(device_memory &mem,
+                   int y,
+                   int w,
+                   int h,
+                   int width,
+                   int height,
+                   int dx,
+                   int dy,
+                   int dw,
+                   int dh,
+                   bool transparent,
+                   const DeviceDrawParams &draw_params)
+  {
+    assert(mem.type == MEM_PIXELS);
+
+    if (!background) {
+      const bool use_fallback_shader = (draw_params.bind_display_space_shader_cb == NULL);
+      PixelMem pmem = pixel_mem_map[mem.device_pointer];
+      float *vpointer;
+
+      CUDAContextScope scope(this);
+
+      /* for multi devices, this assumes the inefficient method that we allocate
+       * all pixels on the device even though we only render to a subset */
+      size_t offset = 4 * y * w;
+
+      if (mem.data_type == TYPE_HALF)
+        offset *= sizeof(GLhalf);
+      else
+        offset *= sizeof(uint8_t);
+
+      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pmem.cuPBO);
+      glActiveTexture(GL_TEXTURE0);
+      glBindTexture(GL_TEXTURE_2D, pmem.cuTexId);
+      if (mem.data_type == TYPE_HALF) {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_HALF_FLOAT, (void *)offset);
+      }
+      else {
+        glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, w, h, GL_RGBA, GL_UNSIGNED_BYTE, (void *)offset);
+      }
+      glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+      if (transparent) {
+        glEnable(GL_BLEND);
+        glBlendFunc(GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+      }
+
+      GLint shader_program;
+      if (use_fallback_shader) {
+        if (!bind_fallback_display_space_shader(dw, dh)) {
+          return;
+        }
+        shader_program = fallback_shader_program;
+      }
+      else {
+        draw_params.bind_display_space_shader_cb();
+        glGetIntegerv(GL_CURRENT_PROGRAM, &shader_program);
+      }
+
+      if (!vertex_buffer) {
+        glGenBuffers(1, &vertex_buffer);
+      }
+
+      glBindBuffer(GL_ARRAY_BUFFER, vertex_buffer);
+      /* invalidate old contents - avoids stalling if buffer is still waiting in queue to be rendered */
+      glBufferData(GL_ARRAY_BUFFER, 16 * sizeof(float), NULL, GL_STREAM_DRAW);
+
+      vpointer = (float *)glMapBuffer(GL_ARRAY_BUFFER, GL_WRITE_ONLY);
+
+      if (vpointer) {
+        /* texture coordinate - vertex pair */
+        vpointer[0] = 0.0f;
+        vpointer[1] = 0.0f;
+        vpointer[2] = dx;
+        vpointer[3] = dy;
+
+        vpointer[4] = (float)w / (float)pmem.w;
+        vpointer[5] = 0.0f;
+        vpointer[6] = (float)width + dx;
+        vpointer[7] = dy;
+
+        vpointer[8] = (float)w / (float)pmem.w;
+        vpointer[9] = (float)h / (float)pmem.h;
+        vpointer[10] = (float)width + dx;
+        vpointer[11] = (float)height + dy;
+
+        vpointer[12] = 0.0f;
+        vpointer[13] = (float)h / (float)pmem.h;
+        vpointer[14] = dx;
+        vpointer[15] = (float)height + dy;
+
+        glUnmapBuffer(GL_ARRAY_BUFFER);
+      }
+
+      GLuint vertex_array_object;
+      GLuint position_attribute, texcoord_attribute;
+
+      glGenVertexArrays(1, &vertex_array_object);
+      glBindVertexArray(vertex_array_object);
+
+      texcoord_attribute = glGetAttribLocation(shader_program, "texCoord");
+      position_attribute = glGetAttribLocation(shader_program, "pos");
+
+      glEnableVertexAttribArray(texcoord_attribute);
+      glEnableVertexAttribArray(position_attribute);
+
+      glVertexAttribPointer(
+          texcoord_attribute, 2, GL_FLOAT, GL_FALSE, 4 * sizeof(float), (const GLvoid *)0);
+      glVertexAttribPointer(position_attribute,
+                            2,
+                            GL_FLOAT,
+                            GL_FALSE,
+                            4 * sizeof(float),
+                            (const GLvoid *)(sizeof(float) * 2));
+
+      glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+      if (use_fallback_shader) {
+        glUseProgram(0);
+      }
+      else {
+        draw_params.unbind_display_space_shader_cb();
+      }
+
+      if (transparent) {
+        glDisable(GL_BLEND);
+      }
+
+      glBindTexture(GL_TEXTURE_2D, 0);
+
+      return;
+    }
+
+    Device::draw_pixels(mem, y, w, h, width, height, dx, dy, dw, dh, transparent, draw_params);
+  }
+
+  void thread_run(DeviceTask *task)
+  {
+    CUDAContextScope scope(this);
+
+    if (task->type == DeviceTask::RENDER) {
+      DeviceRequestedFeatures requested_features;
+      if (use_split_kernel()) {
+        if (split_kernel == NULL) {
+          split_kernel = new CUDASplitKernel(this);
+          split_kernel->load_kernels(requested_features);
+        }
+      }
+
+      device_vector<WorkTile> work_tiles(this, "work_tiles", MEM_READ_ONLY);
+
+      /* keep rendering tiles until done */
+      RenderTile tile;
+      DenoisingTask denoising(this, *task);
+
+      while (task->acquire_tile(this, tile)) {
+        if (tile.task == RenderTile::PATH_TRACE) {
+          if (use_split_kernel()) {
+            device_only_memory<uchar> void_buffer(this, "void_buffer");
+            split_kernel->path_trace(task, tile, void_buffer, void_buffer);
+          }
+          else {
+            path_trace(*task, tile, work_tiles);
+          }
+        }
+        else if (tile.task == RenderTile::DENOISE) {
+          tile.sample = tile.start_sample + tile.num_samples;
+
+          denoise(tile, denoising);
+
+          task->update_progress(&tile, tile.w * tile.h);
+        }
+
+        task->release_tile(tile);
+
+        if (task->get_cancel()) {
+          if (task->need_finish_queue == false)
+            break;
+        }
+      }
+
+      work_tiles.free();
+    }
+    else if (task->type == DeviceTask::SHADER) {
+      shader(*task);
+
+      cuda_assert(cuCtxSynchronize());
+    }
+  }
+
+  class CUDADeviceTask : public DeviceTask {
+   public:
+    CUDADeviceTask(CUDADevice *device, DeviceTask &task) : DeviceTask(task)
+    {
+      run = function_bind(&CUDADevice::thread_run, device, this);
+    }
+  };
+
+  int get_split_task_count(DeviceTask & /*task*/)
+  {
+    return 1;
+  }
+
+  void task_add(DeviceTask &task)
+  {
+    CUDAContextScope scope(this);
+
+    /* Load texture info. */
+    load_texture_info();
+
+    /* Synchronize all memory copies before executing task. */
+    cuda_assert(cuCtxSynchronize());
+
+    if (task.type == DeviceTask::FILM_CONVERT) {
+      /* must be done in main thread due to opengl access */
+      film_convert(task, task.buffer, task.rgba_byte, task.rgba_half);
+    }
+    else {
+      task_pool.push(new CUDADeviceTask(this, task));
+    }
+  }
+
+  void task_wait()
+  {
+    task_pool.wait();
+  }
+
+  void task_cancel()
+  {
+    task_pool.cancel();
+  }
+
+  friend class CUDASplitKernelFunction;
+  friend class CUDASplitKernel;
+  friend class CUDAContextScope;
 };
 
 /* redefine the cuda_assert macro so it can be used outside of the CUDADevice class
@@ -2207,496 +2305,501 @@ public:
  */
 #undef cuda_assert
 #define cuda_assert(stmt) \
-       { \
-               CUresult result = stmt; \
-               \
-               if(result != CUDA_SUCCESS) { \
-                       string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
-                       if(device->error_msg == "") \
-                               device->error_msg = message; \
-                       fprintf(stderr, "%s\n", message.c_str()); \
-                       /*cuda_abort();*/ \
-                       device->cuda_error_documentation(); \
-               } \
-       } (void) 0
-
+  { \
+    CUresult result = stmt; \
+\
+    if (result != CUDA_SUCCESS) { \
+      string message = string_printf("CUDA error: %s in %s", cuewErrorString(result), #stmt); \
+      if (device->error_msg == "") \
+        device->error_msg = message; \
+      fprintf(stderr, "%s\n", message.c_str()); \
+      /*cuda_abort();*/ \
+      device->cuda_error_documentation(); \
+    } \
+  } \
+  (void)0
 
 /* CUDA context scope. */
 
-CUDAContextScope::CUDAContextScope(CUDADevice *device)
-: device(device)
+CUDAContextScope::CUDAContextScope(CUDADevice *device) : device(device)
 {
-       cuda_assert(cuCtxPushCurrent(device->cuContext));
+  cuda_assert(cuCtxPushCurrent(device->cuContext));
 }
 
 CUDAContextScope::~CUDAContextScope()
 {
-       cuda_assert(cuCtxPopCurrent(NULL));
+  cuda_assert(cuCtxPopCurrent(NULL));
 }
 
 /* split kernel */
 
-class CUDASplitKernelFunction : public SplitKernelFunction{
-       CUDADevice* device;
-       CUfunction func;
-public:
-       CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func) {}
-
-       /* enqueue the kernel, returns false if there is an error */
-       bool enqueue(const KernelDimensions &dim, device_memory &/*kg*/, device_memory &/*data*/)
-       {
-               return enqueue(dim, NULL);
-       }
-
-       /* enqueue the kernel, returns false if there is an error */
-       bool enqueue(const KernelDimensions &dim, void *args[])
-       {
-               if(device->have_error())
-                       return false;
-
-               CUDAContextScope scope(device);
-
-               /* we ignore dim.local_size for now, as this is faster */
-               int threads_per_block;
-               cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
-
-               int xblocks = (dim.global_size[0]*dim.global_size[1] + threads_per_block - 1)/threads_per_block;
-
-               cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
-
-               cuda_assert(cuLaunchKernel(func,
-                                          xblocks, 1, 1, /* blocks */
-                                          threads_per_block, 1, 1, /* threads */
-                                          0, 0, args, 0));
-
-               return !device->have_error();
-       }
+class CUDASplitKernelFunction : public SplitKernelFunction {
+  CUDADevice *device;
+  CUfunction func;
+
+ public:
+  CUDASplitKernelFunction(CUDADevice *device, CUfunction func) : device(device), func(func)
+  {
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, device_memory & /*kg*/, device_memory & /*data*/)
+  {
+    return enqueue(dim, NULL);
+  }
+
+  /* enqueue the kernel, returns false if there is an error */
+  bool enqueue(const KernelDimensions &dim, void *args[])
+  {
+    if (device->have_error())
+      return false;
+
+    CUDAContextScope scope(device);
+
+    /* we ignore dim.local_size for now, as this is faster */
+    int threads_per_block;
+    cuda_assert(
+        cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func));
+
+    int xblocks = (dim.global_size[0] * dim.global_size[1] + threads_per_block - 1) /
+                  threads_per_block;
+
+    cuda_assert(cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_L1));
+
+    cuda_assert(cuLaunchKernel(func,
+                               xblocks,
+                               1,
+                               1, /* blocks */
+                               threads_per_block,
+                               1,
+                               1, /* threads */
+                               0,
+                               0,
+                               args,
+                               0));
+
+    return !device->have_error();
+  }
 };
 
 CUDASplitKernel::CUDASplitKernel(CUDADevice *device) : DeviceSplitKernel(device), device(device)
 {
 }
 
-uint64_t CUDASplitKernel::state_buffer_size(device_memory& /*kg*/, device_memory& /*data*/, size_t num_threads)
+uint64_t CUDASplitKernel::state_buffer_size(device_memory & /*kg*/,
+                                            device_memory & /*data*/,
+                                            size_t num_threads)
 {
-       CUDAContextScope scope(device);
+  CUDAContextScope scope(device);
 
-       device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
-       size_buffer.alloc(1);
-       size_buffer.zero_to_device();
+  device_vector<uint64_t> size_buffer(device, "size_buffer", MEM_READ_WRITE);
+  size_buffer.alloc(1);
+  size_buffer.zero_to_device();
 
-       uint threads = num_threads;
-       CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
+  uint threads = num_threads;
+  CUdeviceptr d_size = device->cuda_device_ptr(size_buffer.device_pointer);
 
-       struct args_t {
-               uint* num_threads;
-               CUdeviceptr* size;
-       };
+  struct args_t {
+    uint *num_threads;
+    CUdeviceptr *size;
+  };
 
-       args_t args = {
-               &threads,
-               &d_size
-       };
+  args_t args = {&threads, &d_size};
 
-       CUfunction state_buffer_size;
-       cuda_assert(cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
+  CUfunction state_buffer_size;
+  cuda_assert(
+      cuModuleGetFunction(&state_buffer_size, device->cuModule, "kernel_cuda_state_buffer_size"));
 
-       cuda_assert(cuLaunchKernel(state_buffer_size,
-                                  1, 1, 1,
-                                  1, 1, 1,
-                                  0, 0, (void**)&args, 0));
+  cuda_assert(cuLaunchKernel(state_buffer_size, 1, 1, 1, 1, 1, 1, 0, 0, (void **)&args, 0));
 
-       size_buffer.copy_from_device(0, 1, 1);
-       size_t size = size_buffer[0];
-       size_buffer.free();
+  size_buffer.copy_from_device(0, 1, 1);
+  size_t size = size_buffer[0];
+  size_buffer.free();
 
-       return size;
+  return size;
 }
 
-bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensionsdim,
-                                    RenderTile& rtile,
-                                    int num_global_elements,
-                                    device_memory& /*kernel_globals*/,
-                                    device_memory& /*kernel_data*/,
-                                    device_memory& split_data,
-                                    device_memory& ray_state,
-                                    device_memory& queue_index,
-                                    device_memory& use_queues_flag,
-                                    device_memory& work_pool_wgs)
+bool CUDASplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim,
+                                                     RenderTile &rtile,
+                                                     int num_global_elements,
+                                                     device_memory & /*kernel_globals*/,
+                                                     device_memory & /*kernel_data*/,
+                                                     device_memory &split_data,
+                                                     device_memory &ray_state,
+                                                     device_memory &queue_index,
+                                                     device_memory &use_queues_flag,
+                                                     device_memory &work_pool_wgs)
 {
-       CUDAContextScope scope(device);
-
-       CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
-       CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
-       CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
-       CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
-       CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
-
-       CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
-
-       int end_sample = rtile.start_sample + rtile.num_samples;
-       int queue_size = dim.global_size[0] * dim.global_size[1];
-
-       struct args_t {
-               CUdeviceptr* split_data_buffer;
-               int* num_elements;
-               CUdeviceptr* ray_state;
-               int* start_sample;
-               int* end_sample;
-               int* sx;
-               int* sy;
-               int* sw;
-               int* sh;
-               int* offset;
-               int* stride;
-               CUdeviceptr* queue_index;
-               int* queuesize;
-               CUdeviceptr* use_queues_flag;
-               CUdeviceptr* work_pool_wgs;
-               int* num_samples;
-               CUdeviceptr* buffer;
-       };
-
-       args_t args = {
-               &d_split_data,
-               &num_global_elements,
-               &d_ray_state,
-               &rtile.start_sample,
-               &end_sample,
-               &rtile.x,
-               &rtile.y,
-               &rtile.w,
-               &rtile.h,
-               &rtile.offset,
-               &rtile.stride,
-               &d_queue_index,
-               &queue_size,
-               &d_use_queues_flag,
-               &d_work_pool_wgs,
-               &rtile.num_samples,
-               &d_buffer
-       };
-
-       CUfunction data_init;
-       cuda_assert(cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
-       if(device->have_error()) {
-               return false;
-       }
-
-       CUDASplitKernelFunction(device, data_init).enqueue(dim, (void**)&args);
-
-       return !device->have_error();
+  CUDAContextScope scope(device);
+
+  CUdeviceptr d_split_data = device->cuda_device_ptr(split_data.device_pointer);
+  CUdeviceptr d_ray_state = device->cuda_device_ptr(ray_state.device_pointer);
+  CUdeviceptr d_queue_index = device->cuda_device_ptr(queue_index.device_pointer);
+  CUdeviceptr d_use_queues_flag = device->cuda_device_ptr(use_queues_flag.device_pointer);
+  CUdeviceptr d_work_pool_wgs = device->cuda_device_ptr(work_pool_wgs.device_pointer);
+
+  CUdeviceptr d_buffer = device->cuda_device_ptr(rtile.buffer);
+
+  int end_sample = rtile.start_sample + rtile.num_samples;
+  int queue_size = dim.global_size[0] * dim.global_size[1];
+
+  struct args_t {
+    CUdeviceptr *split_data_buffer;
+    int *num_elements;
+    CUdeviceptr *ray_state;
+    int *start_sample;
+    int *end_sample;
+    int *sx;
+    int *sy;
+    int *sw;
+    int *sh;
+    int *offset;
+    int *stride;
+    CUdeviceptr *queue_index;
+    int *queuesize;
+    CUdeviceptr *use_queues_flag;
+    CUdeviceptr *work_pool_wgs;
+    int *num_samples;
+    CUdeviceptr *buffer;
+  };
+
+  args_t args = {&d_split_data,
+                 &num_global_elements,
+                 &d_ray_state,
+                 &rtile.start_sample,
+                 &end_sample,
+                 &rtile.x,
+                 &rtile.y,
+                 &rtile.w,
+                 &rtile.h,
+                 &rtile.offset,
+                 &rtile.stride,
+                 &d_queue_index,
+                 &queue_size,
+                 &d_use_queues_flag,
+                 &d_work_pool_wgs,
+                 &rtile.num_samples,
+                 &d_buffer};
+
+  CUfunction data_init;
+  cuda_assert(
+      cuModuleGetFunction(&data_init, device->cuModule, "kernel_cuda_path_trace_data_init"));
+  if (device->have_error()) {
+    return false;
+  }
+
+  CUDASplitKernelFunction(device, data_init).enqueue(dim, (void **)&args);
+
+  return !device->have_error();
 }
 
-SplitKernelFunction* CUDASplitKernel::get_split_kernel_function(const string& kernel_name,
-                                                                const DeviceRequestedFeatures&)
+SplitKernelFunction *CUDASplitKernel::get_split_kernel_function(const string &kernel_name,
+                                                                const DeviceRequestedFeatures &)
 {
-       CUDAContextScope scope(device);
-       CUfunction func;
-
-       cuda_assert(cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
-       if(device->have_error()) {
-               device->cuda_error_message(string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
-               return NULL;
-       }
-
-       return new CUDASplitKernelFunction(device, func);
+  CUDAContextScope scope(device);
+  CUfunction func;
+
+  cuda_assert(
+      cuModuleGetFunction(&func, device->cuModule, (string("kernel_cuda_") + kernel_name).data()));
+  if (device->have_error()) {
+    device->cuda_error_message(
+        string_printf("kernel \"kernel_cuda_%s\" not found in module", kernel_name.data()));
+    return NULL;
+  }
+
+  return new CUDASplitKernelFunction(device, func);
 }
 
 int2 CUDASplitKernel::split_kernel_local_size()
 {
-       return make_int2(32, 1);
+  return make_int2(32, 1);
 }
 
-int2 CUDASplitKernel::split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/)
+int2 CUDASplitKernel::split_kernel_global_size(device_memory &kg,
+                                               device_memory &data,
+                                               DeviceTask * /*task*/)
 {
-       CUDAContextScope scope(device);
-       size_t free;
-       size_t total;
+  CUDAContextScope scope(device);
+  size_t free;
+  size_t total;
 
-       cuda_assert(cuMemGetInfo(&free, &total));
+  cuda_assert(cuMemGetInfo(&free, &total));
 
-       VLOG(1) << "Maximum device allocation size: "
-               << string_human_readable_number(free) << " bytes. ("
-               << string_human_readable_size(free) << ").";
+  VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(free)
+          << " bytes. (" << string_human_readable_size(free) << ").";
 
-       size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
-       size_t side = round_down((int)sqrt(num_elements), 32);
-       int2 global_size = make_int2(side, round_down(num_elements / side, 16));
-       VLOG(1) << "Global size: " << global_size << ".";
-       return global_size;
+  size_t num_elements = max_elements_for_max_buffer_size(kg, data, free / 2);
+  size_t side = round_down((int)sqrt(num_elements), 32);
+  int2 global_size = make_int2(side, round_down(num_elements / side, 16));
+  VLOG(1) << "Global size: " << global_size << ".";
+  return global_size;
 }
 
 bool device_cuda_init()
 {
 #ifdef WITH_CUDA_DYNLOAD
-       static bool initialized = false;
-       static bool result = false;
-
-       if(initialized)
-               return result;
-
-       initialized = true;
-       int cuew_result = cuewInit(CUEW_INIT_CUDA);
-       if(cuew_result == CUEW_SUCCESS) {
-               VLOG(1) << "CUEW initialization succeeded";
-               if(CUDADevice::have_precompiled_kernels()) {
-                       VLOG(1) << "Found precompiled kernels";
-                       result = true;
-               }
-#ifndef _WIN32
-               else if(cuewCompilerPath() != NULL) {
-                       VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
-                       result = true;
-               }
-               else {
-                       VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
-                               << " unable to use CUDA";
-               }
-#endif
-       }
-       else {
-               VLOG(1) << "CUEW initialization failed: "
-                       << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED)
-                           ? "Error setting up atexit() handler"
-                           : "Error opening the library");
-       }
-
-       return result;
+  static bool initialized = false;
+  static bool result = false;
+
+  if (initialized)
+    return result;
+
+  initialized = true;
+  int cuew_result = cuewInit(CUEW_INIT_CUDA);
+  if (cuew_result == CUEW_SUCCESS) {
+    VLOG(1) << "CUEW initialization succeeded";
+    if (CUDADevice::have_precompiled_kernels()) {
+      VLOG(1) << "Found precompiled kernels";
+      result = true;
+    }
+#  ifndef _WIN32
+    else if (cuewCompilerPath() != NULL) {
+      VLOG(1) << "Found CUDA compiler " << cuewCompilerPath();
+      result = true;
+    }
+    else {
+      VLOG(1) << "Neither precompiled kernels nor CUDA compiler was found,"
+              << " unable to use CUDA";
+    }
+#  endif
+  }
+  else {
+    VLOG(1) << "CUEW initialization failed: "
+            << ((cuew_result == CUEW_ERROR_ATEXIT_FAILED) ? "Error setting up atexit() handler" :
+                                                            "Error opening the library");
+  }
+
+  return result;
 #else  /* WITH_CUDA_DYNLOAD */
-       return true;
-#endif  /* WITH_CUDA_DYNLOAD */
+  return true;
+#endif /* WITH_CUDA_DYNLOAD */
 }
 
-Device *device_cuda_create(DeviceInfoinfo, Stats &stats, Profiler &profiler, bool background)
+Device *device_cuda_create(DeviceInfo &info, Stats &stats, Profiler &profiler, bool background)
 {
-       return new CUDADevice(info, stats, profiler, background);
+  return new CUDADevice(info, stats, profiler, background);
 }
 
 static CUresult device_cuda_safe_init()
 {
 #ifdef _WIN32
-       __try {
-               return cuInit(0);
-       }
-       __except(EXCEPTION_EXECUTE_HANDLER) {
-               /* Ignore crashes inside the CUDA driver and hope we can
-                * survive even with corrupted CUDA installs. */
-               fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
-       }
-
-       return CUDA_ERROR_NO_DEVICE;
+  __try {
+    return cuInit(0);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+    /* Ignore crashes inside the CUDA driver and hope we can
+     * survive even with corrupted CUDA installs. */
+    fprintf(stderr, "Cycles CUDA: driver crashed, continuing without CUDA.\n");
+  }
+
+  return CUDA_ERROR_NO_DEVICE;
 #else
-       return cuInit(0);
+  return cuInit(0);
 #endif
 }
 
-void device_cuda_info(vector<DeviceInfo>devices)
+void device_cuda_info(vector<DeviceInfo> &devices)
 {
-       CUresult result = device_cuda_safe_init();
-       if(result != CUDA_SUCCESS) {
-               if(result != CUDA_ERROR_NO_DEVICE)
-                       fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
-               return;
-       }
-
-       int count = 0;
-       result = cuDeviceGetCount(&count);
-       if(result != CUDA_SUCCESS) {
-               fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
-               return;
-       }
-
-       vector<DeviceInfo> display_devices;
-
-       for(int num = 0; num < count; num++) {
-               char name[256];
-
-               result = cuDeviceGetName(name, 256, num);
-               if(result != CUDA_SUCCESS) {
-                       fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
-                       continue;
-               }
-
-               int major;
-               cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
-               if(major < 3) {
-                       VLOG(1) << "Ignoring device \"" << name
-                               << "\", this graphics card is no longer supported.";
-                       continue;
-               }
-
-               DeviceInfo info;
-
-               info.type = DEVICE_CUDA;
-               info.description = string(name);
-               info.num = num;
-
-               info.has_half_images = (major >= 3);
-               info.has_volume_decoupled = false;
-
-               int pci_location[3] = {0, 0, 0};
-               cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
-               cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
-               cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
-               info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
-                                       name,
-                                       (unsigned int)pci_location[0],
-                                       (unsigned int)pci_location[1],
-                                       (unsigned int)pci_location[2]);
-
-               /* If device has a kernel timeout and no compute preemption, we assume
-                * it is connected to a display and will freeze the display while doing
-                * computations. */
-               int timeout_attr = 0, preempt_attr = 0;
-               cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
-               cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
-
-               if(timeout_attr && !preempt_attr) {
-                       VLOG(1) << "Device is recognized as display.";
-                       info.description += " (Display)";
-                       info.display_device = true;
-                       display_devices.push_back(info);
-               }
-               else {
-                       devices.push_back(info);
-               }
-               VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
-       }
-
-       if(!display_devices.empty())
-               devices.insert(devices.end(), display_devices.begin(), display_devices.end());
+  CUresult result = device_cuda_safe_init();
+  if (result != CUDA_SUCCESS) {
+    if (result != CUDA_ERROR_NO_DEVICE)
+      fprintf(stderr, "CUDA cuInit: %s\n", cuewErrorString(result));
+    return;
+  }
+
+  int count = 0;
+  result = cuDeviceGetCount(&count);
+  if (result != CUDA_SUCCESS) {
+    fprintf(stderr, "CUDA cuDeviceGetCount: %s\n", cuewErrorString(result));
+    return;
+  }
+
+  vector<DeviceInfo> display_devices;
+
+  for (int num = 0; num < count; num++) {
+    char name[256];
+
+    result = cuDeviceGetName(name, 256, num);
+    if (result != CUDA_SUCCESS) {
+      fprintf(stderr, "CUDA cuDeviceGetName: %s\n", cuewErrorString(result));
+      continue;
+    }
+
+    int major;
+    cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, num);
+    if (major < 3) {
+      VLOG(1) << "Ignoring device \"" << name << "\", this graphics card is no longer supported.";
+      continue;
+    }
+
+    DeviceInfo info;
+
+    info.type = DEVICE_CUDA;
+    info.description = string(name);
+    info.num = num;
+
+    info.has_half_images = (major >= 3);
+    info.has_volume_decoupled = false;
+
+    int pci_location[3] = {0, 0, 0};
+    cuDeviceGetAttribute(&pci_location[0], CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, num);
+    cuDeviceGetAttribute(&pci_location[1], CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, num);
+    cuDeviceGetAttribute(&pci_location[2], CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, num);
+    info.id = string_printf("CUDA_%s_%04x:%02x:%02x",
+                            name,
+                            (unsigned int)pci_location[0],
+                            (unsigned int)pci_location[1],
+                            (unsigned int)pci_location[2]);
+
+    /* If device has a kernel timeout and no compute preemption, we assume
+     * it is connected to a display and will freeze the display while doing
+     * computations. */
+    int timeout_attr = 0, preempt_attr = 0;
+    cuDeviceGetAttribute(&timeout_attr, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, num);
+    cuDeviceGetAttribute(&preempt_attr, CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, num);
+
+    if (timeout_attr && !preempt_attr) {
+      VLOG(1) << "Device is recognized as display.";
+      info.description += " (Display)";
+      info.display_device = true;
+      display_devices.push_back(info);
+    }
+    else {
+      devices.push_back(info);
+    }
+    VLOG(1) << "Added device \"" << name << "\" with id \"" << info.id << "\".";
+  }
+
+  if (!display_devices.empty())
+    devices.insert(devices.end(), display_devices.begin(), display_devices.end());
 }
 
 string device_cuda_capabilities()
 {
-       CUresult result = device_cuda_safe_init();
-       if(result != CUDA_SUCCESS) {
-               if(result != CUDA_ERROR_NO_DEVICE) {
-                       return string("Error initializing CUDA: ") + cuewErrorString(result);
-               }
-               return "No CUDA device found\n";
-       }
-
-       int count;
-       result = cuDeviceGetCount(&count);
-       if(result != CUDA_SUCCESS) {
-               return string("Error getting devices: ") + cuewErrorString(result);
-       }
-
-       string capabilities = "";
-       for(int num = 0; num < count; num++) {
-               char name[256];
-               if(cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
-                       continue;
-               }
-               capabilities += string("\t") + name + "\n";
-               int value;
+  CUresult result = device_cuda_safe_init();
+  if (result != CUDA_SUCCESS) {
+    if (result != CUDA_ERROR_NO_DEVICE) {
+      return string("Error initializing CUDA: ") + cuewErrorString(result);
+    }
+    return "No CUDA device found\n";
+  }
+
+  int count;
+  result = cuDeviceGetCount(&count);
+  if (result != CUDA_SUCCESS) {
+    return string("Error getting devices: ") + cuewErrorString(result);
+  }
+
+  string capabilities = "";
+  for (int num = 0; num < count; num++) {
+    char name[256];
+    if (cuDeviceGetName(name, 256, num) != CUDA_SUCCESS) {
+      continue;
+    }
+    capabilities += string("\t") + name + "\n";
+    int value;
 #define GET_ATTR(attr) \
-               { \
-                       if(cuDeviceGetAttribute(&value, \
-                                               CU_DEVICE_ATTRIBUTE_##attr, \
-                                               num) == CUDA_SUCCESS) \
-                       { \
-                               capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", \
-                                                             value); \
-                       } \
-               } (void) 0
-               /* TODO(sergey): Strip all attributes which are not useful for us
-                * or does not depend on the driver.
-                */
-               GET_ATTR(MAX_THREADS_PER_BLOCK);
-               GET_ATTR(MAX_BLOCK_DIM_X);
-               GET_ATTR(MAX_BLOCK_DIM_Y);
-               GET_ATTR(MAX_BLOCK_DIM_Z);
-               GET_ATTR(MAX_GRID_DIM_X);
-               GET_ATTR(MAX_GRID_DIM_Y);
-               GET_ATTR(MAX_GRID_DIM_Z);
-               GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
-               GET_ATTR(SHARED_MEMORY_PER_BLOCK);
-               GET_ATTR(TOTAL_CONSTANT_MEMORY);
-               GET_ATTR(WARP_SIZE);
-               GET_ATTR(MAX_PITCH);
-               GET_ATTR(MAX_REGISTERS_PER_BLOCK);
-               GET_ATTR(REGISTERS_PER_BLOCK);
-               GET_ATTR(CLOCK_RATE);
-               GET_ATTR(TEXTURE_ALIGNMENT);
-               GET_ATTR(GPU_OVERLAP);
-               GET_ATTR(MULTIPROCESSOR_COUNT);
-               GET_ATTR(KERNEL_EXEC_TIMEOUT);
-               GET_ATTR(INTEGRATED);
-               GET_ATTR(CAN_MAP_HOST_MEMORY);
-               GET_ATTR(COMPUTE_MODE);
-               GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
-               GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
-               GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
-               GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
-               GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
-               GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
-               GET_ATTR(SURFACE_ALIGNMENT);
-               GET_ATTR(CONCURRENT_KERNELS);
-               GET_ATTR(ECC_ENABLED);
-               GET_ATTR(TCC_DRIVER);
-               GET_ATTR(MEMORY_CLOCK_RATE);
-               GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
-               GET_ATTR(L2_CACHE_SIZE);
-               GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
-               GET_ATTR(ASYNC_ENGINE_COUNT);
-               GET_ATTR(UNIFIED_ADDRESSING);
-               GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
-               GET_ATTR(CAN_TEX2D_GATHER);
-               GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
-               GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
-               GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
-               GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
-               GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
-               GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
-               GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
-               GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
-               GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
-               GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
-               GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
-               GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
-               GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
-               GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
-               GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
-               GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
-               GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
-               GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
-               GET_ATTR(COMPUTE_CAPABILITY_MINOR);
-               GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
-               GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
-               GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
-               GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
-               GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
-               GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
-               GET_ATTR(MANAGED_MEMORY);
-               GET_ATTR(MULTI_GPU_BOARD);
-               GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
+  { \
+    if (cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_##attr, num) == CUDA_SUCCESS) { \
+      capabilities += string_printf("\t\tCU_DEVICE_ATTRIBUTE_" #attr "\t\t\t%d\n", value); \
+    } \
+  } \
+  (void)0
+    /* TODO(sergey): Strip all attributes which are not useful for us
+     * or does not depend on the driver.
+     */
+    GET_ATTR(MAX_THREADS_PER_BLOCK);
+    GET_ATTR(MAX_BLOCK_DIM_X);
+    GET_ATTR(MAX_BLOCK_DIM_Y);
+    GET_ATTR(MAX_BLOCK_DIM_Z);
+    GET_ATTR(MAX_GRID_DIM_X);
+    GET_ATTR(MAX_GRID_DIM_Y);
+    GET_ATTR(MAX_GRID_DIM_Z);
+    GET_ATTR(MAX_SHARED_MEMORY_PER_BLOCK);
+    GET_ATTR(SHARED_MEMORY_PER_BLOCK);
+    GET_ATTR(TOTAL_CONSTANT_MEMORY);
+    GET_ATTR(WARP_SIZE);
+    GET_ATTR(MAX_PITCH);
+    GET_ATTR(MAX_REGISTERS_PER_BLOCK);
+    GET_ATTR(REGISTERS_PER_BLOCK);
+    GET_ATTR(CLOCK_RATE);
+    GET_ATTR(TEXTURE_ALIGNMENT);
+    GET_ATTR(GPU_OVERLAP);
+    GET_ATTR(MULTIPROCESSOR_COUNT);
+    GET_ATTR(KERNEL_EXEC_TIMEOUT);
+    GET_ATTR(INTEGRATED);
+    GET_ATTR(CAN_MAP_HOST_MEMORY);
+    GET_ATTR(COMPUTE_MODE);
+    GET_ATTR(MAXIMUM_TEXTURE1D_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES);
+    GET_ATTR(SURFACE_ALIGNMENT);
+    GET_ATTR(CONCURRENT_KERNELS);
+    GET_ATTR(ECC_ENABLED);
+    GET_ATTR(TCC_DRIVER);
+    GET_ATTR(MEMORY_CLOCK_RATE);
+    GET_ATTR(GLOBAL_MEMORY_BUS_WIDTH);
+    GET_ATTR(L2_CACHE_SIZE);
+    GET_ATTR(MAX_THREADS_PER_MULTIPROCESSOR);
+    GET_ATTR(ASYNC_ENGINE_COUNT);
+    GET_ATTR(UNIFIED_ADDRESSING);
+    GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE1D_LAYERED_LAYERS);
+    GET_ATTR(CAN_TEX2D_GATHER);
+    GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_GATHER_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE);
+    GET_ATTR(MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE);
+    GET_ATTR(MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE);
+    GET_ATTR(TEXTURE_PITCH_ALIGNMENT);
+    GET_ATTR(MAXIMUM_TEXTURECUBEMAP_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_SURFACE1D_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE2D_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE2D_HEIGHT);
+    GET_ATTR(MAXIMUM_SURFACE3D_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE3D_HEIGHT);
+    GET_ATTR(MAXIMUM_SURFACE3D_DEPTH);
+    GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE1D_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_HEIGHT);
+    GET_ATTR(MAXIMUM_SURFACE2D_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_SURFACECUBEMAP_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH);
+    GET_ATTR(MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS);
+    GET_ATTR(MAXIMUM_TEXTURE1D_LINEAR_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_HEIGHT);
+    GET_ATTR(MAXIMUM_TEXTURE2D_LINEAR_PITCH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH);
+    GET_ATTR(MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT);
+    GET_ATTR(COMPUTE_CAPABILITY_MAJOR);
+    GET_ATTR(COMPUTE_CAPABILITY_MINOR);
+    GET_ATTR(MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH);
+    GET_ATTR(STREAM_PRIORITIES_SUPPORTED);
+    GET_ATTR(GLOBAL_L1_CACHE_SUPPORTED);
+    GET_ATTR(LOCAL_L1_CACHE_SUPPORTED);
+    GET_ATTR(MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
+    GET_ATTR(MAX_REGISTERS_PER_MULTIPROCESSOR);
+    GET_ATTR(MANAGED_MEMORY);
+    GET_ATTR(MULTI_GPU_BOARD);
+    GET_ATTR(MULTI_GPU_BOARD_GROUP_ID);
 #undef GET_ATTR
-               capabilities += "\n";
-       }
+    capabilities += "\n";
+  }
 
-       return capabilities;
+  return capabilities;
 }
 
 CCL_NAMESPACE_END