Cycles Denoising: Pass tile buffers to every OpenCL kernel to conform to standard...
authorLukas Stockner <lukas.stockner@freenet.de>
Wed, 4 Jul 2018 12:02:38 +0000 (14:02 +0200)
committerLukas Stockner <lukas.stockner@freenet.de>
Wed, 4 Jul 2018 12:38:03 +0000 (14:38 +0200)
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/device_denoising.h
intern/cycles/device/opencl/opencl.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/filter/filter_defines.h
intern/cycles/kernel/filter/filter_prefilter.h
intern/cycles/kernel/kernels/opencl/filter.cl

index c54ac1e5933a01826ebb3dcedd74ce831539ef86..be0dcc207557ea08dbd22c1df968802004006150 100644 (file)
@@ -459,18 +459,6 @@ public:
                }
        };
 
-       bool denoising_set_tile_info(device_ptr *buffers, DenoisingTask *task)
-       {
-               TileInfo *tile_info = (TileInfo*) task->tile_info_mem.host_pointer;
-               for(int i = 0; i < 9; i++) {
-                       tile_info->buffers[i] = buffers[i];
-               }
-
-               task->tile_info_mem.copy_to_device();
-
-               return true;
-       }
-
        bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
                                       DenoisingTask *task)
        {
@@ -722,7 +710,6 @@ public:
                denoising.functions.combine_halves = function_bind(&CPUDevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
                denoising.functions.get_feature = function_bind(&CPUDevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
                denoising.functions.detect_outliers = function_bind(&CPUDevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-               denoising.functions.set_tile_info = function_bind(&CPUDevice::denoising_set_tile_info, this, _1, &denoising);
 
                denoising.filter_area = make_int4(tile.x, tile.y, tile.w, tile.h);
                denoising.render_buffer.samples = tile.sample;
index c8e0ea4d896f313798b7167f89ab6705df0371b2..8294af340e8659032c0516284cbb39f549e5974c 100644 (file)
@@ -1251,18 +1251,6 @@ public:
                }
        }
 
-       bool denoising_set_tile_info(device_ptr *buffers, DenoisingTask *task)
-       {
-               TileInfo *tile_info = (TileInfo*) task->tile_info_mem.host_pointer;
-               for(int i = 0; i < 9; i++) {
-                       tile_info->buffers[i] = buffers[i];
-               }
-
-               task->tile_info_mem.copy_to_device();
-
-               return !have_error();
-       }
-
 #define CUDA_GET_BLOCKSIZE(func, w, h)                                                                          \
                        int threads_per_block;                                                                              \
                        cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
@@ -1622,7 +1610,6 @@ public:
                denoising.functions.combine_halves = function_bind(&CUDADevice::denoising_combine_halves, this, _1, _2, _3, _4, _5, _6, &denoising);
                denoising.functions.get_feature = function_bind(&CUDADevice::denoising_get_feature, this, _1, _2, _3, _4, &denoising);
                denoising.functions.detect_outliers = function_bind(&CUDADevice::denoising_detect_outliers, this, _1, _2, _3, _4, &denoising);
-               denoising.functions.set_tile_info = function_bind(&CUDADevice::denoising_set_tile_info, this, _1, &denoising);
 
                denoising.filter_area = make_int4(rtile.x, rtile.y, rtile.w, rtile.h);
                denoising.render_buffer.samples = rtile.sample;
index 318b14ab49991b48771e4e7005a8f6b72f63ac9e..c0d4634262d172e938af6ca74a014d21ec50177b 100644 (file)
@@ -62,11 +62,10 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
 {
        tile_info = (TileInfo*) tile_info_mem.alloc(sizeof(TileInfo)/sizeof(int));
 
-       device_ptr buffers[9];
        for(int i = 0; i < 9; i++) {
-               buffers[i] = rtiles[i].buffer;
                tile_info->offsets[i] = rtiles[i].offset;
                tile_info->strides[i] = rtiles[i].stride;
+               tile_info->buffers[i] = rtiles[i].buffer;
        }
        tile_info->x[0] = rtiles[3].x;
        tile_info->x[1] = rtiles[4].x;
@@ -81,7 +80,7 @@ void DenoisingTask::set_render_buffer(RenderTile *rtiles)
        target_buffer.stride = rtiles[9].stride;
        target_buffer.ptr    = rtiles[9].buffer;
 
-       functions.set_tile_info(buffers);
+       tile_info_mem.copy_to_device();
 }
 
 void DenoisingTask::setup_denoising_buffer()
index 1063d43d0397b8ffa2d6b5ed1404ff2c8e0128ea..e0da52867f1d97c2183e8c1293cd29e4622fbc0c 100644 (file)
@@ -89,7 +89,6 @@ public:
                              device_ptr depth_ptr,
                              device_ptr output_ptr
                              )> detect_outliers;
-               function<bool(device_ptr*)> set_tile_info;
                function<void(RenderTile *rtiles)> map_neighbor_tiles;
                function<void(RenderTile *rtiles)> unmap_neighbor_tiles;
        } functions;
index c550d738bd50fc0bd0de0d8029ff8d75bd7fa456..22e0503365cd8f2d8875a4418d8c4bcb405789fb 100644 (file)
@@ -436,8 +436,6 @@ protected:
                                       device_ptr depth_ptr,
                                       device_ptr output_ptr,
                                       DenoisingTask *task);
-       bool denoising_set_tile_info(device_ptr *buffers,
-                                    DenoisingTask *task);
 
        device_ptr mem_alloc_sub_ptr(device_memory& mem, int offset, int size);
        void mem_free_sub_ptr(device_ptr ptr);
index 50d371e4c55fe930f7bedc77aef6359041684216..9a50d2173210d856acc7a59a2ef3cdc417bc8c62 100644 (file)
@@ -246,7 +246,6 @@ bool OpenCLDeviceBase::load_kernels(const DeviceRequestedFeatures& requested_fea
        denoising_program.add_kernel(ustring("filter_nlm_normalize"));
        denoising_program.add_kernel(ustring("filter_nlm_construct_gramian"));
        denoising_program.add_kernel(ustring("filter_finalize"));
-       denoising_program.add_kernel(ustring("filter_set_tile_info"));
 
        vector<OpenCLProgram*> programs;
        programs.push_back(&base_program);
@@ -981,9 +980,16 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
 
        cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
 
-       kernel_set_args(ckFilterDivideShadow, 0,
-                       task->render_buffer.samples,
-                       tile_info_mem,
+       int arg_ofs = kernel_set_args(ckFilterDivideShadow, 0,
+                                     task->render_buffer.samples,
+                                     tile_info_mem);
+       cl_mem buffers[9];
+       for(int i = 0; i < 9; i++) {
+               buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+               arg_ofs += kernel_set_args(ckFilterDivideShadow, arg_ofs,
+                                          buffers[i]);
+       }
+       kernel_set_args(ckFilterDivideShadow, arg_ofs,
                        a_mem,
                        b_mem,
                        sample_variance_mem,
@@ -1012,9 +1018,16 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
 
        cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
 
-       kernel_set_args(ckFilterGetFeature, 0,
-                       task->render_buffer.samples,
-                       tile_info_mem,
+       int arg_ofs = kernel_set_args(ckFilterGetFeature, 0,
+                                     task->render_buffer.samples,
+                                     tile_info_mem);
+       cl_mem buffers[9];
+       for(int i = 0; i < 9; i++) {
+               buffers[i] = CL_MEM_PTR(task->tile_info->buffers[i]);
+               arg_ofs += kernel_set_args(ckFilterGetFeature, arg_ofs,
+                                          buffers[i]);
+       }
+       kernel_set_args(ckFilterGetFeature, arg_ofs,
                        mean_offset,
                        variance_offset,
                        mean_mem,
@@ -1056,29 +1069,8 @@ bool OpenCLDeviceBase::denoising_detect_outliers(device_ptr image_ptr,
        return true;
 }
 
-bool OpenCLDeviceBase::denoising_set_tile_info(device_ptr *buffers,
-                                               DenoisingTask *task)
-{
-       task->tile_info_mem.copy_to_device();
-
-       cl_mem tile_info_mem = CL_MEM_PTR(task->tile_info_mem.device_pointer);
-
-       cl_kernel ckFilterSetTileInfo = denoising_program(ustring("filter_set_tile_info"));
-
-       kernel_set_args(ckFilterSetTileInfo, 0, tile_info_mem);
-       for(int i = 0; i < 9; i++) {
-               cl_mem buffer_mem = CL_MEM_PTR(buffers[i]);
-               kernel_set_args(ckFilterSetTileInfo, i+1, buffer_mem);
-       }
-
-       enqueue_kernel(ckFilterSetTileInfo, 1, 1);
-
-       return true;
-}
-
 void OpenCLDeviceBase::denoise(RenderTile &rtile, DenoisingTask& denoising)
 {
-       denoising.functions.set_tile_info = function_bind(&OpenCLDeviceBase::denoising_set_tile_info, this, _1, &denoising);
        denoising.functions.construct_transform = function_bind(&OpenCLDeviceBase::denoising_construct_transform, this, &denoising);
        denoising.functions.reconstruct = function_bind(&OpenCLDeviceBase::denoising_reconstruct, this, _1, _2, _3, &denoising);
        denoising.functions.divide_shadow = function_bind(&OpenCLDeviceBase::denoising_divide_shadow, this, _1, _2, _3, _4, _5, &denoising);
index 57d3d90594feb1cc7b80e10f58ca94b98c2af321..d48ea3ac1d61f20b9d033c22a67220c255d9c835 100644 (file)
@@ -35,4 +35,29 @@ typedef struct TileInfo {
 #endif
 } TileInfo;
 
+#ifdef __KERNEL_OPENCL__
+#  define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info,  \
+                               ccl_global float *tile_buffer_1, \
+                               ccl_global float *tile_buffer_2, \
+                               ccl_global float *tile_buffer_3, \
+                               ccl_global float *tile_buffer_4, \
+                               ccl_global float *tile_buffer_5, \
+                               ccl_global float *tile_buffer_6, \
+                               ccl_global float *tile_buffer_7, \
+                               ccl_global float *tile_buffer_8, \
+                               ccl_global float *tile_buffer_9
+#  define CCL_FILTER_TILE_INFO_ARG tile_info, \
+                                   tile_buffer_1, tile_buffer_2, tile_buffer_3, \
+                                   tile_buffer_4, tile_buffer_5, tile_buffer_6, \
+                                   tile_buffer_7, tile_buffer_8, tile_buffer_9
+#  define ccl_get_tile_buffer(id) (tile_buffer_ ## id)
+#else
+#  ifdef __KERNEL_CUDA__
+#    define CCL_FILTER_TILE_INFO ccl_global TileInfo* tile_info
+#  else
+#    define CCL_FILTER_TILE_INFO TileInfo* tile_info
+#  endif
+#  define ccl_get_tile_buffer(id) (tile_info->buffers[id])
+#endif
+
 #endif /* __FILTER_DEFINES_H__*/
index 9513bf46bd7278c5eb1ef34eaa782e3ca1eb2ee8..3507f80df4657742da7ee966eee14e9894aa0c3a 100644 (file)
@@ -26,7 +26,7 @@ CCL_NAMESPACE_BEGIN
  * bufferVariance: The buffer-based variance of the shadow feature. Unbiased, but quite noisy.
  */
 ccl_device void kernel_filter_divide_shadow(int sample,
-                                            ccl_global TileInfo *tile_info,
+                                            CCL_FILTER_TILE_INFO,
                                             int x, int y,
                                             ccl_global float *unfilteredA,
                                             ccl_global float *unfilteredB,
@@ -43,7 +43,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
 
        int offset = tile_info->offsets[tile];
        int stride = tile_info->strides[tile];
-       const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) tile_info->buffers[tile];
+       const ccl_global float *ccl_restrict center_buffer = (ccl_global float*) ccl_get_tile_buffer(tile);
        center_buffer += (y*stride + x + offset)*buffer_pass_stride;
        center_buffer += buffer_denoising_offset + 14;
 
@@ -79,7 +79,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
  * - rect: The prefilter area (lower pixels inclusive, upper pixels exclusive).
  */
 ccl_device void kernel_filter_get_feature(int sample,
-                                          ccl_global TileInfo *tile_info,
+                                          CCL_FILTER_TILE_INFO,
                                           int m_offset, int v_offset,
                                           int x, int y,
                                           ccl_global float *mean,
@@ -90,7 +90,7 @@ ccl_device void kernel_filter_get_feature(int sample,
        int xtile = (x < tile_info->x[1])? 0: ((x < tile_info->x[2])? 1: 2);
        int ytile = (y < tile_info->y[1])? 0: ((y < tile_info->y[2])? 1: 2);
        int tile = ytile*3+xtile;
-       ccl_global float *center_buffer = ((ccl_global float*) tile_info->buffers[tile]) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
+       ccl_global float *center_buffer = ((ccl_global float*) ccl_get_tile_buffer(tile)) + (tile_info->offsets[tile] + y*tile_info->strides[tile] + x)*buffer_pass_stride + buffer_denoising_offset;
 
        int buffer_w = align_up(rect.z - rect.x, 4);
        int idx = (y-rect.y)*buffer_w + (x - rect.x);
index d553ee6833ccdcc8d95a52cfb808ec94721ef1d9..3c75754fb3917853be7586f776908f4312eb2bb0 100644 (file)
@@ -23,7 +23,7 @@
 /* kernels */
 
 __kernel void kernel_ocl_filter_divide_shadow(int sample,
-                                              ccl_global TileInfo *tile_info,
+                                              CCL_FILTER_TILE_INFO,
                                               ccl_global float *unfilteredA,
                                               ccl_global float *unfilteredB,
                                               ccl_global float *sampleVariance,
@@ -37,7 +37,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
        int y = prefilter_rect.y + get_global_id(1);
        if(x < prefilter_rect.z && y < prefilter_rect.w) {
                kernel_filter_divide_shadow(sample,
-                                           tiles,
+                                           CCL_FILTER_TILE_INFO_ARG,
                                            x, y,
                                            unfilteredA,
                                            unfilteredB,
@@ -51,7 +51,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
 }
 
 __kernel void kernel_ocl_filter_get_feature(int sample,
-                                            ccl_global TileInfo *tile_info,
+                                            CCL_FILTER_TILE_INFO,
                                             int m_offset,
                                             int v_offset,
                                             ccl_global float *mean,
@@ -64,7 +64,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
        int y = prefilter_rect.y + get_global_id(1);
        if(x < prefilter_rect.z && y < prefilter_rect.w) {
                kernel_filter_get_feature(sample,
-                                         tiles,
+                                         CCL_FILTER_TILE_INFO_ARG,
                                          m_offset, v_offset,
                                          x, y,
                                          mean, variance,
@@ -276,27 +276,3 @@ __kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
                                       buffer_params, sample);
        }
 }
-
-__kernel void kernel_ocl_filter_set_tile_info(ccl_global TileInfo* tile_info,
-                                              ccl_global float *buffer_1,
-                                              ccl_global float *buffer_2,
-                                              ccl_global float *buffer_3,
-                                              ccl_global float *buffer_4,
-                                              ccl_global float *buffer_5,
-                                              ccl_global float *buffer_6,
-                                              ccl_global float *buffer_7,
-                                              ccl_global float *buffer_8,
-                                              ccl_global float *buffer_9)
-{
-       if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
-               tile_info->buffers[0] = buffer_1;
-               tile_info->buffers[1] = buffer_2;
-               tile_info->buffers[2] = buffer_3;
-               tile_info->buffers[3] = buffer_4;
-               tile_info->buffers[4] = buffer_5;
-               tile_info->buffers[5] = buffer_6;
-               tile_info->buffers[6] = buffer_7;
-               tile_info->buffers[7] = buffer_8;
-               tile_info->buffers[8] = buffer_9;
-       }
-}