Cycles: Use existing shared temporary memory in reconstruction step of the denoiser
authorLukas Stockner <lukas.stockner@freenet.de>
Mon, 8 Oct 2018 20:13:40 +0000 (22:13 +0200)
committerLukas Stockner <lukas.stockner@freenet.de>
Mon, 8 Oct 2018 20:13:40 +0000 (22:13 +0200)
Previously the code allocated its own temporary memory, but it's possible to just use the existing shared one instead.

intern/cycles/device/device_cuda.cpp
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl

index 7d9a13ecc8871d837e2ce4db642fd77249be35b4..5b46d5a507d44ab13afcbc72a7fbc7023acecd29 100644 (file)
@@ -1397,18 +1397,14 @@ public:
                int h = task->reconstruction_state.source_h;
                int stride = task->buffer.stride;
 
-               int shift_stride = stride*h;
+               int pass_stride = task->buffer.pass_stride;
                int num_shifts = (2*r+1)*(2*r+1);
-               int mem_size = sizeof(float)*shift_stride*num_shifts;
-
-               device_only_memory<uchar> temporary_mem(this, "Denoising temporary_mem");
-               temporary_mem.alloc_to_device(2*mem_size);
 
                if(have_error())
                        return false;
 
-               CUdeviceptr difference     = cuda_device_ptr(temporary_mem.device_pointer);
-               CUdeviceptr blurDifference = difference + mem_size;
+               CUdeviceptr difference     = cuda_device_ptr(task->buffer.temporary_mem.device_pointer);
+               CUdeviceptr blurDifference = difference + sizeof(float)*pass_stride*num_shifts;
 
                {
                        CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
@@ -1426,9 +1422,9 @@ public:
                                             task->reconstruction_state.source_w * task->reconstruction_state.source_h,
                                             num_shifts);
 
-                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
-                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
-                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &pass_stride, &r, &pass_stride, &a, &k_2};
+                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &pass_stride, &r, &f};
+                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &pass_stride, &r, &f};
                        void *construct_gramian_args[] = {&blurDifference,
                                                          &task->buffer.mem.device_pointer,
                                                          &task->storage.transform.device_pointer,
@@ -1437,9 +1433,8 @@ public:
                                                          &task->storage.XtWY.device_pointer,
                                                          &task->reconstruction_state.filter_window,
                                                          &w, &h, &stride,
-                                                         &shift_stride, &r,
-                                                         &f,
-                                                     &task->buffer.pass_stride};
+                                                         &pass_stride, &r,
+                                                         &f};
 
                        CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
                        CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
@@ -1448,8 +1443,6 @@ public:
                        CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
                }
 
-               temporary_mem.free();
-
                {
                        CUfunction cuFinalize;
                        cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
index cc887134bb05ba37f67cc60af7e323b159e46220..75418dad1ccc9eb0c11d88a382e9f4e81babebb7 100644 (file)
@@ -865,38 +865,38 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        int h = task->reconstruction_state.source_h;
        int stride = task->buffer.stride;
 
-       int shift_stride = stride*h;
-       int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
-       int mem_size = sizeof(float)*shift_stride*num_shifts;
+       int r = task->radius;
+       int pass_stride = task->buffer.pass_stride;
+       int num_shifts = (2*r-+1)*(2*r+1);
 
-       cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
-       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
-       cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
-       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+       device_sub_ptr difference(task->buffer.temporary_mem, 0, pass_stride*num_shifts);
+       device_sub_ptr blurDifference(task->buffer.temporary_mem, pass_stride*num_shifts, pass_stride*num_shifts);
+       cl_mem difference_mem = CL_MEM_PTR(*difference);
+       cl_mem blurDifference_mem = CL_MEM_PTR(*blurDifference);
 
        kernel_set_args(ckNLMCalcDifference, 0,
                        color_mem,
                        color_variance_mem,
-                       difference,
+                       difference_mem,
                        w, h, stride,
-                       shift_stride,
-                       task->radius,
-                       task->buffer.pass_stride,
+                       pass_stride,
+                       r,
+                       pass_stride,
                        1.0f, task->nlm_k_2);
        kernel_set_args(ckNLMBlur, 0,
-                       difference,
-                       blurDifference,
+                       difference_mem,
+                       blurDifference_mem,
                        w, h, stride,
-                       shift_stride,
-                       task->radius, 4);
+                       pass_stride,
+                       r, 4);
        kernel_set_args(ckNLMCalcWeight, 0,
-                       blurDifference,
-                       difference,
+                       blurDifference_mem,
+                       difference_mem,
                        w, h, stride,
-                       shift_stride,
-                       task->radius, 4);
+                       pass_stride,
+                       r, 4);
        kernel_set_args(ckNLMConstructGramian, 0,
-                       blurDifference,
+                       blurDifference_mem,
                        buffer_mem,
                        transform_mem,
                        rank_mem,
@@ -904,9 +904,8 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
                        XtWY_mem,
                        task->reconstruction_state.filter_window,
                        w, h, stride,
-                       shift_stride,
-                       task->radius, 4,
-                       task->buffer.pass_stride);
+                       pass_stride,
+                       r, 4);
 
        enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
@@ -914,9 +913,6 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
        enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
 
-       opencl_assert(clReleaseMemObject(difference));
-       opencl_assert(clReleaseMemObject(blurDifference));
-
        kernel_set_args(ckFinalize, 0,
                        output_mem,
                        rank_mem,
index 0561c40e6b18ba5b15143f46a9ef119d820265cd..b856cbde45c487fdab9d7c127465951bcf6e2008 100644 (file)
@@ -140,7 +140,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        int w,
                                        int h,
                                        int stride,
-                                       int shift_stride,
+                                       int pass_stride,
                                        int r,
                                        int channel_offset,
                                        float a,
@@ -148,7 +148,7 @@ kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
                                                  weight_image,
                                                  variance_image,
@@ -165,13 +165,13 @@ kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
                             int w,
                             int h,
                             int stride,
-                            int shift_stride,
+                            int pass_stride,
                             int r,
                             int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_blur(co.x, co.y,
                                       difference_image + ofs,
                                       out_image + ofs,
@@ -186,13 +186,13 @@ kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                    int w,
                                    int h,
                                    int stride,
-                                   int shift_stride,
+                                   int pass_stride,
                                    int r,
                                    int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_weight(co.x, co.y,
                                              difference_image + ofs,
                                              out_image + ofs,
@@ -209,13 +209,13 @@ kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      int w,
                                      int h,
                                      int stride,
-                                     int shift_stride,
+                                     int pass_stride,
                                      int r,
                                      int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
                                                difference_image + ofs,
                                                image,
@@ -252,14 +252,13 @@ kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_im
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
-                                         int f,
-                                         int pass_stride)
+                                         int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+       if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
                                                    difference_image + ofs,
index 3c75754fb3917853be7586f776908f4312eb2bb0..a550f97f4eb6839d7f6266fa5c8567e71220adc3 100644 (file)
@@ -132,7 +132,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
                                                     int w,
                                                     int h,
                                                     int stride,
-                                                    int shift_stride,
+                                                    int pass_stride,
                                                     int r,
                                                     int channel_offset,
                                                     float a,
@@ -140,7 +140,7 @@ __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
                                                  weight_image,
                                                  variance_image,
@@ -155,13 +155,13 @@ __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict di
                                          int w,
                                          int h,
                                          int stride,
-                                         int shift_stride,
+                                         int pass_stride,
                                          int r,
                                          int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_blur(co.x, co.y,
                                       difference_image + ofs,
                                       out_image + ofs,
@@ -174,13 +174,13 @@ __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_rest
                                                 int w,
                                                 int h,
                                                 int stride,
-                                                int shift_stride,
+                                                int pass_stride,
                                                 int r,
                                                 int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_calc_weight(co.x, co.y,
                                              difference_image + ofs,
                                              out_image + ofs,
@@ -195,13 +195,13 @@ __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_re
                                                   int w,
                                                   int h,
                                                   int stride,
-                                                  int shift_stride,
+                                                  int pass_stride,
                                                   int r,
                                                   int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+       if(get_nlm_coords(w, h, r, pass_stride, &rect, &co, &ofs)) {
                kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
                                                difference_image + ofs,
                                                image,
@@ -234,14 +234,13 @@ __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *cc
                                                       int w,
                                                       int h,
                                                       int stride,
-                                                      int shift_stride,
+                                                      int pass_stride,
                                                       int r,
-                                                      int f,
-                                                      int pass_stride)
+                                                      int f)
 {
        int4 co, rect;
        int ofs;
-       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+       if(get_nlm_coords_window(w, h, r, pass_stride, &rect, &co, &ofs, filter_window)) {
                kernel_filter_nlm_construct_gramian(co.x, co.y,
                                                    co.z, co.w,
                                                    difference_image + ofs,