Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / device / opencl / opencl_base.cpp
index f43177247ef0f21f43ae18ba285b49f4db0e73ad..fe084edc90ec46140e3ec308549c9b9a846ec9d6 100644 (file)
@@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
        return global_size + ((r == 0)? 0: group_size - r);
 }
 
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
 {
        size_t workgroup_size, max_work_items[3];
 
@@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size
        }
 
        /* Try to divide evenly over 2 dimensions. */
-       size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-       size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+       size_t local_size[2];
+       if(x_workgroups) {
+               local_size[0] = workgroup_size;
+               local_size[1] = 1;
+       }
+       else {
+               size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+               local_size[0] = local_size[1] = sqrt_workgroup_size;
+       }
 
        /* Some implementations have max size 1 on 2nd dimension. */
        if(local_size[1] > max_work_items[1]) {
@@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
                                                  device_ptr out_ptr,
                                                  DenoisingTask *task)
 {
-       int4 rect = task->rect;
-       int w = rect.z-rect.x;
-       int h = rect.w-rect.y;
+
+       int stride = task->buffer.stride;
+       int w = task->buffer.width;
+       int h = task->buffer.h;
        int r = task->nlm_state.r;
        int f = task->nlm_state.f;
        float a = task->nlm_state.a;
        float k_2 = task->nlm_state.k_2;
 
-       cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
-       cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
-       cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+       int shift_stride = stride*h;
+       int num_shifts = (2*r+1)*(2*r+1);
+       int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+       cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+       cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
+       cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
 
        cl_mem image_mem = CL_MEM_PTR(image_ptr);
        cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
@@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
        cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
        cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
 
-       for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-               int dy = i / (2*r+1) - r;
-               int dx = i % (2*r+1) - r;
-               int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-               kernel_set_args(ckNLMCalcDifference, 0,
-                               dx, dy, guide_mem, variance_mem,
-                               difference, local_rect, w, 0, a, k_2);
-               kernel_set_args(ckNLMBlur, 0,
-                               difference, blurDifference, local_rect, w, f);
-               kernel_set_args(ckNLMCalcWeight, 0,
-                               blurDifference, difference, local_rect, w, f);
-               kernel_set_args(ckNLMUpdateOutput, 0,
-                               dx, dy, blurDifference, image_mem,
-                               out_mem, weightAccum, local_rect, w, f);
-
-               enqueue_kernel(ckNLMCalcDifference, w, h);
-               enqueue_kernel(ckNLMBlur,           w, h);
-               enqueue_kernel(ckNLMCalcWeight,     w, h);
-               enqueue_kernel(ckNLMBlur,           w, h);
-               enqueue_kernel(ckNLMUpdateOutput,   w, h);
-       }
+       kernel_set_args(ckNLMCalcDifference, 0,
+                       guide_mem,
+                       variance_mem,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       r, 0, a, k_2);
+       kernel_set_args(ckNLMBlur, 0,
+                       difference,
+                       blurDifference,
+                       w, h, stride,
+                       shift_stride,
+                       r, f);
+       kernel_set_args(ckNLMCalcWeight, 0,
+                       blurDifference,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       r, f);
+       kernel_set_args(ckNLMUpdateOutput, 0,
+                       blurDifference,
+                       image_mem,
+                       out_mem,
+                       weightAccum,
+                       w, h, stride,
+                       shift_stride,
+                       r, f);
+
+       enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+       enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+       enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
+
+       opencl_assert(clReleaseMemObject(difference));
+       opencl_assert(clReleaseMemObject(blurDifference));
 
-       int4 local_rect = make_int4(0, 0, w, h);
        kernel_set_args(ckNLMNormalize, 0,
-                       out_mem, weightAccum, local_rect, w);
+                       out_mem, weightAccum, w, h, stride);
        enqueue_kernel(ckNLMNormalize, w, h);
 
        return true;
@@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
        cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
 
-       cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
-       cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
-
-       int r = task->radius;
-       int f = 4;
-       float a = 1.0f;
-       for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-               int dy = i / (2*r+1) - r;
-               int dx = i % (2*r+1) - r;
-
-               int local_rect[4] = {max(0, -dx), max(0, -dy),
-                                    task->reconstruction_state.source_w - max(0, dx),
-                                    task->reconstruction_state.source_h - max(0, dy)};
-
-               kernel_set_args(ckNLMCalcDifference, 0,
-                               dx, dy,
-                               color_mem,
-                               color_variance_mem,
-                               difference,
-                               local_rect,
-                               task->buffer.w,
-                               task->buffer.pass_stride,
-                               a, task->nlm_k_2);
-               enqueue_kernel(ckNLMCalcDifference,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               kernel_set_args(ckNLMBlur, 0,
-                               difference,
-                               blurDifference,
-                               local_rect,
-                               task->buffer.w,
-                               f);
-               enqueue_kernel(ckNLMBlur,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               kernel_set_args(ckNLMCalcWeight, 0,
-                               blurDifference,
-                               difference,
-                               local_rect,
-                               task->buffer.w,
-                               f);
-               enqueue_kernel(ckNLMCalcWeight,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               /* Reuse previous arguments. */
-               enqueue_kernel(ckNLMBlur,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               kernel_set_args(ckNLMConstructGramian, 0,
-                               dx, dy,
-                               blurDifference,
-                               buffer_mem,
-                               transform_mem,
-                               rank_mem,
-                               XtWX_mem,
-                               XtWY_mem,
-                               local_rect,
-                               task->reconstruction_state.filter_rect,
-                               task->buffer.w,
-                               task->buffer.h,
-                               f,
-                           task->buffer.pass_stride);
-               enqueue_kernel(ckNLMConstructGramian,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h,
-                              256);
-       }
+       int w = task->reconstruction_state.source_w;
+       int h = task->reconstruction_state.source_h;
+       int stride = task->buffer.stride;
+
+       int shift_stride = stride*h;
+       int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
+       int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+       cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+       cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+
+       kernel_set_args(ckNLMCalcDifference, 0,
+                       color_mem,
+                       color_variance_mem,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius,
+                       task->buffer.pass_stride,
+                       1.0f, task->nlm_k_2);
+       kernel_set_args(ckNLMBlur, 0,
+                       difference,
+                       blurDifference,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius, 4);
+       kernel_set_args(ckNLMCalcWeight, 0,
+                       blurDifference,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius, 4);
+       kernel_set_args(ckNLMConstructGramian, 0,
+                       blurDifference,
+                       buffer_mem,
+                       transform_mem,
+                       rank_mem,
+                       XtWX_mem,
+                       XtWY_mem,
+                       task->reconstruction_state.filter_window,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius, 4,
+                       task->buffer.pass_stride);
+
+       enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+       enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+       enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+
+       opencl_assert(clReleaseMemObject(difference));
+       opencl_assert(clReleaseMemObject(blurDifference));
 
        kernel_set_args(ckFinalize, 0,
-                       task->buffer.w,
-                       task->buffer.h,
                        output_mem,
                        rank_mem,
                        XtWX_mem,
@@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
                        task->filter_area,
                        task->reconstruction_state.buffer_params,
                        task->render_buffer.samples);
-       enqueue_kernel(ckFinalize,
-                      task->reconstruction_state.source_w,
-                      task->reconstruction_state.source_h);
+       enqueue_kernel(ckFinalize, w, h);
 
        return true;
 }