Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / kernels / cuda / filter.cu
index c8172355a7f5f33a8470de0ddf37f45c1d4507c3..035f0484488c23680a2bc3c55a693c97a9e584b8 100644 (file)
@@ -134,95 +134,140 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
-                                       const float *ccl_restrict weight_image,
+kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        const float *ccl_restrict variance_image,
                                        float *difference_image,
-                                       int4 rect, int w,
+                                       int w,
+                                       int h,
+                                       int stride,
+                                       int shift_stride,
+                                       int r,
                                        int channel_offset,
-                                       float a, float k_2)
+                                       float a,
+                                       float k_2)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+                                                 weight_image,
+                                                 variance_image,
+                                                 difference_image + ofs,
+                                                 rect, stride,
+                                                 channel_offset, a, k_2);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
+                            float *out_image,
+                            int w,
+                            int h,
+                            int stride,
+                            int shift_stride,
+                            int r,
+                            int f)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_blur(co.x, co.y,
+                                      difference_image + ofs,
+                                      out_image + ofs,
+                                      rect, stride, f);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                   float *out_image,
+                                   int w,
+                                   int h,
+                                   int stride,
+                                   int shift_stride,
+                                   int r,
+                                   int f)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_calc_weight(co.x, co.y,
+                                             difference_image + ofs,
+                                             out_image + ofs,
+                                             rect, stride, f);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(int dx, int dy,
-                                     const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      const float *ccl_restrict image,
-                                     float *out_image, float *accum_image,
-                                     int4 rect, int w,
+                                     float *out_image,
+                                     float *accum_image,
+                                     int w,
+                                     int h,
+                                     int stride,
+                                     int shift_stride,
+                                     int r,
                                      int f)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+                                               difference_image + ofs,
+                                               image,
+                                               out_image,
+                                               accum_image,
+                                               rect, stride, f);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+kernel_cuda_filter_nlm_normalize(float *out_image,
+                                 const float *ccl_restrict accum_image,
+                                 int w,
+                                 int h,
+                                 int stride)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+       int x = blockDim.x*blockIdx.x + threadIdx.x;
+       int y = blockDim.y*blockIdx.y + threadIdx.y;
+       if(x < w && y < h) {
+               kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
-                                         const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
                                          const float *ccl_restrict buffer,
                                          float const* __restrict__ transform,
                                          int *rank,
                                          float *XtWX,
                                          float3 *XtWY,
-                                         int4 rect,
-                                         int4 filter_rect,
-                                         int w, int h, int f,
+                                         int4 filter_window,
+                                         int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
+                                         int f,
                                          int pass_stride)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
-       int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
-       if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
-               kernel_filter_nlm_construct_gramian(x, y,
-                                                   dx, dy,
-                                                   difference_image,
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+               kernel_filter_nlm_construct_gramian(co.x, co.y,
+                                                   co.z, co.w,
+                                                   difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
                                                    XtWX, XtWY,
-                                                   rect, filter_rect,
-                                                   w, h, f,
+                                                   rect, filter_window,
+                                                   stride, f,
                                                    pass_stride,
                                                    threadIdx.y*blockDim.x + threadIdx.x);
        }
@@ -230,10 +275,12 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(int w, int h,
-                            float *buffer, int *rank,
-                            float *XtWX, float3 *XtWY,
-                            int4 filter_area, int4 buffer_params,
+kernel_cuda_filter_finalize(float *buffer,
+                            int *rank,
+                            float *XtWX,
+                            float3 *XtWY,
+                            int4 filter_area,
+                            int4 buffer_params,
                             int sample)
 {
        int x = blockDim.x*blockIdx.x + threadIdx.x;
@@ -243,7 +290,10 @@ kernel_cuda_filter_finalize(int w, int h,
                rank += storage_ofs;
                XtWX += storage_ofs;
                XtWY += storage_ofs;
-               kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+               kernel_filter_finalize(x, y, buffer, rank,
+                                      filter_area.z*filter_area.w,
+                                      XtWX, XtWY,
+                                      buffer_params, sample);
        }
 }