Cycles: Improve denoising speed on GPUs with small tile sizes
authorLukas Stockner <lukas.stockner@freenet.de>
Fri, 10 Nov 2017 03:34:14 +0000 (04:34 +0100)
committerLukas Stockner <lukas.stockner@freenet.de>
Thu, 30 Nov 2017 06:37:08 +0000 (07:37 +0100)
Previously, the NLM kernels would be launched once per offset with one thread per pixel.
However, with the smaller tile sizes that are now feasible, there wasn't enough work to fully occupy GPUs which results in a significant slowdown.

Therefore, the kernels are now launched in a single call that handles all offsets at once.
This has two downsides: Memory accesses to accumulating buffers are now atomic, and more importantly, the temporary memory now has to be allocated for every shift at once, increasing the required memory.
On the other hand, of course, the smaller tiles significantly reduce the size of the memory.

The main bottleneck right now is the construction of the transformation - there is nothing to be parallelized there, one thread per pixel is the maximum.
I tried to parallelize the SVD implementation by storing the matrix in shared memory and launching one block per pixel, but that wasn't really going anywhere.

To make the new code somewhat readable, the handling of rectangular regions was cleaned up a bit and commented, it should be easier to understand what's going on now.
Also, some variables have been renamed to make the difference between buffer width and stride more apparent, in addition to some general style cleanup.

18 files changed:
intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/device_denoising.cpp
intern/cycles/device/device_denoising.h
intern/cycles/device/opencl/opencl.h
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/CMakeLists.txt
intern/cycles/kernel/filter/filter_nlm_cpu.h
intern/cycles/kernel/filter/filter_nlm_gpu.h
intern/cycles/kernel/filter/filter_reconstruction.h
intern/cycles/kernel/kernels/cpu/filter_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl
intern/cycles/util/CMakeLists.txt
intern/cycles/util/util_math.h
intern/cycles/util/util_math_matrix.h
intern/cycles/util/util_rect.h [new file with mode: 0644]

index 999b9230d295be8f1dece574547a8708b47e943d..2d28ccd2b49cde6715635abac2e0dbbff21cfa9e 100644 (file)
@@ -190,9 +190,9 @@ public:
        KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)>       filter_nlm_update_output_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int)>                                      filter_nlm_normalize_kernel;
 
-       KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                              filter_construct_transform_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
-       KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)>                       filter_finalize_kernel;
+       KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)>                         filter_construct_transform_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
+       KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)>                            filter_finalize_kernel;
 
        KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
                               int, int, int, int, int, int, int, int, ccl_global int*, int,
@@ -565,13 +565,13 @@ public:
                                                            (float*) color_variance_ptr,
                                                            difference,
                                                            local_rect,
-                                                           task->buffer.w,
+                                                           task->buffer.stride,
                                                            task->buffer.pass_stride,
                                                            1.0f,
                                                            task->nlm_k_2);
-                       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
-                       filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
-                       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
+                       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
+                       filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
+                       filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
                        filter_nlm_construct_gramian_kernel()(dx, dy,
                                                              blurDifference,
                                                              (float*)  task->buffer.mem.device_pointer,
@@ -580,9 +580,8 @@ public:
                                                              (float*)  task->storage.XtWX.device_pointer,
                                                              (float3*) task->storage.XtWY.device_pointer,
                                                              local_rect,
-                                                             &task->reconstruction_state.filter_rect.x,
-                                                             task->buffer.w,
-                                                             task->buffer.h,
+                                                             &task->reconstruction_state.filter_window.x,
+                                                             task->buffer.stride,
                                                              4,
                                                              task->buffer.pass_stride);
                }
@@ -591,8 +590,6 @@ public:
                                filter_finalize_kernel()(x,
                                                         y,
                                                         y*task->filter_area.z + x,
-                                                        task->buffer.w,
-                                                        task->buffer.h,
                                                         (float*)  output_ptr,
                                                         (int*)    task->storage.rank.device_pointer,
                                                         (float*)  task->storage.XtWX.device_pointer,
index d8d787ba706b768662bf1df86bd749286bb24d22..a663da748df70006f1fe4b0c976c7a3421bc325f 100644 (file)
@@ -1087,6 +1087,19 @@ public:
                                                   threads, threads, 1, \
                                                   0, 0, args, 0));
 
+/* Similar as above, but for 1-dimensional blocks. */
+#define CUDA_GET_BLOCKSIZE_1D(func, w, h)                                                                       \
+                       int threads_per_block;                                                                              \
+                       cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
+                       int xblocks = ((w) + threads_per_block - 1)/threads_per_block;                                      \
+                       int yblocks = h;
+
+#define CUDA_LAUNCH_KERNEL_1D(func, args)                       \
+                       cuda_assert(cuLaunchKernel(func,                    \
+                                                  xblocks, yblocks, 1,     \
+                                                  threads_per_block, 1, 1, \
+                                                  0, 0, args, 0));
+
        bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
                                       DenoisingTask *task)
        {
@@ -1095,60 +1108,65 @@ public:
 
                CUDAContextScope scope(this);
 
-               int4 rect = task->rect;
-               int w = align_up(rect.z-rect.x, 4);
-               int h = rect.w-rect.y;
+               int stride = task->buffer.stride;
+               int w = task->buffer.width;
+               int h = task->buffer.h;
                int r = task->nlm_state.r;
                int f = task->nlm_state.f;
                float a = task->nlm_state.a;
                float k_2 = task->nlm_state.k_2;
 
-               CUdeviceptr difference     = task->nlm_state.temporary_1_ptr;
-               CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
-               CUdeviceptr weightAccum    = task->nlm_state.temporary_3_ptr;
+               int shift_stride = stride*h;
+               int num_shifts = (2*r+1)*(2*r+1);
+               int mem_size = sizeof(float)*shift_stride*2*num_shifts;
+               int channel_offset = 0;
 
-               cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
-               cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
+               CUdeviceptr temporary_mem;
+               cuda_assert(cuMemAlloc(&temporary_mem, mem_size));
+               CUdeviceptr difference     = temporary_mem;
+               CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts;
 
-               CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
-               cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-               cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-               cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-               cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
-               cuda_assert(cuModuleGetFunction(&cuNLMNormalize,      cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+               CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
+               cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride));
+               cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride));
 
-               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize,      CU_FUNC_CACHE_PREFER_L1));
+               {
+                       CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
+                       cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+                       cuda_assert(cuModuleGetFunction(&cuNLMBlur,           cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+                       cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,     cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+                       cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput,   cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
 
-               CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,           CU_FUNC_CACHE_PREFER_L1));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,     CU_FUNC_CACHE_PREFER_L1));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput,   CU_FUNC_CACHE_PREFER_L1));
 
-               int dx, dy;
-               int4 local_rect;
-               int channel_offset = 0;
-               void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
-               void *blur_args[]            = {&difference, &blurDifference, &local_rect, &w, &f};
-               void *calc_weight_args[]     = {&blurDifference, &difference, &local_rect, &w, &f};
-               void *update_output_args[]   = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
-
-               for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-                       dy = i / (2*r+1) - r;
-                       dx = i % (2*r+1) - r;
-                       local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-
-                       CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
-                       CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
-                       CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-                       CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
-               }
-
-               local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
-               void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
-               CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
-               cuda_assert(cuCtxSynchronize());
+                       CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
+
+                       void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2};
+                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+                       void *update_output_args[]   = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f};
+
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
+               }
+
+               cuMemFree(temporary_mem);
+
+               {
+                       CUfunction cuNLMNormalize;
+                       cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
+                       void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
+                       CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
+                       CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
+                       cuda_assert(cuCtxSynchronize());
+               }
 
                return !have_error();
        }
@@ -1194,91 +1212,81 @@ public:
                mem_zero(task->storage.XtWX);
                mem_zero(task->storage.XtWY);
 
-               CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
-               cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
-               cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
-               cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
-               cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
-               cuda_assert(cuModuleGetFunction(&cuFinalize,            cuFilterModule, "kernel_cuda_filter_finalize"));
+               int r = task->radius;
+               int f = 4;
+               float a = 1.0f;
+               float k_2 = task->nlm_k_2;
 
-               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
-               cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
-               cuda_assert(cuFuncSetCacheConfig(cuFinalize,            CU_FUNC_CACHE_PREFER_L1));
+               int w = task->reconstruction_state.source_w;
+               int h = task->reconstruction_state.source_h;
+               int stride = task->buffer.stride;
 
-               CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
-                                  task->reconstruction_state.source_w,
-                                  task->reconstruction_state.source_h);
+               int shift_stride = stride*h;
+               int num_shifts = (2*r+1)*(2*r+1);
+               int mem_size = sizeof(float)*shift_stride*num_shifts;
 
-               CUdeviceptr difference     = task->reconstruction_state.temporary_1_ptr;
-               CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
+               CUdeviceptr temporary_mem;
+               cuda_assert(cuMemAlloc(&temporary_mem, 2*mem_size));
+               CUdeviceptr difference     = temporary_mem;
+               CUdeviceptr blurDifference = temporary_mem + mem_size;
 
-               int r = task->radius;
-               int f = 4;
-               float a = 1.0f;
-               for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-                       int dy = i / (2*r+1) - r;
-                       int dx = i % (2*r+1) - r;
-
-                       int local_rect[4] = {max(0, -dx), max(0, -dy),
-                                            task->reconstruction_state.source_w - max(0, dx),
-                                            task->reconstruction_state.source_h - max(0, dy)};
-
-                       void *calc_difference_args[] = {&dx, &dy,
-                                                       &color_ptr,
-                                                       &color_variance_ptr,
-                                                       &difference,
-                                                       &local_rect,
-                                                       &task->buffer.w,
-                                                       &task->buffer.pass_stride,
-                                                       &a,
-                                                       &task->nlm_k_2};
-                       CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
-
-                       void *blur_args[] = {&difference,
-                                            &blurDifference,
-                                            &local_rect,
-                                            &task->buffer.w,
-                                            &f};
-                       CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-
-                       void *calc_weight_args[] = {&blurDifference,
-                                                   &difference,
-                                                   &local_rect,
-                                                   &task->buffer.w,
-                                                   &f};
-                       CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
-
-                       /* Reuse previous arguments. */
-                       CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
-
-                       void *construct_gramian_args[] = {&dx, &dy,
-                                                         &blurDifference,
+               {
+                       CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
+                       cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference,   cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
+                       cuda_assert(cuModuleGetFunction(&cuNLMBlur,             cuFilterModule, "kernel_cuda_filter_nlm_blur"));
+                       cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight,       cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
+                       cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
+
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference,   CU_FUNC_CACHE_PREFER_L1));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMBlur,             CU_FUNC_CACHE_PREFER_L1));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight,       CU_FUNC_CACHE_PREFER_L1));
+                       cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
+
+                       CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
+                                            task->reconstruction_state.source_w * task->reconstruction_state.source_h,
+                                            num_shifts);
+
+                       void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
+                       void *blur_args[]            = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
+                       void *calc_weight_args[]     = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
+                       void *construct_gramian_args[] = {&blurDifference,
                                                          &task->buffer.mem.device_pointer,
                                                          &task->storage.transform.device_pointer,
                                                          &task->storage.rank.device_pointer,
                                                          &task->storage.XtWX.device_pointer,
                                                          &task->storage.XtWY.device_pointer,
-                                                         &local_rect,
-                                                         &task->reconstruction_state.filter_rect,
-                                                         &task->buffer.w,
-                                                         &task->buffer.h,
+                                                         &task->reconstruction_state.filter_window,
+                                                         &w, &h, &stride,
+                                                         &shift_stride, &r,
                                                          &f,
                                                      &task->buffer.pass_stride};
-                       CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
-               }
-
-               void *finalize_args[] = {&task->buffer.w,
-                                        &task->buffer.h,
-                                        &output_ptr,
-                                                &task->storage.rank.device_pointer,
-                                                &task->storage.XtWX.device_pointer,
-                                                &task->storage.XtWY.device_pointer,
-                                                &task->filter_area,
-                                                &task->reconstruction_state.buffer_params.x,
-                                                &task->render_buffer.samples};
-               CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
+                       CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
+               }
+
+               cuMemFree(temporary_mem);
+
+               {
+                       CUfunction cuFinalize;
+                       cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
+                       cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
+                       void *finalize_args[] = {&output_ptr,
+                                                        &task->storage.rank.device_pointer,
+                                                        &task->storage.XtWX.device_pointer,
+                                                        &task->storage.XtWY.device_pointer,
+                                                        &task->filter_area,
+                                                        &task->reconstruction_state.buffer_params.x,
+                                                        &task->render_buffer.samples};
+                       CUDA_GET_BLOCKSIZE(cuFinalize,
+                                          task->reconstruction_state.source_w,
+                                          task->reconstruction_state.source_h);
+                       CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
+               }
+
                cuda_assert(cuCtxSynchronize());
 
                return !have_error();
index 69c43e4a8cf909945c2cbf2e0a7d3310310fb458..1862deb9a61b2e9eab3e03dd917051936d050fd8 100644 (file)
@@ -57,10 +57,9 @@ void DenoisingTask::init_from_devicetask(const DeviceTask &task)
        render_buffer.denoising_clean_offset = task.pass_denoising_clean;
 
        /* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
-       rect = make_int4(max(tiles->x[0], filter_area.x - radius),
-                        max(tiles->y[0], filter_area.y - radius),
-                        min(tiles->x[3], filter_area.x + filter_area.z + radius),
-                        min(tiles->y[3], filter_area.y + filter_area.w + radius));
+       rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
+       rect = rect_expand(rect, radius);
+       rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3]));
 }
 
 void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
@@ -93,9 +92,10 @@ bool DenoisingTask::run_denoising()
 {
        /* Allocate denoising buffer. */
        buffer.passes = 14;
-       buffer.w = align_up(rect.z - rect.x, 4);
+       buffer.width = rect.z - rect.x;
+       buffer.stride = align_up(buffer.width, 4);
        buffer.h = rect.w - rect.y;
-       buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
+       buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
        buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false);
 
        device_ptr null_ptr = (device_ptr) 0;
@@ -203,15 +203,17 @@ bool DenoisingTask::run_denoising()
 
        functions.construct_transform();
 
-       storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false);
-       storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false);
-       reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer;
-       reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer;
+       device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1");
+       device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2");
+       temporary_1.alloc_to_device(buffer.pass_stride, false);
+       temporary_2.alloc_to_device(buffer.pass_stride, false);
+       reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
+       reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
 
        storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
        storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
 
-       reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
+       reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
        int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
        reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
                                                       render_buffer.stride,
index ec4e7933cdcc3850904547ca115b158c1a473d80..77a82d0ad041763343058017dfe96bffc9e556ab 100644 (file)
@@ -94,7 +94,7 @@ public:
                device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
                device_ptr temporary_2_ptr;
 
-               int4 filter_rect;
+               int4 filter_window;
                int4 buffer_params;
 
                int source_w;
@@ -148,8 +148,9 @@ public:
        struct DenoiseBuffers {
                int pass_stride;
                int passes;
-               int w;
+               int stride;
                int h;
+               int width;
                device_only_memory<float> mem;
 
                DenoiseBuffers(Device *device)
index c02f8ffafe6f7c107cce7a87a10fbf26771e44a3..f38c2f65c1e7f450734e5ac1d9a08c5d2f243829 100644 (file)
@@ -353,7 +353,9 @@ public:
        void tex_free(device_memory& mem);
 
        size_t global_size_round_up(int group_size, int global_size);
-       void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
+       void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
+                           bool x_workgroups = false,
+                           size_t max_workgroup_size = -1);
        void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
        void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
 
index f43177247ef0f21f43ae18ba285b49f4db0e73ad..fe084edc90ec46140e3ec308549c9b9a846ec9d6 100644 (file)
@@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
        return global_size + ((r == 0)? 0: group_size - r);
 }
 
-void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
+void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
 {
        size_t workgroup_size, max_work_items[3];
 
@@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size
        }
 
        /* Try to divide evenly over 2 dimensions. */
-       size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
-       size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
+       size_t local_size[2];
+       if(x_workgroups) {
+               local_size[0] = workgroup_size;
+               local_size[1] = 1;
+       }
+       else {
+               size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
+               local_size[0] = local_size[1] = sqrt_workgroup_size;
+       }
 
        /* Some implementations have max size 1 on 2nd dimension. */
        if(local_size[1] > max_work_items[1]) {
@@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
                                                  device_ptr out_ptr,
                                                  DenoisingTask *task)
 {
-       int4 rect = task->rect;
-       int w = rect.z-rect.x;
-       int h = rect.w-rect.y;
+
+       int stride = task->buffer.stride;
+       int w = task->buffer.width;
+       int h = task->buffer.h;
        int r = task->nlm_state.r;
        int f = task->nlm_state.f;
        float a = task->nlm_state.a;
        float k_2 = task->nlm_state.k_2;
 
-       cl_mem difference     = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
-       cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
-       cl_mem weightAccum    = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+       int shift_stride = stride*h;
+       int num_shifts = (2*r+1)*(2*r+1);
+       int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+       cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
+
+       cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
+       cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
 
        cl_mem image_mem = CL_MEM_PTR(image_ptr);
        cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
@@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
        cl_kernel ckNLMUpdateOutput   = denoising_program(ustring("filter_nlm_update_output"));
        cl_kernel ckNLMNormalize      = denoising_program(ustring("filter_nlm_normalize"));
 
-       for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-               int dy = i / (2*r+1) - r;
-               int dx = i % (2*r+1) - r;
-               int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
-               kernel_set_args(ckNLMCalcDifference, 0,
-                               dx, dy, guide_mem, variance_mem,
-                               difference, local_rect, w, 0, a, k_2);
-               kernel_set_args(ckNLMBlur, 0,
-                               difference, blurDifference, local_rect, w, f);
-               kernel_set_args(ckNLMCalcWeight, 0,
-                               blurDifference, difference, local_rect, w, f);
-               kernel_set_args(ckNLMUpdateOutput, 0,
-                               dx, dy, blurDifference, image_mem,
-                               out_mem, weightAccum, local_rect, w, f);
-
-               enqueue_kernel(ckNLMCalcDifference, w, h);
-               enqueue_kernel(ckNLMBlur,           w, h);
-               enqueue_kernel(ckNLMCalcWeight,     w, h);
-               enqueue_kernel(ckNLMBlur,           w, h);
-               enqueue_kernel(ckNLMUpdateOutput,   w, h);
-       }
+       kernel_set_args(ckNLMCalcDifference, 0,
+                       guide_mem,
+                       variance_mem,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       r, 0, a, k_2);
+       kernel_set_args(ckNLMBlur, 0,
+                       difference,
+                       blurDifference,
+                       w, h, stride,
+                       shift_stride,
+                       r, f);
+       kernel_set_args(ckNLMCalcWeight, 0,
+                       blurDifference,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       r, f);
+       kernel_set_args(ckNLMUpdateOutput, 0,
+                       blurDifference,
+                       image_mem,
+                       out_mem,
+                       weightAccum,
+                       w, h, stride,
+                       shift_stride,
+                       r, f);
+
+       enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+       enqueue_kernel(ckNLMCalcWeight,     w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,           w*h, num_shifts, true);
+       enqueue_kernel(ckNLMUpdateOutput,   w*h, num_shifts, true);
+
+       opencl_assert(clReleaseMemObject(difference));
+       opencl_assert(clReleaseMemObject(blurDifference));
 
-       int4 local_rect = make_int4(0, 0, w, h);
        kernel_set_args(ckNLMNormalize, 0,
-                       out_mem, weightAccum, local_rect, w);
+                       out_mem, weightAccum, w, h, stride);
        enqueue_kernel(ckNLMNormalize, w, h);
 
        return true;
@@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
        cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
        cl_kernel ckFinalize            = denoising_program(ustring("filter_finalize"));
 
-       cl_mem difference     = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
-       cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
-
-       int r = task->radius;
-       int f = 4;
-       float a = 1.0f;
-       for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
-               int dy = i / (2*r+1) - r;
-               int dx = i % (2*r+1) - r;
-
-               int local_rect[4] = {max(0, -dx), max(0, -dy),
-                                    task->reconstruction_state.source_w - max(0, dx),
-                                    task->reconstruction_state.source_h - max(0, dy)};
-
-               kernel_set_args(ckNLMCalcDifference, 0,
-                               dx, dy,
-                               color_mem,
-                               color_variance_mem,
-                               difference,
-                               local_rect,
-                               task->buffer.w,
-                               task->buffer.pass_stride,
-                               a, task->nlm_k_2);
-               enqueue_kernel(ckNLMCalcDifference,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               kernel_set_args(ckNLMBlur, 0,
-                               difference,
-                               blurDifference,
-                               local_rect,
-                               task->buffer.w,
-                               f);
-               enqueue_kernel(ckNLMBlur,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               kernel_set_args(ckNLMCalcWeight, 0,
-                               blurDifference,
-                               difference,
-                               local_rect,
-                               task->buffer.w,
-                               f);
-               enqueue_kernel(ckNLMCalcWeight,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               /* Reuse previous arguments. */
-               enqueue_kernel(ckNLMBlur,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h);
-
-               kernel_set_args(ckNLMConstructGramian, 0,
-                               dx, dy,
-                               blurDifference,
-                               buffer_mem,
-                               transform_mem,
-                               rank_mem,
-                               XtWX_mem,
-                               XtWY_mem,
-                               local_rect,
-                               task->reconstruction_state.filter_rect,
-                               task->buffer.w,
-                               task->buffer.h,
-                               f,
-                           task->buffer.pass_stride);
-               enqueue_kernel(ckNLMConstructGramian,
-                              task->reconstruction_state.source_w,
-                              task->reconstruction_state.source_h,
-                              256);
-       }
+       int w = task->reconstruction_state.source_w;
+       int h = task->reconstruction_state.source_h;
+       int stride = task->buffer.stride;
+
+       int shift_stride = stride*h;
+       int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
+       int mem_size = sizeof(float)*shift_stride*num_shifts;
+
+       cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+       cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
+       opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
+
+       kernel_set_args(ckNLMCalcDifference, 0,
+                       color_mem,
+                       color_variance_mem,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius,
+                       task->buffer.pass_stride,
+                       1.0f, task->nlm_k_2);
+       kernel_set_args(ckNLMBlur, 0,
+                       difference,
+                       blurDifference,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius, 4);
+       kernel_set_args(ckNLMCalcWeight, 0,
+                       blurDifference,
+                       difference,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius, 4);
+       kernel_set_args(ckNLMConstructGramian, 0,
+                       blurDifference,
+                       buffer_mem,
+                       transform_mem,
+                       rank_mem,
+                       XtWX_mem,
+                       XtWY_mem,
+                       task->reconstruction_state.filter_window,
+                       w, h, stride,
+                       shift_stride,
+                       task->radius, 4,
+                       task->buffer.pass_stride);
+
+       enqueue_kernel(ckNLMCalcDifference,   w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+       enqueue_kernel(ckNLMCalcWeight,       w*h, num_shifts, true);
+       enqueue_kernel(ckNLMBlur,             w*h, num_shifts, true);
+       enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
+
+       opencl_assert(clReleaseMemObject(difference));
+       opencl_assert(clReleaseMemObject(blurDifference));
 
        kernel_set_args(ckFinalize, 0,
-                       task->buffer.w,
-                       task->buffer.h,
                        output_mem,
                        rank_mem,
                        XtWX_mem,
@@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
                        task->filter_area,
                        task->reconstruction_state.buffer_params,
                        task->render_buffer.samples);
-       enqueue_kernel(ckFinalize,
-                      task->reconstruction_state.source_w,
-                      task->reconstruction_state.source_h);
+       enqueue_kernel(ckFinalize, w, h);
 
        return true;
 }
index de056ce97f0c20d563b7ce6800e5a784ed27743c..5f10bdf204157f756ec8eacb7ba6deb537bffd73 100644 (file)
@@ -254,6 +254,7 @@ set(SRC_UTIL_HEADERS
        ../util/util_math_int3.h
        ../util/util_math_int4.h
        ../util/util_math_matrix.h
+       ../util/util_rect.h
        ../util/util_static_assert.h
        ../util/util_transform.h
        ../util/util_texture.h
index 5e989331bc2cacdca36f6f3db50a3f1b005ea897..e2da0fd872bce418731e78ab9face67bb3b57b28 100644 (file)
@@ -21,7 +21,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                                                          const float *ccl_restrict variance_image,
                                                          float *difference_image,
                                                          int4 rect,
-                                                         int w,
+                                                         int stride,
                                                          int channel_offset,
                                                          float a,
                                                          float k_2)
@@ -31,15 +31,15 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
                        float diff = 0.0f;
                        int numChannels = channel_offset? 3 : 1;
                        for(int c = 0; c < numChannels; c++) {
-                               float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
-                               float pvar = variance_image[c*channel_offset + y*w+x];
-                               float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+                               float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+                               float pvar = variance_image[c*channel_offset + y*stride + x];
+                               float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
                                diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
                        }
                        if(numChannels > 1) {
                                diff *= 1.0f/numChannels;
                        }
-                       difference_image[y*w+x] = diff;
+                       difference_image[y*stride + x] = diff;
                }
        }
 }
@@ -47,7 +47,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
 ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
                                               float *out_image,
                                               int4 rect,
-                                              int w,
+                                              int stride,
                                               int f)
 {
        int aligned_lowx = rect.x / 4;
@@ -56,17 +56,17 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
                const int low = max(rect.y, y-f);
                const int high = min(rect.w, y+f+1);
                for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*w+x] = 0.0f;
+                       out_image[y*stride + x] = 0.0f;
                }
                for(int y1 = low; y1 < high; y1++) {
-                       float4* out_image4 = (float4*)(out_image + y*w);
-                       float4* difference_image4 = (float4*)(difference_image + y1*w);
+                       float4* out_image4 = (float4*)(out_image + y*stride);
+                       float4* difference_image4 = (float4*)(difference_image + y1*stride);
                        for(int x = aligned_lowx; x < aligned_highx; x++) {
                                out_image4[x] += difference_image4[x];
                        }
                }
                for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*w+x] *= 1.0f/(high - low);
+                       out_image[y*stride + x] *= 1.0f/(high - low);
                }
        }
 }
@@ -74,12 +74,12 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
 ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
                                                      float *out_image,
                                                      int4 rect,
-                                                     int w,
+                                                     int stride,
                                                      int f)
 {
        for(int y = rect.y; y < rect.w; y++) {
                for(int x = rect.x; x < rect.z; x++) {
-                       out_image[y*w+x] = 0.0f;
+                       out_image[y*stride + x] = 0.0f;
                }
        }
        for(int dx = -f; dx <= f; dx++) {
@@ -87,7 +87,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
                int neg_dx = min(0, dx);
                for(int y = rect.y; y < rect.w; y++) {
                        for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
-                               out_image[y*w+x] += difference_image[y*w+dx+x];
+                               out_image[y*stride + x] += difference_image[y*stride + x+dx];
                        }
                }
        }
@@ -95,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
                for(int x = rect.x; x < rect.z; x++) {
                        const int low = max(rect.x, x-f);
                        const int high = min(rect.z, x+f+1);
-                       out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
+                       out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
                }
        }
 }
@@ -106,7 +106,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                                                        float *out_image,
                                                        float *accum_image,
                                                        int4 rect,
-                                                       int w,
+                                                       int stride,
                                                        int f)
 {
        for(int y = rect.y; y < rect.w; y++) {
@@ -115,11 +115,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
                        const int high = min(rect.z, x+f+1);
                        float sum = 0.0f;
                        for(int x1 = low; x1 < high; x1++) {
-                               sum += difference_image[y*w+x1];
+                               sum += difference_image[y*stride + x1];
                        }
                        float weight = sum * (1.0f/(high - low));
-                       accum_image[y*w+x] += weight;
-                       out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
+                       accum_image[y*stride + x] += weight;
+                       out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
                }
        }
 }
@@ -132,31 +132,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
                                                            float *XtWX,
                                                            float3 *XtWY,
                                                            int4 rect,
-                                                           int4 filter_rect,
-                                                           int w, int h, int f,
+                                                           int4 filter_window,
+                                                           int stride, int f,
                                                            int pass_stride)
 {
+       int4 clip_area = rect_clip(rect, filter_window);
        /* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
-       for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
-               int y = fy + filter_rect.y;
-               for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
-                       int x = fx + filter_rect.x;
+       for(int y = clip_area.y; y < clip_area.w; y++) {
+               for(int x = clip_area.x; x < clip_area.z; x++) {
                        const int low = max(rect.x, x-f);
                        const int high = min(rect.z, x+f+1);
                        float sum = 0.0f;
                        for(int x1 = low; x1 < high; x1++) {
-                               sum += difference_image[y*w+x1];
+                               sum += difference_image[y*stride + x1];
                        }
                        float weight = sum * (1.0f/(high - low));
 
-                       int storage_ofs = fy*filter_rect.z + fx;
+                       int storage_ofs = coord_to_local_index(filter_window, x, y);
                        float  *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
                        float  *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
                        float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
                        int    *l_rank = rank + storage_ofs;
 
                        kernel_filter_construct_gramian(x, y, 1,
-                                                       dx, dy, w, h,
+                                                       dx, dy,
+                                                       stride,
                                                        pass_stride,
                                                        buffer,
                                                        l_transform, l_rank,
index 2c5ac8070513f231ea50c26cfea3c6650c3de9cf..4ca49ea6733c138e61cd596909fd82f301b76848 100644 (file)
 
 CCL_NAMESPACE_BEGIN
 
+/* Determines pixel coordinates and offset for the current thread.
+ * Returns whether the thread should do any work.
+ *
+ * All coordinates are relative to the denoising buffer!
+ *
+ * Window is the rect that should be processed.
+ * co is filled with (x, y, dx, dy).
+ */
+ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
+                                             int4 *rect, int4 *co, int *ofs,
+                                             int4 window)
+{
+       /* Determine the pixel offset that this thread should apply. */
+       int s = 2*r+1;
+       int si = ccl_global_id(1);
+       int sx = si % s;
+       int sy = si / s;
+       if(sy >= s) {
+               return false;
+       }
+       co->z = sx-r;
+       co->w = sy-r;
+
+       /* Pixels still need to lie inside the denoising buffer after applying the offset,
+        * so determine the area for which this is the case. */
+       *rect = make_int4(max(0, -co->z),     max(0, -co->w),
+                     w - max(0,  co->z), h - max(0,  co->w));
+
+       /* Find the intersection of the area that we want to process (window) and the area
+        * that can be processed (rect) to get the final area for this offset. */
+       int4 clip_area = rect_clip(window, *rect);
+
+       /* If the radius is larger than one of the sides of the window,
+        * there will be shifts for which there is no usable pixel at all. */
+       if(!rect_is_valid(clip_area)) {
+               return false;
+       }
+
+       /* Map the linear thread index to pixels inside the clip area. */
+       int x, y;
+       if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
+               return false;
+       }
+       co->x = x;
+       co->y = y;
+
+       *ofs = (sy*s + sx) * stride;
+
+       return true;
+}
+
+ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
+                                      int4 *rect, int4 *co, int *ofs)
+{
+       return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
+}
+
 ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
                                                          int dx, int dy,
                                                          const ccl_global float *ccl_restrict weight_image,
                                                          const ccl_global float *ccl_restrict variance_image,
                                                          ccl_global float *difference_image,
-                                                         int4 rect, int w,
+                                                         int4 rect, int stride,
                                                          int channel_offset,
                                                          float a, float k_2)
 {
        float diff = 0.0f;
        int numChannels = channel_offset? 3 : 1;
        for(int c = 0; c < numChannels; c++) {
-               float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
-               float pvar = variance_image[c*channel_offset + y*w+x];
-               float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
+               float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
+               float pvar = variance_image[c*channel_offset + y*stride + x];
+               float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
                diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
        }
        if(numChannels > 1) {
                diff *= 1.0f/numChannels;
        }
-       difference_image[y*w+x] = diff;
+       difference_image[y*stride + x] = diff;
 }
 
 ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
                                               const ccl_global float *ccl_restrict difference_image,
                                               ccl_global float *out_image,
-                                              int4 rect, int w, int f)
+                                              int4 rect, int stride, int f)
 {
        float sum = 0.0f;
        const int low = max(rect.y, y-f);
        const int high = min(rect.w, y+f+1);
        for(int y1 = low; y1 < high; y1++) {
-               sum += difference_image[y1*w+x];
+               sum += difference_image[y1*stride + x];
        }
        sum *= 1.0f/(high-low);
-       out_image[y*w+x] = sum;
+       out_image[y*stride + x] = sum;
 }
 
 ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
                                                      const ccl_global float *ccl_restrict difference_image,
                                                      ccl_global float *out_image,
-                                                     int4 rect, int w, int f)
+                                                     int4 rect, int stride, int f)
 {
        float sum = 0.0f;
        const int low = max(rect.x, x-f);
        const int high = min(rect.z, x+f+1);
        for(int x1 = low; x1 < high; x1++) {
-               sum += difference_image[y*w+x1];
+               sum += difference_image[y*stride + x1];
        }
        sum *= 1.0f/(high-low);
-       out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
+       out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
 }
 
 ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
@@ -75,25 +132,25 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
                                                        const ccl_global float *ccl_restrict image,
                                                        ccl_global float *out_image,
                                                        ccl_global float *accum_image,
-                                                       int4 rect, int w, int f)
+                                                       int4 rect, int stride, int f)
 {
        float sum = 0.0f;
        const int low = max(rect.x, x-f);
        const int high = min(rect.z, x+f+1);
        for(int x1 = low; x1 < high; x1++) {
-               sum += difference_image[y*w+x1];
+               sum += difference_image[y*stride + x1];
        }
        sum *= 1.0f/(high-low);
        if(out_image) {
-               accum_image[y*w+x] += sum;
-               out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
+               atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
+               atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
        }
        else {
-               accum_image[y*w+x] = sum;
+               accum_image[y*stride + x] = sum;
        }
 }
 
-ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
+ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
                                                            int dx, int dy,
                                                            const ccl_global float *ccl_restrict difference_image,
                                                            const ccl_global float *ccl_restrict buffer,
@@ -102,30 +159,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
                                                            ccl_global float *XtWX,
                                                            ccl_global float3 *XtWY,
                                                            int4 rect,
-                                                           int4 filter_rect,
-                                                           int w, int h, int f,
+                                                           int4 filter_window,
+                                                           int stride, int f,
                                                            int pass_stride,
                                                            int localIdx)
 {
-       int y = fy + filter_rect.y;
-       int x = fx + filter_rect.x;
        const int low = max(rect.x, x-f);
        const int high = min(rect.z, x+f+1);
        float sum = 0.0f;
        for(int x1 = low; x1 < high; x1++) {
-               sum += difference_image[y*w+x1];
+               sum += difference_image[y*stride + x1];
        }
        float weight = sum * (1.0f/(high - low));
 
-       int storage_ofs = fy*filter_rect.z + fx;
+       /* Reconstruction data is only stored for pixels inside the filter window,
+        * so compute the pixels's index in there. */
+       int storage_ofs = coord_to_local_index(filter_window, x, y);
        transform += storage_ofs;
        rank += storage_ofs;
        XtWX += storage_ofs;
        XtWY += storage_ofs;
 
        kernel_filter_construct_gramian(x, y,
-                                       filter_rect.z*filter_rect.w,
-                                       dx, dy, w, h,
+                                       rect_size(filter_window),
+                                       dx, dy,
+                                       stride,
                                        pass_stride,
                                        buffer,
                                        transform, rank,
@@ -136,9 +194,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
 ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
                                                    ccl_global float *out_image,
                                                    const ccl_global float *ccl_restrict accum_image,
-                                                   int4 rect, int w)
+                                                   int stride)
 {
-       out_image[y*w+x] /= accum_image[y*w+x];
+       out_image[y*stride + x] /= accum_image[y*stride + x];
 }
 
 CCL_NAMESPACE_END
index 25a3025056c0bc0469cb5d90ee2498ca0beef390..b7bf322f9ceb12f889d6690e9b5c5e2bb9fea9bd 100644 (file)
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int storage_stride,
                                                        int dx, int dy,
-                                                       int w, int h,
+                                                       int buffer_stride,
                                                        int pass_stride,
                                                        const ccl_global float *ccl_restrict buffer,
                                                        const ccl_global float *ccl_restrict transform,
@@ -33,8 +33,8 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                return;
        }
 
-       int p_offset =  y    *w +  x;
-       int q_offset = (y+dy)*w + (x+dx);
+       int p_offset =  y     * buffer_stride +  x;
+       int q_offset = (y+dy) * buffer_stride + (x+dx);
 
 #ifdef __KERNEL_GPU__
        const int stride = storage_stride;
@@ -65,7 +65,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
        math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
 }
 
-ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ccl_device_inline void kernel_filter_finalize(int x, int y,
                                               ccl_global float *buffer,
                                               ccl_global int *rank,
                                               int storage_stride,
index bf13ba62806d8d9f6c4e98e29c8499b463da3cfb..4231aba88d7b8a09bb8878afbe7133cd12c9daf0 100644 (file)
@@ -74,7 +74,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            float *variance,
                                                            float *difference_image,
                                                            int* rect,
-                                                           int w,
+                                                           int stride,
                                                            int channel_offset,
                                                            float a,
                                                            float k_2);
@@ -82,13 +82,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
                                                 float *out_image,
                                                 int* rect,
-                                                int w,
+                                                int stride,
                                                 int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
                                                        float *out_image,
                                                        int* rect,
-                                                       int w,
+                                                       int stride,
                                                        int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
@@ -98,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int* rect,
-                                                         int w,
+                                                         int stride,
                                                          int f);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
@@ -110,22 +110,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              float *XtWX,
                                                              float3 *XtWY,
                                                              int *rect,
-                                                             int *filter_rect,
-                                                             int w,
-                                                             int h,
+                                                             int *filter_window,
+                                                             int stride,
                                                              int f,
                                                              int pass_stride);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
                                                      float *accum_image,
                                                      int* rect,
-                                                     int w);
+                                                     int stride);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
                                                 int y,
                                                 int storage_ofs,
-                                                int w,
-                                                int h,
                                                 float *buffer,
                                                 int *rank,
                                                 float *XtWX,
index 2fbb0ea2bdb0f0a4ea4ff1a315ce9cda3871f841..ab39260784bc582ae49e5ca3e791b1d257b38b5c 100644 (file)
@@ -150,7 +150,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
                                                            float *variance,
                                                            float *difference_image,
                                                            int *rect,
-                                                           int w,
+                                                           int stride,
                                                            int channel_offset,
                                                            float a,
                                                            float k_2)
@@ -158,33 +158,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
 #else
-       kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
+       kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
                                                 float *out_image,
                                                 int *rect,
-                                                int w,
+                                                int stride,
                                                 int f)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
 #else
-       kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
+       kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
                                                        float *out_image,
                                                        int *rect,
-                                                       int w,
+                                                       int stride,
                                                        int f)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
 #else
-       kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
+       kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
 #endif
 }
 
@@ -195,13 +195,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
                                                          float *out_image,
                                                          float *accum_image,
                                                          int *rect,
-                                                         int w,
+                                                         int stride,
                                                          int f)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
 #else
-       kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
+       kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
 #endif
 }
 
@@ -214,36 +214,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
                                                              float *XtWX,
                                                              float3 *XtWY,
                                                              int *rect,
-                                                             int *filter_rect,
-                                                             int w,
-                                                             int h,
+                                                             int *filter_window,
+                                                             int stride,
                                                              int f,
                                                              int pass_stride)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
 #else
-    kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
+       kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
                                                      float *accum_image,
                                                      int *rect,
-                                                     int w)
+                                                     int stride)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
 #else
-       kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
+       kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
 #endif
 }
 
 void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
                                                 int y,
                                                 int storage_ofs,
-                                                int w,
-                                                int h,
                                                 float *buffer,
                                                 int *rank,
                                                 float *XtWX,
@@ -257,7 +254,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
        XtWX += storage_ofs*XTWX_SIZE;
        XtWY += storage_ofs*XTWY_SIZE;
        rank += storage_ofs;
-       kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
+       kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
 #endif
 }
 
index c8172355a7f5f33a8470de0ddf37f45c1d4507c3..035f0484488c23680a2bc3c55a693c97a9e584b8 100644 (file)
@@ -134,95 +134,140 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
-                                       const float *ccl_restrict weight_image,
+kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
                                        const float *ccl_restrict variance_image,
                                        float *difference_image,
-                                       int4 rect, int w,
+                                       int w,
+                                       int h,
+                                       int stride,
+                                       int shift_stride,
+                                       int r,
                                        int channel_offset,
-                                       float a, float k_2)
+                                       float a,
+                                       float k_2)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+                                                 weight_image,
+                                                 variance_image,
+                                                 difference_image + ofs,
+                                                 rect, stride,
+                                                 channel_offset, a, k_2);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
+                            float *out_image,
+                            int w,
+                            int h,
+                            int stride,
+                            int shift_stride,
+                            int r,
+                            int f)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_blur(co.x, co.y,
+                                      difference_image + ofs,
+                                      out_image + ofs,
+                                      rect, stride, f);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
+kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
+                                   float *out_image,
+                                   int w,
+                                   int h,
+                                   int stride,
+                                   int shift_stride,
+                                   int r,
+                                   int f)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_calc_weight(co.x, co.y,
+                                             difference_image + ofs,
+                                             out_image + ofs,
+                                             rect, stride, f);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_update_output(int dx, int dy,
-                                     const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
                                      const float *ccl_restrict image,
-                                     float *out_image, float *accum_image,
-                                     int4 rect, int w,
+                                     float *out_image,
+                                     float *accum_image,
+                                     int w,
+                                     int h,
+                                     int stride,
+                                     int shift_stride,
+                                     int r,
                                      int f)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+                                               difference_image + ofs,
+                                               image,
+                                               out_image,
+                                               accum_image,
+                                               rect, stride, f);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
+kernel_cuda_filter_nlm_normalize(float *out_image,
+                                 const float *ccl_restrict accum_image,
+                                 int w,
+                                 int h,
+                                 int stride)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
-       int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+       int x = blockDim.x*blockIdx.x + threadIdx.x;
+       int y = blockDim.y*blockIdx.y + threadIdx.y;
+       if(x < w && y < h) {
+               kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
        }
 }
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
-                                         const float *ccl_restrict difference_image,
+kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
                                          const float *ccl_restrict buffer,
                                          float const* __restrict__ transform,
                                          int *rank,
                                          float *XtWX,
                                          float3 *XtWY,
-                                         int4 rect,
-                                         int4 filter_rect,
-                                         int w, int h, int f,
+                                         int4 filter_window,
+                                         int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
+                                         int f,
                                          int pass_stride)
 {
-       int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
-       int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
-       if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
-               kernel_filter_nlm_construct_gramian(x, y,
-                                                   dx, dy,
-                                                   difference_image,
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+               kernel_filter_nlm_construct_gramian(co.x, co.y,
+                                                   co.z, co.w,
+                                                   difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
                                                    XtWX, XtWY,
-                                                   rect, filter_rect,
-                                                   w, h, f,
+                                                   rect, filter_window,
+                                                   stride, f,
                                                    pass_stride,
                                                    threadIdx.y*blockDim.x + threadIdx.x);
        }
@@ -230,10 +275,12 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
 
 extern "C" __global__ void
 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
-kernel_cuda_filter_finalize(int w, int h,
-                            float *buffer, int *rank,
-                            float *XtWX, float3 *XtWY,
-                            int4 filter_area, int4 buffer_params,
+kernel_cuda_filter_finalize(float *buffer,
+                            int *rank,
+                            float *XtWX,
+                            float3 *XtWY,
+                            int4 filter_area,
+                            int4 buffer_params,
                             int sample)
 {
        int x = blockDim.x*blockIdx.x + threadIdx.x;
@@ -243,7 +290,10 @@ kernel_cuda_filter_finalize(int w, int h,
                rank += storage_ofs;
                XtWX += storage_ofs;
                XtWY += storage_ofs;
-               kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+               kernel_filter_finalize(x, y, buffer, rank,
+                                      filter_area.z*filter_area.w,
+                                      XtWX, XtWY,
+                                      buffer_params, sample);
        }
 }
 
index 7a7b596a35041616277869fe3721030258a1f8b4..2b77807c38bff6dbc3414c4ec26e912ff4e5b051 100644 (file)
@@ -126,113 +126,136 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
        }
 }
 
-__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
-                                                    int dy,
-                                                    const ccl_global float *ccl_restrict weight_image,
+__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
                                                     const ccl_global float *ccl_restrict variance_image,
                                                     ccl_global float *difference_image,
-                                                    int4 rect,
                                                     int w,
+                                                    int h,
+                                                    int stride,
+                                                    int shift_stride,
+                                                    int r,
                                                     int channel_offset,
                                                     float a,
                                                     float k_2)
 {
-       int x = get_global_id(0) + rect.x;
-       int y = get_global_id(1) + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
+                                                 weight_image,
+                                                 variance_image,
+                                                 difference_image + ofs,
+                                                 rect, stride,
+                                                 channel_offset, a, k_2);
        }
 }
 
 __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
                                          ccl_global float *out_image,
-                                         int4 rect,
                                          int w,
+                                         int h,
+                                         int stride,
+                                         int shift_stride,
+                                         int r,
                                          int f)
 {
-       int x = get_global_id(0) + rect.x;
-       int y = get_global_id(1) + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_blur(co.x, co.y,
+                                      difference_image + ofs,
+                                      out_image + ofs,
+                                      rect, stride, f);
        }
 }
 
 __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
                                                 ccl_global float *out_image,
-                                                int4 rect,
                                                 int w,
+                                                int h,
+                                                int stride,
+                                                int shift_stride,
+                                                int r,
                                                 int f)
 {
-       int x = get_global_id(0) + rect.x;
-       int y = get_global_id(1) + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_calc_weight(co.x, co.y,
+                                             difference_image + ofs,
+                                             out_image + ofs,
+                                             rect, stride, f);
        }
 }
 
-__kernel void kernel_ocl_filter_nlm_update_output(int dx,
-                                                  int dy,
-                                                  const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
                                                   const ccl_global float *ccl_restrict image,
                                                   ccl_global float *out_image,
                                                   ccl_global float *accum_image,
-                                                  int4 rect,
                                                   int w,
+                                                  int h,
+                                                  int stride,
+                                                  int shift_stride,
+                                                  int r,
                                                   int f)
 {
-       int x = get_global_id(0) + rect.x;
-       int y = get_global_id(1) + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
+               kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
+                                               difference_image + ofs,
+                                               image,
+                                               out_image,
+                                               accum_image,
+                                               rect, stride, f);
        }
 }
 
 __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
                                               const ccl_global float *ccl_restrict accum_image,
-                                              int4 rect,
-                                              int w)
+                                              int w,
+                                              int h,
+                                              int stride)
 {
-       int x = get_global_id(0) + rect.x;
-       int y = get_global_id(1) + rect.y;
-       if(x < rect.z && y < rect.w) {
-               kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
+       int x = get_global_id(0);
+       int y = get_global_id(1);
+       if(x < w && y < h) {
+               kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
        }
 }
 
-__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
-                                                      int dy,
-                                                      const ccl_global float *ccl_restrict difference_image,
+__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
                                                       const ccl_global float *ccl_restrict buffer,
                                                       const ccl_global float *ccl_restrict transform,
                                                       ccl_global int *rank,
                                                       ccl_global float *XtWX,
                                                       ccl_global float3 *XtWY,
-                                                      int4 rect,
-                                                      int4 filter_rect,
+                                                      int4 filter_window,
                                                       int w,
                                                       int h,
+                                                      int stride,
+                                                      int shift_stride,
+                                                      int r,
                                                       int f,
                                                       int pass_stride)
 {
-       int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
-       int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
-       if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
-               kernel_filter_nlm_construct_gramian(x, y,
-                                                   dx, dy,
-                                                   difference_image,
+       int4 co, rect;
+       int ofs;
+       if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
+               kernel_filter_nlm_construct_gramian(co.x, co.y,
+                                                   co.z, co.w,
+                                                   difference_image + ofs,
                                                    buffer,
                                                    transform, rank,
                                                    XtWX, XtWY,
-                                                   rect, filter_rect,
-                                                   w, h, f,
+                                                   rect, filter_window,
+                                                   stride, f,
                                                    pass_stride,
                                                    get_local_id(1)*get_local_size(0) + get_local_id(0));
        }
 }
 
-__kernel void kernel_ocl_filter_finalize(int w,
-                                         int h,
-                                         ccl_global float *buffer,
+__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
                                          ccl_global int *rank,
                                          ccl_global float *XtWX,
                                          ccl_global float3 *XtWY,
@@ -247,7 +270,10 @@ __kernel void kernel_ocl_filter_finalize(int w,
                rank += storage_ofs;
                XtWX += storage_ofs;
                XtWY += storage_ofs;
-               kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
+               kernel_filter_finalize(x, y, buffer, rank,
+                                      filter_area.z*filter_area.w,
+                                      XtWX, XtWY,
+                                      buffer_params, sample);
        }
 }
 
index 7f3747a0f5818aa2e901cf8c207042792b1786ae..bc9def7ca41218c5242e7a210be70dfe9239f5e4 100644 (file)
@@ -68,6 +68,7 @@ set(SRC_HEADERS
        util_path.h
        util_progress.h
        util_queue.h
+       util_rect.h
        util_set.h
        util_simd.h
        util_sky_model.cpp
index 39ce6a93982e2eb38144e4822233a373ad415375..d0e91a2a1c9f2875015dec114bdb2200539e4154 100644 (file)
@@ -320,6 +320,8 @@ CCL_NAMESPACE_END
 #include "util/util_math_float3.h"
 #include "util/util_math_float4.h"
 
+#include "util/util_rect.h"
+
 CCL_NAMESPACE_BEGIN
 
 #ifndef __KERNEL_OPENCL__
index b31dbe4fc670cfd5c365af0253b7cef9b1e47c2b..382dad64ea5e6939970f08c8319b248ecf58daf9 100644 (file)
@@ -98,7 +98,10 @@ ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
 ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
 {
        for(int i = 0; i < n; i++) {
-               v[i*stride] += w*x[i];
+               ccl_global float *elem = (ccl_global float*) (v + i*stride);
+               atomic_add_and_fetch_float(elem+0, w.x*x[i]);
+               atomic_add_and_fetch_float(elem+1, w.y*x[i]);
+               atomic_add_and_fetch_float(elem+2, w.z*x[i]);
        }
 }
 
@@ -136,7 +139,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
 {
        for(int row = 0; row < n; row++) {
                for(int col = 0; col <= row; col++) {
-                       MATHS(A, row, col, stride) += v[row]*v[col]*weight;
+                       atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight);
                }
        }
 }
diff --git a/intern/cycles/util/util_rect.h b/intern/cycles/util/util_rect.h
new file mode 100644 (file)
index 0000000..17a55a1
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2017 Blender Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __UTIL_RECT_H__
+#define __UTIL_RECT_H__
+
+#include "util/util_types.h"
+
+CCL_NAMESPACE_BEGIN
+
+/* Rectangles are represented as a int4 containing the coordinates of the lower-left and
+ * upper-right corners in the order (x0, y0, x1, y1). */
+
+ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h)
+{
+       return make_int4(x0, y0, x0 + w, y0 + h);
+}
+
+ccl_device_inline int4 rect_expand(int4 rect, int d)
+{
+       return make_int4(rect.x - d, rect.y - d, rect.z + d, rect.w + d);
+}
+
+/* Returns the intersection of two rects. */
+ccl_device_inline int4 rect_clip(int4 a, int4 b)
+{
+       return make_int4(max(a.x, b.x), max(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+}
+
+ccl_device_inline bool rect_is_valid(int4 rect)
+{
+       return (rect.z > rect.x) && (rect.w > rect.y);
+}
+
+/* Returns the local row-major index of the pixel inside the rect. */
+ccl_device_inline int coord_to_local_index(int4 rect, int x, int y)
+{
+       int w = rect.z - rect.x;
+       return (y - rect.y) * w + (x - rect.x);
+}
+
+/* Finds the coordinates of a pixel given by its row-major index in the rect,
+ * and returns whether the pixel is inside it. */
+ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y)
+{
+       int w = rect.z - rect.x;
+       *x = (idx % w) + rect.x;
+       *y = (idx / w) + rect.y;
+       return (*y < rect.w);
+}
+
+ccl_device_inline int rect_size(int4 rect)
+{
+       return (rect.z - rect.x) * (rect.w - rect.y);
+}
+
+CCL_NAMESPACE_END
+
+#endif /* __UTIL_RECT_H__ */
+