Code refactor: use split variance calculation for mega kernels too.
authorBrecht Van Lommel <brechtvanlommel@gmail.com>
Tue, 26 Sep 2017 23:03:50 +0000 (01:03 +0200)
committerBrecht Van Lommel <brechtvanlommel@gmail.com>
Wed, 4 Oct 2017 19:11:14 +0000 (21:11 +0200)
There is no significant difference in denoised benchmark scenes and
denoising ctests, so might as well make it all consistent.

intern/cycles/device/device_cpu.cpp
intern/cycles/device/device_cuda.cpp
intern/cycles/device/opencl/opencl_base.cpp
intern/cycles/kernel/filter/filter_prefilter.h
intern/cycles/kernel/kernel_passes.h
intern/cycles/kernel/kernels/cpu/filter_cpu.h
intern/cycles/kernel/kernels/cpu/filter_cpu_impl.h
intern/cycles/kernel/kernels/cuda/filter.cu
intern/cycles/kernel/kernels/opencl/filter.cl

index 72330b02a28fda3c29609848a9527cf1a3e04ce2..ff34f4f9ce471d7c7ef7fd8e9655121491f32667 100644 (file)
@@ -176,10 +176,10 @@ public:
        KernelFunctions<void(*)(KernelGlobals *, uchar4 *, float *, float, int, int, int, int)>       convert_to_byte_kernel;
        KernelFunctions<void(*)(KernelGlobals *, uint4 *, float4 *, float*, int, int, int, int, int)> shader_kernel;
 
-       KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int, bool)> filter_divide_shadow_kernel;
-       KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int, bool)>               filter_get_feature_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_detect_outliers_kernel;
-       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                                     filter_combine_halves_kernel;
+       KernelFunctions<void(*)(int, TilesInfo*, int, int, float*, float*, float*, float*, float*, int*, int, int)> filter_divide_shadow_kernel;
+       KernelFunctions<void(*)(int, TilesInfo*, int, int, int, int, float*, float*, int*, int, int)>               filter_get_feature_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_detect_outliers_kernel;
+       KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int)>                               filter_combine_halves_kernel;
 
        KernelFunctions<void(*)(int, int, float*, float*, float*, int*, int, int, float, float)> filter_nlm_calc_difference_kernel;
        KernelFunctions<void(*)(float*, float*, int*, int, int)>                                 filter_nlm_blur_kernel;
@@ -563,8 +563,7 @@ public:
                                                              (float*) buffer_variance_ptr,
                                                              &task->rect.x,
                                                              task->render_buffer.pass_stride,
-                                                             task->render_buffer.denoising_data_offset,
-                                                             use_split_kernel);
+                                                             task->render_buffer.denoising_data_offset);
                        }
                }
                return true;
@@ -587,8 +586,7 @@ public:
                                                            (float*) variance_ptr,
                                                            &task->rect.x,
                                                            task->render_buffer.pass_stride,
-                                                           task->render_buffer.denoising_data_offset,
-                                                           use_split_kernel);
+                                                           task->render_buffer.denoising_data_offset);
                        }
                }
                return true;
index e5464dcf34ec2ffe7fae1c4bdfe909927c0497fb..54e012191ae220e2da23f0a045dab99adf5bd434 100644 (file)
@@ -1173,7 +1173,6 @@ public:
                                   task->rect.z-task->rect.x,
                                   task->rect.w-task->rect.y);
 
-               bool use_split_variance = use_split_kernel();
                void *args[] = {&task->render_buffer.samples,
                                &task->tiles_mem.device_pointer,
                                &a_ptr,
@@ -1183,8 +1182,7 @@ public:
                                &buffer_variance_ptr,
                                &task->rect,
                                &task->render_buffer.pass_stride,
-                               &task->render_buffer.denoising_data_offset,
-                               &use_split_variance};
+                               &task->render_buffer.denoising_data_offset};
                CUDA_LAUNCH_KERNEL(cuFilterDivideShadow, args);
                cuda_assert(cuCtxSynchronize());
 
@@ -1209,7 +1207,6 @@ public:
                                   task->rect.z-task->rect.x,
                                   task->rect.w-task->rect.y);
 
-               bool use_split_variance = use_split_kernel();
                void *args[] = {&task->render_buffer.samples,
                                &task->tiles_mem.device_pointer,
                                        &mean_offset,
@@ -1218,8 +1215,7 @@ public:
                                &variance_ptr,
                                &task->rect,
                                &task->render_buffer.pass_stride,
-                               &task->render_buffer.denoising_data_offset,
-                               &use_split_variance};
+                               &task->render_buffer.denoising_data_offset};
                CUDA_LAUNCH_KERNEL(cuFilterGetFeature, args);
                cuda_assert(cuCtxSynchronize());
 
index 7bdf81462b8a8a5fba28fce11f36cf3623b925a1..8095611f09977e7c03357878662d1eeb26d8a325 100644 (file)
@@ -982,7 +982,6 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
 
        cl_kernel ckFilterDivideShadow = denoising_program(ustring("filter_divide_shadow"));
 
-       char split_kernel = is_split_kernel()? 1 : 0;
        kernel_set_args(ckFilterDivideShadow, 0,
                        task->render_buffer.samples,
                        tiles_mem,
@@ -993,8 +992,7 @@ bool OpenCLDeviceBase::denoising_divide_shadow(device_ptr a_ptr,
                        buffer_variance_mem,
                        task->rect,
                        task->render_buffer.pass_stride,
-                       task->render_buffer.denoising_data_offset,
-                       split_kernel);
+                       task->render_buffer.denoising_data_offset);
        enqueue_kernel(ckFilterDivideShadow,
                       task->rect.z-task->rect.x,
                       task->rect.w-task->rect.y);
@@ -1015,7 +1013,6 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
 
        cl_kernel ckFilterGetFeature = denoising_program(ustring("filter_get_feature"));
 
-       char split_kernel = is_split_kernel()? 1 : 0;
        kernel_set_args(ckFilterGetFeature, 0,
                        task->render_buffer.samples,
                        tiles_mem,
@@ -1025,8 +1022,7 @@ bool OpenCLDeviceBase::denoising_get_feature(int mean_offset,
                        variance_mem,
                        task->rect,
                        task->render_buffer.pass_stride,
-                       task->render_buffer.denoising_data_offset,
-                       split_kernel);
+                       task->render_buffer.denoising_data_offset);
        enqueue_kernel(ckFilterGetFeature,
                       task->rect.z-task->rect.x,
                       task->rect.w-task->rect.y);
index 2aeb54a62be3c75ca1a7f3a1ac41bbd8becf80ad..eefcbfea23087b0aa335cb77cc03fbff39679b55 100644 (file)
@@ -35,8 +35,7 @@ ccl_device void kernel_filter_divide_shadow(int sample,
                                             ccl_global float *bufferVariance,
                                             int4 rect,
                                             int buffer_pass_stride,
-                                            int buffer_denoising_offset,
-                                            bool use_split_variance)
+                                            int buffer_denoising_offset)
 {
        int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
        int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
@@ -57,10 +56,12 @@ ccl_device void kernel_filter_divide_shadow(int sample,
        float varB = center_buffer[5];
        int odd_sample = (sample+1)/2;
        int even_sample = sample/2;
-       if(use_split_variance) {
-               varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
-               varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
-       }
+
+       /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+        * update does not work efficiently with atomics in the kernel. */
+       varA = max(0.0f, varA - unfilteredA[idx]*unfilteredA[idx]*odd_sample);
+       varB = max(0.0f, varB - unfilteredB[idx]*unfilteredB[idx]*even_sample);
+
        varA /= max(odd_sample - 1, 1);
        varB /= max(even_sample - 1, 1);
 
@@ -84,8 +85,7 @@ ccl_device void kernel_filter_get_feature(int sample,
                                           ccl_global float *mean,
                                           ccl_global float *variance,
                                           int4 rect, int buffer_pass_stride,
-                                          int buffer_denoising_offset,
-                                          bool use_split_variance)
+                                          int buffer_denoising_offset)
 {
        int xtile = (x < tiles->x[1])? 0: ((x < tiles->x[2])? 1: 2);
        int ytile = (y < tiles->y[1])? 0: ((y < tiles->y[2])? 1: 2);
@@ -97,12 +97,9 @@ ccl_device void kernel_filter_get_feature(int sample,
 
        mean[idx] = center_buffer[m_offset] / sample;
        if(sample > 1) {
-               if(use_split_variance) {
-                       variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
-               }
-               else {
-                       variance[idx] = center_buffer[v_offset] / (sample * (sample-1));
-               }
+               /* Approximate variance as E[x^2] - 1/N * (E[x])^2, since online variance
+                * update does not work efficiently with atomics in the kernel. */
+               variance[idx] = max(0.0f, (center_buffer[v_offset] - mean[idx]*mean[idx]*sample) / (sample * (sample-1)));
        }
        else {
                /* Can't compute variance with single sample, just set it very high. */
index fff7f4cfdb750198960e330840b55276fe6afade..bd756185e783c801a39d94a50967a71de236bae9 100644 (file)
@@ -67,18 +67,7 @@ ccl_device_inline void kernel_write_pass_float_variance(ccl_global float *buffer
 
        /* The online one-pass variance update that's used for the megakernel can't easily be implemented
         * with atomics, so for the split kernel the E[x^2] - 1/N * (E[x])^2 fallback is used. */
-#  ifdef __SPLIT_KERNEL__
        kernel_write_pass_float(buffer+1, sample, value*value);
-#  else
-       if(sample == 0) {
-               kernel_write_pass_float(buffer+1, sample, 0.0f);
-       }
-       else {
-               float new_mean = buffer[0] * (1.0f / (sample + 1));
-               float old_mean = (buffer[0] - value) * (1.0f / sample);
-               kernel_write_pass_float(buffer+1, sample, (value - new_mean) * (value - old_mean));
-       }
-#  endif
 }
 
 #  if defined(__SPLIT_KERNEL__)
@@ -95,19 +84,7 @@ ccl_device_inline void kernel_write_pass_float3_unaligned(ccl_global float *buff
 ccl_device_inline void kernel_write_pass_float3_variance(ccl_global float *buffer, int sample, float3 value)
 {
        kernel_write_pass_float3_unaligned(buffer, sample, value);
-#  ifdef __SPLIT_KERNEL__
        kernel_write_pass_float3_unaligned(buffer+3, sample, value*value);
-#  else
-       if(sample == 0) {
-               kernel_write_pass_float3_unaligned(buffer+3, sample, make_float3(0.0f, 0.0f, 0.0f));
-       }
-       else {
-               float3 sum = make_float3(buffer[0], buffer[1], buffer[2]);
-               float3 new_mean = sum * (1.0f / (sample + 1));
-               float3 old_mean = (sum - value) * (1.0f / sample);
-               kernel_write_pass_float3_unaligned(buffer+3, sample, (value - new_mean) * (value - old_mean));
-       }
-#  endif
 }
 
 ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_global float *buffer,
@@ -125,18 +102,7 @@ ccl_device_inline void kernel_write_denoising_shadow(KernelGlobals *kg, ccl_glob
        kernel_write_pass_float(buffer+1, sample/2, path_total_shaded);
 
        float value = path_total_shaded / max(path_total, 1e-7f);
-#  ifdef __SPLIT_KERNEL__
        kernel_write_pass_float(buffer+2, sample/2, value*value);
-#  else
-       if(sample < 2) {
-               kernel_write_pass_float(buffer+2, sample/2, 0.0f);
-       }
-       else {
-               float old_value = (buffer[1] - path_total_shaded) / max(buffer[0] - path_total, 1e-7f);
-               float new_value = buffer[1] / max(buffer[0], 1e-7f);
-               kernel_write_pass_float(buffer+2, sample, (value - new_value) * (value - old_value));
-       }
-#  endif
 }
 #endif /* __DENOISING_FEATURES__ */
 
index 2ed713299fd88d526673fd60deb2e7b0296afb92..bf13ba62806d8d9f6c4e98e29c8499b463da3cfb 100644 (file)
@@ -27,8 +27,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
                                                      float *bufferV,
                                                      int* prefilter_rect,
                                                      int buffer_pass_stride,
-                                                     int buffer_denoising_offset,
-                                                     bool use_split_variance);
+                                                     int buffer_denoising_offset);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                                    TilesInfo *tiles,
@@ -40,8 +39,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                                    float *variance,
                                                    int* prefilter_rect,
                                                    int buffer_pass_stride,
-                                                   int buffer_denoising_offset,
-                                                   bool use_split_variance);
+                                                   int buffer_denoising_offset);
 
 void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
                                                        ccl_global float *image,
index 8dc1a8d583c015e6dd3ee49b6039dac31b9e1ede..2fbb0ea2bdb0f0a4ea4ff1a315ce9cda3871f841 100644 (file)
@@ -45,8 +45,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
                                                      float *bufferVariance,
                                                      int* prefilter_rect,
                                                      int buffer_pass_stride,
-                                                     int buffer_denoising_offset,
-                                                     bool use_split_variance)
+                                                     int buffer_denoising_offset)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
@@ -60,8 +59,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
                                    bufferVariance,
                                    load_int4(prefilter_rect),
                                    buffer_pass_stride,
-                                   buffer_denoising_offset,
-                                   use_split_variance);
+                                   buffer_denoising_offset);
 #endif
 }
 
@@ -74,8 +72,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                                    float *mean, float *variance,
                                                    int* prefilter_rect,
                                                    int buffer_pass_stride,
-                                                   int buffer_denoising_offset,
-                                                   bool use_split_variance)
+                                                   int buffer_denoising_offset)
 {
 #ifdef KERNEL_STUB
        STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
@@ -86,8 +83,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
                                  mean, variance,
                                  load_int4(prefilter_rect),
                                  buffer_pass_stride,
-                                 buffer_denoising_offset,
-                                 use_split_variance);
+                                 buffer_denoising_offset);
 #endif
 }
 
index 009c3fde9d576d4864561718390eaf4d52c9d6d2..c8172355a7f5f33a8470de0ddf37f45c1d4507c3 100644 (file)
@@ -37,8 +37,7 @@ kernel_cuda_filter_divide_shadow(int sample,
                                  float *bufferVariance,
                                  int4 prefilter_rect,
                                  int buffer_pass_stride,
-                                 int buffer_denoising_offset,
-                                 bool use_split_variance)
+                                 int buffer_denoising_offset)
 {
        int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
        int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
@@ -53,8 +52,7 @@ kernel_cuda_filter_divide_shadow(int sample,
                                            bufferVariance,
                                            prefilter_rect,
                                            buffer_pass_stride,
-                                           buffer_denoising_offset,
-                                           use_split_variance);
+                                           buffer_denoising_offset);
        }
 }
 
@@ -68,8 +66,7 @@ kernel_cuda_filter_get_feature(int sample,
                                float *variance,
                                int4 prefilter_rect,
                                int buffer_pass_stride,
-                               int buffer_denoising_offset,
-                               bool use_split_variance)
+                               int buffer_denoising_offset)
 {
        int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
        int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
@@ -81,8 +78,7 @@ kernel_cuda_filter_get_feature(int sample,
                                          mean, variance,
                                          prefilter_rect,
                                          buffer_pass_stride,
-                                         buffer_denoising_offset,
-                                         use_split_variance);
+                                         buffer_denoising_offset);
        }
 }
 
index f015ac47d8a767ab7d663dccba61ec60853823f3..7a7b596a35041616277869fe3721030258a1f8b4 100644 (file)
@@ -31,8 +31,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
                                               ccl_global float *bufferVariance,
                                               int4 prefilter_rect,
                                               int buffer_pass_stride,
-                                              int buffer_denoising_offset,
-                                              char use_split_variance)
+                                              int buffer_denoising_offset)
 {
        int x = prefilter_rect.x + get_global_id(0);
        int y = prefilter_rect.y + get_global_id(1);
@@ -47,8 +46,7 @@ __kernel void kernel_ocl_filter_divide_shadow(int sample,
                                            bufferVariance,
                                            prefilter_rect,
                                            buffer_pass_stride,
-                                           buffer_denoising_offset,
-                                           use_split_variance);
+                                           buffer_denoising_offset);
        }
 }
 
@@ -60,8 +58,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
                                             ccl_global float *variance,
                                             int4 prefilter_rect,
                                             int buffer_pass_stride,
-                                            int buffer_denoising_offset,
-                                            char use_split_variance)
+                                            int buffer_denoising_offset)
 {
        int x = prefilter_rect.x + get_global_id(0);
        int y = prefilter_rect.y + get_global_id(1);
@@ -73,8 +70,7 @@ __kernel void kernel_ocl_filter_get_feature(int sample,
                                          mean, variance,
                                          prefilter_rect,
                                          buffer_pass_stride,
-                                         buffer_denoising_offset,
-                                         use_split_variance);
+                                         buffer_denoising_offset);
        }
 }