Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / filter / filter_reconstruction.h
index dc90f318570d1f0e847a52b75bd470422d70644e..b7bf322f9ceb12f889d6690e9b5c5e2bb9fea9bd 100644 (file)
@@ -19,11 +19,9 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int storage_stride,
                                                        int dx, int dy,
-                                                       int w, int h,
+                                                       int buffer_stride,
                                                        int pass_stride,
                                                        const ccl_global float *ccl_restrict buffer,
-                                                       ccl_global float *color_pass,
-                                                       ccl_global float *variance_pass,
                                                        const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
                                                        float weight,
@@ -31,33 +29,31 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        ccl_global float3 *XtWY,
                                                        int localIdx)
 {
-       int p_offset =  y    *w +  x;
-       int q_offset = (y+dy)*w + (x+dx);
+       if(weight < 1e-3f) {
+               return;
+       }
 
-#ifdef __KERNEL_CPU__
-       const int stride = 1;
-       (void)storage_stride;
-       (void)localIdx;
-       float design_row[DENOISE_FEATURES+1];
-#elif defined(__KERNEL_CUDA__)
+       int p_offset =  y     * buffer_stride +  x;
+       int q_offset = (y+dy) * buffer_stride + (x+dx);
+
+#ifdef __KERNEL_GPU__
        const int stride = storage_stride;
+#else
+       const int stride = 1;
+       (void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
        ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
        ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
 #else
-       const int stride = storage_stride;
        float design_row[DENOISE_FEATURES+1];
 #endif
 
-       float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
-       float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
+       float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
 
-       float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
-       float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
-
-       /* If the pixel was flagged as an outlier during prefiltering, skip it.
-        * Otherwise, perform the regular confidence interval test. */
-       if(ccl_get_feature(buffer + q_offset, 0) < 0.0f ||
-          average(fabs(p_color - q_color)) > 2.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+       /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+       if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
                return;
        }
 
@@ -69,7 +65,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
        math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
 }
 
-ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ccl_device_inline void kernel_filter_finalize(int x, int y,
                                               ccl_global float *buffer,
                                               ccl_global int *rank,
                                               int storage_stride,
@@ -78,13 +74,19 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
                                               int4 buffer_params,
                                               int sample)
 {
-#ifdef __KERNEL_CPU__
-       const int stride = 1;
-       (void)storage_stride;
-#else
+#ifdef __KERNEL_GPU__
        const int stride = storage_stride;
+#else
+       const int stride = 1;
+       (void) storage_stride;
 #endif
 
+       if(XtWX[0] < 1e-3f) {
+               /* There is not enough information to determine a denoised result.
+                * As a fallback, keep the original value of the pixel. */
+                return;
+       }
+
        /* The weighted average of pixel colors (essentially, the NLM-filtered image).
         * In case the solution of the linear model fails due to numerical issues,
         * fall back to this value. */
@@ -97,6 +99,9 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
                final_color = mean_color;
        }
 
+       /* Clamp pixel value to positive values. */
+       final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
+
        ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
        final_color *= sample;
        if(buffer_params.w) {
@@ -109,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
        combined_buffer[2] = final_color.z;
 }
 
-#undef STORAGE_TYPE
-
 CCL_NAMESPACE_END