Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / filter / filter_reconstruction.h
index 02f3802fa0cda2897c11323662d2a77d806f6f17..b7bf322f9ceb12f889d6690e9b5c5e2bb9fea9bd 100644 (file)
@@ -19,42 +19,41 @@ CCL_NAMESPACE_BEGIN
 ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
                                                        int storage_stride,
                                                        int dx, int dy,
-                                                       int w, int h,
+                                                       int buffer_stride,
                                                        int pass_stride,
-                                                       ccl_global float ccl_restrict_ptr buffer,
-                                                       ccl_global float *color_pass,
-                                                       ccl_global float *variance_pass,
-                                                       ccl_global float ccl_restrict_ptr transform,
+                                                       const ccl_global float *ccl_restrict buffer,
+                                                       const ccl_global float *ccl_restrict transform,
                                                        ccl_global int *rank,
                                                        float weight,
                                                        ccl_global float *XtWX,
                                                        ccl_global float3 *XtWY,
                                                        int localIdx)
 {
-       int p_offset =  y    *w +  x;
-       int q_offset = (y+dy)*w + (x+dx);
+       if(weight < 1e-3f) {
+               return;
+       }
 
-#ifdef __KERNEL_CPU__
-       const int stride = 1;
-       (void)storage_stride;
-       (void)localIdx;
-       float design_row[DENOISE_FEATURES+1];
-#elif defined(__KERNEL_CUDA__)
+       int p_offset =  y     * buffer_stride +  x;
+       int q_offset = (y+dy) * buffer_stride + (x+dx);
+
+#ifdef __KERNEL_GPU__
        const int stride = storage_stride;
+#else
+       const int stride = 1;
+       (void) storage_stride;
+#endif
+
+#ifdef __KERNEL_CUDA__
        ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
        ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
 #else
-       const int stride = storage_stride;
        float design_row[DENOISE_FEATURES+1];
 #endif
 
-       float3 p_color = filter_get_pixel_color(color_pass + p_offset, pass_stride);
-       float3 q_color = filter_get_pixel_color(color_pass + q_offset, pass_stride);
-
-       float p_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + p_offset, pass_stride));
-       float q_std_dev = sqrtf(filter_get_pixel_variance(variance_pass + q_offset, pass_stride));
+       float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
 
-       if(average(fabs(p_color - q_color)) > 3.0f*(p_std_dev + q_std_dev + 1e-3f)) {
+       /* If the pixel was flagged as an outlier during prefiltering, skip it. */
+       if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
                return;
        }
 
@@ -66,7 +65,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
        math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
 }
 
-ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
+ccl_device_inline void kernel_filter_finalize(int x, int y,
                                               ccl_global float *buffer,
                                               ccl_global int *rank,
                                               int storage_stride,
@@ -75,16 +74,33 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
                                               int4 buffer_params,
                                               int sample)
 {
-#ifdef __KERNEL_CPU__
-       const int stride = 1;
-       (void)storage_stride;
-#else
+#ifdef __KERNEL_GPU__
        const int stride = storage_stride;
+#else
+       const int stride = 1;
+       (void) storage_stride;
 #endif
 
+       if(XtWX[0] < 1e-3f) {
+               /* There is not enough information to determine a denoised result.
+                * As a fallback, keep the original value of the pixel. */
+                return;
+       }
+
+       /* The weighted average of pixel colors (essentially, the NLM-filtered image).
+        * In case the solution of the linear model fails due to numerical issues,
+        * fall back to this value. */
+       float3 mean_color = XtWY[0]/XtWX[0];
+
        math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
 
        float3 final_color = XtWY[0];
+       if(!isfinite3_safe(final_color)) {
+               final_color = mean_color;
+       }
+
+       /* Clamp pixel value to positive values. */
+       final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
 
        ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
        final_color *= sample;
@@ -98,6 +114,4 @@ ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
        combined_buffer[2] = final_color.z;
 }
 
-#undef STORAGE_TYPE
-
 CCL_NAMESPACE_END