Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / filter / filter_reconstruction.h
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
20                                                        int storage_stride,
21                                                        int dx, int dy,
22                                                        int buffer_stride,
23                                                        int pass_stride,
24                                                        const ccl_global float *ccl_restrict buffer,
25                                                        const ccl_global float *ccl_restrict transform,
26                                                        ccl_global int *rank,
27                                                        float weight,
28                                                        ccl_global float *XtWX,
29                                                        ccl_global float3 *XtWY,
30                                                        int localIdx)
31 {
32         if(weight < 1e-3f) {
33                 return;
34         }
35
36         int p_offset =  y     * buffer_stride +  x;
37         int q_offset = (y+dy) * buffer_stride + (x+dx);
38
39 #ifdef __KERNEL_GPU__
40         const int stride = storage_stride;
41 #else
42         const int stride = 1;
43         (void) storage_stride;
44 #endif
45
46 #ifdef __KERNEL_CUDA__
47         ccl_local float shared_design_row[(DENOISE_FEATURES+1)*CCL_MAX_LOCAL_SIZE];
48         ccl_local_param float *design_row = shared_design_row + localIdx*(DENOISE_FEATURES+1);
49 #else
50         float design_row[DENOISE_FEATURES+1];
51 #endif
52
53         float3 q_color = filter_get_color(buffer + q_offset, pass_stride);
54
55         /* If the pixel was flagged as an outlier during prefiltering, skip it. */
56         if(ccl_get_feature(buffer + q_offset, 0) < 0.0f) {
57                 return;
58         }
59
60         filter_get_design_row_transform(make_int2(x, y),       buffer + p_offset,
61                                         make_int2(x+dx, y+dy), buffer + q_offset,
62                                         pass_stride, *rank, design_row, transform, stride);
63
64         math_trimatrix_add_gramian_strided(XtWX, (*rank)+1, design_row, weight, stride);
65         math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
66 }
67
68 ccl_device_inline void kernel_filter_finalize(int x, int y,
69                                               ccl_global float *buffer,
70                                               ccl_global int *rank,
71                                               int storage_stride,
72                                               ccl_global float *XtWX,
73                                               ccl_global float3 *XtWY,
74                                               int4 buffer_params,
75                                               int sample)
76 {
77 #ifdef __KERNEL_GPU__
78         const int stride = storage_stride;
79 #else
80         const int stride = 1;
81         (void) storage_stride;
82 #endif
83
84         if(XtWX[0] < 1e-3f) {
85                 /* There is not enough information to determine a denoised result.
86                  * As a fallback, keep the original value of the pixel. */
87                  return;
88         }
89
90         /* The weighted average of pixel colors (essentially, the NLM-filtered image).
91          * In case the solution of the linear model fails due to numerical issues,
92          * fall back to this value. */
93         float3 mean_color = XtWY[0]/XtWX[0];
94
95         math_trimatrix_vec3_solve(XtWX, XtWY, (*rank)+1, stride);
96
97         float3 final_color = XtWY[0];
98         if(!isfinite3_safe(final_color)) {
99                 final_color = mean_color;
100         }
101
102         /* Clamp pixel value to positive values. */
103         final_color = max(final_color, make_float3(0.0f, 0.0f, 0.0f));
104
105         ccl_global float *combined_buffer = buffer + (y*buffer_params.y + x + buffer_params.x)*buffer_params.z;
106         final_color *= sample;
107         if(buffer_params.w) {
108                 final_color.x += combined_buffer[buffer_params.w+0];
109                 final_color.y += combined_buffer[buffer_params.w+1];
110                 final_color.z += combined_buffer[buffer_params.w+2];
111         }
112         combined_buffer[0] = final_color.x;
113         combined_buffer[1] = final_color.y;
114         combined_buffer[2] = final_color.z;
115 }
116
117 CCL_NAMESPACE_END