Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / filter / filter_nlm_gpu.h
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 CCL_NAMESPACE_BEGIN
18
19 /* Determines pixel coordinates and offset for the current thread.
20  * Returns whether the thread should do any work.
21  *
22  * All coordinates are relative to the denoising buffer!
23  *
24  * Window is the rect that should be processed.
25  * co is filled with (x, y, dx, dy).
26  */
27 ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
28                                              int4 *rect, int4 *co, int *ofs,
29                                              int4 window)
30 {
31         /* Determine the pixel offset that this thread should apply. */
32         int s = 2*r+1;
33         int si = ccl_global_id(1);
34         int sx = si % s;
35         int sy = si / s;
36         if(sy >= s) {
37                 return false;
38         }
39         co->z = sx-r;
40         co->w = sy-r;
41
42         /* Pixels still need to lie inside the denoising buffer after applying the offset,
43          * so determine the area for which this is the case. */
44         *rect = make_int4(max(0, -co->z),     max(0, -co->w),
45                       w - max(0,  co->z), h - max(0,  co->w));
46
47         /* Find the intersection of the area that we want to process (window) and the area
48          * that can be processed (rect) to get the final area for this offset. */
49         int4 clip_area = rect_clip(window, *rect);
50
51         /* If the radius is larger than one of the sides of the window,
52          * there will be shifts for which there is no usable pixel at all. */
53         if(!rect_is_valid(clip_area)) {
54                 return false;
55         }
56
57         /* Map the linear thread index to pixels inside the clip area. */
58         int x, y;
59         if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
60                 return false;
61         }
62         co->x = x;
63         co->y = y;
64
65         *ofs = (sy*s + sx) * stride;
66
67         return true;
68 }
69
70 ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
71                                       int4 *rect, int4 *co, int *ofs)
72 {
73         return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
74 }
75
76 ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
77                                                          int dx, int dy,
78                                                          const ccl_global float *ccl_restrict weight_image,
79                                                          const ccl_global float *ccl_restrict variance_image,
80                                                          ccl_global float *difference_image,
81                                                          int4 rect, int stride,
82                                                          int channel_offset,
83                                                          float a, float k_2)
84 {
85         float diff = 0.0f;
86         int numChannels = channel_offset? 3 : 1;
87         for(int c = 0; c < numChannels; c++) {
88                 float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
89                 float pvar = variance_image[c*channel_offset + y*stride + x];
90                 float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
91                 diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
92         }
93         if(numChannels > 1) {
94                 diff *= 1.0f/numChannels;
95         }
96         difference_image[y*stride + x] = diff;
97 }
98
99 ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
100                                               const ccl_global float *ccl_restrict difference_image,
101                                               ccl_global float *out_image,
102                                               int4 rect, int stride, int f)
103 {
104         float sum = 0.0f;
105         const int low = max(rect.y, y-f);
106         const int high = min(rect.w, y+f+1);
107         for(int y1 = low; y1 < high; y1++) {
108                 sum += difference_image[y1*stride + x];
109         }
110         sum *= 1.0f/(high-low);
111         out_image[y*stride + x] = sum;
112 }
113
114 ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
115                                                      const ccl_global float *ccl_restrict difference_image,
116                                                      ccl_global float *out_image,
117                                                      int4 rect, int stride, int f)
118 {
119         float sum = 0.0f;
120         const int low = max(rect.x, x-f);
121         const int high = min(rect.z, x+f+1);
122         for(int x1 = low; x1 < high; x1++) {
123                 sum += difference_image[y*stride + x1];
124         }
125         sum *= 1.0f/(high-low);
126         out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
127 }
128
129 ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
130                                                        int dx, int dy,
131                                                        const ccl_global float *ccl_restrict difference_image,
132                                                        const ccl_global float *ccl_restrict image,
133                                                        ccl_global float *out_image,
134                                                        ccl_global float *accum_image,
135                                                        int4 rect, int stride, int f)
136 {
137         float sum = 0.0f;
138         const int low = max(rect.x, x-f);
139         const int high = min(rect.z, x+f+1);
140         for(int x1 = low; x1 < high; x1++) {
141                 sum += difference_image[y*stride + x1];
142         }
143         sum *= 1.0f/(high-low);
144         if(out_image) {
145                 atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
146                 atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
147         }
148         else {
149                 accum_image[y*stride + x] = sum;
150         }
151 }
152
153 ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
154                                                            int dx, int dy,
155                                                            const ccl_global float *ccl_restrict difference_image,
156                                                            const ccl_global float *ccl_restrict buffer,
157                                                            const ccl_global float *ccl_restrict transform,
158                                                            ccl_global int *rank,
159                                                            ccl_global float *XtWX,
160                                                            ccl_global float3 *XtWY,
161                                                            int4 rect,
162                                                            int4 filter_window,
163                                                            int stride, int f,
164                                                            int pass_stride,
165                                                            int localIdx)
166 {
167         const int low = max(rect.x, x-f);
168         const int high = min(rect.z, x+f+1);
169         float sum = 0.0f;
170         for(int x1 = low; x1 < high; x1++) {
171                 sum += difference_image[y*stride + x1];
172         }
173         float weight = sum * (1.0f/(high - low));
174
175         /* Reconstruction data is only stored for pixels inside the filter window,
176          * so compute the pixels's index in there. */
177         int storage_ofs = coord_to_local_index(filter_window, x, y);
178         transform += storage_ofs;
179         rank += storage_ofs;
180         XtWX += storage_ofs;
181         XtWY += storage_ofs;
182
183         kernel_filter_construct_gramian(x, y,
184                                         rect_size(filter_window),
185                                         dx, dy,
186                                         stride,
187                                         pass_stride,
188                                         buffer,
189                                         transform, rank,
190                                         weight, XtWX, XtWY,
191                                         localIdx);
192 }
193
194 ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
195                                                    ccl_global float *out_image,
196                                                    const ccl_global float *ccl_restrict accum_image,
197                                                    int stride)
198 {
199         out_image[y*stride + x] /= accum_image[y*stride + x];
200 }
201
202 CCL_NAMESPACE_END