Cycles Denoising: Cleanup: Rename tiles to tile_info
[blender.git] / intern / cycles / kernel / kernels / cuda / filter.cu
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* CUDA kernel entry points */
18
19 #ifdef __CUDA_ARCH__
20
21 #include "kernel_config.h"
22
23 #include "kernel/kernel_compat_cuda.h"
24
25 #include "kernel/filter/filter_kernel.h"
26
27 /* kernels */
28
29 extern "C" __global__ void
30 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
31 kernel_cuda_filter_divide_shadow(int sample,
32                                  TileInfo *tile_info,
33                                  float *unfilteredA,
34                                  float *unfilteredB,
35                                  float *sampleVariance,
36                                  float *sampleVarianceV,
37                                  float *bufferVariance,
38                                  int4 prefilter_rect,
39                                  int buffer_pass_stride,
40                                  int buffer_denoising_offset)
41 {
42         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
43         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
44         if(x < prefilter_rect.z && y < prefilter_rect.w) {
45                 kernel_filter_divide_shadow(sample,
46                                             tile_info,
47                                             x, y,
48                                             unfilteredA,
49                                             unfilteredB,
50                                             sampleVariance,
51                                             sampleVarianceV,
52                                             bufferVariance,
53                                             prefilter_rect,
54                                             buffer_pass_stride,
55                                             buffer_denoising_offset);
56         }
57 }
58
59 extern "C" __global__ void
60 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
61 kernel_cuda_filter_get_feature(int sample,
62                                TileInfo *tile_info,
63                                int m_offset,
64                                int v_offset,
65                                float *mean,
66                                float *variance,
67                                int4 prefilter_rect,
68                                int buffer_pass_stride,
69                                int buffer_denoising_offset)
70 {
71         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
72         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
73         if(x < prefilter_rect.z && y < prefilter_rect.w) {
74                 kernel_filter_get_feature(sample,
75                                           tile_info,
76                                           m_offset, v_offset,
77                                           x, y,
78                                           mean, variance,
79                                           prefilter_rect,
80                                           buffer_pass_stride,
81                                           buffer_denoising_offset);
82         }
83 }
84
85 extern "C" __global__ void
86 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
87 kernel_cuda_filter_detect_outliers(float *image,
88                                    float *variance,
89                                    float *depth,
90                                    float *output,
91                                    int4 prefilter_rect,
92                                    int pass_stride)
93 {
94         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
95         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
96         if(x < prefilter_rect.z && y < prefilter_rect.w) {
97                 kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
98         }
99 }
100
101 extern "C" __global__ void
102 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
103 kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
104 {
105         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
106         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
107         if(x < prefilter_rect.z && y < prefilter_rect.w) {
108                 kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
109         }
110 }
111
112 extern "C" __global__ void
113 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
114 kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
115                                        float *transform, int *rank,
116                                        int4 filter_area, int4 rect,
117                                        int radius, float pca_threshold,
118                                        int pass_stride)
119 {
120         int x = blockDim.x*blockIdx.x + threadIdx.x;
121         int y = blockDim.y*blockIdx.y + threadIdx.y;
122         if(x < filter_area.z && y < filter_area.w) {
123                 int *l_rank = rank + y*filter_area.z + x;
124                 float *l_transform = transform + y*filter_area.z + x;
125                 kernel_filter_construct_transform(buffer,
126                                                   x + filter_area.x, y + filter_area.y,
127                                                   rect, pass_stride,
128                                                   l_transform, l_rank,
129                                                   radius, pca_threshold,
130                                                   filter_area.z*filter_area.w,
131                                                   threadIdx.y*blockDim.x + threadIdx.x);
132         }
133 }
134
135 extern "C" __global__ void
136 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
137 kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
138                                        const float *ccl_restrict variance_image,
139                                        float *difference_image,
140                                        int w,
141                                        int h,
142                                        int stride,
143                                        int shift_stride,
144                                        int r,
145                                        int channel_offset,
146                                        float a,
147                                        float k_2)
148 {
149         int4 co, rect;
150         int ofs;
151         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
152                 kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
153                                                   weight_image,
154                                                   variance_image,
155                                                   difference_image + ofs,
156                                                   rect, stride,
157                                                   channel_offset, a, k_2);
158         }
159 }
160
161 extern "C" __global__ void
162 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
163 kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
164                             float *out_image,
165                             int w,
166                             int h,
167                             int stride,
168                             int shift_stride,
169                             int r,
170                             int f)
171 {
172         int4 co, rect;
173         int ofs;
174         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
175                 kernel_filter_nlm_blur(co.x, co.y,
176                                        difference_image + ofs,
177                                        out_image + ofs,
178                                        rect, stride, f);
179         }
180 }
181
182 extern "C" __global__ void
183 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
184 kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
185                                    float *out_image,
186                                    int w,
187                                    int h,
188                                    int stride,
189                                    int shift_stride,
190                                    int r,
191                                    int f)
192 {
193         int4 co, rect;
194         int ofs;
195         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
196                 kernel_filter_nlm_calc_weight(co.x, co.y,
197                                               difference_image + ofs,
198                                               out_image + ofs,
199                                               rect, stride, f);
200         }
201 }
202
203 extern "C" __global__ void
204 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
205 kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
206                                      const float *ccl_restrict image,
207                                      float *out_image,
208                                      float *accum_image,
209                                      int w,
210                                      int h,
211                                      int stride,
212                                      int shift_stride,
213                                      int r,
214                                      int f)
215 {
216         int4 co, rect;
217         int ofs;
218         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
219                 kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
220                                                 difference_image + ofs,
221                                                 image,
222                                                 out_image,
223                                                 accum_image,
224                                                 rect, stride, f);
225         }
226 }
227
228 extern "C" __global__ void
229 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
230 kernel_cuda_filter_nlm_normalize(float *out_image,
231                                  const float *ccl_restrict accum_image,
232                                  int w,
233                                  int h,
234                                  int stride)
235 {
236         int x = blockDim.x*blockIdx.x + threadIdx.x;
237         int y = blockDim.y*blockIdx.y + threadIdx.y;
238         if(x < w && y < h) {
239                 kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
240         }
241 }
242
243 extern "C" __global__ void
244 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
245 kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
246                                          const float *ccl_restrict buffer,
247                                          float const* __restrict__ transform,
248                                          int *rank,
249                                          float *XtWX,
250                                          float3 *XtWY,
251                                          int4 filter_window,
252                                          int w,
253                                          int h,
254                                          int stride,
255                                          int shift_stride,
256                                          int r,
257                                          int f,
258                                          int pass_stride)
259 {
260         int4 co, rect;
261         int ofs;
262         if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
263                 kernel_filter_nlm_construct_gramian(co.x, co.y,
264                                                     co.z, co.w,
265                                                     difference_image + ofs,
266                                                     buffer,
267                                                     transform, rank,
268                                                     XtWX, XtWY,
269                                                     rect, filter_window,
270                                                     stride, f,
271                                                     pass_stride,
272                                                     threadIdx.y*blockDim.x + threadIdx.x);
273         }
274 }
275
276 extern "C" __global__ void
277 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
278 kernel_cuda_filter_finalize(float *buffer,
279                             int *rank,
280                             float *XtWX,
281                             float3 *XtWY,
282                             int4 filter_area,
283                             int4 buffer_params,
284                             int sample)
285 {
286         int x = blockDim.x*blockIdx.x + threadIdx.x;
287         int y = blockDim.y*blockIdx.y + threadIdx.y;
288         if(x < filter_area.z && y < filter_area.w) {
289                 int storage_ofs = y*filter_area.z+x;
290                 rank += storage_ofs;
291                 XtWX += storage_ofs;
292                 XtWY += storage_ofs;
293                 kernel_filter_finalize(x, y, buffer, rank,
294                                        filter_area.z*filter_area.w,
295                                        XtWX, XtWY,
296                                        buffer_params, sample);
297         }
298 }
299
300 #endif
301