f812a6601c6c93891313038b69b4a8d10a99ab5d
[blender.git] / intern / cycles / kernel / kernels / cuda / filter.cu
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* CUDA kernel entry points */
18
19 #ifdef __CUDA_ARCH__
20
21 #include "kernel_config.h"
22
23 #include "kernel/kernel_compat_cuda.h"
24
25 #include "kernel/filter/filter_kernel.h"
26
27 /* kernels */
28
29 extern "C" __global__ void
30 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
31 kernel_cuda_filter_divide_shadow(int sample,
32                                  TilesInfo *tiles,
33                                  float *unfilteredA,
34                                  float *unfilteredB,
35                                  float *sampleVariance,
36                                  float *sampleVarianceV,
37                                  float *bufferVariance,
38                                  int4 prefilter_rect,
39                                  int buffer_pass_stride,
40                                  int buffer_denoising_offset,
41                                  bool use_split_variance)
42 {
43         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
44         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
45         if(x < prefilter_rect.z && y < prefilter_rect.w) {
46                 kernel_filter_divide_shadow(sample,
47                                             tiles,
48                                             x, y,
49                                             unfilteredA,
50                                             unfilteredB,
51                                             sampleVariance,
52                                             sampleVarianceV,
53                                             bufferVariance,
54                                             prefilter_rect,
55                                             buffer_pass_stride,
56                                             buffer_denoising_offset,
57                                             use_split_variance);
58         }
59 }
60
61 extern "C" __global__ void
62 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
63 kernel_cuda_filter_get_feature(int sample,
64                                TilesInfo *tiles,
65                                int m_offset,
66                                int v_offset,
67                                float *mean,
68                                float *variance,
69                                int4 prefilter_rect,
70                                int buffer_pass_stride,
71                                int buffer_denoising_offset,
72                                bool use_split_variance)
73 {
74         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
75         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
76         if(x < prefilter_rect.z && y < prefilter_rect.w) {
77                 kernel_filter_get_feature(sample,
78                                           tiles,
79                                           m_offset, v_offset,
80                                           x, y,
81                                           mean, variance,
82                                           prefilter_rect,
83                                           buffer_pass_stride,
84                                           buffer_denoising_offset,
85                                           use_split_variance);
86         }
87 }
88
89 extern "C" __global__ void
90 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
91 kernel_cuda_filter_detect_outliers(float *image,
92                                    float *variance,
93                                    float *depth,
94                                    float *output,
95                                    int4 prefilter_rect,
96                                    int pass_stride)
97 {
98         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
99         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
100         if(x < prefilter_rect.z && y < prefilter_rect.w) {
101                 kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
102         }
103 }
104
105 extern "C" __global__ void
106 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
107 kernel_cuda_filter_combine_halves(float *mean, float *variance, float *a, float *b, int4 prefilter_rect, int r)
108 {
109         int x = prefilter_rect.x + blockDim.x*blockIdx.x + threadIdx.x;
110         int y = prefilter_rect.y + blockDim.y*blockIdx.y + threadIdx.y;
111         if(x < prefilter_rect.z && y < prefilter_rect.w) {
112                 kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
113         }
114 }
115
116 extern "C" __global__ void
117 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
118 kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
119                                        float *transform, int *rank,
120                                        int4 filter_area, int4 rect,
121                                        int radius, float pca_threshold,
122                                        int pass_stride)
123 {
124         int x = blockDim.x*blockIdx.x + threadIdx.x;
125         int y = blockDim.y*blockIdx.y + threadIdx.y;
126         if(x < filter_area.z && y < filter_area.w) {
127                 int *l_rank = rank + y*filter_area.z + x;
128                 float *l_transform = transform + y*filter_area.z + x;
129                 kernel_filter_construct_transform(buffer,
130                                                   x + filter_area.x, y + filter_area.y,
131                                                   rect, pass_stride,
132                                                   l_transform, l_rank,
133                                                   radius, pca_threshold,
134                                                   filter_area.z*filter_area.w,
135                                                   threadIdx.y*blockDim.x + threadIdx.x);
136         }
137 }
138
139 extern "C" __global__ void
140 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
141 kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
142                                        float ccl_restrict_ptr weightImage,
143                                        float ccl_restrict_ptr varianceImage,
144                                        float *differenceImage,
145                                        int4 rect, int w,
146                                        int channel_offset,
147                                        float a, float k_2) {
148         int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
149         int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
150         if(x < rect.z && y < rect.w) {
151                 kernel_filter_nlm_calc_difference(x, y, dx, dy, weightImage, varianceImage, differenceImage, rect, w, channel_offset, a, k_2);
152         }
153 }
154
155 extern "C" __global__ void
156 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
157 kernel_cuda_filter_nlm_blur(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
158         int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
159         int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
160         if(x < rect.z && y < rect.w) {
161                 kernel_filter_nlm_blur(x, y, differenceImage, outImage, rect, w, f);
162         }
163 }
164
165 extern "C" __global__ void
166 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
167 kernel_cuda_filter_nlm_calc_weight(float ccl_restrict_ptr differenceImage, float *outImage, int4 rect, int w, int f) {
168         int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
169         int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
170         if(x < rect.z && y < rect.w) {
171                 kernel_filter_nlm_calc_weight(x, y, differenceImage, outImage, rect, w, f);
172         }
173 }
174
175 extern "C" __global__ void
176 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
177 kernel_cuda_filter_nlm_update_output(int dx, int dy,
178                                      float ccl_restrict_ptr differenceImage,
179                                      float ccl_restrict_ptr image,
180                                      float *outImage, float *accumImage,
181                                      int4 rect, int w,
182                                      int f) {
183         int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
184         int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
185         if(x < rect.z && y < rect.w) {
186                 kernel_filter_nlm_update_output(x, y, dx, dy, differenceImage, image, outImage, accumImage, rect, w, f);
187         }
188 }
189
190 extern "C" __global__ void
191 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
192 kernel_cuda_filter_nlm_normalize(float *outImage, float ccl_restrict_ptr accumImage, int4 rect, int w) {
193         int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
194         int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
195         if(x < rect.z && y < rect.w) {
196                 kernel_filter_nlm_normalize(x, y, outImage, accumImage, rect, w);
197         }
198 }
199
200 extern "C" __global__ void
201 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
202 kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
203                                          float ccl_restrict_ptr differenceImage,
204                                          float ccl_restrict_ptr buffer,
205                                          float *color_pass,
206                                          float *variance_pass,
207                                          float const* __restrict__ transform,
208                                          int *rank,
209                                          float *XtWX,
210                                          float3 *XtWY,
211                                          int4 rect,
212                                          int4 filter_rect,
213                                          int w, int h, int f,
214                                          int pass_stride) {
215         int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
216         int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
217         if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
218                 kernel_filter_nlm_construct_gramian(x, y,
219                                                     dx, dy,
220                                                     differenceImage,
221                                                     buffer,
222                                                     color_pass, variance_pass,
223                                                     transform, rank,
224                                                     XtWX, XtWY,
225                                                     rect, filter_rect,
226                                                     w, h, f,
227                                                     pass_stride,
228                                                     threadIdx.y*blockDim.x + threadIdx.x);
229         }
230 }
231
232 extern "C" __global__ void
233 CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
234 kernel_cuda_filter_finalize(int w, int h,
235                             float *buffer, int *rank,
236                             float *XtWX, float3 *XtWY,
237                             int4 filter_area, int4 buffer_params,
238                             int sample) {
239         int x = blockDim.x*blockIdx.x + threadIdx.x;
240         int y = blockDim.y*blockIdx.y + threadIdx.y;
241         if(x < filter_area.z && y < filter_area.w) {
242                 int storage_ofs = y*filter_area.z+x;
243                 rank += storage_ofs;
244                 XtWX += storage_ofs;
245                 XtWY += storage_ofs;
246                 kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
247         }
248 }
249
250 #endif
251