Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / kernels / cpu / filter_cpu_impl.h
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* Templated common implementation part of all CPU kernels.
18  *
19  * The idea is that particular .cpp files sets needed optimization flags and
20  * simply includes this file without worry of copying actual implementation over.
21  */
22
23 #include "kernel/kernel_compat_cpu.h"
24
25 #include "kernel/filter/filter_kernel.h"
26
27 #ifdef KERNEL_STUB
28 #  include "util/util_debug.h"
29 #  define STUB_ASSERT(arch, name) assert(!(#name " kernel stub for architecture " #arch " was called!"))
30 #endif
31
32 CCL_NAMESPACE_BEGIN
33
34
35 /* Denoise filter */
36
37 void KERNEL_FUNCTION_FULL_NAME(filter_divide_shadow)(int sample,
38                                                      TilesInfo *tiles,
39                                                      int x,
40                                                      int y,
41                                                      float *unfilteredA,
42                                                      float *unfilteredB,
43                                                      float *sampleVariance,
44                                                      float *sampleVarianceV,
45                                                      float *bufferVariance,
46                                                      int* prefilter_rect,
47                                                      int buffer_pass_stride,
48                                                      int buffer_denoising_offset)
49 {
50 #ifdef KERNEL_STUB
51         STUB_ASSERT(KERNEL_ARCH, filter_divide_shadow);
52 #else
53         kernel_filter_divide_shadow(sample, tiles,
54                                     x, y,
55                                     unfilteredA,
56                                     unfilteredB,
57                                     sampleVariance,
58                                     sampleVarianceV,
59                                     bufferVariance,
60                                     load_int4(prefilter_rect),
61                                     buffer_pass_stride,
62                                     buffer_denoising_offset);
63 #endif
64 }
65
66 void KERNEL_FUNCTION_FULL_NAME(filter_get_feature)(int sample,
67                                                    TilesInfo *tiles,
68                                                    int m_offset,
69                                                    int v_offset,
70                                                    int x,
71                                                    int y,
72                                                    float *mean, float *variance,
73                                                    int* prefilter_rect,
74                                                    int buffer_pass_stride,
75                                                    int buffer_denoising_offset)
76 {
77 #ifdef KERNEL_STUB
78         STUB_ASSERT(KERNEL_ARCH, filter_get_feature);
79 #else
80         kernel_filter_get_feature(sample, tiles,
81                                   m_offset, v_offset,
82                                   x, y,
83                                   mean, variance,
84                                   load_int4(prefilter_rect),
85                                   buffer_pass_stride,
86                                   buffer_denoising_offset);
87 #endif
88 }
89
90 void KERNEL_FUNCTION_FULL_NAME(filter_detect_outliers)(int x, int y,
91                                                        ccl_global float *image,
92                                                        ccl_global float *variance,
93                                                        ccl_global float *depth,
94                                                        ccl_global float *output,
95                                                        int *rect,
96                                                        int pass_stride)
97 {
98 #ifdef KERNEL_STUB
99         STUB_ASSERT(KERNEL_ARCH, filter_detect_outliers);
100 #else
101         kernel_filter_detect_outliers(x, y, image, variance, depth, output, load_int4(rect), pass_stride);
102 #endif
103 }
104
105 void KERNEL_FUNCTION_FULL_NAME(filter_combine_halves)(int x, int y,
106                                                       float *mean,
107                                                       float *variance,
108                                                       float *a,
109                                                       float *b,
110                                                       int* prefilter_rect,
111                                                       int r)
112 {
113 #ifdef KERNEL_STUB
114         STUB_ASSERT(KERNEL_ARCH, filter_combine_halves);
115 #else
116         kernel_filter_combine_halves(x, y, mean, variance, a, b, load_int4(prefilter_rect), r);
117 #endif
118 }
119
120 void KERNEL_FUNCTION_FULL_NAME(filter_construct_transform)(float* buffer,
121                                                            int x,
122                                                            int y,
123                                                            int storage_ofs,
124                                                            float *transform,
125                                                            int *rank,
126                                                            int* prefilter_rect,
127                                                            int pass_stride,
128                                                            int radius,
129                                                            float pca_threshold)
130 {
131 #ifdef KERNEL_STUB
132         STUB_ASSERT(KERNEL_ARCH, filter_construct_transform);
133 #else
134   rank += storage_ofs;
135   transform += storage_ofs*TRANSFORM_SIZE;
136         kernel_filter_construct_transform(buffer,
137                                           x, y,
138                                           load_int4(prefilter_rect),
139                                           pass_stride,
140                                           transform,
141                                           rank,
142                                           radius,
143                                           pca_threshold);
144 #endif
145 }
146
147 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
148                                                            int dy,
149                                                            float *weight_image,
150                                                            float *variance,
151                                                            float *difference_image,
152                                                            int *rect,
153                                                            int stride,
154                                                            int channel_offset,
155                                                            float a,
156                                                            float k_2)
157 {
158 #ifdef KERNEL_STUB
159         STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
160 #else
161         kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
162 #endif
163 }
164
165 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
166                                                 float *out_image,
167                                                 int *rect,
168                                                 int stride,
169                                                 int f)
170 {
171 #ifdef KERNEL_STUB
172         STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
173 #else
174         kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
175 #endif
176 }
177
178 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
179                                                        float *out_image,
180                                                        int *rect,
181                                                        int stride,
182                                                        int f)
183 {
184 #ifdef KERNEL_STUB
185         STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
186 #else
187         kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
188 #endif
189 }
190
191 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
192                                                          int dy,
193                                                          float *difference_image,
194                                                          float *image,
195                                                          float *out_image,
196                                                          float *accum_image,
197                                                          int *rect,
198                                                          int stride,
199                                                          int f)
200 {
201 #ifdef KERNEL_STUB
202         STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
203 #else
204         kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
205 #endif
206 }
207
208 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
209                                                              int dy,
210                                                              float *difference_image,
211                                                              float *buffer,
212                                                              float *transform,
213                                                              int *rank,
214                                                              float *XtWX,
215                                                              float3 *XtWY,
216                                                              int *rect,
217                                                              int *filter_window,
218                                                              int stride,
219                                                              int f,
220                                                              int pass_stride)
221 {
222 #ifdef KERNEL_STUB
223         STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
224 #else
225         kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
226 #endif
227 }
228
229 void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
230                                                      float *accum_image,
231                                                      int *rect,
232                                                      int stride)
233 {
234 #ifdef KERNEL_STUB
235         STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
236 #else
237         kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
238 #endif
239 }
240
241 void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
242                                                 int y,
243                                                 int storage_ofs,
244                                                 float *buffer,
245                                                 int *rank,
246                                                 float *XtWX,
247                                                 float3 *XtWY,
248                                                 int *buffer_params,
249                                                 int sample)
250 {
251 #ifdef KERNEL_STUB
252         STUB_ASSERT(KERNEL_ARCH, filter_finalize);
253 #else
254         XtWX += storage_ofs*XTWX_SIZE;
255         XtWY += storage_ofs*XTWY_SIZE;
256         rank += storage_ofs;
257         kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
258 #endif
259 }
260
261 #undef KERNEL_STUB
262 #undef STUB_ASSERT
263 #undef KERNEL_ARCH
264
265 CCL_NAMESPACE_END