Cycles: Improve denoising speed on GPUs with small tile sizes
[blender.git] / intern / cycles / kernel / kernels / opencl / filter.cl
1 /*
2  * Copyright 2011-2017 Blender Foundation
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16
17 /* OpenCL kernel entry points */
18
19 #include "kernel/kernel_compat_opencl.h"
20
21 #include "kernel/filter/filter_kernel.h"
22
23 /* kernels */
24
25 __kernel void kernel_ocl_filter_divide_shadow(int sample,
26                                               ccl_global TilesInfo *tiles,
27                                               ccl_global float *unfilteredA,
28                                               ccl_global float *unfilteredB,
29                                               ccl_global float *sampleVariance,
30                                               ccl_global float *sampleVarianceV,
31                                               ccl_global float *bufferVariance,
32                                               int4 prefilter_rect,
33                                               int buffer_pass_stride,
34                                               int buffer_denoising_offset)
35 {
36         int x = prefilter_rect.x + get_global_id(0);
37         int y = prefilter_rect.y + get_global_id(1);
38         if(x < prefilter_rect.z && y < prefilter_rect.w) {
39                 kernel_filter_divide_shadow(sample,
40                                             tiles,
41                                             x, y,
42                                             unfilteredA,
43                                             unfilteredB,
44                                             sampleVariance,
45                                             sampleVarianceV,
46                                             bufferVariance,
47                                             prefilter_rect,
48                                             buffer_pass_stride,
49                                             buffer_denoising_offset);
50         }
51 }
52
53 __kernel void kernel_ocl_filter_get_feature(int sample,
54                                             ccl_global TilesInfo *tiles,
55                                             int m_offset,
56                                             int v_offset,
57                                             ccl_global float *mean,
58                                             ccl_global float *variance,
59                                             int4 prefilter_rect,
60                                             int buffer_pass_stride,
61                                             int buffer_denoising_offset)
62 {
63         int x = prefilter_rect.x + get_global_id(0);
64         int y = prefilter_rect.y + get_global_id(1);
65         if(x < prefilter_rect.z && y < prefilter_rect.w) {
66                 kernel_filter_get_feature(sample,
67                                           tiles,
68                                           m_offset, v_offset,
69                                           x, y,
70                                           mean, variance,
71                                           prefilter_rect,
72                                           buffer_pass_stride,
73                                           buffer_denoising_offset);
74         }
75 }
76
77 __kernel void kernel_ocl_filter_detect_outliers(ccl_global float *image,
78                                                 ccl_global float *variance,
79                                                 ccl_global float *depth,
80                                                 ccl_global float *output,
81                                                 int4 prefilter_rect,
82                                                 int pass_stride)
83 {
84         int x = prefilter_rect.x + get_global_id(0);
85         int y = prefilter_rect.y + get_global_id(1);
86         if(x < prefilter_rect.z && y < prefilter_rect.w) {
87                 kernel_filter_detect_outliers(x, y, image, variance, depth, output, prefilter_rect, pass_stride);
88         }
89 }
90
91 __kernel void kernel_ocl_filter_combine_halves(ccl_global float *mean,
92                                                ccl_global float *variance,
93                                                ccl_global float *a,
94                                                ccl_global float *b,
95                                                int4 prefilter_rect,
96                                                int r)
97 {
98         int x = prefilter_rect.x + get_global_id(0);
99         int y = prefilter_rect.y + get_global_id(1);
100         if(x < prefilter_rect.z && y < prefilter_rect.w) {
101                 kernel_filter_combine_halves(x, y, mean, variance, a, b, prefilter_rect, r);
102         }
103 }
104
105 __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_restrict buffer,
106                                                     ccl_global float *transform,
107                                                     ccl_global int *rank,
108                                                     int4 filter_area,
109                                                     int4 rect,
110                                                     int pass_stride,
111                                                     int radius,
112                                                     float pca_threshold)
113 {
114         int x = get_global_id(0);
115         int y = get_global_id(1);
116         if(x < filter_area.z && y < filter_area.w) {
117                 ccl_global int *l_rank = rank + y*filter_area.z + x;
118                 ccl_global float *l_transform = transform + y*filter_area.z + x;
119                 kernel_filter_construct_transform(buffer,
120                                                   x + filter_area.x, y + filter_area.y,
121                                                   rect, pass_stride,
122                                                   l_transform, l_rank,
123                                                   radius, pca_threshold,
124                                                   filter_area.z*filter_area.w,
125                                                   get_local_id(1)*get_local_size(0) + get_local_id(0));
126         }
127 }
128
129 __kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
130                                                     const ccl_global float *ccl_restrict variance_image,
131                                                     ccl_global float *difference_image,
132                                                     int w,
133                                                     int h,
134                                                     int stride,
135                                                     int shift_stride,
136                                                     int r,
137                                                     int channel_offset,
138                                                     float a,
139                                                     float k_2)
140 {
141         int4 co, rect;
142         int ofs;
143         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
144                 kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
145                                                   weight_image,
146                                                   variance_image,
147                                                   difference_image + ofs,
148                                                   rect, stride,
149                                                   channel_offset, a, k_2);
150         }
151 }
152
153 __kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
154                                          ccl_global float *out_image,
155                                          int w,
156                                          int h,
157                                          int stride,
158                                          int shift_stride,
159                                          int r,
160                                          int f)
161 {
162         int4 co, rect;
163         int ofs;
164         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
165                 kernel_filter_nlm_blur(co.x, co.y,
166                                        difference_image + ofs,
167                                        out_image + ofs,
168                                        rect, stride, f);
169         }
170 }
171
172 __kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
173                                                 ccl_global float *out_image,
174                                                 int w,
175                                                 int h,
176                                                 int stride,
177                                                 int shift_stride,
178                                                 int r,
179                                                 int f)
180 {
181         int4 co, rect;
182         int ofs;
183         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
184                 kernel_filter_nlm_calc_weight(co.x, co.y,
185                                               difference_image + ofs,
186                                               out_image + ofs,
187                                               rect, stride, f);
188         }
189 }
190
191 __kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
192                                                   const ccl_global float *ccl_restrict image,
193                                                   ccl_global float *out_image,
194                                                   ccl_global float *accum_image,
195                                                   int w,
196                                                   int h,
197                                                   int stride,
198                                                   int shift_stride,
199                                                   int r,
200                                                   int f)
201 {
202         int4 co, rect;
203         int ofs;
204         if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
205                 kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
206                                                 difference_image + ofs,
207                                                 image,
208                                                 out_image,
209                                                 accum_image,
210                                                 rect, stride, f);
211         }
212 }
213
214 __kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
215                                               const ccl_global float *ccl_restrict accum_image,
216                                               int w,
217                                               int h,
218                                               int stride)
219 {
220         int x = get_global_id(0);
221         int y = get_global_id(1);
222         if(x < w && y < h) {
223                 kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
224         }
225 }
226
227 __kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
228                                                       const ccl_global float *ccl_restrict buffer,
229                                                       const ccl_global float *ccl_restrict transform,
230                                                       ccl_global int *rank,
231                                                       ccl_global float *XtWX,
232                                                       ccl_global float3 *XtWY,
233                                                       int4 filter_window,
234                                                       int w,
235                                                       int h,
236                                                       int stride,
237                                                       int shift_stride,
238                                                       int r,
239                                                       int f,
240                                                       int pass_stride)
241 {
242         int4 co, rect;
243         int ofs;
244         if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
245                 kernel_filter_nlm_construct_gramian(co.x, co.y,
246                                                     co.z, co.w,
247                                                     difference_image + ofs,
248                                                     buffer,
249                                                     transform, rank,
250                                                     XtWX, XtWY,
251                                                     rect, filter_window,
252                                                     stride, f,
253                                                     pass_stride,
254                                                     get_local_id(1)*get_local_size(0) + get_local_id(0));
255         }
256 }
257
258 __kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
259                                          ccl_global int *rank,
260                                          ccl_global float *XtWX,
261                                          ccl_global float3 *XtWY,
262                                          int4 filter_area,
263                                          int4 buffer_params,
264                                          int sample)
265 {
266         int x = get_global_id(0);
267         int y = get_global_id(1);
268         if(x < filter_area.z && y < filter_area.w) {
269                 int storage_ofs = y*filter_area.z+x;
270                 rank += storage_ofs;
271                 XtWX += storage_ofs;
272                 XtWY += storage_ofs;
273                 kernel_filter_finalize(x, y, buffer, rank,
274                                        filter_area.z*filter_area.w,
275                                        XtWX, XtWY,
276                                        buffer_params, sample);
277         }
278 }
279
280 __kernel void kernel_ocl_filter_set_tiles(ccl_global TilesInfo* tiles,
281                                           ccl_global float *buffer_1,
282                                           ccl_global float *buffer_2,
283                                           ccl_global float *buffer_3,
284                                           ccl_global float *buffer_4,
285                                           ccl_global float *buffer_5,
286                                           ccl_global float *buffer_6,
287                                           ccl_global float *buffer_7,
288                                           ccl_global float *buffer_8,
289                                           ccl_global float *buffer_9)
290 {
291         if((get_global_id(0) == 0) && (get_global_id(1) == 0)) {
292                 tiles->buffers[0] = buffer_1;
293                 tiles->buffers[1] = buffer_2;
294                 tiles->buffers[2] = buffer_3;
295                 tiles->buffers[3] = buffer_4;
296                 tiles->buffers[4] = buffer_5;
297                 tiles->buffers[5] = buffer_6;
298                 tiles->buffers[6] = buffer_7;
299                 tiles->buffers[7] = buffer_8;
300                 tiles->buffers[8] = buffer_9;
301         }
302 }